Beispiel #1
0
    def __init__(self,
                 fileobj,
                 no_record_parse=False,
                 verify_http=False,
                 arc2warc=False,
                 ensure_http_headers=False,
                 block_size=BUFF_SIZE):

        self.fh = fileobj

        self.loader = ArcWarcRecordLoader(verify_http=verify_http,
                                          arc2warc=arc2warc)
        self.known_format = None

        self.mixed_arc_warc = arc2warc

        self.member_info = None
        self.no_record_parse = no_record_parse
        self.ensure_http_headers = ensure_http_headers

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()
        self.next_line = None

        self.err_count = 0
        self.record = None

        self.the_iter = self._iterate_records()
def get_rendered_original_stream(warc_filename,
                                 warc_offset,
                                 compressedendoffset,
                                 payload_only=True):
    """
    Grabs a resource.
    """
    # If not found, say so:
    if warc_filename is None:
        return None, None

    # Grab the payload from the WARC and return it.
    url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (
        WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset)
    if compressedendoffset and int(compressedendoffset) > 0:
        url = "%s&length=%s" % (url, compressedendoffset)
    r = requests.get(url, stream=True)
    # We handle decoding etc.
    r.raw.decode_content = False
    logger.debug("Loading from: %s" % r.url)
    logger.debug("Got status code %s" % r.status_code)
    # Return the payload, or the record:
    if payload_only:
        # Parse the WARC, return the payload:
        rl = ArcWarcRecordLoader()
        record = rl.parse_record_stream(
            DecompressingBufferedReader(stream=r.raw))
        #return record.raw_stream, record.content_type
        return record.content_stream(), record.content_type
    else:
        # This makes sure we only get the first GZip chunk:
        s = DecompressingBufferedReader(stream=r.raw)
        warc_record = s.read()
        return warc_record, 'application/warc'
Beispiel #3
0
def test_s3_read_2():
    pytest.importorskip('boto3')

    res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html')

    buff = res.read()
    assert len(buff) == 2082

    reader = DecompressingBufferedReader(BytesIO(buff))
    assert reader.readline() == b'<!DOCTYPE html>\n'
Beispiel #4
0
def test_s3_read_2():
    pytest.importorskip('boto3')

    res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html')

    buff = res.read()
    assert len(buff) == 2082

    reader = DecompressingBufferedReader(BytesIO(buff))
    assert reader.readline() == b'<!DOCTYPE html>\n'
Beispiel #5
0
def test_compress_invalid():
    result = compress('ABCDEFG' * 1)
    # cut-off part of the block
    result = result[:-2] + b'xyz'

    x = DecompressingBufferedReader(BytesIO(result), block_size=16)
    b = x.read(3)
    assert b == b'ABC'

    assert b'DE' == x.read()
Beispiel #6
0
def test_s3_read_1():
    pytest.importorskip('boto3')

    res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz',
                             offset=53235662,
                             length=2526)

    buff = res.read()
    assert len(buff) == 2526

    reader = DecompressingBufferedReader(BytesIO(buff))
    assert reader.readline() == b'WARC/1.0\r\n'
    assert reader.readline() == b'WARC-Type: response\r\n'
Beispiel #7
0
def test_s3_read_1():
    pytest.importorskip('boto3')

    res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz',
                             offset=53235662,
                             length=2526)

    buff = res.read()
    assert len(buff) == 2526

    reader = DecompressingBufferedReader(BytesIO(buff))
    assert reader.readline() == b'WARC/1.0\r\n'
    assert reader.readline() == b'WARC-Type: response\r\n'
Beispiel #8
0
    def test_record_video_metadata(self):
        pytest.importorskip('youtube_dl')
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        params = {'param.recorder.user': '******',
                  'param.recorder.coll': 'VIDEO',
                  'content_type': 'application/vnd.youtube-dl_formats+json'
                 }

        resp = self._test_warc_write(recorder_app,
            'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params),
            link_url='metadata://www.youtube.com/v/BfBgWtAIbRc')

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('USER:VIDEO:warc')
        assert len(warcs) == 1

        filename = list(warcs.values())[0]

        with open(filename, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(decomp)

        status_headers = record.rec_headers
        assert status_headers.get_header('WARC-Type') == 'metadata'
        assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json'
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
Beispiel #9
0
def fetch_warc_record(capture, warc_download_prefix):
    for field in ('url', 'filename', 'offset', 'length'):
        if field not in capture:  # pragma: no cover
            raise ValueError('capture must contain '+field)

    url = capture['url']
    filename = capture['filename']
    offset = int(capture['offset'])
    length = int(capture['length'])

    warc_url = warc_download_prefix + '/' + filename
    headers = {'Range': 'bytes={}-{}'.format(offset, offset+length-1)}

    resp = myrequests_get(warc_url, headers=headers)
    record_bytes = resp.content
    stream = DecompressingBufferedReader(BytesIO(record_bytes))
    record = ArcWarcRecordLoader().parse_record_stream(stream)

    for header in ('WARC-Source-URI', 'WARC-Source-Range'):
        if record.rec_headers.get_header(header):  # pragma: no cover
            print('Surprised that {} was already set in this WARC record'.format(header), file=sys.stderr)

    warc_target_uri = record.rec_headers.get_header('WARC-Target-URI')
    if url != warc_target_uri:  # pragma: no cover
        print('Surprised that WARC-Target-URI {} is not the capture url {}'.format(warc_target_uri, url), file=sys.stderr)

    record.rec_headers.replace_header('WARC-Source-URI', warc_url)
    record.rec_headers.replace_header('WARC-Source-Range', 'bytes={}-{}'.format(offset, offset+length-1))
    return record
Beispiel #10
0
def test_brotli():
    brotli_buff = b'[\xff\xaf\x02\xc0"y\\\xfbZ\x8cB;\xf4%U\x19Z\x92\x99\xb15\xc8\x19\x9e\x9e\n{K\x90\xb9<\x98\xc8\t@\xf3\xe6\xd9M\xe4me\x1b\'\x87\x13_\xa6\xe90\x96{<\x15\xd8S\x1c'

    with closing(
            DecompressingBufferedReader(BytesIO(brotli_buff),
                                        decomp_type='br')) as x:
        x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
Beispiel #11
0
    def test_record_custom_record(self):
        dedup_index = self._get_dedup_index(user=False)

        warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz')

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        req_url = '/live/resource/postreq?url=custom://httpbin.org&param.recorder.coll=META&put_record=resource'

        buff = b'Some Data'

        testapp = webtest.TestApp(recorder_app)
        headers = {'content-type': 'text/plain', 'WARC-Custom': 'foo'}

        resp = testapp.put(req_url, headers=headers, params=buff)

        assert resp.json['success'] == 'true'
        assert resp.json['WARC-Date'] != ''

        self._test_all_warcs('/warcs/meta', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('META:warc')
        assert len(warcs) == 1

        warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8')

        with open(warcs[warc_key], 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(
                decomp, ensure_http_headers=True)

        status_headers = record.rec_headers
        assert len(record.rec_headers.headers) == 9
        assert status_headers.get_header('WARC-Type') == 'resource'
        assert status_headers.get_header(
            'WARC-Target-URI') == 'custom://httpbin.org'
        assert status_headers.get_header('WARC-Record-ID') != ''
        assert status_headers.get_header('WARC-Date') != ''
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header(
            'WARC-Block-Digest') == status_headers.get_header(
                'WARC-Payload-Digest')
        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))
        assert status_headers.get_header('WARC-Custom') == 'foo'

        assert record.raw_stream.read() == buff

        status_headers = record.http_headers
        assert len(record.http_headers.headers) == 2

        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))

        writer.close()
        assert len(writer.fh_cache) == 0
Beispiel #12
0
    def test_generate_response_gzip(self):
        writer = FixedTestWARCWriter(gzip=True)

        record = self._sample_response(writer)

        writer.write_record(record)

        gzip_buff = writer.get_contents()

        self._validate_record_content_len(BytesIO(gzip_buff))

        stream = writer.get_stream()
        stream = DecompressingBufferedReader(stream)

        buff = stream.read()
        assert len(buff) > len(gzip_buff)

        assert buff.decode('utf-8') == RESPONSE_RECORD
Beispiel #13
0
    def test_record_param_user_coll_revisit(self):
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        recorder_app = RecorderApp(
            self.upstream_url,
            PerRecordWARCWriter(warc_path, dedup_index=dedup_index))

        self._test_all_warcs('/warcs/USER/COLL/', 1)

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org', '/user-agent',
            '&param.recorder.user=USER&param.recorder.coll=COLL')

        assert '"user-agent": "{0}"'.format(UA) in resp.text
        #assert b'HTTP/1.1 200 OK' in resp.body
        #assert b'"foo": "bar"' in resp.body

        self._test_all_warcs('/warcs/USER/COLL/', 2)

        # Test Redis CDX
        r = FakeStrictRedis.from_url('redis://localhost/2')

        res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/',
                            '(org,httpbin,')
        assert len(res) == 2

        if b'warc/revisit' in res[0]:
            cdx = CDXObject(res[0])
        else:
            cdx = CDXObject(res[1])

        assert cdx['urlkey'] == 'org,httpbin)/user-agent'
        assert cdx['mime'] == 'warc/revisit'
        assert cdx['offset'] == '0'
        assert cdx['filename'].startswith(to_path('USER/COLL/'))
        assert cdx['filename'].endswith('.warc.gz')

        fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])

        warcs = r.hgetall('USER:COLL:warc')
        assert len(warcs) == 2
        assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode(
            'utf-8')

        with open(fullwarc, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            # Test refers-to headers
            status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp)
            assert status_headers.get_header('WARC-Type') == 'revisit'
            assert status_headers.get_header(
                'WARC-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Date') != ''
            assert status_headers.get_header(
                'WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent'
            assert status_headers.get_header('WARC-Refers-To-Date') != ''
Beispiel #14
0
    def test_read_from_stream_no_content_length(self, record_sampler, is_gzip,
                                                builder_factory):
        writer = FixedTestWARCWriter(gzip=is_gzip)
        builder = builder_factory(writer)

        record_maker, record_string = record_sampler
        full_record = record_maker(builder)

        stream = BytesIO()
        record_no_cl = self._conv_to_streaming_record(record_string,
                                                      full_record.rec_type)

        if is_gzip:
            gzip_stream = GzippingWrapper(stream)
            gzip_stream.write(record_no_cl.encode('utf-8'))
            gzip_stream.flush()
        else:
            stream.write(record_no_cl.encode('utf-8'))

        # parse to verify http headers + payload matches sample record
        # but not rec headers (missing content-length)
        stream.seek(0)
        parsed_record = ArcWarcRecordLoader().parse_record_stream(
            DecompressingBufferedReader(stream))

        if 'Content-Disposition' not in record_string:
            assert full_record.http_headers == parsed_record.http_headers
        assert full_record.raw_stream.read() == parsed_record.raw_stream.read()
        assert full_record.rec_headers != parsed_record.rec_headers

        # parse and write
        stream.seek(0)
        parsed_record = ArcWarcRecordLoader().parse_record_stream(
            DecompressingBufferedReader(stream))

        writer.write_record(parsed_record)

        stream = DecompressingBufferedReader(writer.get_stream())
        buff = stream.read()

        # assert written record matches expected response record
        # with content-length, digests computed
        assert buff.decode('utf-8') == record_string
Beispiel #15
0
def test_brotli_very_small_chunk():
    brotli_buff = b'[\xff\xaf\x02\xc0"y\\\xfbZ\x8cB;\xf4%U\x19Z\x92\x99\xb15\xc8\x19\x9e\x9e\n{K\x90\xb9<\x98\xc8\t@\xf3\xe6\xd9M\xe4me\x1b\'\x87\x13_\xa6\xe90\x96{<\x15\xd8S\x1c'

    # read 3 bytes at time, will need to read() multiple types before decompressor has enough to return something
    with closing(
            DecompressingBufferedReader(BytesIO(brotli_buff),
                                        decomp_type='br',
                                        block_size=3)) as x:
        assert x.read(
        ) == b'The quick brown fox jumps over the lazy dog' * 4096
Beispiel #16
0
def lookupRecord(url):
    """ Look up URL in database. """
    try:
        filename, offset, length = urlmap[url]
        with open(filename, 'rb') as stream:
            stream.seek(offset, 0)
            buf = BytesIO(stream.read(length))
            loader = ArcWarcRecordLoader()
            return loader.parse_record_stream(DecompressingBufferedReader(buf))
    except KeyError:
        return None
    def decompress_and_recompress(self, stream, output):
        with tempfile.TemporaryFile() as tout:
            decomp = DecompressingBufferedReader(stream)

            # decompress entire file to temp file
            stream.seek(0)
            shutil.copyfileobj(decomp, tout)

            # attempt to compress and write temp
            tout.seek(0)
            self.load_and_write(tout, output)
Beispiel #18
0
def test_compress_mix():
    x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'),
                                    decomp_type='gzip')
    b = x.read()
    assert b == b'ABC'
    x.read_next_member()
    assert x.read() == b'123'
Beispiel #19
0
    def test_generate_record(self, record_sampler, is_gzip, builder_factory):
        writer = FixedTestWARCWriter(gzip=is_gzip)

        builder = builder_factory(writer)
        record_maker, record_string = record_sampler
        record = record_maker(builder)

        writer.write_record(record)

        raw_buff = writer.get_contents()

        self._validate_record_content_len(BytesIO(raw_buff))

        stream = DecompressingBufferedReader(writer.get_stream())

        buff = stream.read()

        if is_gzip:
            assert len(buff) > len(raw_buff)
        else:
            assert len(buff) == len(raw_buff)

        assert buff.decode('utf-8') == record_string

        # assert parsing record matches as well
        stream = DecompressingBufferedReader(writer.get_stream())
        parsed_record = ArcWarcRecordLoader().parse_record_stream(stream)
        writer2 = FixedTestWARCWriter(gzip=False)
        writer2.write_record(parsed_record)
        assert writer2.get_contents().decode('utf-8') == record_string

        # verify parts of record
        stream = DecompressingBufferedReader(writer.get_stream())
        parsed_record = ArcWarcRecordLoader().parse_record_stream(stream)

        content_buff = parsed_record.content_stream().read().decode('utf-8')
        assert content_buff in record_string

        rec_type = parsed_record.rec_type

        # verify http_headers

        # match original
        assert record.http_headers == parsed_record.http_headers

        if parsed_record.http_headers:
            assert rec_type in ('response', 'request', 'revisit')
        else:
            # empty revisit
            if rec_type == 'revisit':
                assert len(content_buff) == 0
            else:
                assert len(content_buff) == parsed_record.length
Beispiel #20
0
def get_rendered_original_stream(warc_filename, warc_offset,
                                 compressedendoffset):
    # If not found, say so:
    if warc_filename is None:
        return None, None

    # Grab the payload from the WARC and return it.
    url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (
        WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset)
    if compressedendoffset:
        url = "%s&length=%s" % (url, compressedendoffset)
    r = requests.get(url, stream=True)
    logger.debug("Loading from: %s" % r.url)
    r.raw.decode_content = False
    rl = ArcWarcRecordLoader()
    record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw))

    return record.raw_stream, record.content_type
Beispiel #21
0
    def load(self, url, offset, length, no_record_parse=False):
        """ Load a single record from given url at offset with length
        and parse as either warc or arc record
        """
        try:
            length = int(length)
        except:
            length = -1

        stream = self.loader.load(url, int(offset), length)
        decomp_type = 'gzip'

        # Create decompressing stream
        stream = DecompressingBufferedReader(stream=stream,
                                             decomp_type=decomp_type,
                                             block_size=self.block_size)

        return self.parse_record_stream(stream,
                                        no_record_parse=no_record_parse)
Beispiel #22
0
    def test_warcinfo_record(self, is_gzip):
        writer = FixedTestWARCWriter(gzip=is_gzip)

        record = sample_warcinfo(writer)

        writer.write_record(record)
        reader = DecompressingBufferedReader(writer.get_stream())

        parsed_record = ArcWarcRecordLoader().parse_record_stream(reader)

        assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
        assert parsed_record.rec_headers.get_header(
            'Content-Type') == 'application/warc-fields'
        assert parsed_record.rec_headers.get_header(
            'WARC-Filename') == 'testfile.warc.gz'

        buff = parsed_record.content_stream().read().decode('utf-8')

        assert 'json-metadata: {"foo": "bar"}\r\n' in buff
        assert 'format: WARC File Format 1.0\r\n' in buff
Beispiel #23
0
    def test_warcinfo_record(self, is_gzip, builder_factory):
        writer = FixedTestWARCWriter(gzip=is_gzip)
        builder = builder_factory(writer)

        record = sample_warcinfo(builder)

        writer.write_record(record)
        reader = DecompressingBufferedReader(writer.get_stream())

        parsed_record = ArcWarcRecordLoader().parse_record_stream(reader)

        assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
        assert parsed_record.rec_headers.get_header(
            'Content-Type') == 'application/warc-fields'
        assert parsed_record.rec_headers.get_header(
            'WARC-Filename') == 'testfile.warc.gz'
        assert parsed_record.rec_headers.get_header(
            'WARC-Block-Digest') == 'sha1:GAD6P5BTZPRU57ICXEYUJZGCURZYABID'

        buff = parsed_record.content_stream().read().decode('utf-8')

        assert 'json-metadata: {"foo": "bar"}\r\n' in buff
        assert 'format: WARC File Format 1.0\r\n' in buff
Beispiel #24
0
class ArchiveIterator(six.Iterator):
    """ Iterate over records in WARC and ARC files, both gzip chunk
    compressed and uncompressed

    The indexer will automatically detect format, and decompress
    if necessary.

    """

    GZIP_ERR_MSG = """
    ERROR: non-chunked gzip file detected, gzip block continues
    beyond single record.

    This file is probably not a multi-member gzip but a single gzip file.

    To allow seek, a gzipped {1} must have each record compressed into
    a single gzip member and concatenated together.

    This file is likely still valid and can be fixed by running:

    warcio recompress <path/to/file> <path/to/new_file>

"""

    INC_RECORD = """\
    WARNING: Record not followed by newline, perhaps Content-Length is invalid
    Offset: {0}
    Remainder: {1}
"""

    def __init__(self, fileobj, no_record_parse=False,
                 verify_http=False, arc2warc=False,
                 ensure_http_headers=False, block_size=BUFF_SIZE):

        self.fh = fileobj

        self.loader = ArcWarcRecordLoader(verify_http=verify_http,
                                          arc2warc=arc2warc)
        self.known_format = None

        self.mixed_arc_warc = arc2warc

        self.member_info = None
        self.no_record_parse = no_record_parse
        self.ensure_http_headers = ensure_http_headers

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()
        self.next_line = None

        self.err_count = 0

        self.the_iter = self._iterate_records()

    def __iter__(self):
        return self.the_iter

    def __next__(self):
        return six.next(self.the_iter)

    def _iterate_records(self):
        """ iterate over each record
        """
        raise_invalid_gzip = False
        empty_record = False
        record = None

        while True:
            try:
                record = self._next_record(self.next_line)
                if raise_invalid_gzip:
                    self._raise_invalid_gzip_err()

                yield record

            except EOFError:
                empty_record = True

            if record:
                self.read_to_end(record)

            if self.reader.decompressor:
                # if another gzip member, continue
                if self.reader.read_next_member():
                    continue

                # if empty record, then we're done
                elif empty_record:
                    break

                # otherwise, probably a gzip
                # containing multiple non-chunked records
                # raise this as an error
                else:
                    raise_invalid_gzip = True

            # non-gzip, so we're done
            elif empty_record:
                break

    def _raise_invalid_gzip_err(self):
        """ A gzip file with multiple ARC/WARC records, non-chunked
        has been detected. This is not valid for replay, so notify user
        """
        frmt = 'warc/arc'
        if self.known_format:
            frmt = self.known_format

        frmt_up = frmt.upper()

        msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
        raise ArchiveLoadFailed(msg)

    def _consume_blanklines(self):
        """ Consume blank lines that are between records
        - For warcs, there are usually 2
        - For arcs, may be 1 or 0
        - For block gzipped files, these are at end of each gzip envelope
          and are included in record length which is the full gzip envelope
        - For uncompressed, they are between records and so are NOT part of
          the record length

          count empty_size so that it can be substracted from
          the record length for uncompressed

          if first line read is not blank, likely error in WARC/ARC,
          display a warning
        """
        empty_size = 0
        first_line = True

        while True:
            line = self.reader.readline()
            if len(line) == 0:
                return None, empty_size

            stripped = line.rstrip()

            if len(stripped) == 0 or first_line:
                empty_size += len(line)

                if len(stripped) != 0:
                    # if first line is not blank,
                    # likely content-length was invalid, display warning
                    err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
                    sys.stderr.write(self.INC_RECORD.format(err_offset, line))
                    self.err_count += 1

                first_line = False
                continue

            return line, empty_size

    def read_to_end(self, record):
        """ Read remainder of the stream
        If a digester is included, update it
        with the data read
        """

        # already at end of this record, don't read until it is consumed
        if self.member_info:
            return None

        curr_offset = self.offset

        while True:
            b = record.raw_stream.read(BUFF_SIZE)
            if not b:
                break

        """
        - For compressed files, blank lines are consumed
          since they are part of record length
        - For uncompressed files, blank lines are read later,
          and not included in the record length
        """
        #if self.reader.decompressor:
        self.next_line, empty_size = self._consume_blanklines()

        self.offset = self.fh.tell() - self.reader.rem_length()
        #if self.offset < 0:
        #    raise Exception('Not Gzipped Properly')

        if self.next_line:
            self.offset -= len(self.next_line)

        length = self.offset - curr_offset

        if not self.reader.decompressor:
            length -= empty_size

        self.member_info = (curr_offset, length)
        #return self.member_info
        #return next_line

    def _next_record(self, next_line):
        """ Use loader to parse the record from the reader stream
        Supporting warc and arc records
        """
        record = self.loader.parse_record_stream(self.reader,
                                                 next_line,
                                                 self.known_format,
                                                 self.no_record_parse,
                                                 self.ensure_http_headers)

        self.member_info = None

        # Track known format for faster parsing of other records
        if not self.mixed_arc_warc:
            self.known_format = record.format

        return record