Esempio n. 1
0
def test_err_compress_mix():
    # error: compressed member, followed by not compressed -- considered invalid
    x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
    b = x.read()
    b = x.read_next_member()
    with pytest.raises(zlib.error):
        x.read()
Esempio n. 2
0
    def iter_records(self, block_size=16384):
        """ iterate over each record
        """

        decomp_type = 'gzip'

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()

        self.next_line = None

        is_valid = True

        while True:
            try:
                record = self._next_record(self.next_line)
                if not is_valid:
                    self._raise_err()

                yield record
            except EOFError:
                break

            self.read_to_end(record)

            if self.reader.decompressor:
                is_valid = self.reader.read_next_member()
Esempio n. 3
0
def test_s3_read_1():
    pytest.importorskip('boto')

    res = BlockLoader().load('s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz',
                             offset=53235662,
                             length=2526)

    buff = res.read()
    assert len(buff) == 2526

    reader = DecompressingBufferedReader(BytesIO(buff))
    assert reader.readline() == b'WARC/1.0\r\n'
    assert reader.readline() == b'WARC-Type: response\r\n'
Esempio n. 4
0
def test_s3_read_1():
    pytest.importorskip('boto')

    res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz',
                             offset=53235662,
                             length=2526)

    buff = res.read()
    assert len(buff) == 2526

    reader = DecompressingBufferedReader(BytesIO(buff))
    assert reader.readline() == b'WARC/1.0\r\n'
    assert reader.readline() == b'WARC-Type: response\r\n'
Esempio n. 5
0
    def __call__(self, block_size=16384):
        """ iterate over each record
        """

        decomp_type = 'gzip'

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()

        self.next_line = None

        raise_invalid_gzip = False
        empty_record = False
        record = None

        while True:
            try:
                curr_offset = self.fh.tell()
                record = self._next_record(self.next_line)
                if raise_invalid_gzip:
                    self._raise_invalid_gzip_err()

                yield record

            except EOFError:
                empty_record = True

            if record:
                self.read_to_end(record)

            if self.reader.decompressor:
                # if another gzip member, continue
                if self.reader.read_next_member():
                    continue

                # if empty record, then we're done
                elif empty_record:
                    break

                # otherwise, probably a gzip
                # containing multiple non-chunked records
                # raise this as an error
                else:
                    raise_invalid_gzip = True

            # non-gzip, so we're done
            elif empty_record:
                break
Esempio n. 6
0
    def iter_records(self, block_size=16384):
        """ iterate over each record
        """

        decomp_type = 'gzip'

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()

        self.next_line = None

        is_valid = True

        while True:
            try:
                record = self._next_record(self.next_line)
                if not is_valid:
                    self._raise_err()

                yield record
            except EOFError:
                break

            self.read_to_end(record)

            if self.reader.decompressor:
                is_valid = self.reader.read_next_member()
Esempio n. 7
0
    def iter_records(self):
        """ iterate over each record
        """

        decomp_type = 'gzip'
        block_size = 16384

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()

        next_line = None

        while True:
            try:
                record = self._next_record(next_line)
                yield record
            except EOFError:
                break

            self.read_to_end(record)

            # for non-compressed, consume blank lines here
            if not self.reader.decompressor:
                next_line = self._consume_blanklines()
                if next_line is None:
                    # at end of file
                    break

            # reset reader for next member
            else:
                self.reader.read_next_member()
Esempio n. 8
0
def test_err_compress_mix():
    # error: compressed member, followed by not compressed -- considered invalid
    x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
    b = x.read()
    assert b == b'ABC'
    x.read_next_member()
    assert x.read() == b''
Esempio n. 9
0
    def _check_encoding(self, rewritten_headers, stream, enc):
        if (rewritten_headers.contains_removed_header('content-encoding',
                                                      enc)):

            #optimize: if already a ChunkedDataReader, add the encoding
            if isinstance(stream, ChunkedDataReader):
                stream.set_decomp(enc)
            else:
                stream = DecompressingBufferedReader(stream, decomp_type=enc)

            rewritten_headers.status_headers.remove_header('content-length')

        return stream
Esempio n. 10
0
def extract_text(entry):
    buff_reader = entry.buffer
    if not buff_reader:
        return b''

    buff_reader.seek(0)

    if entry.record.status_headers.get_header('content-encoding'):
        buff_reader = DecompressingBufferedReader(buff_reader)

    buff = b''
    while True:
        new_buff = buff_reader.read()
        if not new_buff:
            break

        buff += new_buff

    if is_binary_string(buff):
        return b''

    return buff
Esempio n. 11
0
    def iter_records(self, block_size=16384):
        """ iterate over each record
        """

        decomp_type = 'gzip'

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()

        self.next_line = None

        raise_invalid_gzip = False
        empty_record = False
        record = None

        while True:
            try:
                curr_offset = self.fh.tell()
                record = self._next_record(self.next_line)
                if raise_invalid_gzip:
                    self._raise_invalid_gzip_err()

                yield record

            except EOFError:
                empty_record = True

            if record:
                self.read_to_end(record)

            if self.reader.decompressor:
                # if another gzip member, continue
                if self.reader.read_next_member():
                    continue

                # if empty record, then we're done
                elif empty_record:
                    break

                # otherwise, probably a gzip
                # containing multiple non-chunked records
                # raise this as an error
                else:
                    raise_invalid_gzip = True

            # non-gzip, so we're done
            elif empty_record:
                break
Esempio n. 12
0
    def load(self, url, offset, length):
        """ Load a single record from given url at offset with length
        and parse as either warc or arc record
        """
        try:
            length = int(length)
        except:
            length = -1

        stream = self.loader.load(url, long(offset), length)
        decomp_type = 'gzip'

        # Create decompressing stream
        stream = DecompressingBufferedReader(stream=stream,
                                             decomp_type=decomp_type,
                                             block_size=self.block_size)

        return self.parse_record_stream(stream)
Esempio n. 13
0
def get_rendered_original_stream(warc_filename, warc_offset, compressedendoffset):
    # If not found, say so:
    if warc_filename is None:
        return None, None

    # Grab the payload from the WARC and return it.
    url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset)
    if compressedendoffset:
        url = "%s&length=%s" % (url, compressedendoffset)
    logger.info("Requesting copy from HDFS: %s " % url)
    r = requests.get(url, stream=True)
    logger.info("Loading from: %s" % r.url)
    r.raw.decode_content = False
    rl = ArcWarcRecordLoader()
    logger.info("Passing response to parser...")
    record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw))
    logger.info("RESULT:")
    logger.info(record)

    logger.info("Returning stream...")
    return record.stream, record.content_type
Esempio n. 14
0
def get_rendered_original(url,
                          type='screenshot',
                          target_timestamp=30001201235900):
    """
    Grabs a rendered resource.

    Only reason Wayback can't do this is that it does not like the extended URIs
    i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://'
    """
    # Query URL
    qurl = "%s:%s" % (type, url)
    # Query CDX Server for the item
    #logger.info("Querying CDX for prefix...")
    warc_filename, warc_offset, compressedendoffset = lookup_in_cdx(
        qurl, target_timestamp)

    # If not found, say so:
    if warc_filename is None:
        return None

    # Grab the payload from the WARC and return it.
    WEBHDFS_PREFIX = os.environ['WEBHDFS_PREFIX']
    WEBHDFS_USER = os.environ['WEBHDFS_USER']
    url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (
        WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset)
    if compressedendoffset:
        url = "%s&length=%s" % (url, compressedendoffset)
    #logger.info("Requesting copy from HDFS: %s " % url)
    r = requests.get(url, stream=True)
    #logger.info("Loading from: %s" % r.url)
    r.raw.decode_content = False
    rl = ArcWarcRecordLoader()
    #logger.info("Passing response to parser...")
    record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw))
    #logger.info("RESULT:")
    #logger.info(record)

    #logger.info("Returning stream...")
    return (record.stream, record.content_type)
Esempio n. 15
0
def get_rendered_original():
    """
    Grabs a rendered resource.

    Only reason Wayback can't do this is that it does not like the extended URIs
    i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://'
    """
    url = request.args.get('url')
    app.logger.debug("Got URL: %s" % url)
    #
    type = request.args.get('type', 'screenshot')
    app.logger.debug("Got type: %s" % type)

    # Query URL
    qurl = "%s:%s" % (type, url)
    # Query CDX Server for the item
    (warc_filename, warc_offset) = lookup_in_cdx(qurl)

    # If not found, say so:
    if warc_filename is None:
        abort(404)

    # Grab the payload from the WARC and return it.
    r = requests.get("%s%s%s?op=OPEN&user.name=%s&offset=%s" %
                     (systems().webhdfs, h3().hdfs_root_folder, warc_filename,
                      webhdfs().user, warc_offset))
    app.logger.info("Loading from: %s" % r.url)
    r.raw.decode_content = False
    rl = ArcWarcRecordLoader()
    record = rl.parse_record_stream(
        DecompressingBufferedReader(stream=io.BytesIO(r.content)))
    print(record)
    print(record.length)
    print(record.stream.limit)

    return send_file(record.stream, mimetype=record.content_type)
Esempio n. 16
0
class ArchiveIterator(object):
    """ Iterate over records in WARC and ARC files, both gzip chunk
    compressed and uncompressed

    The indexer will automatically detect format, and decompress
    if necessary.

    """

    def __init__(self, fileobj):
        self.fh = fileobj

        self.loader = ArcWarcRecordLoader()
        self.reader = None

        self.offset = 0
        self.known_format = None

        self.member_info = None

    def iter_records(self):
        """ iterate over each record
        """

        decomp_type = 'gzip'
        block_size = 16384

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()

        next_line = None

        while True:
            try:
                record = self._next_record(next_line)
                yield record
            except EOFError:
                break

            self.read_to_end(record)

            # for non-compressed, consume blank lines here
            if not self.reader.decompressor:
                next_line = self._consume_blanklines()
                if next_line is None:
                    # at end of file
                    break

            # reset reader for next member
            else:
                self.reader.read_next_member()

    def _consume_blanklines(self):
        """ Consume blank lines that are between records
        - For warcs, there are usually 2
        - For arcs, may be 1 or 0
        - For block gzipped files, these are at end of each gzip envelope
          and are included in record length which is the full gzip envelope
        - For uncompressed, they are between records and so are NOT part of
          the record length
        """
        while True:
            line = self.reader.readline()
            if len(line) == 0:
                return None

            if line.rstrip() == '':
                self.offset = self.fh.tell() - self.reader.rem_length()
                continue

            return line

    def read_to_end(self, record, compute_digest=False):
        """ Read remainder of the stream
        If a digester is included, update it
        with the data read
        """
        if self.member_info:
            return self.member_info

        if compute_digest:
            digester = hashlib.sha1()
        else:
            digester = None

        num = 0
        curr_offset = self.offset

        while True:
            b = record.stream.read(8192)
            if not b:
                break
            num += len(b)
            if digester:
                digester.update(b)

        """
        - For compressed files, blank lines are consumed
          since they are part of record length
        - For uncompressed files, blank lines are read later,
          and not included in the record length
        """
        if self.reader.decompressor:
            self._consume_blanklines()

        self.offset = self.fh.tell() - self.reader.rem_length()
        length = self.offset - curr_offset

        if compute_digest:
            digest = base64.b32encode(digester.digest())
        else:
            digest = None

        self.member_info = (curr_offset, length, digest)
        return self.member_info

    def _next_record(self, next_line):
        """ Use loader to parse the record from the reader stream
        Supporting warc and arc records
        """
        record = self.loader.parse_record_stream(self.reader,
                                                 next_line,
                                                 self.known_format)

        self.member_info = None

        # Track known format for faster parsing of other records
        self.known_format = record.format

        return record
Esempio n. 17
0
    def rewrite_content(self, urlrewriter, headers, stream,
                        head_insert_func=None, urlkey='',
                        cdx=None):

        wb_url = urlrewriter.wburl

        if (wb_url.is_identity or
             (not head_insert_func and wb_url.is_banner_only)):
            status_headers, stream = self.sanitize_content(headers, stream)
            return (status_headers, self.stream_to_gen(stream), False)

        if wb_url.is_banner_only:
            urlrewriter = None

        rule = self.ruleset.get_first_match(urlkey)

        (rewritten_headers, stream) = self._rewrite_headers(urlrewriter,
                                                            rule,
                                                            headers,
                                                            stream)

        status_headers = rewritten_headers.status_headers

        # use rewritten headers, but no further rewriting needed
        if rewritten_headers.text_type is None:
            return (status_headers, self.stream_to_gen(stream), False)

        # Handle text content rewriting
        # ====================================================================
        # special case -- need to ungzip the body

        text_type = rewritten_headers.text_type

        # see known js/css modifier specified, the context should run
        # default text_type
        mod = wb_url.mod

        stream_raw = False
        encoding = None
        first_buff = ''

        if (rewritten_headers.
             contains_removed_header('content-encoding', 'gzip')):

            #optimize: if already a ChunkedDataReader, add gzip
            if isinstance(stream, ChunkedDataReader):
                stream.set_decomp('gzip')
            else:
                stream = DecompressingBufferedReader(stream)

        if mod == 'js_':
            text_type, stream = self._resolve_text_type('js',
                                                        text_type,
                                                        stream)
        elif mod == 'cs_':
            text_type, stream = self._resolve_text_type('css',
                                                        text_type,
                                                        stream)

        rewriter_class = rule.rewriters[text_type]

        # for html, need to perform header insert, supply js, css, xml
        # rewriters
        if text_type == 'html':
            head_insert_str = ''
            charset = rewritten_headers.charset

            # if no charset set, attempt to extract from first 1024
            if not rewritten_headers.charset:
                first_buff = stream.read(1024)
                charset = self._extract_html_charset(first_buff,
                                                     status_headers)

            if head_insert_func:
                if not charset:
                    charset = 'utf-8'
                head_insert_str = head_insert_func(rule, cdx)
                head_insert_str = head_insert_str.encode(charset)

            if wb_url.is_banner_only:
                gen = self._head_insert_only_gen(head_insert_str,
                                                 stream,
                                                 first_buff)

                content_len = headers.get_header('Content-Length')
                try:
                    content_len = int(content_len)
                except Exception:
                    content_len = None

                if content_len and content_len >= 0:
                    content_len = str(content_len + len(head_insert_str))
                    status_headers.replace_header('Content-Length',
                                                  content_len)

                return (status_headers, gen, False)

            rewriter = rewriter_class(urlrewriter,
                                      js_rewriter_class=rule.rewriters['js'],
                                      css_rewriter_class=rule.rewriters['css'],
                                      head_insert=head_insert_str,
                                      defmod=self.defmod,
                                      parse_comments=rule.parse_comments)

        else:
            if wb_url.is_banner_only:
                return (status_headers, self.stream_to_gen(stream), False)

            # apply one of (js, css, xml) rewriters
            rewriter = rewriter_class(urlrewriter)


        # align to line end for all non-html rewriting
        align = (text_type != 'html')

        # Create rewriting generator
        gen = self.rewrite_text_stream_to_gen(stream,
                                              rewrite_func=rewriter.rewrite,
                                              final_read_func=rewriter.close,
                                              first_buff=first_buff,
                                              align_to_line=align)

        return (status_headers, gen, True)
Esempio n. 18
0
class ArchiveIterator(object):
    """ Iterate over records in WARC and ARC files, both gzip chunk
    compressed and uncompressed

    The indexer will automatically detect format, and decompress
    if necessary.

    """

    GZIP_ERR_MSG = """
    ERROR: Non-chunked gzip file detected, gzip block continues
    beyond single record.

    This file is probably not a multi-chunk gzip but a single gzip file.

    To allow seek, a gzipped {1} must have each record compressed into
    a single gzip chunk and concatenated together.

    This file is likely still valid and you can use it by decompressing it:

    gunzip myfile.{0}.gz

    You can then also use the 'warc2warc' tool from the 'warc-tools'
    package which will create a properly chunked gzip file:

    warc2warc -Z myfile.{0} > myfile.{0}.gz
"""

    INC_RECORD = """\
    WARNING: Record not followed by newline, perhaps Content-Length is invalid
    Offset: {0}
    Remainder: {1}
"""


    def __init__(self, fileobj, no_record_parse=False,
                 verify_http=False):
        self.fh = fileobj

        self.loader = ArcWarcRecordLoader(verify_http=verify_http)
        self.reader = None

        self.offset = 0
        self.known_format = None

        self.member_info = None
        self.no_record_parse = no_record_parse

    def iter_records(self, block_size=16384):
        """ iterate over each record
        """

        decomp_type = 'gzip'

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()

        self.next_line = None

        raise_invalid_gzip = False
        empty_record = False
        record = None

        while True:
            try:
                curr_offset = self.fh.tell()
                record = self._next_record(self.next_line)
                if raise_invalid_gzip:
                    self._raise_invalid_gzip_err()

                yield record

            except EOFError:
                empty_record = True

            if record:
                self.read_to_end(record)

            if self.reader.decompressor:
                # if another gzip member, continue
                if self.reader.read_next_member():
                    continue

                # if empty record, then we're done
                elif empty_record:
                    break

                # otherwise, probably a gzip
                # containing multiple non-chunked records
                # raise this as an error
                else:
                    raise_invalid_gzip = True

            # non-gzip, so we're done
            elif empty_record:
                break

    def _raise_invalid_gzip_err(self):
        """ A gzip file with multiple ARC/WARC records, non-chunked
        has been detected. This is not valid for replay, so notify user
        """
        frmt = 'warc/arc'
        if self.known_format:
            frmt = self.known_format

        frmt_up = frmt.upper()

        msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
        raise Exception(msg)

    def _consume_blanklines(self):
        """ Consume blank lines that are between records
        - For warcs, there are usually 2
        - For arcs, may be 1 or 0
        - For block gzipped files, these are at end of each gzip envelope
          and are included in record length which is the full gzip envelope
        - For uncompressed, they are between records and so are NOT part of
          the record length

          count empty_size so that it can be substracted from
          the record length for uncompressed

          if first line read is not blank, likely error in WARC/ARC,
          display a warning
        """
        empty_size = 0
        first_line = True

        while True:
            line = self.reader.readline()
            if len(line) == 0:
                return None, empty_size

            stripped = line.rstrip()

            if stripped == '' or first_line:
                empty_size += len(line)

                if stripped != '':
                    # if first line is not blank,
                    # likely content-length was invalid, display warning
                    err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
                    sys.stderr.write(self.INC_RECORD.format(err_offset, line))

                first_line = False
                continue

            return line, empty_size

    def read_to_end(self, record, payload_callback=None):
        """ Read remainder of the stream
        If a digester is included, update it
        with the data read
        """

        # already at end of this record, don't read until it is consumed
        if self.member_info:
            return None

        num = 0
        curr_offset = self.offset

        while True:
            b = record.stream.read(8192)
            if not b:
                break
            num += len(b)
            if payload_callback:
                payload_callback(b)

        """
        - For compressed files, blank lines are consumed
          since they are part of record length
        - For uncompressed files, blank lines are read later,
          and not included in the record length
        """
        #if self.reader.decompressor:
        self.next_line, empty_size = self._consume_blanklines()

        self.offset = self.fh.tell() - self.reader.rem_length()
        #if self.offset < 0:
        #    raise Exception('Not Gzipped Properly')

        if self.next_line:
            self.offset -= len(self.next_line)

        length = self.offset - curr_offset

        if not self.reader.decompressor:
            length -= empty_size

        self.member_info = (curr_offset, length)
        #return self.member_info
        #return next_line

    def _next_record(self, next_line):
        """ Use loader to parse the record from the reader stream
        Supporting warc and arc records
        """
        record = self.loader.parse_record_stream(self.reader,
                                                 next_line,
                                                 self.known_format,
                                                 self.no_record_parse)

        self.member_info = None

        # Track known format for faster parsing of other records
        self.known_format = record.format

        return record
Esempio n. 19
0
    def rewrite_content(self,
                        urlrewriter,
                        headers,
                        stream,
                        head_insert_func=None,
                        urlkey='',
                        cdx=None):

        wb_url = urlrewriter.wburl

        if (wb_url.is_identity
                or (not head_insert_func and wb_url.is_banner_only)):
            status_headers, stream = self.sanitize_content(headers, stream)
            return (status_headers, self.stream_to_gen(stream), False)

        if wb_url.is_banner_only:
            urlrewriter = None

        rule = self.ruleset.get_first_match(urlkey)

        (rewritten_headers,
         stream) = self._rewrite_headers(urlrewriter, rule, headers, stream)

        status_headers = rewritten_headers.status_headers

        # use rewritten headers, but no further rewriting needed
        if rewritten_headers.text_type is None:
            return (status_headers, self.stream_to_gen(stream), False)

        # Handle text content rewriting
        # ====================================================================
        # special case -- need to ungzip the body

        text_type = rewritten_headers.text_type

        # see known js/css modifier specified, the context should run
        # default text_type
        mod = wb_url.mod

        stream_raw = False
        encoding = None
        first_buff = None

        if (rewritten_headers.contains_removed_header('content-encoding',
                                                      'gzip')):

            #optimize: if already a ChunkedDataReader, add gzip
            if isinstance(stream, ChunkedDataReader):
                stream.set_decomp('gzip')
            else:
                stream = DecompressingBufferedReader(stream)

        if mod == 'js_':
            text_type, stream = self._resolve_text_type(
                'js', text_type, stream)
        elif mod == 'cs_':
            text_type, stream = self._resolve_text_type(
                'css', text_type, stream)

        rewriter_class = rule.rewriters[text_type]

        # for html, need to perform header insert, supply js, css, xml
        # rewriters
        if text_type == 'html':
            head_insert_str = ''

            if head_insert_func:
                head_insert_str = head_insert_func(rule, cdx)
                head_insert_str = head_insert_str.encode('utf-8')

            if wb_url.is_banner_only:
                gen = self._head_insert_only_gen(head_insert_str, stream)

                content_len = headers.get_header('Content-Length')
                try:
                    content_len = int(content_len)
                except Exception:
                    content_len = None

                if content_len and content_len >= 0:
                    content_len = str(content_len + len(head_insert_str))
                    status_headers.replace_header('Content-Length',
                                                  content_len)

                return (status_headers, gen, False)

            rewriter = rewriter_class(urlrewriter,
                                      js_rewriter_class=rule.rewriters['js'],
                                      css_rewriter_class=rule.rewriters['css'],
                                      head_insert=head_insert_str,
                                      defmod=self.defmod,
                                      parse_comments=rule.parse_comments)

        else:
            if wb_url.is_banner_only:
                return (status_headers, self.stream_to_gen(stream), False)

            # apply one of (js, css, xml) rewriters
            rewriter = rewriter_class(urlrewriter)

        # Create rewriting generator
        gen = self.rewrite_text_stream_to_gen(stream,
                                              rewrite_func=rewriter.rewrite,
                                              final_read_func=rewriter.close,
                                              first_buff=first_buff)

        return (status_headers, gen, True)
Esempio n. 20
0
def test_brotli():
    with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh:
        x = DecompressingBufferedReader(fh, decomp_type='br')
        x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
Esempio n. 21
0
class ArchiveIterator(object):
    """ Iterate over records in WARC and ARC files, both gzip chunk
    compressed and uncompressed

    The indexer will automatically detect format, and decompress
    if necessary.

    """

    GZIP_ERR_MSG = """
    ERROR: Non-chunked gzip file detected, gzip block continues
    beyond single record.

    This file is probably not a multi-chunk gzip but a single gzip file.

    To allow seek, a gzipped {1} must have each record compressed into
    a single gzip chunk and concatenated together.

    This file is likely still valid and you can use it by decompressing it:

    gunzip myfile.{0}.gz

    You can then also use the 'warc2warc' tool from the 'warc-tools'
    package which will create a properly chunked gzip file:

    warc2warc -Z myfile.{0} > myfile.{0}.gz
"""

    INC_RECORD = """\
    WARNING: Record not followed by newline, perhaps Content-Length is invalid
    Offset: {0}
    Remainder: {1}
"""

    def __init__(self,
                 fileobj,
                 no_record_parse=False,
                 verify_http=False,
                 arc2warc=False):
        self.fh = fileobj

        self.loader = ArcWarcRecordLoader(verify_http=verify_http,
                                          arc2warc=arc2warc)
        self.reader = None

        self.offset = 0
        self.known_format = None

        self.mixed_arc_warc = arc2warc

        self.member_info = None
        self.no_record_parse = no_record_parse

    def __call__(self, block_size=16384):
        """ iterate over each record
        """

        decomp_type = 'gzip'

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()

        self.next_line = None

        raise_invalid_gzip = False
        empty_record = False
        record = None

        while True:
            try:
                curr_offset = self.fh.tell()
                record = self._next_record(self.next_line)
                if raise_invalid_gzip:
                    self._raise_invalid_gzip_err()

                yield record

            except EOFError:
                empty_record = True

            if record:
                self.read_to_end(record)

            if self.reader.decompressor:
                # if another gzip member, continue
                if self.reader.read_next_member():
                    continue

                # if empty record, then we're done
                elif empty_record:
                    break

                # otherwise, probably a gzip
                # containing multiple non-chunked records
                # raise this as an error
                else:
                    raise_invalid_gzip = True

            # non-gzip, so we're done
            elif empty_record:
                break

    def _raise_invalid_gzip_err(self):
        """ A gzip file with multiple ARC/WARC records, non-chunked
        has been detected. This is not valid for replay, so notify user
        """
        frmt = 'warc/arc'
        if self.known_format:
            frmt = self.known_format

        frmt_up = frmt.upper()

        msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
        raise Exception(msg)

    def _consume_blanklines(self):
        """ Consume blank lines that are between records
        - For warcs, there are usually 2
        - For arcs, may be 1 or 0
        - For block gzipped files, these are at end of each gzip envelope
          and are included in record length which is the full gzip envelope
        - For uncompressed, they are between records and so are NOT part of
          the record length

          count empty_size so that it can be substracted from
          the record length for uncompressed

          if first line read is not blank, likely error in WARC/ARC,
          display a warning
        """
        empty_size = 0
        first_line = True

        while True:
            line = self.reader.readline()
            if len(line) == 0:
                return None, empty_size

            stripped = line.rstrip()

            if len(stripped) == 0 or first_line:
                empty_size += len(line)

                if len(stripped) != 0:
                    # if first line is not blank,
                    # likely content-length was invalid, display warning
                    err_offset = self.fh.tell() - self.reader.rem_length(
                    ) - empty_size
                    sys.stderr.write(self.INC_RECORD.format(err_offset, line))

                first_line = False
                continue

            return line, empty_size

    def read_to_end(self, record, payload_callback=None):
        """ Read remainder of the stream
        If a digester is included, update it
        with the data read
        """

        # already at end of this record, don't read until it is consumed
        if self.member_info:
            return None

        num = 0
        curr_offset = self.offset

        while True:
            b = record.stream.read(8192)
            if not b:
                break
            num += len(b)
            if payload_callback:
                payload_callback(b)
        """
        - For compressed files, blank lines are consumed
          since they are part of record length
        - For uncompressed files, blank lines are read later,
          and not included in the record length
        """
        #if self.reader.decompressor:
        self.next_line, empty_size = self._consume_blanklines()

        self.offset = self.fh.tell() - self.reader.rem_length()
        #if self.offset < 0:
        #    raise Exception('Not Gzipped Properly')

        if self.next_line:
            self.offset -= len(self.next_line)

        length = self.offset - curr_offset

        if not self.reader.decompressor:
            length -= empty_size

        self.member_info = (curr_offset, length)
        #return self.member_info
        #return next_line

    def _next_record(self, next_line):
        """ Use loader to parse the record from the reader stream
        Supporting warc and arc records
        """
        record = self.loader.parse_record_stream(self.reader, next_line,
                                                 self.known_format,
                                                 self.no_record_parse)

        self.member_info = None

        # Track known format for faster parsing of other records
        if not self.mixed_arc_warc:
            self.known_format = record.format

        return record
Esempio n. 22
0
class ArchiveIterator(object):
    """ Iterate over records in WARC and ARC files, both gzip chunk
    compressed and uncompressed

    The indexer will automatically detect format, and decompress
    if necessary.

    """

    GZIP_ERR_MSG = """
    ERROR: Non-chunked gzip file detected, gzip block continues
    beyond single record.

    This file is probably not a multi-chunk gzip but a single gzip file.

    To allow seek, a gzipped {1} must have each record compressed into
    a single gzip chunk and concatenated together.

    This file is likely still valid and you can use it by decompressing it:

    gunzip myfile.{0}.gz

    You can then also use the 'warc2warc' tool from the 'warc-tools'
    package which will create a properly chunked gzip file:

    warc2warc -Z myfile.{0} > myfile.{0}.gz
    """

    def __init__(self, fileobj, no_record_parse=False,
                 verify_http=False):
        self.fh = fileobj

        self.loader = ArcWarcRecordLoader(verify_http=verify_http)
        self.reader = None

        self.offset = 0
        self.known_format = None

        self.member_info = None
        self.no_record_parse = no_record_parse

    def iter_records(self, block_size=16384):
        """ iterate over each record
        """

        decomp_type = 'gzip'

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()

        self.next_line = None

        is_valid = True

        while True:
            try:
                record = self._next_record(self.next_line)
                if not is_valid:
                    self._raise_err()

                yield record
            except EOFError:
                break

            self.read_to_end(record)

            if self.reader.decompressor:
                is_valid = self.reader.read_next_member()

    def _raise_err(self):
        frmt = 'warc/arc'
        if self.known_format:
            frmt = self.known_format

        frmt_up = frmt.upper()

        msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
        raise Exception(msg)

    def _consume_blanklines(self):
        """ Consume blank lines that are between records
        - For warcs, there are usually 2
        - For arcs, may be 1 or 0
        - For block gzipped files, these are at end of each gzip envelope
          and are included in record length which is the full gzip envelope
        - For uncompressed, they are between records and so are NOT part of
          the record length

          count empty_size so that it can be substracted from
          the record length for uncompressed
        """
        empty_size = 0
        while True:
            line = self.reader.readline()
            if len(line) == 0:
                return None, empty_size

            if line.rstrip() == '':
                empty_size += len(line)
                continue

            return line, empty_size

    def read_to_end(self, record, compute_digest=False):
        """ Read remainder of the stream
        If a digester is included, update it
        with the data read
        """

        # already at end of this record, don't read until it is consumed
        if self.member_info:
            return None

        if compute_digest:
            digester = hashlib.sha1()
        else:
            digester = None

        num = 0
        curr_offset = self.offset

        while True:
            b = record.stream.read(8192)
            if not b:
                break
            num += len(b)
            if digester:
                digester.update(b)

        """
        - For compressed files, blank lines are consumed
          since they are part of record length
        - For uncompressed files, blank lines are read later,
          and not included in the record length
        """
        #if self.reader.decompressor:
        self.next_line, empty_size = self._consume_blanklines()

        self.offset = self.fh.tell() - self.reader.rem_length()
        #if self.offset < 0:
        #    raise Exception('Not Gzipped Properly')

        if self.next_line:
            self.offset -= len(self.next_line)

        length = self.offset - curr_offset

        if not self.reader.decompressor:
            length -= empty_size

        if compute_digest:
            digest = base64.b32encode(digester.digest())
        else:
            digest = None

        self.member_info = (curr_offset, length, digest)
        #return self.member_info
        #return next_line

    def _next_record(self, next_line):
        """ Use loader to parse the record from the reader stream
        Supporting warc and arc records
        """
        record = self.loader.parse_record_stream(self.reader,
                                                 next_line,
                                                 self.known_format,
                                                 self.no_record_parse)

        self.member_info = None

        # Track known format for faster parsing of other records
        self.known_format = record.format

        return record
Esempio n. 23
0
class ArchiveIterator(object):
    """ Iterate over records in WARC and ARC files, both gzip chunk
    compressed and uncompressed

    The indexer will automatically detect format, and decompress
    if necessary.

    """

    GZIP_ERR_MSG = """
    ERROR: Non-chunked gzip file detected, gzip block continues
    beyond single record.

    This file is probably not a multi-chunk gzip but a single gzip file.

    To allow seek, a gzipped {1} must have each record compressed into
    a single gzip chunk and concatenated together.

    This file is likely still valid and you can use it by decompressing it:

    gunzip myfile.{0}.gz

    You can then also use the 'warc2warc' tool from the 'warc-tools'
    package which will create a properly chunked gzip file:

    warc2warc -Z myfile.{0} > myfile.{0}.gz
    """

    def __init__(self, fileobj, no_record_parse=False):
        self.fh = fileobj

        self.loader = ArcWarcRecordLoader()
        self.reader = None

        self.offset = 0
        self.known_format = None

        self.member_info = None
        self.no_record_parse = no_record_parse

    def iter_records(self, block_size=16384):
        """ iterate over each record
        """

        decomp_type = 'gzip'

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()

        self.next_line = None

        is_valid = True

        while True:
            try:
                record = self._next_record(self.next_line)
                if not is_valid:
                    self._raise_err()

                yield record
            except EOFError:
                break

            self.read_to_end(record)

            if self.reader.decompressor:
                is_valid = self.reader.read_next_member()

    def _raise_err(self):
        frmt = 'warc/arc'
        if self.known_format:
            frmt = self.known_format

        frmt_up = frmt.upper()

        msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
        raise Exception(msg)

    def _consume_blanklines(self):
        """ Consume blank lines that are between records
        - For warcs, there are usually 2
        - For arcs, may be 1 or 0
        - For block gzipped files, these are at end of each gzip envelope
          and are included in record length which is the full gzip envelope
        - For uncompressed, they are between records and so are NOT part of
          the record length

          count empty_size so that it can be substracted from
          the record length for uncompressed
        """
        empty_size = 0
        while True:
            line = self.reader.readline()
            if len(line) == 0:
                return None, empty_size

            if line.rstrip() == '':
                empty_size += len(line)
                continue

            return line, empty_size

    def read_to_end(self, record, compute_digest=False):
        """ Read remainder of the stream
        If a digester is included, update it
        with the data read
        """

        # already at end of this record, don't read until it is consumed
        if self.member_info:
            return None

        if compute_digest:
            digester = hashlib.sha1()
        else:
            digester = None

        num = 0
        curr_offset = self.offset

        while True:
            b = record.stream.read(8192)
            if not b:
                break
            num += len(b)
            if digester:
                digester.update(b)
        """
        - For compressed files, blank lines are consumed
          since they are part of record length
        - For uncompressed files, blank lines are read later,
          and not included in the record length
        """
        #if self.reader.decompressor:
        self.next_line, empty_size = self._consume_blanklines()

        self.offset = self.fh.tell() - self.reader.rem_length()
        #if self.offset < 0:
        #    raise Exception('Not Gzipped Properly')

        if self.next_line:
            self.offset -= len(self.next_line)

        length = self.offset - curr_offset

        if not self.reader.decompressor:
            length -= empty_size

        if compute_digest:
            digest = base64.b32encode(digester.digest())
        else:
            digest = None

        self.member_info = (curr_offset, length, digest)
        #return self.member_info
        #return next_line

    def _next_record(self, next_line):
        """ Use loader to parse the record from the reader stream
        Supporting warc and arc records
        """
        record = self.loader.parse_record_stream(self.reader, next_line,
                                                 self.known_format,
                                                 self.no_record_parse)

        self.member_info = None

        # Track known format for faster parsing of other records
        self.known_format = record.format

        return record