Ejemplo n.º 1
0
    def iter_records(self, block_size=16384):
        """ iterate over each record
        """

        decomp_type = 'gzip'

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()

        self.next_line = None

        is_valid = True

        while True:
            try:
                record = self._next_record(self.next_line)
                if not is_valid:
                    self._raise_err()

                yield record
            except EOFError:
                break

            self.read_to_end(record)

            if self.reader.decompressor:
                is_valid = self.reader.read_next_member()
Ejemplo n.º 2
0
def test_err_compress_mix():
    # error: compressed member, followed by not compressed -- considered invalid
    x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
    b = x.read()
    assert b == b'ABC'
    x.read_next_member()
    assert x.read() == b''
Ejemplo n.º 3
0
def test_s3_read_1():
    pytest.importorskip('boto')

    res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz',
                             offset=53235662,
                             length=2526)

    buff = res.read()
    assert len(buff) == 2526

    reader = DecompressingBufferedReader(BytesIO(buff))
    assert reader.readline() == b'WARC/1.0\r\n'
    assert reader.readline() == b'WARC-Type: response\r\n'
Ejemplo n.º 4
0
    def _check_encoding(self, rewritten_headers, stream, enc):
        if (rewritten_headers.contains_removed_header('content-encoding',
                                                      enc)):

            #optimize: if already a ChunkedDataReader, add the encoding
            if isinstance(stream, ChunkedDataReader):
                stream.set_decomp(enc)
            else:
                stream = DecompressingBufferedReader(stream, decomp_type=enc)

            rewritten_headers.status_headers.remove_header('content-length')

        return stream
Ejemplo n.º 5
0
    def __call__(self, block_size=16384):
        """ iterate over each record
        """

        decomp_type = 'gzip'

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()

        self.next_line = None

        raise_invalid_gzip = False
        empty_record = False
        record = None

        while True:
            try:
                curr_offset = self.fh.tell()
                record = self._next_record(self.next_line)
                if raise_invalid_gzip:
                    self._raise_invalid_gzip_err()

                yield record

            except EOFError:
                empty_record = True

            if record:
                self.read_to_end(record)

            if self.reader.decompressor:
                # if another gzip member, continue
                if self.reader.read_next_member():
                    continue

                # if empty record, then we're done
                elif empty_record:
                    break

                # otherwise, probably a gzip
                # containing multiple non-chunked records
                # raise this as an error
                else:
                    raise_invalid_gzip = True

            # non-gzip, so we're done
            elif empty_record:
                break
Ejemplo n.º 6
0
    def load(self, url, offset, length):
        """ Load a single record from given url at offset with length
        and parse as either warc or arc record
        """
        try:
            length = int(length)
        except:
            length = -1

        stream = self.loader.load(url, long(offset), length)
        decomp_type = 'gzip'

        # Create decompressing stream
        stream = DecompressingBufferedReader(stream=stream,
                                             decomp_type=decomp_type,
                                             block_size=self.block_size)

        return self.parse_record_stream(stream)
Ejemplo n.º 7
0
def get_rendered_original_stream(warc_filename, warc_offset, compressedendoffset):
    # If not found, say so:
    if warc_filename is None:
        return None, None

    # Grab the payload from the WARC and return it.
    url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset)
    if compressedendoffset:
        url = "%s&length=%s" % (url, compressedendoffset)
    logger.info("Requesting copy from HDFS: %s " % url)
    r = requests.get(url, stream=True)
    logger.info("Loading from: %s" % r.url)
    r.raw.decode_content = False
    rl = ArcWarcRecordLoader()
    logger.info("Passing response to parser...")
    record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw))
    logger.info("RESULT:")
    logger.info(record)

    logger.info("Returning stream...")
    return record.stream, record.content_type
Ejemplo n.º 8
0
def get_rendered_original(url,
                          type='screenshot',
                          target_timestamp=30001201235900):
    """
    Grabs a rendered resource.

    Only reason Wayback can't do this is that it does not like the extended URIs
    i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://'
    """
    # Query URL
    qurl = "%s:%s" % (type, url)
    # Query CDX Server for the item
    #logger.info("Querying CDX for prefix...")
    warc_filename, warc_offset, compressedendoffset = lookup_in_cdx(
        qurl, target_timestamp)

    # If not found, say so:
    if warc_filename is None:
        return None

    # Grab the payload from the WARC and return it.
    WEBHDFS_PREFIX = os.environ['WEBHDFS_PREFIX']
    WEBHDFS_USER = os.environ['WEBHDFS_USER']
    url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (
        WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset)
    if compressedendoffset:
        url = "%s&length=%s" % (url, compressedendoffset)
    #logger.info("Requesting copy from HDFS: %s " % url)
    r = requests.get(url, stream=True)
    #logger.info("Loading from: %s" % r.url)
    r.raw.decode_content = False
    rl = ArcWarcRecordLoader()
    #logger.info("Passing response to parser...")
    record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw))
    #logger.info("RESULT:")
    #logger.info(record)

    #logger.info("Returning stream...")
    return (record.stream, record.content_type)
Ejemplo n.º 9
0
def extract_text(entry):
    buff_reader = entry.buffer
    if not buff_reader:
        return b''

    buff_reader.seek(0)

    if entry.record.status_headers.get_header('content-encoding'):
        buff_reader = DecompressingBufferedReader(buff_reader)

    buff = b''
    while True:
        new_buff = buff_reader.read()
        if not new_buff:
            break

        buff += new_buff

    if is_binary_string(buff):
        return b''

    return buff
Ejemplo n.º 10
0
def get_rendered_original():
    """
    Grabs a rendered resource.

    Only reason Wayback can't do this is that it does not like the extended URIs
    i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://'
    """
    url = request.args.get('url')
    app.logger.debug("Got URL: %s" % url)
    #
    type = request.args.get('type', 'screenshot')
    app.logger.debug("Got type: %s" % type)

    # Query URL
    qurl = "%s:%s" % (type, url)
    # Query CDX Server for the item
    (warc_filename, warc_offset) = lookup_in_cdx(qurl)

    # If not found, say so:
    if warc_filename is None:
        abort(404)

    # Grab the payload from the WARC and return it.
    r = requests.get("%s%s%s?op=OPEN&user.name=%s&offset=%s" %
                     (systems().webhdfs, h3().hdfs_root_folder, warc_filename,
                      webhdfs().user, warc_offset))
    app.logger.info("Loading from: %s" % r.url)
    r.raw.decode_content = False
    rl = ArcWarcRecordLoader()
    record = rl.parse_record_stream(
        DecompressingBufferedReader(stream=io.BytesIO(r.content)))
    print(record)
    print(record.length)
    print(record.stream.limit)

    return send_file(record.stream, mimetype=record.content_type)
Ejemplo n.º 11
0
    def rewrite_content(self,
                        urlrewriter,
                        headers,
                        stream,
                        head_insert_func=None,
                        urlkey='',
                        cdx=None):

        wb_url = urlrewriter.wburl

        if (wb_url.is_identity
                or (not head_insert_func and wb_url.is_banner_only)):
            status_headers, stream = self.sanitize_content(headers, stream)
            return (status_headers, self.stream_to_gen(stream), False)

        if wb_url.is_banner_only:
            urlrewriter = None

        rule = self.ruleset.get_first_match(urlkey)

        (rewritten_headers,
         stream) = self._rewrite_headers(urlrewriter, rule, headers, stream)

        status_headers = rewritten_headers.status_headers

        # use rewritten headers, but no further rewriting needed
        if rewritten_headers.text_type is None:
            return (status_headers, self.stream_to_gen(stream), False)

        # Handle text content rewriting
        # ====================================================================
        # special case -- need to ungzip the body

        text_type = rewritten_headers.text_type

        # see known js/css modifier specified, the context should run
        # default text_type
        mod = wb_url.mod

        stream_raw = False
        encoding = None
        first_buff = None

        if (rewritten_headers.contains_removed_header('content-encoding',
                                                      'gzip')):

            #optimize: if already a ChunkedDataReader, add gzip
            if isinstance(stream, ChunkedDataReader):
                stream.set_decomp('gzip')
            else:
                stream = DecompressingBufferedReader(stream)

        if mod == 'js_':
            text_type, stream = self._resolve_text_type(
                'js', text_type, stream)
        elif mod == 'cs_':
            text_type, stream = self._resolve_text_type(
                'css', text_type, stream)

        rewriter_class = rule.rewriters[text_type]

        # for html, need to perform header insert, supply js, css, xml
        # rewriters
        if text_type == 'html':
            head_insert_str = ''

            if head_insert_func:
                head_insert_str = head_insert_func(rule, cdx)
                head_insert_str = head_insert_str.encode('utf-8')

            if wb_url.is_banner_only:
                gen = self._head_insert_only_gen(head_insert_str, stream)

                content_len = headers.get_header('Content-Length')
                try:
                    content_len = int(content_len)
                except Exception:
                    content_len = None

                if content_len and content_len >= 0:
                    content_len = str(content_len + len(head_insert_str))
                    status_headers.replace_header('Content-Length',
                                                  content_len)

                return (status_headers, gen, False)

            rewriter = rewriter_class(urlrewriter,
                                      js_rewriter_class=rule.rewriters['js'],
                                      css_rewriter_class=rule.rewriters['css'],
                                      head_insert=head_insert_str,
                                      defmod=self.defmod,
                                      parse_comments=rule.parse_comments)

        else:
            if wb_url.is_banner_only:
                return (status_headers, self.stream_to_gen(stream), False)

            # apply one of (js, css, xml) rewriters
            rewriter = rewriter_class(urlrewriter)

        # Create rewriting generator
        gen = self.rewrite_text_stream_to_gen(stream,
                                              rewrite_func=rewriter.rewrite,
                                              final_read_func=rewriter.close,
                                              first_buff=first_buff)

        return (status_headers, gen, True)
Ejemplo n.º 12
0
def test_brotli():
    with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh:
        x = DecompressingBufferedReader(fh, decomp_type='br')
        x.read() == b'The quick brown fox jumps over the lazy dog' * 4096