Beispiel #1
0
def test_compress_mix():
    x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'),
                                    decomp_type='gzip')
    b = x.read()
    assert b == b'ABC'
    x.read_next_member()
    assert x.read() == b'123'
Beispiel #2
0
def test_compress_invalid():
    result = compress('ABCDEFG' * 1)
    # cut-off part of the block
    result = result[:-2] + b'xyz'

    x = DecompressingBufferedReader(BytesIO(result), block_size=16)
    b = x.read(3)
    assert b == b'ABC'

    assert b'DE' == x.read()
def get_rendered_original_stream(warc_filename,
                                 warc_offset,
                                 compressedendoffset,
                                 payload_only=True):
    """
    Grabs a resource.
    """
    # If not found, say so:
    if warc_filename is None:
        return None, None

    # Grab the payload from the WARC and return it.
    url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (
        WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset)
    if compressedendoffset and int(compressedendoffset) > 0:
        url = "%s&length=%s" % (url, compressedendoffset)
    r = requests.get(url, stream=True)
    # We handle decoding etc.
    r.raw.decode_content = False
    logger.debug("Loading from: %s" % r.url)
    logger.debug("Got status code %s" % r.status_code)
    # Return the payload, or the record:
    if payload_only:
        # Parse the WARC, return the payload:
        rl = ArcWarcRecordLoader()
        record = rl.parse_record_stream(
            DecompressingBufferedReader(stream=r.raw))
        #return record.raw_stream, record.content_type
        return record.content_stream(), record.content_type
    else:
        # This makes sure we only get the first GZip chunk:
        s = DecompressingBufferedReader(stream=r.raw)
        warc_record = s.read()
        return warc_record, 'application/warc'
Beispiel #4
0
    def test_generate_record(self, record_sampler, is_gzip, builder_factory):
        writer = FixedTestWARCWriter(gzip=is_gzip)

        builder = builder_factory(writer)
        record_maker, record_string = record_sampler
        record = record_maker(builder)

        writer.write_record(record)

        raw_buff = writer.get_contents()

        self._validate_record_content_len(BytesIO(raw_buff))

        stream = DecompressingBufferedReader(writer.get_stream())

        buff = stream.read()

        if is_gzip:
            assert len(buff) > len(raw_buff)
        else:
            assert len(buff) == len(raw_buff)

        assert buff.decode('utf-8') == record_string

        # assert parsing record matches as well
        stream = DecompressingBufferedReader(writer.get_stream())
        parsed_record = ArcWarcRecordLoader().parse_record_stream(stream)
        writer2 = FixedTestWARCWriter(gzip=False)
        writer2.write_record(parsed_record)
        assert writer2.get_contents().decode('utf-8') == record_string

        # verify parts of record
        stream = DecompressingBufferedReader(writer.get_stream())
        parsed_record = ArcWarcRecordLoader().parse_record_stream(stream)

        content_buff = parsed_record.content_stream().read().decode('utf-8')
        assert content_buff in record_string

        rec_type = parsed_record.rec_type

        # verify http_headers

        # match original
        assert record.http_headers == parsed_record.http_headers

        if parsed_record.http_headers:
            assert rec_type in ('response', 'request', 'revisit')
        else:
            # empty revisit
            if rec_type == 'revisit':
                assert len(content_buff) == 0
            else:
                assert len(content_buff) == parsed_record.length
Beispiel #5
0
    def test_generate_response_gzip(self):
        writer = FixedTestWARCWriter(gzip=True)

        record = self._sample_response(writer)

        writer.write_record(record)

        gzip_buff = writer.get_contents()

        self._validate_record_content_len(BytesIO(gzip_buff))

        stream = writer.get_stream()
        stream = DecompressingBufferedReader(stream)

        buff = stream.read()
        assert len(buff) > len(gzip_buff)

        assert buff.decode('utf-8') == RESPONSE_RECORD