Example #1
0
    def test_generate_record(self, record_sampler, is_gzip, builder_factory):
        writer = FixedTestWARCWriter(gzip=is_gzip)

        builder = builder_factory(writer)
        record_maker, record_string = record_sampler
        record = record_maker(builder)

        writer.write_record(record)

        raw_buff = writer.get_contents()

        self._validate_record_content_len(BytesIO(raw_buff))

        stream = DecompressingBufferedReader(writer.get_stream())

        buff = stream.read()

        if is_gzip:
            assert len(buff) > len(raw_buff)
        else:
            assert len(buff) == len(raw_buff)

        assert buff.decode('utf-8') == record_string

        # assert parsing record matches as well
        stream = DecompressingBufferedReader(writer.get_stream())
        parsed_record = ArcWarcRecordLoader().parse_record_stream(stream)
        writer2 = FixedTestWARCWriter(gzip=False)
        writer2.write_record(parsed_record)
        assert writer2.get_contents().decode('utf-8') == record_string

        # verify parts of record
        stream = DecompressingBufferedReader(writer.get_stream())
        parsed_record = ArcWarcRecordLoader().parse_record_stream(stream)

        content_buff = parsed_record.content_stream().read().decode('utf-8')
        assert content_buff in record_string

        rec_type = parsed_record.rec_type

        # verify http_headers

        # match original
        assert record.http_headers == parsed_record.http_headers

        if parsed_record.http_headers:
            assert rec_type in ('response', 'request', 'revisit')
        else:
            # empty revisit
            if rec_type == 'revisit':
                assert len(content_buff) == 0
            else:
                assert len(content_buff) == parsed_record.length
Example #2
0
    def test_warcinfo_record(self, is_gzip):
        writer = FixedTestWARCWriter(gzip=is_gzip)

        record = sample_warcinfo(writer)

        writer.write_record(record)
        reader = DecompressingBufferedReader(writer.get_stream())

        parsed_record = ArcWarcRecordLoader().parse_record_stream(reader)

        assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
        assert parsed_record.rec_headers.get_header(
            'Content-Type') == 'application/warc-fields'
        assert parsed_record.rec_headers.get_header(
            'WARC-Filename') == 'testfile.warc.gz'

        buff = parsed_record.content_stream().read().decode('utf-8')

        assert 'json-metadata: {"foo": "bar"}\r\n' in buff
        assert 'format: WARC File Format 1.0\r\n' in buff
Example #3
0
    def test_warcinfo_record(self, is_gzip, builder_factory):
        writer = FixedTestWARCWriter(gzip=is_gzip)
        builder = builder_factory(writer)

        record = sample_warcinfo(builder)

        writer.write_record(record)
        reader = DecompressingBufferedReader(writer.get_stream())

        parsed_record = ArcWarcRecordLoader().parse_record_stream(reader)

        assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
        assert parsed_record.rec_headers.get_header(
            'Content-Type') == 'application/warc-fields'
        assert parsed_record.rec_headers.get_header(
            'WARC-Filename') == 'testfile.warc.gz'
        assert parsed_record.rec_headers.get_header(
            'WARC-Block-Digest') == 'sha1:GAD6P5BTZPRU57ICXEYUJZGCURZYABID'

        buff = parsed_record.content_stream().read().decode('utf-8')

        assert 'json-metadata: {"foo": "bar"}\r\n' in buff
        assert 'format: WARC File Format 1.0\r\n' in buff