def test_cdxj_empty(self): output = StringIO() empty = BytesIO() opts = {"filename": "empty.warc.gz"} write_cdx_index(output, empty, opts) assert output.getvalue() == ""
def test_cdxj_middle_empty_records(self): empty_gzip_record = b"\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00" new_warc = BytesIO() with open(os.path.join(TEST_DIR, "example.warc.gz"), "rb") as fh: new_warc.write(empty_gzip_record) new_warc.write(fh.read()) new_warc.write(empty_gzip_record) new_warc.write(empty_gzip_record) fh.seek(0) new_warc.write(fh.read()) new_warc.seek(0) output = StringIO() opts = {"filename": "empty.warc.gz"} write_cdx_index(output, new_warc, opts) lines = output.getvalue().rstrip().split("\n") assert len(lines) == 4, lines
def index_all(self, **opts): output = StringIO() # paths = [os.path.join(TEST_DIR, filename) for filename in os.listdir(TEST_DIR)] paths = [TEST_DIR] write_cdx_index(output, paths, opts) return output.getvalue()
def index_file(self, filename, **opts): output = StringIO() write_cdx_index(output, os.path.join(TEST_DIR, filename), opts) return output.getvalue()