def test_err_compress_mix(): # error: compressed member, followed by not compressed -- considered invalid x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip') b = x.read() assert b == b'ABC' x.read_next_member() assert x.read() == b''
def test_err_compress_mix(): # error: compressed member, followed by not compressed -- considered invalid x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip') b = x.read() b = x.read_next_member() with pytest.raises(zlib.error): x.read()
class ArchiveIterator(object): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed The indexer will automatically detect format, and decompress if necessary. """ GZIP_ERR_MSG = """ ERROR: Non-chunked gzip file detected, gzip block continues beyond single record. This file is probably not a multi-chunk gzip but a single gzip file. To allow seek, a gzipped {1} must have each record compressed into a single gzip chunk and concatenated together. This file is likely still valid and you can use it by decompressing it: gunzip myfile.{0}.gz You can then also use the 'warc2warc' tool from the 'warc-tools' package which will create a properly chunked gzip file: warc2warc -Z myfile.{0} > myfile.{0}.gz """ INC_RECORD = """\ WARNING: Record not followed by newline, perhaps Content-Length is invalid Offset: {0} Remainder: {1} """ def __init__(self, fileobj, no_record_parse=False, verify_http=False): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http) self.reader = None self.offset = 0 self.known_format = None self.member_info = None self.no_record_parse = no_record_parse def iter_records(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None raise_invalid_gzip = False empty_record = False record = None while True: try: curr_offset = self.fh.tell() record = self._next_record(self.next_line) if raise_invalid_gzip: self._raise_invalid_gzip_err() yield record except EOFError: empty_record = True if record: self.read_to_end(record) if self.reader.decompressor: # if another gzip member, continue if self.reader.read_next_member(): continue # if empty record, then we're done elif empty_record: break # otherwise, probably a gzip # containing multiple non-chunked records # raise this as an error else: raise_invalid_gzip = True # non-gzip, so we're done elif empty_record: break def _raise_invalid_gzip_err(self): """ A gzip file with multiple ARC/WARC records, non-chunked has been detected. This is not valid for replay, so notify user """ frmt = 'warc/arc' if self.known_format: frmt = self.known_format frmt_up = frmt.upper() msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) raise Exception(msg) def _consume_blanklines(self): """ Consume blank lines that are between records - For warcs, there are usually 2 - For arcs, may be 1 or 0 - For block gzipped files, these are at end of each gzip envelope and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length count empty_size so that it can be substracted from the record length for uncompressed if first line read is not blank, likely error in WARC/ARC, display a warning """ empty_size = 0 first_line = True while True: line = self.reader.readline() if len(line) == 0: return None, empty_size stripped = line.rstrip() if stripped == '' or first_line: empty_size += len(line) if stripped != '': # if first line is not blank, # likely content-length was invalid, display warning err_offset = self.fh.tell() - self.reader.rem_length() - empty_size sys.stderr.write(self.INC_RECORD.format(err_offset, line)) first_line = False continue return line, empty_size def read_to_end(self, record, payload_callback=None): """ Read remainder of the stream If a digester is included, update it with the data read """ # already at end of this record, don't read until it is consumed if self.member_info: return None num = 0 curr_offset = self.offset while True: b = record.stream.read(8192) if not b: break num += len(b) if payload_callback: payload_callback(b) """ - For compressed files, blank lines are consumed since they are part of record length - For uncompressed files, blank lines are read later, and not included in the record length """ #if self.reader.decompressor: self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() #if self.offset < 0: # raise Exception('Not Gzipped Properly') if self.next_line: self.offset -= len(self.next_line) length = self.offset - curr_offset if not self.reader.decompressor: length -= empty_size self.member_info = (curr_offset, length) #return self.member_info #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream Supporting warc and arc records """ record = self.loader.parse_record_stream(self.reader, next_line, self.known_format, self.no_record_parse) self.member_info = None # Track known format for faster parsing of other records self.known_format = record.format return record
class ArchiveIterator(object): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed The indexer will automatically detect format, and decompress if necessary. """ GZIP_ERR_MSG = """ ERROR: Non-chunked gzip file detected, gzip block continues beyond single record. This file is probably not a multi-chunk gzip but a single gzip file. To allow seek, a gzipped {1} must have each record compressed into a single gzip chunk and concatenated together. This file is likely still valid and you can use it by decompressing it: gunzip myfile.{0}.gz You can then also use the 'warc2warc' tool from the 'warc-tools' package which will create a properly chunked gzip file: warc2warc -Z myfile.{0} > myfile.{0}.gz """ INC_RECORD = """\ WARNING: Record not followed by newline, perhaps Content-Length is invalid Offset: {0} Remainder: {1} """ def __init__(self, fileobj, no_record_parse=False, verify_http=False, arc2warc=False): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, arc2warc=arc2warc) self.reader = None self.offset = 0 self.known_format = None self.mixed_arc_warc = arc2warc self.member_info = None self.no_record_parse = no_record_parse def __call__(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None raise_invalid_gzip = False empty_record = False record = None while True: try: curr_offset = self.fh.tell() record = self._next_record(self.next_line) if raise_invalid_gzip: self._raise_invalid_gzip_err() yield record except EOFError: empty_record = True if record: self.read_to_end(record) if self.reader.decompressor: # if another gzip member, continue if self.reader.read_next_member(): continue # if empty record, then we're done elif empty_record: break # otherwise, probably a gzip # containing multiple non-chunked records # raise this as an error else: raise_invalid_gzip = True # non-gzip, so we're done elif empty_record: break def _raise_invalid_gzip_err(self): """ A gzip file with multiple ARC/WARC records, non-chunked has been detected. This is not valid for replay, so notify user """ frmt = 'warc/arc' if self.known_format: frmt = self.known_format frmt_up = frmt.upper() msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) raise Exception(msg) def _consume_blanklines(self): """ Consume blank lines that are between records - For warcs, there are usually 2 - For arcs, may be 1 or 0 - For block gzipped files, these are at end of each gzip envelope and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length count empty_size so that it can be substracted from the record length for uncompressed if first line read is not blank, likely error in WARC/ARC, display a warning """ empty_size = 0 first_line = True while True: line = self.reader.readline() if len(line) == 0: return None, empty_size stripped = line.rstrip() if len(stripped) == 0 or first_line: empty_size += len(line) if len(stripped) != 0: # if first line is not blank, # likely content-length was invalid, display warning err_offset = self.fh.tell() - self.reader.rem_length( ) - empty_size sys.stderr.write(self.INC_RECORD.format(err_offset, line)) first_line = False continue return line, empty_size def read_to_end(self, record, payload_callback=None): """ Read remainder of the stream If a digester is included, update it with the data read """ # already at end of this record, don't read until it is consumed if self.member_info: return None num = 0 curr_offset = self.offset while True: b = record.stream.read(8192) if not b: break num += len(b) if payload_callback: payload_callback(b) """ - For compressed files, blank lines are consumed since they are part of record length - For uncompressed files, blank lines are read later, and not included in the record length """ #if self.reader.decompressor: self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() #if self.offset < 0: # raise Exception('Not Gzipped Properly') if self.next_line: self.offset -= len(self.next_line) length = self.offset - curr_offset if not self.reader.decompressor: length -= empty_size self.member_info = (curr_offset, length) #return self.member_info #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream Supporting warc and arc records """ record = self.loader.parse_record_stream(self.reader, next_line, self.known_format, self.no_record_parse) self.member_info = None # Track known format for faster parsing of other records if not self.mixed_arc_warc: self.known_format = record.format return record
class ArchiveIterator(object): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed The indexer will automatically detect format, and decompress if necessary. """ GZIP_ERR_MSG = """ ERROR: Non-chunked gzip file detected, gzip block continues beyond single record. This file is probably not a multi-chunk gzip but a single gzip file. To allow seek, a gzipped {1} must have each record compressed into a single gzip chunk and concatenated together. This file is likely still valid and you can use it by decompressing it: gunzip myfile.{0}.gz You can then also use the 'warc2warc' tool from the 'warc-tools' package which will create a properly chunked gzip file: warc2warc -Z myfile.{0} > myfile.{0}.gz """ def __init__(self, fileobj, no_record_parse=False): self.fh = fileobj self.loader = ArcWarcRecordLoader() self.reader = None self.offset = 0 self.known_format = None self.member_info = None self.no_record_parse = no_record_parse def iter_records(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None is_valid = True while True: try: record = self._next_record(self.next_line) if not is_valid: self._raise_err() yield record except EOFError: break self.read_to_end(record) if self.reader.decompressor: is_valid = self.reader.read_next_member() def _raise_err(self): frmt = 'warc/arc' if self.known_format: frmt = self.known_format frmt_up = frmt.upper() msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) raise Exception(msg) def _consume_blanklines(self): """ Consume blank lines that are between records - For warcs, there are usually 2 - For arcs, may be 1 or 0 - For block gzipped files, these are at end of each gzip envelope and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length count empty_size so that it can be substracted from the record length for uncompressed """ empty_size = 0 while True: line = self.reader.readline() if len(line) == 0: return None, empty_size if line.rstrip() == '': empty_size += len(line) continue return line, empty_size def read_to_end(self, record, compute_digest=False): """ Read remainder of the stream If a digester is included, update it with the data read """ # already at end of this record, don't read until it is consumed if self.member_info: return None if compute_digest: digester = hashlib.sha1() else: digester = None num = 0 curr_offset = self.offset while True: b = record.stream.read(8192) if not b: break num += len(b) if digester: digester.update(b) """ - For compressed files, blank lines are consumed since they are part of record length - For uncompressed files, blank lines are read later, and not included in the record length """ #if self.reader.decompressor: self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() #if self.offset < 0: # raise Exception('Not Gzipped Properly') if self.next_line: self.offset -= len(self.next_line) length = self.offset - curr_offset if not self.reader.decompressor: length -= empty_size if compute_digest: digest = base64.b32encode(digester.digest()) else: digest = None self.member_info = (curr_offset, length, digest) #return self.member_info #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream Supporting warc and arc records """ record = self.loader.parse_record_stream(self.reader, next_line, self.known_format, self.no_record_parse) self.member_info = None # Track known format for faster parsing of other records self.known_format = record.format return record
class ArchiveIterator(object): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed The indexer will automatically detect format, and decompress if necessary. """ GZIP_ERR_MSG = """ ERROR: Non-chunked gzip file detected, gzip block continues beyond single record. This file is probably not a multi-chunk gzip but a single gzip file. To allow seek, a gzipped {1} must have each record compressed into a single gzip chunk and concatenated together. This file is likely still valid and you can use it by decompressing it: gunzip myfile.{0}.gz You can then also use the 'warc2warc' tool from the 'warc-tools' package which will create a properly chunked gzip file: warc2warc -Z myfile.{0} > myfile.{0}.gz """ def __init__(self, fileobj, no_record_parse=False, verify_http=False): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http) self.reader = None self.offset = 0 self.known_format = None self.member_info = None self.no_record_parse = no_record_parse def iter_records(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None is_valid = True while True: try: record = self._next_record(self.next_line) if not is_valid: self._raise_err() yield record except EOFError: break self.read_to_end(record) if self.reader.decompressor: is_valid = self.reader.read_next_member() def _raise_err(self): frmt = 'warc/arc' if self.known_format: frmt = self.known_format frmt_up = frmt.upper() msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) raise Exception(msg) def _consume_blanklines(self): """ Consume blank lines that are between records - For warcs, there are usually 2 - For arcs, may be 1 or 0 - For block gzipped files, these are at end of each gzip envelope and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length count empty_size so that it can be substracted from the record length for uncompressed """ empty_size = 0 while True: line = self.reader.readline() if len(line) == 0: return None, empty_size if line.rstrip() == '': empty_size += len(line) continue return line, empty_size def read_to_end(self, record, compute_digest=False): """ Read remainder of the stream If a digester is included, update it with the data read """ # already at end of this record, don't read until it is consumed if self.member_info: return None if compute_digest: digester = hashlib.sha1() else: digester = None num = 0 curr_offset = self.offset while True: b = record.stream.read(8192) if not b: break num += len(b) if digester: digester.update(b) """ - For compressed files, blank lines are consumed since they are part of record length - For uncompressed files, blank lines are read later, and not included in the record length """ #if self.reader.decompressor: self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() #if self.offset < 0: # raise Exception('Not Gzipped Properly') if self.next_line: self.offset -= len(self.next_line) length = self.offset - curr_offset if not self.reader.decompressor: length -= empty_size if compute_digest: digest = base64.b32encode(digester.digest()) else: digest = None self.member_info = (curr_offset, length, digest) #return self.member_info #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream Supporting warc and arc records """ record = self.loader.parse_record_stream(self.reader, next_line, self.known_format, self.no_record_parse) self.member_info = None # Track known format for faster parsing of other records self.known_format = record.format return record
class ArchiveIterator(object): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed The indexer will automatically detect format, and decompress if necessary. """ def __init__(self, fileobj): self.fh = fileobj self.loader = ArcWarcRecordLoader() self.reader = None self.offset = 0 self.known_format = None self.member_info = None def iter_records(self): """ iterate over each record """ decomp_type = 'gzip' block_size = 16384 self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() next_line = None while True: try: record = self._next_record(next_line) yield record except EOFError: break self.read_to_end(record) # for non-compressed, consume blank lines here if not self.reader.decompressor: next_line = self._consume_blanklines() if next_line is None: # at end of file break # reset reader for next member else: self.reader.read_next_member() def _consume_blanklines(self): """ Consume blank lines that are between records - For warcs, there are usually 2 - For arcs, may be 1 or 0 - For block gzipped files, these are at end of each gzip envelope and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length """ while True: line = self.reader.readline() if len(line) == 0: return None if line.rstrip() == '': self.offset = self.fh.tell() - self.reader.rem_length() continue return line def read_to_end(self, record, compute_digest=False): """ Read remainder of the stream If a digester is included, update it with the data read """ if self.member_info: return self.member_info if compute_digest: digester = hashlib.sha1() else: digester = None num = 0 curr_offset = self.offset while True: b = record.stream.read(8192) if not b: break num += len(b) if digester: digester.update(b) """ - For compressed files, blank lines are consumed since they are part of record length - For uncompressed files, blank lines are read later, and not included in the record length """ if self.reader.decompressor: self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() length = self.offset - curr_offset if compute_digest: digest = base64.b32encode(digester.digest()) else: digest = None self.member_info = (curr_offset, length, digest) return self.member_info def _next_record(self, next_line): """ Use loader to parse the record from the reader stream Supporting warc and arc records """ record = self.loader.parse_record_stream(self.reader, next_line, self.known_format) self.member_info = None # Track known format for faster parsing of other records self.known_format = record.format return record