def disco_input_stream(stream, size, url, ignore_corrupt=False): """Input stream for Disco's internal compression format.""" from disco.compat import BytesIO, int_of_byte from disco.compat import pickle_load import struct, gzip, zlib offset = 0 while True: header = stream.read(1) if not header: return if int_of_byte(header[0]) < 128: for e in old_netstr_reader(stream, size, url, header): yield e return try: is_compressed, checksum, hunk_size =\ struct.unpack('<BIQ', stream.read(13)) except: raise DataError("Truncated data at {0} bytes".format(offset), url) if not hunk_size: return hunk = stream.read(hunk_size) data = b'' try: data = zlib.decompress(hunk) if is_compressed else hunk if checksum != (zlib.crc32(data) & 0xFFFFFFFF): raise ValueError("Checksum does not match") except (ValueError, zlib.error) as e: if not ignore_corrupt: raise DataError( "Corrupted data between bytes {0}-{1}: {2}".format( offset, offset + hunk_size, e), url) offset += hunk_size hunk = BytesIO(data) while True: try: yield pickle_load(hunk) except EOFError: break except UnpicklingError as e: if not ignore_corrupt: raise DataError( "Corrupted data between bytes {0}-{1}: {2}".format( offset - hunk_size, offset, e), url)
def disco_input_stream(stream, size, url, ignore_corrupt = False): """Input stream for Disco's internal compression format.""" from disco.compat import BytesIO, int_of_byte from disco.compat import pickle_load import struct, gzip, zlib offset = 0 while True: header = stream.read(1) if not header: return if int_of_byte(header[0]) < 128: for e in old_netstr_reader(stream, size, url, header): yield e return try: is_compressed, checksum, hunk_size =\ struct.unpack('<BIQ', stream.read(13)) except: raise DataError("Truncated data at {0} bytes".format(offset), url) if not hunk_size: return hunk = stream.read(hunk_size) data = b'' try: data = zlib.decompress(hunk) if is_compressed else hunk if checksum != (zlib.crc32(data) & 0xFFFFFFFF): raise ValueError("Checksum does not match") except (ValueError, zlib.error) as e: if not ignore_corrupt: raise DataError("Corrupted data between bytes {0}-{1}: {2}" .format(offset, offset + hunk_size, e), url) offset += hunk_size hunk = BytesIO(data) while True: try: yield pickle_load(hunk) except EOFError: break except UnpicklingError as e: if not ignore_corrupt: raise DataError("Corrupted data between bytes {0}-{1}: {2}" .format(offset - hunk_size, offset, e), url)
def read(handle): while True: try: yield pickle_load(handle) except EOFError: return