Example #1
0
File: task_io.py Project: saa/disco
def disco_input_stream(stream, size, url, ignore_corrupt=False):
    """Input stream for Disco's internal compression format."""
    from disco.compat import BytesIO, int_of_byte
    from disco.compat import pickle_load
    import struct, gzip, zlib
    offset = 0
    while True:
        header = stream.read(1)
        if not header:
            return
        if int_of_byte(header[0]) < 128:
            for e in old_netstr_reader(stream, size, url, header):
                yield e
            return
        try:
            is_compressed, checksum, hunk_size =\
                struct.unpack('<BIQ', stream.read(13))
        except:
            raise DataError("Truncated data at {0} bytes".format(offset), url)
        if not hunk_size:
            return
        hunk = stream.read(hunk_size)
        data = b''
        try:
            data = zlib.decompress(hunk) if is_compressed else hunk
            if checksum != (zlib.crc32(data) & 0xFFFFFFFF):
                raise ValueError("Checksum does not match")
        except (ValueError, zlib.error) as e:
            if not ignore_corrupt:
                raise DataError(
                    "Corrupted data between bytes {0}-{1}: {2}".format(
                        offset, offset + hunk_size, e), url)
        offset += hunk_size
        hunk = BytesIO(data)
        while True:
            try:
                yield pickle_load(hunk)
            except EOFError:
                break
            except UnpicklingError as e:
                if not ignore_corrupt:
                    raise DataError(
                        "Corrupted data between bytes {0}-{1}: {2}".format(
                            offset - hunk_size, offset, e), url)
Example #2
0
def disco_input_stream(stream, size, url, ignore_corrupt = False):
    """Input stream for Disco's internal compression format."""
    from disco.compat import BytesIO, int_of_byte
    from disco.compat import pickle_load
    import struct, gzip, zlib
    offset = 0
    while True:
        header = stream.read(1)
        if not header:
            return
        if int_of_byte(header[0]) < 128:
            for e in old_netstr_reader(stream, size, url, header):
                yield e
            return
        try:
            is_compressed, checksum, hunk_size =\
                struct.unpack('<BIQ', stream.read(13))
        except:
            raise DataError("Truncated data at {0} bytes".format(offset), url)
        if not hunk_size:
            return
        hunk = stream.read(hunk_size)
        data = b''
        try:
            data = zlib.decompress(hunk) if is_compressed else hunk
            if checksum != (zlib.crc32(data) & 0xFFFFFFFF):
                raise ValueError("Checksum does not match")
        except (ValueError, zlib.error) as e:
            if not ignore_corrupt:
                raise DataError("Corrupted data between bytes {0}-{1}: {2}"
                                .format(offset, offset + hunk_size, e), url)
        offset += hunk_size
        hunk = BytesIO(data)
        while True:
            try:
                yield pickle_load(hunk)
            except EOFError:
                break
            except UnpicklingError as e:
                if not ignore_corrupt:
                    raise DataError("Corrupted data between bytes {0}-{1}: {2}"
                                    .format(offset - hunk_size, offset, e), url)
Example #3
0
 def read(handle):
     while True:
         try:
             yield pickle_load(handle)
         except EOFError:
             return
Example #4
0
File: util.py Project: yuj/disco
 def read(handle):
     while True:
         try:
             yield pickle_load(handle)
         except EOFError:
             return