def _unpack(self, execute=EX_CONSTRUCT): typ, n, obj = self._read_header(execute) if execute == EX_READ_ARRAY_HEADER: if typ != TYPE_ARRAY: raise UnpackValueError("Expected array") return n if execute == EX_READ_MAP_HEADER: if typ != TYPE_MAP: raise UnpackValueError("Expected map") return n # TODO should we eliminate the recursion? if typ == TYPE_ARRAY: if execute == EX_SKIP: for i in range(n): # TODO check whether we need to call `list_hook` self._unpack(EX_SKIP) return ret = newlist_hint(n) for i in range(n): ret.append(self._unpack(EX_CONSTRUCT)) if self._list_hook is not None: ret = self._list_hook(ret) # TODO is the interaction between `list_hook` and `use_list` ok? return ret if self._use_list else tuple(ret) if typ == TYPE_MAP: if execute == EX_SKIP: for i in range(n): # TODO check whether we need to call hooks self._unpack(EX_SKIP) self._unpack(EX_SKIP) return if self._object_pairs_hook is not None: ret = self._object_pairs_hook( (self._unpack(EX_CONSTRUCT), self._unpack(EX_CONSTRUCT)) for _ in range(n)) else: ret = {} for _ in range(n): key = self._unpack(EX_CONSTRUCT) ret[key] = self._unpack(EX_CONSTRUCT) if self._object_hook is not None: ret = self._object_hook(ret) return ret if execute == EX_SKIP: return if typ == TYPE_RAW: if self._encoding is not None: obj = obj.decode(self._encoding, self._unicode_errors) elif self._raw: obj = bytes(obj) else: obj = obj.decode('utf_8') return obj if typ == TYPE_EXT: return self._ext_hook(n, bytes(obj)) if typ == TYPE_BIN: return bytes(obj) assert typ == TYPE_IMMEDIATE return obj
def row(fileIter, lencols, compression, level): if hasattr(sys, 'pypy_version_info'): from __pypy__ import newlist_hint else: newlist_hint = lambda size: [] try: import msgpack except ImportError: import marshal as msgpack serializer = msgpack exitGen = False while not exitGen: mrows = newlist_hint(lencols) if lencols == 0: (yield) try: for i in xrange(lencols): mrows.append((yield)) except GeneratorExit: exitGen = True output = StringIO.StringIO() colnum = len(schema) output.truncate(0) output.write(struct.pack('!B', 1)) output.write(struct.pack('!B', 0)) headindex = [0 for _ in xrange((colnum * 2) + 1)] type = '!' + 'i' * len(headindex) output.write(struct.pack(type, *headindex)) output.write(serializer.dumps(mrows)) fileIter.write(output.getvalue())
def _unpack(self, execute=EX_CONSTRUCT): typ, n, obj = self._read_header(execute) if execute == EX_READ_ARRAY_HEADER: if typ != TYPE_ARRAY: raise UnpackValueError("Expected array") return n if execute == EX_READ_MAP_HEADER: if typ != TYPE_MAP: raise UnpackValueError("Expected map") return n # TODO should we eliminate the recursion? if typ == TYPE_ARRAY: if execute == EX_SKIP: for i in xrange(n): # TODO check whether we need to call `list_hook` self._unpack(EX_SKIP) return ret = newlist_hint(n) for i in xrange(n): ret.append(self._unpack(EX_CONSTRUCT)) if self._list_hook is not None: ret = self._list_hook(ret) # TODO is the interaction between `list_hook` and `use_list` ok? return ret if self._use_list else tuple(ret) if typ == TYPE_MAP: if execute == EX_SKIP: for i in xrange(n): # TODO check whether we need to call hooks self._unpack(EX_SKIP) self._unpack(EX_SKIP) return if self._object_pairs_hook is not None: ret = self._object_pairs_hook( (self._unpack(EX_CONSTRUCT), self._unpack(EX_CONSTRUCT)) for _ in xrange(n)) else: ret = {} for _ in xrange(n): key = self._unpack(EX_CONSTRUCT) ret[key] = self._unpack(EX_CONSTRUCT) if self._object_hook is not None: ret = self._object_hook(ret) return ret if execute == EX_SKIP: return if typ == TYPE_RAW: if self._encoding is not None: obj = obj.decode(self._encoding, self._unicode_errors) else: obj = bytes(obj) return obj if typ == TYPE_EXT: return self._ext_hook(n, bytes(obj)) if typ == TYPE_BIN: return bytes(obj) assert typ == TYPE_IMMEDIATE return obj
def _filter_tuple(func, seq): length = len(seq) result = newlist_hint(length) for i in range(length): # Again, must call __getitem__, at least there are tests. item = seq[i] if func(item): result.append(item) return tuple(result)
def __fetch_one_row(self): num_cols = _lib.sqlite3_data_count(self.__statement._statement) row = newlist_hint(num_cols) for i in xrange(num_cols): if self.__connection._detect_types: converter = self.__row_cast_map[i] else: converter = None if converter is not None: blob = _lib.sqlite3_column_blob(self.__statement._statement, i) if not blob: val = None else: blob_len = _lib.sqlite3_column_bytes( self.__statement._statement, i) val = _ffi.buffer(blob, blob_len)[:] val = converter(val) else: typ = _lib.sqlite3_column_type(self.__statement._statement, i) if typ == _lib.SQLITE_NULL: val = None elif typ == _lib.SQLITE_INTEGER: val = _lib.sqlite3_column_int64( self.__statement._statement, i) val = int(val) elif typ == _lib.SQLITE_FLOAT: val = _lib.sqlite3_column_double( self.__statement._statement, i) elif typ == _lib.SQLITE_TEXT: text = _lib.sqlite3_column_text( self.__statement._statement, i) text_len = _lib.sqlite3_column_bytes( self.__statement._statement, i) val = _ffi.buffer(text, text_len)[:] try: val = self.__connection.text_factory(val) except Exception: column_name = _lib.sqlite3_column_name( self.__statement._statement, i) if column_name: column_name = _ffi.string(column_name).decode( 'utf-8') else: column_name = "<unknown column name>" val = val.decode('ascii', 'replace') raise OperationalError( "Could not decode to UTF-8 column '%s' with text '%s'" % (column_name, val)) elif typ == _lib.SQLITE_BLOB: blob = _lib.sqlite3_column_blob( self.__statement._statement, i) blob_len = _lib.sqlite3_column_bytes( self.__statement._statement, i) val = _BLOB_TYPE(_ffi.buffer(blob, blob_len)[:]) row.append(val) return tuple(row)
def _filter_string(func, string, str_type): if func is bool and type(string) is str_type: return string length = len(string) result = newlist_hint(length) for i in range(length): # You must call __getitem__ on the strings, simply iterating doesn't # work :/ item = string[i] if func(item): if not isinstance(item, str_type): raise TypeError("__getitem__ returned a non-string type") result.append(item) return str_type().join(result)
def __fetch_one_row(self): num_cols = _lib.sqlite3_data_count(self.__statement._statement) row = newlist_hint(num_cols) for i in xrange(num_cols): if self.__connection._detect_types: converter = self.__row_cast_map[i] else: converter = None if converter is not None: blob = _lib.sqlite3_column_blob(self.__statement._statement, i) if not blob: val = None else: blob_len = _lib.sqlite3_column_bytes(self.__statement._statement, i) val = _ffi.buffer(blob, blob_len)[:] val = converter(val) else: typ = _lib.sqlite3_column_type(self.__statement._statement, i) if typ == _lib.SQLITE_NULL: val = None elif typ == _lib.SQLITE_INTEGER: val = _lib.sqlite3_column_int64(self.__statement._statement, i) val = int(val) elif typ == _lib.SQLITE_FLOAT: val = _lib.sqlite3_column_double(self.__statement._statement, i) elif typ == _lib.SQLITE_TEXT: text = _lib.sqlite3_column_text(self.__statement._statement, i) text_len = _lib.sqlite3_column_bytes(self.__statement._statement, i) val = _ffi.buffer(text, text_len)[:] try: val = self.__connection.text_factory(val) except Exception: column_name = _lib.sqlite3_column_name( self.__statement._statement, i) if column_name: column_name = _ffi.string(column_name).decode('utf-8') else: column_name = "<unknown column name>" val = val.decode('ascii', 'replace') raise OperationalError( "Could not decode to UTF-8 column '%s' with text '%s'" % ( column_name, val)) elif typ == _lib.SQLITE_BLOB: blob = _lib.sqlite3_column_blob(self.__statement._statement, i) blob_len = _lib.sqlite3_column_bytes(self.__statement._statement, i) val = _BLOB_TYPE(_ffi.buffer(blob, blob_len)[:]) row.append(val) return tuple(row)
def __fetch_one_row(self): num_cols = _lib.sqlite3_data_count(self.__statement._statement) row = newlist_hint(num_cols) for i in xrange(num_cols): if self.__connection._detect_types: converter = self.__row_cast_map[i] else: converter = None if converter is not None: blob = _lib.sqlite3_column_blob(self.__statement._statement, i) if not blob: val = None else: blob_len = _lib.sqlite3_column_bytes( self.__statement._statement, i) val = _ffi.buffer(blob, blob_len)[:] val = converter(val) else: typ = _lib.sqlite3_column_type(self.__statement._statement, i) if typ == _lib.SQLITE_NULL: val = None elif typ == _lib.SQLITE_INTEGER: val = _lib.sqlite3_column_int64( self.__statement._statement, i) val = int(val) elif typ == _lib.SQLITE_FLOAT: val = _lib.sqlite3_column_double( self.__statement._statement, i) elif typ == _lib.SQLITE_TEXT: text = _lib.sqlite3_column_text( self.__statement._statement, i) text_len = _lib.sqlite3_column_bytes( self.__statement._statement, i) val = _ffi.buffer(text, text_len)[:] val = self.__connection.text_factory(val) elif typ == _lib.SQLITE_BLOB: blob = _lib.sqlite3_column_blob( self.__statement._statement, i) blob_len = _lib.sqlite3_column_bytes( self.__statement._statement, i) val = _BLOB_TYPE(_ffi.buffer(blob, blob_len)[:]) row.append(val) return tuple(row)
def __fetch_one_row(self): num_cols = _lib.sqlite3_data_count(self.__statement._statement) row = newlist_hint(num_cols) for i in xrange(num_cols): if self.__connection._detect_types: converter = self.__row_cast_map[i] else: converter = None if converter is not None: blob = _lib.sqlite3_column_blob(self.__statement._statement, i) if not blob: val = None else: blob_len = _lib.sqlite3_column_bytes(self.__statement._statement, i) val = _ffi.buffer(blob, blob_len)[:] val = converter(val) else: typ = _lib.sqlite3_column_type(self.__statement._statement, i) if typ == _lib.SQLITE_NULL: val = None elif typ == _lib.SQLITE_INTEGER: val = _lib.sqlite3_column_int64(self.__statement._statement, i) val = int(val) elif typ == _lib.SQLITE_FLOAT: val = _lib.sqlite3_column_double(self.__statement._statement, i) elif typ == _lib.SQLITE_TEXT: text = _lib.sqlite3_column_text(self.__statement._statement, i) text_len = _lib.sqlite3_column_bytes(self.__statement._statement, i) val = _ffi.buffer(text, text_len)[:] val = self.__connection.text_factory(val) elif typ == _lib.SQLITE_BLOB: blob = _lib.sqlite3_column_blob(self.__statement._statement, i) blob_len = _lib.sqlite3_column_bytes(self.__statement._statement, i) val = _BLOB_TYPE(_ffi.buffer(blob, blob_len)[:]) row.append(val) return tuple(row)
def sorteddictpercol(fileIter, lencols, compression, level): output = StringIO.StringIO() if split: output.write(struct.pack('!B', 0)) cPickle.dump(schema[1:], output, 1) colnum = len(schema) - 1 cz = output.getvalue() fileIter.write(struct.pack('!i', len(cz))) fileIter.write(cz) else: colnum = len(schema) fileIter.write(struct.pack('!B', 0)) cPickle.dump(schema, fileIter, 1) if hasattr(sys, 'pypy_version_info'): from __pypy__ import newlist_hint else: newlist_hint = lambda size: [] paxcols = [] blocknum = 0 # tempio = cStringIO.StringIO() # fastPickler = cPickle.Pickler(tempio, 2) # fastPickler.fast = 1 exitGen = False compress = zlib.compress if compression == BZ2: compress = bz2.compress if lencols == 0: (yield) while not exitGen: output.truncate(0) mrows = newlist_hint(lencols) try: for i in xrange(lencols): mrows.append((yield)) except GeneratorExit: exitGen = True count = len(mrows) output.write(struct.pack('!B', 1)) if compression == BZ2: output.write(struct.pack('!B', 0)) else: output.write(struct.pack('!B', 1)) headindex = [0 for _ in xrange((colnum * 2) + 1)] type = '!' + 'i' * len(headindex) output.write(struct.pack(type, *headindex)) if mrows != []: for i, col in enumerate( ([x[c] for x in mrows] for c in xrange(colnum))): if blocknum == 0: s = sorted(set(col)) lens = len(s) if lens > 50 * 1.0 * count / 100: paxcols.append(i) l = output.tell() # tempio.truncate(0) # fastPickler.dump(col) output.write(compress(serializer.dumps(col), level)) headindex[i * 2] = output.tell() - l else: coldict = dict(((x, y) for y, x in enumerate(s))) l = output.tell() # tempio.truncate(0) # fastPickler.dump(s) output.write(compress(serializer.dumps(s), level)) headindex[i * 2] = output.tell() - l if lens > 1: if lens < 256: output.write( compress( array('B', [coldict[y] for y in col]).tostring(), level)) else: output.write( compress( array('H', [coldict[y] for y in col]).tostring(), level)) headindex[i * 2 + 1] = output.tell() - l - headindex[i * 2] else: if i in paxcols: l = output.tell() # tempio.truncate(0) # fastPickler.dump(col) output.write(compress(serializer.dumps(col), level)) headindex[i * 2] = output.tell() - l else: s = sorted(set(col)) lens = len(s) coldict = dict(((x, y) for y, x in enumerate(s))) l = output.tell() # tempio.truncate(0) # fastPickler.dump(s) output.write(compress(serializer.dumps(s), level)) headindex[i * 2] = output.tell() - l if lens > 1: if lens < 256: output.write( compress( array('B', [coldict[y] for y in col]).tostring(), level)) else: output.write( compress( array('H', [coldict[y] for y in col]).tostring(), level)) headindex[i * 2 + 1] = output.tell() - l - headindex[i * 2] blocknum = 1 headindex[colnum * 2] = count output.seek(0) type = '!' + 'i' * len(headindex) output.write(struct.pack('!B', 1)) if compression == BZ2: output.write(struct.pack('!B', 0)) else: output.write(struct.pack('!B', 1)) output.write(struct.pack(type, *headindex)) cz = output.getvalue() fileIter.write(struct.pack('!i', len(cz))) fileIter.write(cz) fileIter.close()
def _unpack(self, execute=EX_CONSTRUCT): typ, n, obj = self._read_header(execute) if execute == EX_READ_ARRAY_HEADER: if typ != TYPE_ARRAY: raise ValueError("Expected array") return n if execute == EX_READ_MAP_HEADER: if typ != TYPE_MAP: raise ValueError("Expected map") return n # TODO should we eliminate the recursion? if typ == TYPE_ARRAY: if execute == EX_SKIP: for i in xrange(n): # TODO check whether we need to call `list_hook` self._unpack(EX_SKIP) return ret = newlist_hint(n) for i in xrange(n): ret.append(self._unpack(EX_CONSTRUCT)) if self._list_hook is not None: ret = self._list_hook(ret) # TODO is the interaction between `list_hook` and `use_list` ok? return ret if self._use_list else tuple(ret) if typ == TYPE_MAP: if execute == EX_SKIP: for i in xrange(n): # TODO check whether we need to call hooks self._unpack(EX_SKIP) self._unpack(EX_SKIP) return if self._object_pairs_hook is not None: ret = self._object_pairs_hook( (self._unpack(EX_CONSTRUCT), self._unpack(EX_CONSTRUCT)) for _ in xrange(n) ) else: ret = {} for _ in xrange(n): key = self._unpack(EX_CONSTRUCT) if self._strict_map_key and type(key) not in (unicode, bytes): raise ValueError( "%s is not allowed for map key" % str(type(key)) ) if not PY2 and type(key) is str: key = sys.intern(key) ret[key] = self._unpack(EX_CONSTRUCT) if self._object_hook is not None: ret = self._object_hook(ret) return ret if execute == EX_SKIP: return if typ == TYPE_RAW: if self._raw: obj = bytes(obj) else: obj = obj.decode("utf_8", self._unicode_errors) return obj if typ == TYPE_BIN: return bytes(obj) if typ == TYPE_EXT: if n == -1: # timestamp ts = Timestamp.from_bytes(bytes(obj)) if self._timestamp == 1: return ts.to_unix() elif self._timestamp == 2: return ts.to_unix_nano() elif self._timestamp == 3: return ts.to_datetime() else: return ts else: return self._ext_hook(n, bytes(obj)) assert typ == TYPE_IMMEDIATE return obj
def __init__(self, length_hint): self.length_hint = length_hint self.list = newlist_hint(length_hint)
def spac(fileIter, lencols, compression, level): indexes_size = [0] * (len(schema) - 1) values_size = [0] * (len(schema) - 1) valcount = [0] * (len(schema) - 1) output = StringIO.StringIO() check = 0 globaldict = [None] * (len(schema) - 1) index_init = [0 for _ in xrange(3)] if split: output.write(struct.pack('!B', 0)) cPickle.dump(schema[1:], output, 1) colnum = len(schema) - 1 cz = output.getvalue() fileIter.write(struct.pack('!i', len(cz))) fileIter.write(cz) else: colnum = len(schema) fileIter.write(struct.pack('!B', 0)) cPickle.dump(schema, fileIter, 1) if hasattr(sys, 'pypy_version_info'): from __pypy__ import newlist_hint else: newlist_hint = lambda size: [] paxcols = [] blocknum = 0 # tempio = cStringIO.StringIO() # fastPickler = cPickle.Pickler(tempio, 2) # fastPickler.fast = 1 exitGen = False compress = zlib.compress compress2 = bz2.compress if compression == BZ2: compress = bz2.compress if lencols == 0: (yield) while not exitGen: output.truncate(0) mrows = newlist_hint(lencols) try: for i in xrange(lencols): mrows.append((yield)) except GeneratorExit: exitGen = True count = len(mrows) output.write(struct.pack('!B', 1)) if compression == BZ2: output.write(struct.pack('!B', 0)) else: output.write(struct.pack('!B', 1)) headindex = [0 for _ in xrange((colnum * 4) + 1)] #headindex2 = [0 for _ in xrange(colnum*4)] type = '!' + 'i' * len(headindex) output.write(struct.pack(type, *headindex)) #type = '!'+'i'*len(headindex2) #output.write(struct.pack(type, *headindex2)) if mrows != []: for i, col in enumerate( ([x[c] for x in mrows] for c in xrange(colnum))): if blocknum == 0: globaldict[i] = sorted(set(col)) lens = len(globaldict[i]) if lens > 50 * 1.0 * count / 100: paxcols.append(i) l = output.tell() headindex[i * 4 + 3] = l output.write(compress(serializer.dumps(col), level)) headindex[i * 4] = output.tell() - l else: coldict = dict( ((x, y) for y, x in enumerate(globaldict[i]))) l = output.tell() headindex[i * 4 + 3] = l output.write( compress(serializer.dumps(globaldict[i]), level)) valcount[i] += lens headindex[i * 4] = output.tell() - l values_size[i] += headindex[i * 4] if lens > 1: l1 = output.tell() if lens < 256: output.write( compress( array('B', [coldict[y] for y in col]).tostring(), level)) elif lens < 65536: output.write( compress( array('H', [coldict[y] for y in col]).tostring(), level)) else: print 'lala' output.write( compress( array('i', [coldict[y] for y in col]).tostring(), level)) indexes_size[i] += output.tell() - l1 headindex[i * 4 + 1] = output.tell() - l - headindex[i * 4] else: if i in paxcols: l = output.tell() headindex[i * 4 + 3] = l output.write(compress(serializer.dumps(col), level)) headindex[i * 4] = output.tell() - l else: so = sorted(set(col)) setcol = set(so) diff = setcol - set( globaldict[i] ) # upologismos newn diaforetikwn val lens = len(globaldict[i]) if lens > 2147483647 or ( lens < 65535 and lens + len(diff) > 65535 ) or (lens < 256 and lens + len(diff) >= 256 ) or len(setcol) == 1 or (lens >= 256 and len(setcol) < 256): headindex[i * 4 + 2] = 0 # sdc again globaldict[i] = so coldict = dict( ((x, y) for y, x in enumerate(globaldict[i]))) l = output.tell() headindex[i * 4 + 3] = l output.write( compress(serializer.dumps(globaldict[i]), level)) lens = len(globaldict[i]) headindex[i * 4] = output.tell() - l valcount[i] += lens values_size[i] += headindex[i * 3] if lens > 1: l1 = output.tell() if lens < 256: output.write( compress( array( 'B', [coldict[y] for y in col]).tostring(), level)) elif lens < 65536: output.write( compress( array( 'H', [coldict[y] for y in col]).tostring(), level)) else: output.write( compress( array( 'i', [coldict[y] for y in col]).tostring(), level)) indexes_size[i] += output.tell() - l1 headindex[ i * 4 + 1] = output.tell() - l - headindex[i * 4] else: headindex[i * 4 + 2] = 1 dif = sorted(diff) if dif: globaldict[i] = globaldict[ i] + dif # upologismos neou global dict #globaldict[i] = set(dif).union(globaldict[i]) d = 0 t = output.tell() if len(globaldict[i]) != 0: if len(globaldict[i]) < 256: indextype = 'B' elif len(globaldict[i]) < 65536: indextype = 'H' else: print 'lala' indextype = 'i' l = output.tell() headindex[i * 4 + 3] = l output.write( compress(serializer.dumps(dif), level)) valcount[i] += len(dif) headindex[i * 4] = output.tell() - l values_size[i] += headindex[i * 4] lens = len(globaldict[i]) coldict = dict( ((x, y) for y, x in enumerate(globaldict[i]))) l = output.tell() if lens > 1: l1 = output.tell() output.write( compress( array(indextype, [coldict[y] for y in col]).tostring(), level)) indexes_size[i] += output.tell() - l1 headindex[i * 4 + 1] = output.tell() - l blocknum += 1 headindex[colnum * 4] = count output.seek(0) type = '!' + 'i' * len(headindex) #type2 = '!'+'i'*len(headindex2) output.write(struct.pack('!B', 1)) if compression == BZ2: output.write(struct.pack('!B', 0)) else: output.write(struct.pack('!B', 1)) output.write(struct.pack(type, *headindex)) cz = output.getvalue() fileIter.write(struct.pack('!i', len(cz))) fileIter.write(cz) #print indexes_size #print values_size #print valcount fileIter.close()
def sorteddictpercol(fileIter, lencols, compression, level): output = StringIO.StringIO() if split: output.write(struct.pack('!B', 0)) cPickle.dump(schema[1:], output, 1) colnum = len(schema) - 1 cz = output.getvalue() fileIter.write(struct.pack('!i', len(cz))) fileIter.write(cz) else: colnum = len(schema) fileIter.write(struct.pack('!B', 0)) cPickle.dump(schema, fileIter, 1) if hasattr(sys, 'pypy_version_info'): from __pypy__ import newlist_hint else: newlist_hint = lambda size: [] paxcols = [] blocknum = 0 # tempio = cStringIO.StringIO() # fastPickler = cPickle.Pickler(tempio, 2) # fastPickler.fast = 1 exitGen = False compress = zlib.compress if compression == BZ2: compress = bz2.compress if lencols == 0: (yield) while not exitGen: output.truncate(0) mrows = newlist_hint(lencols) try: for i in xrange(lencols): mrows.append((yield)) except GeneratorExit: exitGen = True count = len(mrows) output.write(struct.pack('!B', 1)) if compression == BZ2: output.write(struct.pack('!B', 0)) else: output.write(struct.pack('!B', 1)) headindex = [0 for _ in xrange((colnum * 2) + 1)] type = '!' + 'i' * len(headindex) output.write(struct.pack(type, *headindex)) if mrows != []: for i, col in enumerate(([x[c] for x in mrows] for c in xrange(colnum))): if blocknum == 0: s = sorted(set(col)) lens = len(s) if lens > 50 * 1.0 * count / 100: paxcols.append(i) l = output.tell() # tempio.truncate(0) # fastPickler.dump(col) output.write(compress(serializer.dumps(col), level)) headindex[i * 2] = output.tell() - l else: coldict = dict(((x, y) for y, x in enumerate(s))) l = output.tell() # tempio.truncate(0) # fastPickler.dump(s) output.write(compress(serializer.dumps(s), level)) headindex[i * 2] = output.tell() - l if lens > 1: if lens < 256: output.write(compress(array('B', [coldict[y] for y in col]).tostring(), level)) else: output.write(compress(array('H', [coldict[y] for y in col]).tostring(), level)) headindex[i * 2 + 1] = output.tell() - l - headindex[i * 2] else: if i in paxcols: l = output.tell() # tempio.truncate(0) # fastPickler.dump(col) output.write(compress(serializer.dumps(col), level)) headindex[i * 2] = output.tell() - l else: s = sorted(set(col)) lens = len(s) coldict = dict(((x, y) for y, x in enumerate(s))) l = output.tell() # tempio.truncate(0) # fastPickler.dump(s) output.write(compress(serializer.dumps(s), level)) headindex[i * 2] = output.tell() - l if lens > 1: if lens < 256: output.write(compress(array('B', [coldict[y] for y in col]).tostring(), level)) else: output.write(compress(array('H', [coldict[y] for y in col]).tostring(), level)) headindex[i * 2 + 1] = output.tell() - l - headindex[i * 2] blocknum = 1 headindex[colnum * 2] = count output.seek(0) type = '!' + 'i' * len(headindex) output.write(struct.pack('!B', 1)) if compression == BZ2: output.write(struct.pack('!B', 0)) else: output.write(struct.pack('!B', 1)) output.write(struct.pack(type, *headindex)) cz = output.getvalue() fileIter.write(struct.pack('!i', len(cz))) fileIter.write(cz) fileIter.close()
def _fb_unpack(self, execute=EX_CONSTRUCT, write_bytes=None): typ = TYPE_IMMEDIATE c = self._fb_read(1, write_bytes) b = ord(c) if b & 0b10000000 == 0: obj = b elif b & 0b11100000 == 0b11100000: obj = struct.unpack("b", c)[0] elif b & 0b11100000 == 0b10100000: n = b & 0b00011111 obj = self._fb_read(n, write_bytes) typ = TYPE_RAW elif b & 0b11110000 == 0b10010000: n = b & 0b00001111 typ = TYPE_ARRAY elif b & 0b11110000 == 0b10000000: n = b & 0b00001111 typ = TYPE_MAP elif b == 0xc0: obj = None elif b == 0xc2: obj = False elif b == 0xc3: obj = True elif b == 0xca: obj = struct.unpack(">f", self._fb_read(4, write_bytes))[0] elif b == 0xcb: obj = struct.unpack(">d", self._fb_read(8, write_bytes))[0] elif b == 0xcc: obj = struct.unpack("B", self._fb_read(1, write_bytes))[0] elif b == 0xcd: obj = struct.unpack(">H", self._fb_read(2, write_bytes))[0] elif b == 0xce: obj = struct.unpack(">I", self._fb_read(4, write_bytes))[0] elif b == 0xcf: obj = struct.unpack(">Q", self._fb_read(8, write_bytes))[0] elif b == 0xd0: obj = struct.unpack("b", self._fb_read(1, write_bytes))[0] elif b == 0xd1: obj = struct.unpack(">h", self._fb_read(2, write_bytes))[0] elif b == 0xd2: obj = struct.unpack(">i", self._fb_read(4, write_bytes))[0] elif b == 0xd3: obj = struct.unpack(">q", self._fb_read(8, write_bytes))[0] elif b == 0xda: n = struct.unpack(">H", self._fb_read(2, write_bytes))[0] obj = self._fb_read(n, write_bytes) typ = TYPE_RAW elif b == 0xdb: n = struct.unpack(">I", self._fb_read(4, write_bytes))[0] obj = self._fb_read(n, write_bytes) typ = TYPE_RAW elif b == 0xdc: n = struct.unpack(">H", self._fb_read(2, write_bytes))[0] typ = TYPE_ARRAY elif b == 0xdd: n = struct.unpack(">I", self._fb_read(4, write_bytes))[0] typ = TYPE_ARRAY elif b == 0xde: n = struct.unpack(">H", self._fb_read(2, write_bytes))[0] typ = TYPE_MAP elif b == 0xdf: n = struct.unpack(">I", self._fb_read(4, write_bytes))[0] typ = TYPE_MAP else: raise UnpackValueError("Unknown header: 0x%x" % b) if execute == EX_READ_ARRAY_HEADER: if typ != TYPE_ARRAY: raise UnpackValueError("Expected array") return n if execute == EX_READ_MAP_HEADER: if typ != TYPE_MAP: raise UnpackValueError("Expected map") return n # TODO should we eliminate the recursion? if typ == TYPE_ARRAY: if execute == EX_SKIP: for i in xrange(n): # TODO check whether we need to call `list_hook` self._fb_unpack(EX_SKIP, write_bytes) return ret = newlist_hint(n) for i in xrange(n): ret.append(self._fb_unpack(EX_CONSTRUCT, write_bytes)) if self._list_hook is not None: ret = self._list_hook(ret) # TODO is the interaction between `list_hook` and `use_list` ok? return ret if self._use_list else tuple(ret) if typ == TYPE_MAP: if execute == EX_SKIP: for i in xrange(n): # TODO check whether we need to call hooks self._fb_unpack(EX_SKIP, write_bytes) self._fb_unpack(EX_SKIP, write_bytes) return if self._object_pairs_hook is not None: ret = self._object_pairs_hook( (self._fb_unpack(EX_CONSTRUCT, write_bytes), self._fb_unpack(EX_CONSTRUCT, write_bytes)) for _ in xrange(n) ) else: ret = {} for _ in xrange(n): key = self._fb_unpack(EX_CONSTRUCT, write_bytes) ret[key] = self._fb_unpack(EX_CONSTRUCT, write_bytes) if self._object_hook is not None: ret = self._object_hook(ret) return ret if execute == EX_SKIP: return if typ == TYPE_RAW: if self._encoding is not None: obj = obj.decode(self._encoding, self._unicode_errors) return obj assert typ == TYPE_IMMEDIATE return obj
def _unpack(self, command: int = _CMD_CONSTRUCT, data_type: typing.Optional[int] = None): obj_type, n, obj, obj_dt = self._read_header(data_type) # Type checking if command == _CMD_READ_ARRAY_HEADER: if obj_type != _TYPE_ARRAY and obj_type != _TYPE_MARRAY: raise ValueError('Expected ARRAY') return n elif command == _CMD_READ_MAP_HEADER: if obj_type != _TYPE_MAP: raise ValueError('Expected MAP') return n # Unpacking ARRAY if obj_type == _TYPE_ARRAY: # Skip over every element in the ARRAY if command == _CMD_SKIP: for _ in range(n): self._unpack(_CMD_SKIP) return ret = newlist_hint(n) for _ in range(n): ret.append(self._unpack(_CMD_CONSTRUCT, data_type=obj_dt)) if self._list_hook is not None: ret = self._list_hook(ret) return ret # Unpacking MARRAY elif obj_type == _TYPE_MARRAY: if command == _CMD_SKIP: for _ in range(n): self._unpack(_CMD_SKIP) return ret = newlist_hint(n) for _ in range(n): ret.append(self._unpack(_CMD_CONSTRUCT)) if self._list_hook is not None: ret = self._list_hook(ret) return ret # Unpacking MAP elif obj_type == _TYPE_MAP: if command == _CMD_SKIP: for _ in range(n): self._unpack(_CMD_SKIP) self._unpack(_CMD_SKIP) return if self._object_pairs_hook is not None: ret = self._object_pairs_hook((self._unpack(_CMD_CONSTRUCT), self._unpack(_CMD_CONSTRUCT)) for _ in range(n)) else: ret = {} for _ in range(n): key = self._unpack(_CMD_CONSTRUCT) ret[key] = self._unpack(_CMD_CONSTRUCT) if self._object_hook is not None: ret = self._object_hook(ret) return ret if command == _CMD_SKIP: return # Unpacking STR if obj_type == _TYPE_STR: return obj.decode('utf-8') # Unpacking BIN elif obj_type == _TYPE_BIN: return bytes(obj) # Unpacking EXT elif obj_type == _TYPE_EXT: return self._ext_hook(n, bytes(obj)) # Unpacking INT assert obj_type == _TYPE_IMMEDIATE return obj