def _unpack(self, execute=EX_CONSTRUCT):
        typ, n, obj = self._read_header(execute)

        if execute == EX_READ_ARRAY_HEADER:
            if typ != TYPE_ARRAY:
                raise UnpackValueError("Expected array")
            return n
        if execute == EX_READ_MAP_HEADER:
            if typ != TYPE_MAP:
                raise UnpackValueError("Expected map")
            return n
        # TODO should we eliminate the recursion?
        if typ == TYPE_ARRAY:
            if execute == EX_SKIP:
                for i in range(n):
                    # TODO check whether we need to call `list_hook`
                    self._unpack(EX_SKIP)
                return
            ret = newlist_hint(n)
            for i in range(n):
                ret.append(self._unpack(EX_CONSTRUCT))
            if self._list_hook is not None:
                ret = self._list_hook(ret)
            # TODO is the interaction between `list_hook` and `use_list` ok?
            return ret if self._use_list else tuple(ret)
        if typ == TYPE_MAP:
            if execute == EX_SKIP:
                for i in range(n):
                    # TODO check whether we need to call hooks
                    self._unpack(EX_SKIP)
                    self._unpack(EX_SKIP)
                return
            if self._object_pairs_hook is not None:
                ret = self._object_pairs_hook(
                    (self._unpack(EX_CONSTRUCT), self._unpack(EX_CONSTRUCT))
                    for _ in range(n))
            else:
                ret = {}
                for _ in range(n):
                    key = self._unpack(EX_CONSTRUCT)
                    ret[key] = self._unpack(EX_CONSTRUCT)
                if self._object_hook is not None:
                    ret = self._object_hook(ret)
            return ret
        if execute == EX_SKIP:
            return
        if typ == TYPE_RAW:
            if self._encoding is not None:
                obj = obj.decode(self._encoding, self._unicode_errors)
            elif self._raw:
                obj = bytes(obj)
            else:
                obj = obj.decode('utf_8')
            return obj
        if typ == TYPE_EXT:
            return self._ext_hook(n, bytes(obj))
        if typ == TYPE_BIN:
            return bytes(obj)
        assert typ == TYPE_IMMEDIATE
        return obj
Exemple #2
0
    def row(fileIter, lencols, compression, level):
        if hasattr(sys, 'pypy_version_info'):
            from __pypy__ import newlist_hint

        else:
            newlist_hint = lambda size: []
        try:
            import msgpack
        except ImportError:
            import marshal as msgpack
        serializer = msgpack
        exitGen = False
        while not exitGen:
            mrows = newlist_hint(lencols)
            if lencols == 0:
                (yield)
            try:
                for i in xrange(lencols):
                    mrows.append((yield))
            except GeneratorExit:
                exitGen = True
            output = StringIO.StringIO()
            colnum = len(schema)
            output.truncate(0)

            output.write(struct.pack('!B', 1))
            output.write(struct.pack('!B', 0))
            headindex = [0 for _ in xrange((colnum * 2) + 1)]
            type = '!' + 'i' * len(headindex)
            output.write(struct.pack(type, *headindex))
            output.write(serializer.dumps(mrows))

            fileIter.write(output.getvalue())
Exemple #3
0
    def _unpack(self, execute=EX_CONSTRUCT):
        typ, n, obj = self._read_header(execute)

        if execute == EX_READ_ARRAY_HEADER:
            if typ != TYPE_ARRAY:
                raise UnpackValueError("Expected array")
            return n
        if execute == EX_READ_MAP_HEADER:
            if typ != TYPE_MAP:
                raise UnpackValueError("Expected map")
            return n
        # TODO should we eliminate the recursion?
        if typ == TYPE_ARRAY:
            if execute == EX_SKIP:
                for i in xrange(n):
                    # TODO check whether we need to call `list_hook`
                    self._unpack(EX_SKIP)
                return
            ret = newlist_hint(n)
            for i in xrange(n):
                ret.append(self._unpack(EX_CONSTRUCT))
            if self._list_hook is not None:
                ret = self._list_hook(ret)
            # TODO is the interaction between `list_hook` and `use_list` ok?
            return ret if self._use_list else tuple(ret)
        if typ == TYPE_MAP:
            if execute == EX_SKIP:
                for i in xrange(n):
                    # TODO check whether we need to call hooks
                    self._unpack(EX_SKIP)
                    self._unpack(EX_SKIP)
                return
            if self._object_pairs_hook is not None:
                ret = self._object_pairs_hook(
                    (self._unpack(EX_CONSTRUCT),
                     self._unpack(EX_CONSTRUCT))
                    for _ in xrange(n))
            else:
                ret = {}
                for _ in xrange(n):
                    key = self._unpack(EX_CONSTRUCT)
                    ret[key] = self._unpack(EX_CONSTRUCT)
                if self._object_hook is not None:
                    ret = self._object_hook(ret)
            return ret
        if execute == EX_SKIP:
            return
        if typ == TYPE_RAW:
            if self._encoding is not None:
                obj = obj.decode(self._encoding, self._unicode_errors)
            else:
                obj = bytes(obj)
            return obj
        if typ == TYPE_EXT:
            return self._ext_hook(n, bytes(obj))
        if typ == TYPE_BIN:
            return bytes(obj)
        assert typ == TYPE_IMMEDIATE
        return obj
Exemple #4
0
def _filter_tuple(func, seq):
    length = len(seq)
    result = newlist_hint(length)
    for i in range(length):
        # Again, must call __getitem__, at least there are tests.
        item = seq[i]
        if func(item):
            result.append(item)
    return tuple(result)
Exemple #5
0
    def __fetch_one_row(self):
        num_cols = _lib.sqlite3_data_count(self.__statement._statement)
        row = newlist_hint(num_cols)
        for i in xrange(num_cols):
            if self.__connection._detect_types:
                converter = self.__row_cast_map[i]
            else:
                converter = None

            if converter is not None:
                blob = _lib.sqlite3_column_blob(self.__statement._statement, i)
                if not blob:
                    val = None
                else:
                    blob_len = _lib.sqlite3_column_bytes(
                        self.__statement._statement, i)
                    val = _ffi.buffer(blob, blob_len)[:]
                    val = converter(val)
            else:
                typ = _lib.sqlite3_column_type(self.__statement._statement, i)
                if typ == _lib.SQLITE_NULL:
                    val = None
                elif typ == _lib.SQLITE_INTEGER:
                    val = _lib.sqlite3_column_int64(
                        self.__statement._statement, i)
                    val = int(val)
                elif typ == _lib.SQLITE_FLOAT:
                    val = _lib.sqlite3_column_double(
                        self.__statement._statement, i)
                elif typ == _lib.SQLITE_TEXT:
                    text = _lib.sqlite3_column_text(
                        self.__statement._statement, i)
                    text_len = _lib.sqlite3_column_bytes(
                        self.__statement._statement, i)
                    val = _ffi.buffer(text, text_len)[:]
                    try:
                        val = self.__connection.text_factory(val)
                    except Exception:
                        column_name = _lib.sqlite3_column_name(
                            self.__statement._statement, i)
                        if column_name:
                            column_name = _ffi.string(column_name).decode(
                                'utf-8')
                        else:
                            column_name = "<unknown column name>"
                        val = val.decode('ascii', 'replace')
                        raise OperationalError(
                            "Could not decode to UTF-8 column '%s' with text '%s'"
                            % (column_name, val))
                elif typ == _lib.SQLITE_BLOB:
                    blob = _lib.sqlite3_column_blob(
                        self.__statement._statement, i)
                    blob_len = _lib.sqlite3_column_bytes(
                        self.__statement._statement, i)
                    val = _BLOB_TYPE(_ffi.buffer(blob, blob_len)[:])
            row.append(val)
        return tuple(row)
Exemple #6
0
def _filter_string(func, string, str_type):
    if func is bool and type(string) is str_type:
        return string
    length = len(string)
    result = newlist_hint(length)
    for i in range(length):
        # You must call __getitem__ on the strings, simply iterating doesn't
        # work :/
        item = string[i]
        if func(item):
            if not isinstance(item, str_type):
                raise TypeError("__getitem__ returned a non-string type")
            result.append(item)
    return str_type().join(result)
Exemple #7
0
    def __fetch_one_row(self):
        num_cols = _lib.sqlite3_data_count(self.__statement._statement)
        row = newlist_hint(num_cols)
        for i in xrange(num_cols):
            if self.__connection._detect_types:
                converter = self.__row_cast_map[i]
            else:
                converter = None

            if converter is not None:
                blob = _lib.sqlite3_column_blob(self.__statement._statement, i)
                if not blob:
                    val = None
                else:
                    blob_len = _lib.sqlite3_column_bytes(self.__statement._statement, i)
                    val = _ffi.buffer(blob, blob_len)[:]
                    val = converter(val)
            else:
                typ = _lib.sqlite3_column_type(self.__statement._statement, i)
                if typ == _lib.SQLITE_NULL:
                    val = None
                elif typ == _lib.SQLITE_INTEGER:
                    val = _lib.sqlite3_column_int64(self.__statement._statement, i)
                    val = int(val)
                elif typ == _lib.SQLITE_FLOAT:
                    val = _lib.sqlite3_column_double(self.__statement._statement, i)
                elif typ == _lib.SQLITE_TEXT:
                    text = _lib.sqlite3_column_text(self.__statement._statement, i)
                    text_len = _lib.sqlite3_column_bytes(self.__statement._statement, i)
                    val = _ffi.buffer(text, text_len)[:]
                    try:
                        val = self.__connection.text_factory(val)
                    except Exception:
                        column_name = _lib.sqlite3_column_name(
                            self.__statement._statement, i)
                        if column_name:
                            column_name = _ffi.string(column_name).decode('utf-8')
                        else:
                            column_name = "<unknown column name>"
                        val = val.decode('ascii', 'replace')
                        raise OperationalError(
                            "Could not decode to UTF-8 column '%s' with text '%s'" % (
                                column_name, val))
                elif typ == _lib.SQLITE_BLOB:
                    blob = _lib.sqlite3_column_blob(self.__statement._statement, i)
                    blob_len = _lib.sqlite3_column_bytes(self.__statement._statement, i)
                    val = _BLOB_TYPE(_ffi.buffer(blob, blob_len)[:])
            row.append(val)
        return tuple(row)
Exemple #8
0
    def __fetch_one_row(self):
        num_cols = _lib.sqlite3_data_count(self.__statement._statement)
        row = newlist_hint(num_cols)
        for i in xrange(num_cols):
            if self.__connection._detect_types:
                converter = self.__row_cast_map[i]
            else:
                converter = None

            if converter is not None:
                blob = _lib.sqlite3_column_blob(self.__statement._statement, i)
                if not blob:
                    val = None
                else:
                    blob_len = _lib.sqlite3_column_bytes(
                        self.__statement._statement, i)
                    val = _ffi.buffer(blob, blob_len)[:]
                    val = converter(val)
            else:
                typ = _lib.sqlite3_column_type(self.__statement._statement, i)
                if typ == _lib.SQLITE_NULL:
                    val = None
                elif typ == _lib.SQLITE_INTEGER:
                    val = _lib.sqlite3_column_int64(
                        self.__statement._statement, i)
                    val = int(val)
                elif typ == _lib.SQLITE_FLOAT:
                    val = _lib.sqlite3_column_double(
                        self.__statement._statement, i)
                elif typ == _lib.SQLITE_TEXT:
                    text = _lib.sqlite3_column_text(
                        self.__statement._statement, i)
                    text_len = _lib.sqlite3_column_bytes(
                        self.__statement._statement, i)
                    val = _ffi.buffer(text, text_len)[:]
                    val = self.__connection.text_factory(val)
                elif typ == _lib.SQLITE_BLOB:
                    blob = _lib.sqlite3_column_blob(
                        self.__statement._statement, i)
                    blob_len = _lib.sqlite3_column_bytes(
                        self.__statement._statement, i)
                    val = _BLOB_TYPE(_ffi.buffer(blob, blob_len)[:])
            row.append(val)
        return tuple(row)
Exemple #9
0
    def __fetch_one_row(self):
        num_cols = _lib.sqlite3_data_count(self.__statement._statement)
        row = newlist_hint(num_cols)
        for i in xrange(num_cols):
            if self.__connection._detect_types:
                converter = self.__row_cast_map[i]
            else:
                converter = None

            if converter is not None:
                blob = _lib.sqlite3_column_blob(self.__statement._statement, i)
                if not blob:
                    val = None
                else:
                    blob_len = _lib.sqlite3_column_bytes(self.__statement._statement, i)
                    val = _ffi.buffer(blob, blob_len)[:]
                    val = converter(val)
            else:
                typ = _lib.sqlite3_column_type(self.__statement._statement, i)
                if typ == _lib.SQLITE_NULL:
                    val = None
                elif typ == _lib.SQLITE_INTEGER:
                    val = _lib.sqlite3_column_int64(self.__statement._statement, i)
                    val = int(val)
                elif typ == _lib.SQLITE_FLOAT:
                    val = _lib.sqlite3_column_double(self.__statement._statement, i)
                elif typ == _lib.SQLITE_TEXT:
                    text = _lib.sqlite3_column_text(self.__statement._statement, i)
                    text_len = _lib.sqlite3_column_bytes(self.__statement._statement, i)
                    val = _ffi.buffer(text, text_len)[:]
                    val = self.__connection.text_factory(val)
                elif typ == _lib.SQLITE_BLOB:
                    blob = _lib.sqlite3_column_blob(self.__statement._statement, i)
                    blob_len = _lib.sqlite3_column_bytes(self.__statement._statement, i)
                    val = _BLOB_TYPE(_ffi.buffer(blob, blob_len)[:])
            row.append(val)
        return tuple(row)
Exemple #10
0
    def sorteddictpercol(fileIter, lencols, compression, level):
        output = StringIO.StringIO()
        if split:
            output.write(struct.pack('!B', 0))
            cPickle.dump(schema[1:], output, 1)
            colnum = len(schema) - 1
            cz = output.getvalue()
            fileIter.write(struct.pack('!i', len(cz)))
            fileIter.write(cz)

        else:
            colnum = len(schema)
            fileIter.write(struct.pack('!B', 0))
            cPickle.dump(schema, fileIter, 1)
        if hasattr(sys, 'pypy_version_info'):
            from __pypy__ import newlist_hint

        else:
            newlist_hint = lambda size: []
        paxcols = []
        blocknum = 0

        #        tempio = cStringIO.StringIO()
        #        fastPickler = cPickle.Pickler(tempio, 2)
        #        fastPickler.fast = 1
        exitGen = False
        compress = zlib.compress
        if compression == BZ2:
            compress = bz2.compress
        if lencols == 0:
            (yield)

        while not exitGen:
            output.truncate(0)
            mrows = newlist_hint(lencols)
            try:
                for i in xrange(lencols):
                    mrows.append((yield))
            except GeneratorExit:
                exitGen = True

            count = len(mrows)
            output.write(struct.pack('!B', 1))
            if compression == BZ2:
                output.write(struct.pack('!B', 0))
            else:
                output.write(struct.pack('!B', 1))

            headindex = [0 for _ in xrange((colnum * 2) + 1)]
            type = '!' + 'i' * len(headindex)
            output.write(struct.pack(type, *headindex))

            if mrows != []:

                for i, col in enumerate(
                    ([x[c] for x in mrows] for c in xrange(colnum))):

                    if blocknum == 0:
                        s = sorted(set(col))
                        lens = len(s)
                        if lens > 50 * 1.0 * count / 100:
                            paxcols.append(i)
                            l = output.tell()
                            #                            tempio.truncate(0)
                            #                            fastPickler.dump(col)
                            output.write(compress(serializer.dumps(col),
                                                  level))
                            headindex[i * 2] = output.tell() - l
                        else:
                            coldict = dict(((x, y) for y, x in enumerate(s)))
                            l = output.tell()
                            #                            tempio.truncate(0)
                            #                            fastPickler.dump(s)
                            output.write(compress(serializer.dumps(s), level))
                            headindex[i * 2] = output.tell() - l
                            if lens > 1:
                                if lens < 256:
                                    output.write(
                                        compress(
                                            array('B',
                                                  [coldict[y]
                                                   for y in col]).tostring(),
                                            level))
                                else:
                                    output.write(
                                        compress(
                                            array('H',
                                                  [coldict[y]
                                                   for y in col]).tostring(),
                                            level))
                            headindex[i * 2 +
                                      1] = output.tell() - l - headindex[i * 2]
                    else:
                        if i in paxcols:
                            l = output.tell()
                            #                            tempio.truncate(0)
                            #                            fastPickler.dump(col)
                            output.write(compress(serializer.dumps(col),
                                                  level))
                            headindex[i * 2] = output.tell() - l
                        else:
                            s = sorted(set(col))
                            lens = len(s)
                            coldict = dict(((x, y) for y, x in enumerate(s)))
                            l = output.tell()
                            #                            tempio.truncate(0)
                            #                            fastPickler.dump(s)
                            output.write(compress(serializer.dumps(s), level))
                            headindex[i * 2] = output.tell() - l
                            if lens > 1:
                                if lens < 256:
                                    output.write(
                                        compress(
                                            array('B',
                                                  [coldict[y]
                                                   for y in col]).tostring(),
                                            level))
                                else:
                                    output.write(
                                        compress(
                                            array('H',
                                                  [coldict[y]
                                                   for y in col]).tostring(),
                                            level))
                            headindex[i * 2 +
                                      1] = output.tell() - l - headindex[i * 2]

                blocknum = 1
                headindex[colnum * 2] = count
                output.seek(0)
                type = '!' + 'i' * len(headindex)
                output.write(struct.pack('!B', 1))
                if compression == BZ2:
                    output.write(struct.pack('!B', 0))
                else:
                    output.write(struct.pack('!B', 1))
                output.write(struct.pack(type, *headindex))
                cz = output.getvalue()
                fileIter.write(struct.pack('!i', len(cz)))
                fileIter.write(cz)
        fileIter.close()
Exemple #11
0
    def _unpack(self, execute=EX_CONSTRUCT):
        typ, n, obj = self._read_header(execute)

        if execute == EX_READ_ARRAY_HEADER:
            if typ != TYPE_ARRAY:
                raise ValueError("Expected array")
            return n
        if execute == EX_READ_MAP_HEADER:
            if typ != TYPE_MAP:
                raise ValueError("Expected map")
            return n
        # TODO should we eliminate the recursion?
        if typ == TYPE_ARRAY:
            if execute == EX_SKIP:
                for i in xrange(n):
                    # TODO check whether we need to call `list_hook`
                    self._unpack(EX_SKIP)
                return
            ret = newlist_hint(n)
            for i in xrange(n):
                ret.append(self._unpack(EX_CONSTRUCT))
            if self._list_hook is not None:
                ret = self._list_hook(ret)
            # TODO is the interaction between `list_hook` and `use_list` ok?
            return ret if self._use_list else tuple(ret)
        if typ == TYPE_MAP:
            if execute == EX_SKIP:
                for i in xrange(n):
                    # TODO check whether we need to call hooks
                    self._unpack(EX_SKIP)
                    self._unpack(EX_SKIP)
                return
            if self._object_pairs_hook is not None:
                ret = self._object_pairs_hook(
                    (self._unpack(EX_CONSTRUCT), self._unpack(EX_CONSTRUCT))
                    for _ in xrange(n)
                )
            else:
                ret = {}
                for _ in xrange(n):
                    key = self._unpack(EX_CONSTRUCT)
                    if self._strict_map_key and type(key) not in (unicode, bytes):
                        raise ValueError(
                            "%s is not allowed for map key" % str(type(key))
                        )
                    if not PY2 and type(key) is str:
                        key = sys.intern(key)
                    ret[key] = self._unpack(EX_CONSTRUCT)
                if self._object_hook is not None:
                    ret = self._object_hook(ret)
            return ret
        if execute == EX_SKIP:
            return
        if typ == TYPE_RAW:
            if self._raw:
                obj = bytes(obj)
            else:
                obj = obj.decode("utf_8", self._unicode_errors)
            return obj
        if typ == TYPE_BIN:
            return bytes(obj)
        if typ == TYPE_EXT:
            if n == -1:  # timestamp
                ts = Timestamp.from_bytes(bytes(obj))
                if self._timestamp == 1:
                    return ts.to_unix()
                elif self._timestamp == 2:
                    return ts.to_unix_nano()
                elif self._timestamp == 3:
                    return ts.to_datetime()
                else:
                    return ts
            else:
                return self._ext_hook(n, bytes(obj))
        assert typ == TYPE_IMMEDIATE
        return obj
Exemple #12
0
 def __init__(self, length_hint):
     self.length_hint = length_hint
     self.list = newlist_hint(length_hint)
Exemple #13
0
    def spac(fileIter, lencols, compression, level):
        indexes_size = [0] * (len(schema) - 1)
        values_size = [0] * (len(schema) - 1)
        valcount = [0] * (len(schema) - 1)
        output = StringIO.StringIO()
        check = 0
        globaldict = [None] * (len(schema) - 1)
        index_init = [0 for _ in xrange(3)]

        if split:
            output.write(struct.pack('!B', 0))
            cPickle.dump(schema[1:], output, 1)
            colnum = len(schema) - 1
            cz = output.getvalue()

            fileIter.write(struct.pack('!i', len(cz)))
            fileIter.write(cz)

        else:
            colnum = len(schema)
            fileIter.write(struct.pack('!B', 0))
            cPickle.dump(schema, fileIter, 1)
        if hasattr(sys, 'pypy_version_info'):
            from __pypy__ import newlist_hint

        else:
            newlist_hint = lambda size: []
        paxcols = []
        blocknum = 0

        #        tempio = cStringIO.StringIO()
        #        fastPickler = cPickle.Pickler(tempio, 2)
        #        fastPickler.fast = 1
        exitGen = False
        compress = zlib.compress

        compress2 = bz2.compress
        if compression == BZ2:
            compress = bz2.compress
        if lencols == 0:
            (yield)

        while not exitGen:
            output.truncate(0)
            mrows = newlist_hint(lencols)
            try:
                for i in xrange(lencols):
                    mrows.append((yield))
            except GeneratorExit:
                exitGen = True

            count = len(mrows)
            output.write(struct.pack('!B', 1))
            if compression == BZ2:
                output.write(struct.pack('!B', 0))
            else:
                output.write(struct.pack('!B', 1))

            headindex = [0 for _ in xrange((colnum * 4) + 1)]
            #headindex2 = [0 for _ in xrange(colnum*4)]
            type = '!' + 'i' * len(headindex)
            output.write(struct.pack(type, *headindex))
            #type = '!'+'i'*len(headindex2)
            #output.write(struct.pack(type, *headindex2))

            if mrows != []:

                for i, col in enumerate(
                    ([x[c] for x in mrows] for c in xrange(colnum))):
                    if blocknum == 0:
                        globaldict[i] = sorted(set(col))
                        lens = len(globaldict[i])
                        if lens > 50 * 1.0 * count / 100:
                            paxcols.append(i)
                            l = output.tell()
                            headindex[i * 4 + 3] = l
                            output.write(compress(serializer.dumps(col),
                                                  level))
                            headindex[i * 4] = output.tell() - l
                        else:
                            coldict = dict(
                                ((x, y) for y, x in enumerate(globaldict[i])))
                            l = output.tell()
                            headindex[i * 4 + 3] = l
                            output.write(
                                compress(serializer.dumps(globaldict[i]),
                                         level))
                            valcount[i] += lens
                            headindex[i * 4] = output.tell() - l
                            values_size[i] += headindex[i * 4]
                            if lens > 1:
                                l1 = output.tell()
                                if lens < 256:
                                    output.write(
                                        compress(
                                            array('B',
                                                  [coldict[y]
                                                   for y in col]).tostring(),
                                            level))
                                elif lens < 65536:
                                    output.write(
                                        compress(
                                            array('H',
                                                  [coldict[y]
                                                   for y in col]).tostring(),
                                            level))
                                else:
                                    print 'lala'
                                    output.write(
                                        compress(
                                            array('i',
                                                  [coldict[y]
                                                   for y in col]).tostring(),
                                            level))
                                    indexes_size[i] += output.tell() - l1
                            headindex[i * 4 +
                                      1] = output.tell() - l - headindex[i * 4]

                    else:
                        if i in paxcols:
                            l = output.tell()
                            headindex[i * 4 + 3] = l
                            output.write(compress(serializer.dumps(col),
                                                  level))
                            headindex[i * 4] = output.tell() - l
                        else:
                            so = sorted(set(col))
                            setcol = set(so)

                            diff = setcol - set(
                                globaldict[i]
                            )  # upologismos newn diaforetikwn val
                            lens = len(globaldict[i])
                            if lens > 2147483647 or (
                                    lens < 65535 and lens + len(diff) > 65535
                            ) or (lens < 256 and lens + len(diff) >= 256
                                  ) or len(setcol) == 1 or (lens >= 256 and
                                                            len(setcol) < 256):
                                headindex[i * 4 + 2] = 0
                                # sdc again
                                globaldict[i] = so
                                coldict = dict(
                                    ((x, y)
                                     for y, x in enumerate(globaldict[i])))
                                l = output.tell()
                                headindex[i * 4 + 3] = l
                                output.write(
                                    compress(serializer.dumps(globaldict[i]),
                                             level))
                                lens = len(globaldict[i])
                                headindex[i * 4] = output.tell() - l
                                valcount[i] += lens
                                values_size[i] += headindex[i * 3]
                                if lens > 1:
                                    l1 = output.tell()
                                    if lens < 256:
                                        output.write(
                                            compress(
                                                array(
                                                    'B',
                                                    [coldict[y]
                                                     for y in col]).tostring(),
                                                level))
                                    elif lens < 65536:
                                        output.write(
                                            compress(
                                                array(
                                                    'H',
                                                    [coldict[y]
                                                     for y in col]).tostring(),
                                                level))
                                    else:
                                        output.write(
                                            compress(
                                                array(
                                                    'i',
                                                    [coldict[y]
                                                     for y in col]).tostring(),
                                                level))
                                    indexes_size[i] += output.tell() - l1
                                headindex[
                                    i * 4 +
                                    1] = output.tell() - l - headindex[i * 4]
                            else:
                                headindex[i * 4 + 2] = 1
                                dif = sorted(diff)
                                if dif:
                                    globaldict[i] = globaldict[
                                        i] + dif  # upologismos neou global dict
                                    #globaldict[i] = set(dif).union(globaldict[i])

                                d = 0
                                t = output.tell()

                                if len(globaldict[i]) != 0:
                                    if len(globaldict[i]) < 256:
                                        indextype = 'B'
                                    elif len(globaldict[i]) < 65536:
                                        indextype = 'H'
                                    else:
                                        print 'lala'
                                        indextype = 'i'
                                l = output.tell()
                                headindex[i * 4 + 3] = l
                                output.write(
                                    compress(serializer.dumps(dif), level))

                                valcount[i] += len(dif)
                                headindex[i * 4] = output.tell() - l
                                values_size[i] += headindex[i * 4]

                                lens = len(globaldict[i])

                                coldict = dict(
                                    ((x, y)
                                     for y, x in enumerate(globaldict[i])))
                                l = output.tell()

                                if lens > 1:
                                    l1 = output.tell()
                                    output.write(
                                        compress(
                                            array(indextype,
                                                  [coldict[y]
                                                   for y in col]).tostring(),
                                            level))
                                    indexes_size[i] += output.tell() - l1
                                headindex[i * 4 + 1] = output.tell() - l

                blocknum += 1
                headindex[colnum * 4] = count
                output.seek(0)
                type = '!' + 'i' * len(headindex)
                #type2 = '!'+'i'*len(headindex2)
                output.write(struct.pack('!B', 1))
                if compression == BZ2:
                    output.write(struct.pack('!B', 0))
                else:
                    output.write(struct.pack('!B', 1))
                output.write(struct.pack(type, *headindex))
                cz = output.getvalue()
                fileIter.write(struct.pack('!i', len(cz)))
                fileIter.write(cz)

    #print indexes_size
    #print values_size
    #print valcount
        fileIter.close()
Exemple #14
0
    def sorteddictpercol(fileIter, lencols, compression, level):
        output = StringIO.StringIO()
        if split:
            output.write(struct.pack('!B', 0))
            cPickle.dump(schema[1:], output, 1)
            colnum = len(schema) - 1
            cz = output.getvalue()
            fileIter.write(struct.pack('!i', len(cz)))
            fileIter.write(cz)

        else:
            colnum = len(schema)
            fileIter.write(struct.pack('!B', 0))
            cPickle.dump(schema, fileIter, 1)
        if hasattr(sys, 'pypy_version_info'):
            from __pypy__ import newlist_hint

        else:
            newlist_hint = lambda size: []
        paxcols = []
        blocknum = 0

        #        tempio = cStringIO.StringIO()
        #        fastPickler = cPickle.Pickler(tempio, 2)
        #        fastPickler.fast = 1
        exitGen = False
        compress = zlib.compress
        if compression == BZ2:
            compress = bz2.compress
        if lencols == 0:
            (yield)

        while not exitGen:
            output.truncate(0)
            mrows = newlist_hint(lencols)
            try:
                for i in xrange(lencols):
                    mrows.append((yield))
            except GeneratorExit:
                exitGen = True

            count = len(mrows)
            output.write(struct.pack('!B', 1))
            if compression == BZ2:
                output.write(struct.pack('!B', 0))
            else:
                output.write(struct.pack('!B', 1))

            headindex = [0 for _ in xrange((colnum * 2) + 1)]
            type = '!' + 'i' * len(headindex)
            output.write(struct.pack(type, *headindex))

            if mrows != []:

                for i, col in enumerate(([x[c] for x in mrows] for c in xrange(colnum))):

                    if blocknum == 0:
                        s = sorted(set(col))
                        lens = len(s)
                        if lens > 50 * 1.0 * count / 100:
                            paxcols.append(i)
                            l = output.tell()
                            #                            tempio.truncate(0)
                            #                            fastPickler.dump(col)
                            output.write(compress(serializer.dumps(col), level))
                            headindex[i * 2] = output.tell() - l
                        else:
                            coldict = dict(((x, y) for y, x in enumerate(s)))
                            l = output.tell()
                            #                            tempio.truncate(0)
                            #                            fastPickler.dump(s)
                            output.write(compress(serializer.dumps(s), level))
                            headindex[i * 2] = output.tell() - l
                            if lens > 1:
                                if lens < 256:
                                    output.write(compress(array('B', [coldict[y] for y in col]).tostring(), level))
                                else:
                                    output.write(compress(array('H', [coldict[y] for y in col]).tostring(), level))
                            headindex[i * 2 + 1] = output.tell() - l - headindex[i * 2]
                    else:
                        if i in paxcols:
                            l = output.tell()
                            #                            tempio.truncate(0)
                            #                            fastPickler.dump(col)
                            output.write(compress(serializer.dumps(col), level))
                            headindex[i * 2] = output.tell() - l
                        else:
                            s = sorted(set(col))
                            lens = len(s)
                            coldict = dict(((x, y) for y, x in enumerate(s)))
                            l = output.tell()
                            #                            tempio.truncate(0)
                            #                            fastPickler.dump(s)
                            output.write(compress(serializer.dumps(s), level))
                            headindex[i * 2] = output.tell() - l
                            if lens > 1:
                                if lens < 256:
                                    output.write(compress(array('B', [coldict[y] for y in col]).tostring(), level))
                                else:
                                    output.write(compress(array('H', [coldict[y] for y in col]).tostring(), level))
                            headindex[i * 2 + 1] = output.tell() - l - headindex[i * 2]

                blocknum = 1
                headindex[colnum * 2] = count
                output.seek(0)
                type = '!' + 'i' * len(headindex)
                output.write(struct.pack('!B', 1))
                if compression == BZ2:
                    output.write(struct.pack('!B', 0))
                else:
                    output.write(struct.pack('!B', 1))
                output.write(struct.pack(type, *headindex))
                cz = output.getvalue()
                fileIter.write(struct.pack('!i', len(cz)))
                fileIter.write(cz)
        fileIter.close()
 def _fb_unpack(self, execute=EX_CONSTRUCT, write_bytes=None):
     typ = TYPE_IMMEDIATE
     c = self._fb_read(1, write_bytes)
     b = ord(c)
     if   b & 0b10000000 == 0:
         obj = b
     elif b & 0b11100000 == 0b11100000:
         obj = struct.unpack("b", c)[0]
     elif b & 0b11100000 == 0b10100000:
         n = b & 0b00011111
         obj = self._fb_read(n, write_bytes)
         typ = TYPE_RAW
     elif b & 0b11110000 == 0b10010000:
         n = b & 0b00001111
         typ = TYPE_ARRAY
     elif b & 0b11110000 == 0b10000000:
         n = b & 0b00001111
         typ = TYPE_MAP
     elif b == 0xc0:
         obj = None
     elif b == 0xc2:
         obj = False
     elif b == 0xc3:
         obj = True
     elif b == 0xca:
         obj = struct.unpack(">f", self._fb_read(4, write_bytes))[0]
     elif b == 0xcb:
         obj = struct.unpack(">d", self._fb_read(8, write_bytes))[0]
     elif b == 0xcc:
         obj = struct.unpack("B", self._fb_read(1, write_bytes))[0]
     elif b == 0xcd:
         obj = struct.unpack(">H", self._fb_read(2, write_bytes))[0]
     elif b == 0xce:
         obj = struct.unpack(">I", self._fb_read(4, write_bytes))[0]
     elif b == 0xcf:
         obj = struct.unpack(">Q", self._fb_read(8, write_bytes))[0]
     elif b == 0xd0:
         obj = struct.unpack("b", self._fb_read(1, write_bytes))[0]
     elif b == 0xd1:
         obj = struct.unpack(">h", self._fb_read(2, write_bytes))[0]
     elif b == 0xd2:
         obj = struct.unpack(">i", self._fb_read(4, write_bytes))[0]
     elif b == 0xd3:
         obj = struct.unpack(">q", self._fb_read(8, write_bytes))[0]
     elif b == 0xda:
         n = struct.unpack(">H", self._fb_read(2, write_bytes))[0]
         obj = self._fb_read(n, write_bytes)
         typ = TYPE_RAW
     elif b == 0xdb:
         n = struct.unpack(">I", self._fb_read(4, write_bytes))[0]
         obj = self._fb_read(n, write_bytes)
         typ = TYPE_RAW
     elif b == 0xdc:
         n = struct.unpack(">H", self._fb_read(2, write_bytes))[0]
         typ = TYPE_ARRAY
     elif b == 0xdd:
         n = struct.unpack(">I", self._fb_read(4, write_bytes))[0]
         typ = TYPE_ARRAY
     elif b == 0xde:
         n = struct.unpack(">H", self._fb_read(2, write_bytes))[0]
         typ = TYPE_MAP
     elif b == 0xdf:
         n = struct.unpack(">I", self._fb_read(4, write_bytes))[0]
         typ = TYPE_MAP
     else:
         raise UnpackValueError("Unknown header: 0x%x" % b)
     if execute == EX_READ_ARRAY_HEADER:
         if typ != TYPE_ARRAY:
             raise UnpackValueError("Expected array")
         return n
     if execute == EX_READ_MAP_HEADER:
         if typ != TYPE_MAP:
             raise UnpackValueError("Expected map")
         return n
     # TODO should we eliminate the recursion?
     if typ == TYPE_ARRAY:
         if execute == EX_SKIP:
             for i in xrange(n):
                 # TODO check whether we need to call `list_hook`
                 self._fb_unpack(EX_SKIP, write_bytes)
             return
         ret = newlist_hint(n)
         for i in xrange(n):
             ret.append(self._fb_unpack(EX_CONSTRUCT, write_bytes))
         if self._list_hook is not None:
             ret = self._list_hook(ret)
         # TODO is the interaction between `list_hook` and `use_list` ok?
         return ret if self._use_list else tuple(ret)
     if typ == TYPE_MAP:
         if execute == EX_SKIP:
             for i in xrange(n):
                 # TODO check whether we need to call hooks
                 self._fb_unpack(EX_SKIP, write_bytes)
                 self._fb_unpack(EX_SKIP, write_bytes)
             return
         if self._object_pairs_hook is not None:
             ret = self._object_pairs_hook(
                     (self._fb_unpack(EX_CONSTRUCT, write_bytes),
                      self._fb_unpack(EX_CONSTRUCT, write_bytes))
                         for _ in xrange(n)
                     )
         else:
             ret = {}
             for _ in xrange(n):
                 key = self._fb_unpack(EX_CONSTRUCT, write_bytes)
                 ret[key] = self._fb_unpack(EX_CONSTRUCT, write_bytes)
             if self._object_hook is not None:
                 ret = self._object_hook(ret)
         return ret
     if execute == EX_SKIP:
         return
     if typ == TYPE_RAW:
         if self._encoding is not None:
             obj = obj.decode(self._encoding, self._unicode_errors)
         return obj
     assert typ == TYPE_IMMEDIATE
     return obj
Exemple #16
0
    def _unpack(self,
                command: int = _CMD_CONSTRUCT,
                data_type: typing.Optional[int] = None):
        obj_type, n, obj, obj_dt = self._read_header(data_type)

        # Type checking
        if command == _CMD_READ_ARRAY_HEADER:
            if obj_type != _TYPE_ARRAY and obj_type != _TYPE_MARRAY:
                raise ValueError('Expected ARRAY')
            return n
        elif command == _CMD_READ_MAP_HEADER:
            if obj_type != _TYPE_MAP:
                raise ValueError('Expected MAP')
            return n

        # Unpacking ARRAY
        if obj_type == _TYPE_ARRAY:
            # Skip over every element in the ARRAY
            if command == _CMD_SKIP:
                for _ in range(n):
                    self._unpack(_CMD_SKIP)
                return
            ret = newlist_hint(n)
            for _ in range(n):
                ret.append(self._unpack(_CMD_CONSTRUCT, data_type=obj_dt))
            if self._list_hook is not None:
                ret = self._list_hook(ret)
            return ret

        # Unpacking MARRAY
        elif obj_type == _TYPE_MARRAY:
            if command == _CMD_SKIP:
                for _ in range(n):
                    self._unpack(_CMD_SKIP)
                return
            ret = newlist_hint(n)
            for _ in range(n):
                ret.append(self._unpack(_CMD_CONSTRUCT))
            if self._list_hook is not None:
                ret = self._list_hook(ret)
            return ret

        # Unpacking MAP
        elif obj_type == _TYPE_MAP:
            if command == _CMD_SKIP:
                for _ in range(n):
                    self._unpack(_CMD_SKIP)
                    self._unpack(_CMD_SKIP)
                return
            if self._object_pairs_hook is not None:
                ret = self._object_pairs_hook((self._unpack(_CMD_CONSTRUCT),
                                               self._unpack(_CMD_CONSTRUCT))
                                              for _ in range(n))
            else:
                ret = {}
                for _ in range(n):
                    key = self._unpack(_CMD_CONSTRUCT)
                    ret[key] = self._unpack(_CMD_CONSTRUCT)
                if self._object_hook is not None:
                    ret = self._object_hook(ret)
            return ret

        if command == _CMD_SKIP:
            return

        # Unpacking STR
        if obj_type == _TYPE_STR:
            return obj.decode('utf-8')

        # Unpacking BIN
        elif obj_type == _TYPE_BIN:
            return bytes(obj)

        # Unpacking EXT
        elif obj_type == _TYPE_EXT:
            return self._ext_hook(n, bytes(obj))

        # Unpacking INT
        assert obj_type == _TYPE_IMMEDIATE
        return obj