Beispiel #1
0
    def _init_dict(self, data, axes, dtype=None):
        items = axes[0]

        # prefilter if items passed
        if items is not None:
            items = _ensure_index(items)
            data = dict((k, v) for k, v in data.iteritems() if k in items)
        else:
            items = Index(_try_sort(data.keys()))

        # figure out the index, if necessary
        if index is None:
            index = extract_index(data)

        # don't force copy because getting jammed in an ndarray anyway
        # homogenized = _homogenize(data, index, columns, dtype)

        data, index, columns = _homogenize(data, intersect=intersect)

        # segregates dtypes and forms blocks matching to columns
        blocks = form_blocks(homogenized, index, columns)

        # consolidate for now
        mgr = BlockManager(blocks, [columns, index])
        return mgr.consolidate()
Beispiel #2
0
    def test_get(self):
        cols = Index(list('abc'))
        values = np.random.rand(3, 3)
        block = make_block(values=values.copy(), placement=np.arange(3))
        mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)])

        assert_almost_equal(mgr.get('a', fastpath=False), values[0])
        assert_almost_equal(mgr.get('b', fastpath=False), values[1])
        assert_almost_equal(mgr.get('c', fastpath=False), values[2])
        assert_almost_equal(mgr.get('a').internal_values(), values[0])
        assert_almost_equal(mgr.get('b').internal_values(), values[1])
        assert_almost_equal(mgr.get('c').internal_values(), values[2])
Beispiel #3
0
    def test_duplicate_ref_loc_failure(self):
        tmp_mgr = create_mgr('a:bool; a: f8')

        axes, blocks = tmp_mgr.axes, tmp_mgr.blocks

        blocks[0].mgr_locs = np.array([0])
        blocks[1].mgr_locs = np.array([0])
        # test trying to create block manager with overlapping ref locs
        self.assertRaises(AssertionError, BlockManager, blocks, axes)

        blocks[0].mgr_locs = np.array([0])
        blocks[1].mgr_locs = np.array([1])
        mgr = BlockManager(blocks, axes)
        mgr.iget(1)
Beispiel #4
0
    def test_equals_block_order_different_dtypes(self):
        # GH 9330

        mgr_strings = [
            "a:i8;b:f8",  # basic case
            "a:i8;b:f8;c:c8;d:b",  # many types
            "a:i8;e:dt;f:td;g:string",  # more types
            "a:i8;b:category;c:category2;d:category2",  # categories
            "c:sparse;d:sparse_na;b:f8",  # sparse
        ]

        for mgr_string in mgr_strings:
            bm = create_mgr(mgr_string)
            block_perms = itertools.permutations(bm.blocks)
            for bm_perm in block_perms:
                bm_this = BlockManager(bm_perm, bm.axes)
                self.assertTrue(bm.equals(bm_this))
                self.assertTrue(bm_this.equals(bm))
Beispiel #5
0
 def test_equals(self, mgr_string):
     # unique items
     bm1 = create_mgr(mgr_string)
     bm2 = BlockManager(bm1.blocks[::-1], bm1.axes)
     assert bm1.equals(bm2)
def decode(obj):
    """
    Decoder for deserializing numpy data types.
    """

    typ = obj.get(u'typ')
    if typ is None:
        return obj
    elif typ == u'timestamp':
        return Timestamp(obj[u'value'], tz=obj[u'tz'], offset=obj[u'offset'])
    elif typ == u'nat':
        return NaT
    elif typ == u'period':
        return Period(ordinal=obj[u'ordinal'], freq=obj[u'freq'])
    elif typ == u'index':
        dtype = dtype_for(obj[u'dtype'])
        data = unconvert(obj[u'data'], dtype,
                         obj.get(u'compress'))
        return globals()[obj[u'klass']](data, dtype=dtype, name=obj[u'name'])
    elif typ == u'range_index':
        return globals()[obj[u'klass']](obj[u'start'],
                                        obj[u'stop'],
                                        obj[u'step'],
                                        name=obj[u'name'])
    elif typ == u'multi_index':
        dtype = dtype_for(obj[u'dtype'])
        data = unconvert(obj[u'data'], dtype,
                         obj.get(u'compress'))
        data = [tuple(x) for x in data]
        return globals()[obj[u'klass']].from_tuples(data, names=obj[u'names'])
    elif typ == u'period_index':
        data = unconvert(obj[u'data'], np.int64, obj.get(u'compress'))
        d = dict(name=obj[u'name'], freq=obj[u'freq'])
        return globals()[obj[u'klass']](data, **d)
    elif typ == u'datetime_index':
        data = unconvert(obj[u'data'], np.int64, obj.get(u'compress'))
        d = dict(name=obj[u'name'], freq=obj[u'freq'], verify_integrity=False)
        result = globals()[obj[u'klass']](data, **d)
        tz = obj[u'tz']

        # reverse tz conversion
        if tz is not None:
            result = result.tz_localize('UTC').tz_convert(tz)
        return result

    elif typ == u'category':
        from_codes = globals()[obj[u'klass']].from_codes
        return from_codes(codes=obj[u'codes'],
                          categories=obj[u'categories'],
                          ordered=obj[u'ordered'],
                          name=obj[u'name'])

    elif typ == u'series':
        dtype = dtype_for(obj[u'dtype'])
        pd_dtype = pandas_dtype(dtype)
        np_dtype = pandas_dtype(dtype).base

        index = obj[u'index']
        result = globals()[obj[u'klass']](unconvert(obj[u'data'], dtype,
                                                    obj[u'compress']),
                                          index=index,
                                          dtype=np_dtype,
                                          name=obj[u'name'])
        tz = getattr(pd_dtype, 'tz', None)
        if tz:
            result = result.dt.tz_localize('UTC').dt.tz_convert(tz)
        return result

    elif typ == u'block_manager':
        axes = obj[u'axes']

        def create_block(b):
            values = unconvert(b[u'values'], dtype_for(b[u'dtype']),
                               b[u'compress']).reshape(b[u'shape'])

            # locs handles duplicate column names, and should be used instead
            # of items; see GH 9618
            if u'locs' in b:
                placement = b[u'locs']
            else:
                placement = axes[0].get_indexer(b[u'items'])
            return make_block(values=values,
                              klass=getattr(internals, b[u'klass']),
                              placement=placement,
                              dtype=b[u'dtype'])

        blocks = [create_block(b) for b in obj[u'blocks']]
        return globals()[obj[u'klass']](BlockManager(blocks, axes))
    elif typ == u'datetime':
        return parse(obj[u'data'])
    elif typ == u'datetime64':
        return np.datetime64(parse(obj[u'data']))
    elif typ == u'date':
        return parse(obj[u'data']).date()
    elif typ == u'timedelta':
        return timedelta(*obj[u'data'])
    elif typ == u'timedelta64':
        return np.timedelta64(int(obj[u'data']))
    # elif typ == 'sparse_series':
    #    dtype = dtype_for(obj['dtype'])
    #    return globals()[obj['klass']](
    #        unconvert(obj['sp_values'], dtype, obj['compress']),
    #        sparse_index=obj['sp_index'], index=obj['index'],
    #        fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
    # elif typ == 'sparse_dataframe':
    #    return globals()[obj['klass']](
    #        obj['data'], columns=obj['columns'],
    #        default_fill_value=obj['default_fill_value'],
    #        default_kind=obj['default_kind']
    #    )
    # elif typ == 'sparse_panel':
    #    return globals()[obj['klass']](
    #        obj['data'], items=obj['items'],
    #        default_fill_value=obj['default_fill_value'],
    #        default_kind=obj['default_kind'])
    elif typ == u'block_index':
        return globals()[obj[u'klass']](obj[u'length'], obj[u'blocs'],
                                        obj[u'blengths'])
    elif typ == u'int_index':
        return globals()[obj[u'klass']](obj[u'length'], obj[u'indices'])
    elif typ == u'ndarray':
        return unconvert(obj[u'data'], np.typeDict[obj[u'dtype']],
                         obj.get(u'compress')).reshape(obj[u'shape'])
    elif typ == u'np_scalar':
        if obj.get(u'sub_typ') == u'np_complex':
            return c2f(obj[u'real'], obj[u'imag'], obj[u'dtype'])
        else:
            dtype = dtype_for(obj[u'dtype'])
            try:
                return dtype(obj[u'data'])
            except:
                return dtype.type(obj[u'data'])
    elif typ == u'np_complex':
        return complex(obj[u'real'] + u'+' + obj[u'imag'] + u'j')
    elif isinstance(obj, (dict, list, set)):
        return obj
    else:
        return obj
Beispiel #7
0
def decode(obj):
    """
    Decoder for deserializing numpy data types.
    """

    typ = obj.get(u"typ")
    if typ is None:
        return obj
    elif typ == u"timestamp":
        freq = obj[u"freq"] if "freq" in obj else obj[u"offset"]
        return Timestamp(obj[u"value"], tz=obj[u"tz"], freq=freq)
    elif typ == u"nat":
        return NaT
    elif typ == u"period":
        return Period(ordinal=obj[u"ordinal"], freq=obj[u"freq"])
    elif typ == u"index":
        dtype = dtype_for(obj[u"dtype"])
        data = unconvert(obj[u"data"], dtype, obj.get(u"compress"))
        return globals()[obj[u"klass"]](data, dtype=dtype, name=obj[u"name"])
    elif typ == u"range_index":
        return globals()[obj[u"klass"]](obj[u"start"],
                                        obj[u"stop"],
                                        obj[u"step"],
                                        name=obj[u"name"])
    elif typ == u"multi_index":
        dtype = dtype_for(obj[u"dtype"])
        data = unconvert(obj[u"data"], dtype, obj.get(u"compress"))
        data = [tuple(x) for x in data]
        return globals()[obj[u"klass"]].from_tuples(data, names=obj[u"names"])
    elif typ == u"period_index":
        data = unconvert(obj[u"data"], np.int64, obj.get(u"compress"))
        d = dict(name=obj[u"name"], freq=obj[u"freq"])
        if _is_pandas_legacy_version:
            # legacy
            return globals()[obj[u"klass"]](data, **d)
        else:

            freq = d['freq']
            if freq is None:
                raise ValueError(
                    'freq is not specified and cannot be inferred')
            values = [Period(ordinal=x, freq=freq) for x in data]
            return PeriodIndex(values)
            #return globals()[obj[u"klass"]]._from_ordinals(data, **d)
    elif typ == u"datetime_index":
        data = unconvert(obj[u"data"], np.int64, obj.get(u"compress"))
        d = dict(name=obj[u"name"],
                 freq=obj[u"freq"])  #, verify_integrity=False)
        result = globals()[obj[u"klass"]](data, **d)
        tz = obj[u"tz"]

        # reverse tz conversion
        if tz is not None:
            result = result.tz_localize("UTC").tz_convert(tz)
        return result

    elif typ == u"category":
        from_codes = globals()[obj[u"klass"]].from_codes
        return from_codes(codes=obj[u"codes"],
                          categories=obj[u"categories"],
                          ordered=obj[u"ordered"])

    elif typ == u"series":
        dtype = dtype_for(obj[u"dtype"])
        pd_dtype = pandas_dtype(dtype)

        index = obj[u"index"]
        result = globals()[obj[u"klass"]](
            unconvert(obj[u"data"], dtype, obj[u"compress"]),
            index=index,
            dtype=pd_dtype,
            name=obj[u"name"],
        )
        return result

    elif typ == u"block_manager":
        axes = obj[u"axes"]

        def create_block(b):
            values = _safe_reshape(
                unconvert(b[u"values"], dtype_for(b[u"dtype"]),
                          b[u"compress"]),
                b[u"shape"],
            )

            # locs handles duplicate column names, and should be used instead
            # of items; see GH 9618
            if u"locs" in b:
                placement = b[u"locs"]
            else:
                placement = axes[0].get_indexer(b[u"items"])
            klass = getattr(internals, b[u"klass"])
            if klass == DatetimeTZBlock:
                raise ValueError(
                    "Lost the ability to parse datetime with timezone. Sorry")

            return make_block(
                values=values.copy(),
                klass=getattr(internals, b[u"klass"]),
                placement=placement,
                dtype=b[u"dtype"],
            )

        blocks = [create_block(b) for b in obj[u"blocks"]]
        return globals()[obj[u"klass"]](BlockManager(blocks, axes))
    elif typ == u"datetime":
        return parse(obj[u"data"])
    elif typ == u"datetime64":
        return np.datetime64(parse(obj[u"data"]))
    elif typ == u"date":
        return parse(obj[u"data"]).date()
    elif typ == u"timedelta":
        return timedelta(*obj[u"data"])
    elif typ == u"timedelta64":
        return np.timedelta64(int(obj[u"data"]))
    # elif typ == 'sparse_series':
    #    dtype = dtype_for(obj['dtype'])
    #    return globals()[obj['klass']](
    #        unconvert(obj['sp_values'], dtype, obj['compress']),
    #        sparse_index=obj['sp_index'], index=obj['index'],
    #        fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
    # elif typ == 'sparse_dataframe':
    #    return globals()[obj['klass']](
    #        obj['data'], columns=obj['columns'],
    #        default_fill_value=obj['default_fill_value'],
    #        default_kind=obj['default_kind']
    #    )
    # elif typ == 'sparse_panel':
    #    return globals()[obj['klass']](
    #        obj['data'], items=obj['items'],
    #        default_fill_value=obj['default_fill_value'],
    #        default_kind=obj['default_kind'])
    elif typ == u"block_index":
        return globals()[obj[u"klass"]](obj[u"length"], obj[u"blocs"],
                                        obj[u"blengths"])
    elif typ == u"int_index":
        return globals()[obj[u"klass"]](obj[u"length"], obj[u"indices"])
    elif typ == u"ndarray":
        return unconvert(obj[u"data"], np.typeDict[obj[u"dtype"]],
                         obj.get(u"compress")).reshape(obj[u"shape"])
    elif typ == u"np_scalar":
        if obj.get(u"sub_typ") == u"np_complex":
            return c2f(obj[u"real"], obj[u"imag"], obj[u"dtype"])
        else:
            dtype = dtype_for(obj[u"dtype"])
            try:
                return dtype(obj[u"data"])
            except:
                return dtype.type(obj[u"data"])
    elif typ == u"np_complex":
        return complex(obj[u"real"] + u"+" + obj[u"imag"] + u"j")
    elif isinstance(obj, (dict, list, set)):
        return obj
    else:
        return obj
Beispiel #8
0
def create_mgr(descr, item_shape=None):
    """
    Construct BlockManager from string description.

    String description syntax looks similar to np.matrix initializer.  It looks
    like this::

        a,b,c: f8; d,e,f: i8

    Rules are rather simple:

    * see list of supported datatypes in `create_block` method
    * components are semicolon-separated
    * each component is `NAME,NAME,NAME: DTYPE_ID`
    * whitespace around colons & semicolons are removed
    * components with same DTYPE_ID are combined into single block
    * to force multiple blocks with same dtype, use '-SUFFIX'::

        'a:f8-1; b:f8-2; c:f8-foobar'

    """
    if item_shape is None:
        item_shape = (N, )

    offset = 0
    mgr_items = []
    block_placements = {}
    for d in descr.split(";"):
        d = d.strip()
        if not len(d):
            continue
        names, blockstr = d.partition(":")[::2]
        blockstr = blockstr.strip()
        names = names.strip().split(",")

        mgr_items.extend(names)
        placement = list(np.arange(len(names)) + offset)
        try:
            block_placements[blockstr].extend(placement)
        except KeyError:
            block_placements[blockstr] = placement
        offset += len(names)

    mgr_items = Index(mgr_items)

    blocks = []
    num_offset = 0
    for blockstr, placement in block_placements.items():
        typestr = blockstr.split("-")[0]
        blocks.append(
            create_block(typestr,
                         placement,
                         item_shape=item_shape,
                         num_offset=num_offset))
        num_offset += len(placement)

    sblocks = sorted(blocks, key=lambda b: b.mgr_locs[0])
    return BlockManager(
        tuple(sblocks),
        [mgr_items] + [Index(np.arange(n)) for n in item_shape],
    )
Beispiel #9
0
def decode(obj):
    """
    Decoder for deserializing numpy data types.
    """

    typ = obj.get('typ')
    if typ is None:
        return obj
    elif typ == 'timestamp':
        return Timestamp(obj['value'], tz=obj['tz'], offset=obj['offset'])
    elif typ == 'period':
        return Period(ordinal=obj['ordinal'], freq=obj['freq'])
    elif typ == 'index':
        dtype = dtype_for(obj['dtype'])
        data = unconvert(obj['data'], np.typeDict[obj['dtype']],
                         obj.get('compress'))
        return globals()[obj['klass']](data, dtype=dtype, name=obj['name'])
    elif typ == 'multi_index':
        data = unconvert(obj['data'], np.typeDict[obj['dtype']],
                         obj.get('compress'))
        data = [tuple(x) for x in data]
        return globals()[obj['klass']].from_tuples(data, names=obj['names'])
    elif typ == 'period_index':
        data = unconvert(obj['data'], np.int64, obj.get('compress'))
        return globals()[obj['klass']](data,
                                       name=obj['name'],
                                       freq=obj['freq'])
    elif typ == 'datetime_index':
        data = unconvert(obj['data'], np.int64, obj.get('compress'))
        result = globals()[obj['klass']](data,
                                         freq=obj['freq'],
                                         name=obj['name'])
        tz = obj['tz']

        # reverse tz conversion
        if tz is not None:
            result = result.tz_localize('UTC').tz_convert(tz)
        return result
    elif typ == 'series':
        dtype = dtype_for(obj['dtype'])
        index = obj['index']
        return globals()[obj['klass']](unconvert(obj['data'], dtype,
                                                 obj['compress']),
                                       index=index,
                                       name=obj['name'])
    elif typ == 'block_manager':
        axes = obj['axes']

        def create_block(b):
            dtype = dtype_for(b['dtype'])
            return make_block(unconvert(b['values'], dtype,
                                        b['compress']).reshape(b['shape']),
                              b['items'],
                              axes[0],
                              klass=getattr(internals, b['klass']))

        blocks = [create_block(b) for b in obj['blocks']]
        return globals()[obj['klass']](BlockManager(blocks, axes))
    elif typ == 'datetime':
        return parse(obj['data'])
    elif typ == 'datetime64':
        return np.datetime64(parse(obj['data']))
    elif typ == 'date':
        return parse(obj['data']).date()
    elif typ == 'timedelta':
        return timedelta(*obj['data'])
    elif typ == 'timedelta64':
        return np.timedelta64(int(obj['data']))
    #elif typ == 'sparse_series':
    #    dtype = dtype_for(obj['dtype'])
    #    return globals()[obj['klass']](
    #        unconvert(obj['sp_values'], dtype, obj['compress']),
    #        sparse_index=obj['sp_index'], index=obj['index'],
    #        fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
    #elif typ == 'sparse_dataframe':
    #    return globals()[obj['klass']](
    #        obj['data'], columns=obj['columns'],
    #        default_fill_value=obj['default_fill_value'],
    #        default_kind=obj['default_kind']
    #    )
    #elif typ == 'sparse_panel':
    #    return globals()[obj['klass']](
    #        obj['data'], items=obj['items'],
    #        default_fill_value=obj['default_fill_value'],
    #        default_kind=obj['default_kind'])
    elif typ == 'block_index':
        return globals()[obj['klass']](obj['length'], obj['blocs'],
                                       obj['blengths'])
    elif typ == 'int_index':
        return globals()[obj['klass']](obj['length'], obj['indices'])
    elif typ == 'ndarray':
        return unconvert(obj['data'], np.typeDict[obj['dtype']],
                         obj.get('compress')).reshape(obj['shape'])
    elif typ == 'np_scalar':
        if obj.get('sub_typ') == 'np_complex':
            return c2f(obj['real'], obj['imag'], obj['dtype'])
        else:
            dtype = dtype_for(obj['dtype'])
            try:
                return dtype(obj['data'])
            except:
                return dtype.type(obj['data'])
    elif typ == 'np_complex':
        return complex(obj['real'] + '+' + obj['imag'] + 'j')
    elif isinstance(obj, (dict, list, set)):
        return obj
    else:
        return obj
Beispiel #10
0
def decode(obj):
    """
    Decoder for deserializing numpy data types.
    """

    typ = obj.get("typ")
    if typ is None:
        return obj
    elif typ == "timestamp":
        freq = obj["freq"] if "freq" in obj else obj["offset"]
        return Timestamp(obj["value"], tz=obj["tz"], freq=freq)
    elif typ == "nat":
        return NaT
    elif typ == "period":
        return Period(ordinal=obj["ordinal"], freq=obj["freq"])
    elif typ == "index":
        dtype = dtype_for(obj["dtype"])
        data = unconvert(obj["data"], dtype, obj.get("compress"))
        return Index(data, dtype=dtype, name=obj["name"])
    elif typ == "range_index":
        return RangeIndex(obj["start"],
                          obj["stop"],
                          obj["step"],
                          name=obj["name"])
    elif typ == "multi_index":
        dtype = dtype_for(obj["dtype"])
        data = unconvert(obj["data"], dtype, obj.get("compress"))
        data = [tuple(x) for x in data]
        return MultiIndex.from_tuples(data, names=obj["names"])
    elif typ == "period_index":
        data = unconvert(obj["data"], np.int64, obj.get("compress"))
        d = dict(name=obj["name"], freq=obj["freq"])
        freq = d.pop("freq", None)
        return PeriodIndex(PeriodArray(data, freq), **d)

    elif typ == "datetime_index":
        data = unconvert(obj["data"], np.int64, obj.get("compress"))
        d = dict(name=obj["name"], freq=obj["freq"])
        result = DatetimeIndex(data, **d)
        tz = obj["tz"]

        # reverse tz conversion
        if tz is not None:
            result = result.tz_localize("UTC").tz_convert(tz)
        return result

    elif typ in ("interval_index", "interval_array"):
        return globals()[obj["klass"]].from_arrays(obj["left"],
                                                   obj["right"],
                                                   obj["closed"],
                                                   name=obj["name"])
    elif typ == "category":
        from_codes = globals()[obj["klass"]].from_codes
        return from_codes(codes=obj["codes"],
                          categories=obj["categories"],
                          ordered=obj["ordered"])

    elif typ == "interval":
        return Interval(obj["left"], obj["right"], obj["closed"])
    elif typ == "series":
        dtype = dtype_for(obj["dtype"])
        index = obj["index"]
        data = unconvert(obj["data"], dtype, obj["compress"])
        return Series(data, index=index, dtype=dtype, name=obj["name"])

    elif typ == "block_manager":
        axes = obj["axes"]

        def create_block(b):
            values = _safe_reshape(
                unconvert(b["values"], dtype_for(b["dtype"]), b["compress"]),
                b["shape"])

            # locs handles duplicate column names, and should be used instead
            # of items; see GH 9618
            if "locs" in b:
                placement = b["locs"]
            else:
                placement = axes[0].get_indexer(b["items"])

            if is_datetime64tz_dtype(b["dtype"]):
                assert isinstance(values, np.ndarray), type(values)
                assert values.dtype == "M8[ns]", values.dtype
                values = DatetimeArray(values, dtype=b["dtype"])

            return make_block(
                values=values,
                klass=getattr(internals, b["klass"]),
                placement=placement,
                dtype=b["dtype"],
            )

        blocks = [create_block(b) for b in obj["blocks"]]
        return globals()[obj["klass"]](BlockManager(blocks, axes))
    elif typ == "datetime":
        return parse(obj["data"])
    elif typ == "datetime64":
        return np.datetime64(parse(obj["data"]))
    elif typ == "date":
        return parse(obj["data"]).date()
    elif typ == "timedelta":
        return timedelta(*obj["data"])
    elif typ == "timedelta64":
        return np.timedelta64(int(obj["data"]))
    # elif typ == 'sparse_series':
    #    dtype = dtype_for(obj['dtype'])
    #    return SparseSeries(
    #        unconvert(obj['sp_values'], dtype, obj['compress']),
    #        sparse_index=obj['sp_index'], index=obj['index'],
    #        fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
    # elif typ == 'sparse_dataframe':
    #    return SparseDataFrame(
    #        obj['data'], columns=obj['columns'],
    #        default_fill_value=obj['default_fill_value'],
    #        default_kind=obj['default_kind']
    #    )
    elif typ == "block_index":
        return globals()[obj["klass"]](obj["length"], obj["blocs"],
                                       obj["blengths"])
    elif typ == "int_index":
        return globals()[obj["klass"]](obj["length"], obj["indices"])
    elif typ == "ndarray":
        return unconvert(obj["data"], np.typeDict[obj["dtype"]],
                         obj.get("compress")).reshape(obj["shape"])
    elif typ == "np_scalar":
        if obj.get("sub_typ") == "np_complex":
            return c2f(obj["real"], obj["imag"], obj["dtype"])
        else:
            dtype = dtype_for(obj["dtype"])
            try:
                return dtype(obj["data"])
            except (ValueError, TypeError):
                return dtype.type(obj["data"])
    elif typ == "np_complex":
        return complex(obj["real"] + "+" + obj["imag"] + "j")
    elif isinstance(obj, (dict, list, set)):
        return obj
    else:
        return obj
Beispiel #11
0
def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
    """
    Create empty DataFrame to assign into

    Parameters
    ----------
    types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples
        applies to non-categorical columns. If there are only categorical
        columns, an empty string of None will do.
    size: int
        Number of rows to allocate
    cats: dict {col: labels}
        Location and labels for categorical columns, e.g., {1: ['mary', 'mo]}
        will create column index 1 (inserted amongst the numerical columns)
        with two possible values. If labels is an integers, `{'col': 5}`,
        will generate temporary labels using range. If None, or column name
        is missing, will assume 16-bit integers (a reasonable default).
    cols: list of labels
        assigned column names, including categorical ones.

    Returns
    -------
    - dataframe with correct shape and data-types
    - list of numpy views, in order, of the columns of the dataframe. Assign
        to this.
    """
    df = DataFrame()
    views = {}

    cols = cols if cols is not None else range(cols)
    if isinstance(types, STR_TYPE):
        types = types.split(',')
    for t, col in zip(types, cols):
        if str(t) == 'category':
            if cats is None or col not in cats:
                df[str(col)] = Categorical([],
                                           categories=RangeIndex(0, 2**14),
                                           fastpath=True)
            elif isinstance(cats[col], int):
                df[str(col)] = Categorical([],
                                           categories=RangeIndex(0, cats[col]),
                                           fastpath=True)
            else:  # explicit labels list
                df[str(col)] = Categorical([],
                                           categories=cats[col],
                                           fastpath=True)
        else:
            df[str(col)] = np.empty(0, dtype=t)

    if index_type is not None and index_type is not False:
        if index_name is None:
            raise ValueError('If using an index, must give an index name')
        if str(index_type) == 'category':
            if cats is None or index_name not in cats:
                c = Categorical([],
                                categories=RangeIndex(0, 2**14),
                                fastpath=True)
            elif isinstance(cats[index_name], int):
                c = Categorical([],
                                categories=RangeIndex(0, cats[index_name]),
                                fastpath=True)
            else:  # explicit labels list
                c = Categorical([], categories=cats[index_name], fastpath=True)
            print(cats, index_name, c)
            vals = np.empty(size, dtype=c.codes.dtype)
            index = CategoricalIndex(c)
            index._data._codes = vals
            views[index_name] = vals
        else:
            index = np.empty(size, dtype=index_type)
            views[index_name] = index

        axes = [df._data.axes[0], index]
    else:
        axes = [df._data.axes[0], RangeIndex(size)]

    # allocate and create blocks
    blocks = []
    for block in df._data.blocks:
        if block.is_categorical:
            categories = block.values.categories
            code = np.zeros(shape=size, dtype=block.values.codes.dtype)
            values = Categorical(values=code,
                                 categories=categories,
                                 fastpath=True)
        else:
            new_shape = (block.values.shape[0], size)
            values = np.empty(shape=new_shape, dtype=block.values.dtype)

        new_block = block.make_block_same_class(values=values)
        blocks.append(new_block)

    # create block manager
    df = DataFrame(BlockManager(blocks, axes))

    # create views
    for block in df._data.blocks:
        dtype = block.dtype
        inds = block.mgr_locs.indexer
        if isinstance(inds, slice):
            inds = list(range(inds.start, inds.stop, inds.step))
        for i, ind in enumerate(inds):
            col = df.columns[ind]
            if str(dtype) == 'category':
                views[col] = block.values._codes
                views[col + '-catdef'] = block.values
            else:
                views[col] = block.values[i]

    if index_name is not None and index_name is not False:
        df.index.name = index_name
    if str(index_type) == 'category':
        views[index_name + '-catdef'] = df._data.axes[1].values
    return df, views
Beispiel #12
0
def decode(obj):
    """
    Decoder for deserializing numpy data types.
    """

    typ = obj.get('typ')
    if typ is None:
        return obj
    elif typ == 'timestamp':
        freq = obj['freq'] if 'freq' in obj else obj['offset']
        return Timestamp(obj['value'], tz=obj['tz'], freq=freq)
    elif typ == 'nat':
        return NaT
    elif typ == 'period':
        return Period(ordinal=obj['ordinal'], freq=obj['freq'])
    elif typ == 'index':
        dtype = dtype_for(obj['dtype'])
        data = unconvert(obj['data'], dtype,
                         obj.get('compress'))
        return Index(data, dtype=dtype, name=obj['name'])
    elif typ == 'range_index':
        return RangeIndex(obj['start'],
                          obj['stop'],
                          obj['step'],
                          name=obj['name'])
    elif typ == 'multi_index':
        dtype = dtype_for(obj['dtype'])
        data = unconvert(obj['data'], dtype,
                         obj.get('compress'))
        data = [tuple(x) for x in data]
        return MultiIndex.from_tuples(data, names=obj['names'])
    elif typ == 'period_index':
        data = unconvert(obj['data'], np.int64, obj.get('compress'))
        d = dict(name=obj['name'], freq=obj['freq'])
        freq = d.pop('freq', None)
        return PeriodIndex(PeriodArray(data, freq), **d)

    elif typ == 'datetime_index':
        data = unconvert(obj['data'], np.int64, obj.get('compress'))
        d = dict(name=obj['name'], freq=obj['freq'])
        result = DatetimeIndex(data, **d)
        tz = obj['tz']

        # reverse tz conversion
        if tz is not None:
            result = result.tz_localize('UTC').tz_convert(tz)
        return result

    elif typ in ('interval_index', 'interval_array'):
        return globals()[obj['klass']].from_arrays(obj['left'],
                                                   obj['right'],
                                                   obj['closed'],
                                                   name=obj['name'])
    elif typ == 'category':
        from_codes = globals()[obj['klass']].from_codes
        return from_codes(codes=obj['codes'],
                          categories=obj['categories'],
                          ordered=obj['ordered'])

    elif typ == 'interval':
        return Interval(obj['left'], obj['right'], obj['closed'])
    elif typ == 'series':
        dtype = dtype_for(obj['dtype'])
        pd_dtype = pandas_dtype(dtype)

        index = obj['index']
        result = Series(unconvert(obj['data'], dtype, obj['compress']),
                        index=index,
                        dtype=pd_dtype,
                        name=obj['name'])
        return result

    elif typ == 'block_manager':
        axes = obj['axes']

        def create_block(b):
            values = _safe_reshape(unconvert(
                b['values'], dtype_for(b['dtype']),
                b['compress']), b['shape'])

            # locs handles duplicate column names, and should be used instead
            # of items; see GH 9618
            if 'locs' in b:
                placement = b['locs']
            else:
                placement = axes[0].get_indexer(b['items'])

            if is_datetime64tz_dtype(b['dtype']):
                assert isinstance(values, np.ndarray), type(values)
                assert values.dtype == 'M8[ns]', values.dtype
                values = DatetimeArray(values, dtype=b['dtype'])

            return make_block(values=values,
                              klass=getattr(internals, b['klass']),
                              placement=placement,
                              dtype=b['dtype'])

        blocks = [create_block(b) for b in obj['blocks']]
        return globals()[obj['klass']](BlockManager(blocks, axes))
    elif typ == 'datetime':
        return parse(obj['data'])
    elif typ == 'datetime64':
        return np.datetime64(parse(obj['data']))
    elif typ == 'date':
        return parse(obj['data']).date()
    elif typ == 'timedelta':
        return timedelta(*obj['data'])
    elif typ == 'timedelta64':
        return np.timedelta64(int(obj['data']))
    # elif typ == 'sparse_series':
    #    dtype = dtype_for(obj['dtype'])
    #    return SparseSeries(
    #        unconvert(obj['sp_values'], dtype, obj['compress']),
    #        sparse_index=obj['sp_index'], index=obj['index'],
    #        fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
    # elif typ == 'sparse_dataframe':
    #    return SparseDataFrame(
    #        obj['data'], columns=obj['columns'],
    #        default_fill_value=obj['default_fill_value'],
    #        default_kind=obj['default_kind']
    #    )
    # elif typ == 'sparse_panel':
    #    return SparsePanel(
    #        obj['data'], items=obj['items'],
    #        default_fill_value=obj['default_fill_value'],
    #        default_kind=obj['default_kind'])
    elif typ == 'block_index':
        return globals()[obj['klass']](obj['length'], obj['blocs'],
                                       obj['blengths'])
    elif typ == 'int_index':
        return globals()[obj['klass']](obj['length'], obj['indices'])
    elif typ == 'ndarray':
        return unconvert(obj['data'], np.typeDict[obj['dtype']],
                         obj.get('compress')).reshape(obj['shape'])
    elif typ == 'np_scalar':
        if obj.get('sub_typ') == 'np_complex':
            return c2f(obj['real'], obj['imag'], obj['dtype'])
        else:
            dtype = dtype_for(obj['dtype'])
            try:
                return dtype(obj['data'])
            except (ValueError, TypeError):
                return dtype.type(obj['data'])
    elif typ == 'np_complex':
        return complex(obj['real'] + '+' + obj['imag'] + 'j')
    elif isinstance(obj, (dict, list, set)):
        return obj
    else:
        return obj
Beispiel #13
0
 def _init_arrays(self, arrays, arr_names, axes):
     # segregates dtypes and forms blocks matching to columns
     blocks = form_blocks(arrays, arr_names, axes)
     mgr = BlockManager(blocks, axes).consolidate()
     return mgr
Beispiel #14
0
def empty(types,
          size,
          cats=None,
          cols=None,
          index_types=None,
          index_names=None,
          timezones=None):
    """
    Create empty DataFrame to assign into

    Parameters
    ----------
    types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples
        applies to non-categorical columns. If there are only categorical
        columns, an empty string of None will do.
    size: int
        Number of rows to allocate
    cats: dict {col: labels}
        Location and labels for categorical columns, e.g., {1: ['mary', 'mo]}
        will create column index 1 (inserted amongst the numerical columns)
        with two possible values. If labels is an integers, `{'col': 5}`,
        will generate temporary labels using range. If None, or column name
        is missing, will assume 16-bit integers (a reasonable default).
    cols: list of labels
        assigned column names, including categorical ones.
    timezones: dict {col: timezone_str}
        for timestamp type columns, apply this timezone to the pandas series;
        the numpy view will be UTC.

    Returns
    -------
    - dataframe with correct shape and data-types
    - list of numpy views, in order, of the columns of the dataframe. Assign
        to this.
    """
    views = {}
    timezones = timezones or {}

    if isinstance(types, STR_TYPE):
        types = types.split(',')
    cols = cols if cols is not None else range(len(types))

    def cat(col):
        if cats is None or col not in cats:
            return RangeIndex(0, 2**14)
        elif isinstance(cats[col], int):
            return RangeIndex(0, cats[col])
        else:  # explicit labels list
            return cats[col]

    indexes = []
    if index_names:
        for t, col in zip(index_types, index_names):
            if col is None:
                raise ValueError('If using an index, must give an index name')
            if str(t) == 'category':
                c = Categorical([], categories=cat(col), fastpath=True)
                vals = np.zeros(size, dtype=c.codes.dtype)
                index = CategoricalIndex(c)
                index._data._codes = vals
                views[col] = vals
                views[col + '-catdef'] = index._data
            else:
                d = np.empty(size, dtype=t)
                # if d.dtype.kind == "M" and six.text_type(col) in timezones:
                #     d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
                index = Index(d)
                views[col] = index.values
            index.name = _index_name(col)
            indexes.append(index)

    df = OrderedDict()
    for t, col in zip(types, cols):
        if str(t) == 'category':
            df[six.text_type(col)] = Categorical([],
                                                 categories=cat(col),
                                                 fastpath=True)
        else:
            d = np.empty(0, dtype=t)
            if d.dtype.kind == "M" and six.text_type(col) in timezones:
                d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
            df[six.text_type(col)] = d

    df = DataFrame(df)

    # allocate and create blocks
    blocks = []
    for block in df._data.blocks:
        if block.is_categorical:
            categories = block.values.categories
            code = np.zeros(shape=size, dtype=block.values.codes.dtype)
            values = Categorical(values=code,
                                 categories=categories,
                                 fastpath=True)
            new_block = block.make_block_same_class(values=values)
        elif getattr(block.dtype, 'tz', None):
            new_shape = (size, )
            values = np.empty(shape=new_shape, dtype=block.values.values.dtype)
            new_block = block.make_block_same_class(values=values,
                                                    dtype=block.values.dtype)
        else:
            new_shape = (block.values.shape[0], size)
            values = np.empty(shape=new_shape, dtype=block.values.dtype)
            new_block = block.make_block_same_class(values=values)

        blocks.append(new_block)

    # create block manager
    axes = [df._data.axes[0], RangeIndex(size)]
    n_indexes = len(indexes)
    if n_indexes == 1:
        axes[1] = indexes[0]
    elif n_indexes > 1:
        views['__fastparquet_multiindex__'] = indexes

    df = DataFrame(BlockManager(blocks, axes))

    # create views
    for block in df._data.blocks:
        dtype = block.dtype
        inds = block.mgr_locs.indexer
        if isinstance(inds, slice):
            inds = list(range(inds.start, inds.stop, inds.step))
        for i, ind in enumerate(inds):
            col = df.columns[ind]
            if is_categorical_dtype(dtype):
                views[col] = block.values._codes
                views[col + '-catdef'] = block.values
            elif getattr(block.dtype, 'tz', None):
                views[col] = block.values.values
            else:
                views[col] = block.values[i]

    return df, views
Beispiel #15
0
    def _read_panel_table(self, group, where=None):
        table = getattr(group, 'table')
        fields = table._v_attrs.fields

        # create the selection
        sel = Selection(table, where, table._v_attrs.index_kind)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values['column'],
                                 table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind)
        values = sel.values['values']

        major = Factor.from_array(index)
        minor = Factor.from_array(columns)

        J, K = len(major.levels), len(minor.levels)
        key = major.labels * K + minor.labels

        if len(unique(key)) == len(key):
            sorter, _ = lib.groupsort_indexer(com._ensure_int64(key), J * K)
            sorter = com._ensure_platform_int(sorter)

            # the data need to be sorted
            sorted_values = values.take(sorter, axis=0)
            major_labels = major.labels.take(sorter)
            minor_labels = minor.labels.take(sorter)

            block = block2d_to_block3d(sorted_values, fields, (J, K),
                                       major_labels, minor_labels)

            mgr = BlockManager([block],
                               [block.ref_items, major.levels, minor.levels])
            wp = Panel(mgr)
        else:
            if not self._quiet:  # pragma: no cover
                print(
                    'Duplicate entries in table, taking most recently '
                    'appended')

            # reconstruct
            long_index = MultiIndex.from_arrays([index, columns])
            lp = DataFrame(values, index=long_index, columns=fields)

            # need a better algorithm
            tuple_index = long_index._tuple_index

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = match(unique_tuples, tuple_index)
            indexer = com._ensure_platform_int(indexer)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = DataFrame(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_panel()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp
Beispiel #16
0
def empty(types, size, cats=None, cols=None, index_types=None, index_names=None,
          timezones=None):
    """
    Create empty DataFrame to assign into

    In the simplest case, will return a Pandas dataframe of the given size,
    with columns of the given names and types. The second return value `views`
    is a dictionary of numpy arrays into which you can assign values that
    show up in the dataframe.

    For categorical columns, you get two views to assign into: if the
    column name is "col", you get both "col" (the category codes) and
    "col-catdef" (the category labels).

    For a single categorical index, you should use the `.set_categories`
    method of the appropriate "-catdef" columns, passing an Index of values

    ``views['index-catdef'].set_categories(pd.Index(newvalues), fastpath=True)``

    Multi-indexes work a lot like categoricals, even if the types of each
    index are not themselves categories, and will also have "-catdef" entries
    in the views. However, these will be Dummy instances, providing only a
    ``.set_categories`` method, to be used as above.

    Parameters
    ----------
    types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples
        applies to non-categorical columns. If there are only categorical
        columns, an empty string of None will do.
    size: int
        Number of rows to allocate
    cats: dict {col: labels}
        Location and labels for categorical columns, e.g., {1: ['mary', 'mo]}
        will create column index 1 (inserted amongst the numerical columns)
        with two possible values. If labels is an integers, `{'col': 5}`,
        will generate temporary labels using range. If None, or column name
        is missing, will assume 16-bit integers (a reasonable default).
    cols: list of labels
        assigned column names, including categorical ones.
    index_types: list of str
        For one of more index columns, make them have this type. See general
        description, above, for caveats about multi-indexing. If None, the
        index will be the default RangeIndex.
    index_names: list of str
        Names of the index column(s), if using
    timezones: dict {col: timezone_str}
        for timestamp type columns, apply this timezone to the pandas series;
        the numpy view will be UTC.

    Returns
    -------
    - dataframe with correct shape and data-types
    - list of numpy views, in order, of the columns of the dataframe. Assign
        to this.
    """
    views = {}
    timezones = timezones or {}

    if isinstance(types, STR_TYPE):
        types = types.split(',')
    cols = cols if cols is not None else range(len(types))

    def cat(col):
        if cats is None or col not in cats:
            return RangeIndex(0, 2**14)
        elif isinstance(cats[col], int):
            return RangeIndex(0, cats[col])
        else:  # explicit labels list
            return cats[col]

    df = OrderedDict()
    for t, col in zip(types, cols):
        if str(t) == 'category':
            df[six.text_type(col)] = Categorical([], categories=cat(col),
                                                 fastpath=True)
        else:
            d = np.empty(0, dtype=t)
            if d.dtype.kind == "M" and six.text_type(col) in timezones:
                d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
            df[six.text_type(col)] = d

    df = DataFrame(df)
    if not index_types:
        index = RangeIndex(size)
    elif len(index_types) == 1:
        t, col = index_types[0], index_names[0]
        if col is None:
            raise ValueError('If using an index, must give an index name')
        if str(t) == 'category':
            c = Categorical([], categories=cat(col), fastpath=True)
            vals = np.zeros(size, dtype=c.codes.dtype)
            index = CategoricalIndex(c)
            index._data._codes = vals
            views[col] = vals
            views[col+'-catdef'] = index._data
        else:
            d = np.empty(size, dtype=t)
            index = Index(d)
            views[col] = index.values
    else:
        index = MultiIndex([[]], [[]])
        # index = MultiIndex.from_arrays(indexes)
        index._levels = list()
        index._labels = list()
        index._codes = list()
        for i, col in enumerate(index_names):
            index._levels.append(Index([None]))

            def set_cats(values, i=i, col=col, **kwargs):
                values.name = col
                if index._levels[i][0] is None:
                    index._levels[i] = values
                elif not index._levels[i].equals(values):
                    raise RuntimeError("Different dictionaries encountered"
                                       " while building categorical")

            x = Dummy()
            x._set_categories = set_cats

            d = np.zeros(size, dtype=int)
            if LooseVersion(pdver) >= LooseVersion("0.24.0"):
                index._codes = list(index._codes) + [d]
            else:
                index._labels.append(d)
            views[col] = d
            views[col+'-catdef'] = x

    axes = [df._data.axes[0], index]

    # allocate and create blocks
    blocks = []
    for block in df._data.blocks:
        if block.is_categorical:
            categories = block.values.categories
            code = np.zeros(shape=size, dtype=block.values.codes.dtype)
            values = Categorical(values=code, categories=categories,
                                 fastpath=True)
            new_block = block.make_block_same_class(values=values)
        elif getattr(block.dtype, 'tz', None):
            new_shape = (size, )
            values = np.empty(shape=new_shape, dtype='M8[ns]')
            new_block = block.make_block_same_class(
                    values=values, dtype=block.values.dtype)
        else:
            new_shape = (block.values.shape[0], size)
            values = np.empty(shape=new_shape, dtype=block.values.dtype)
            new_block = block.make_block_same_class(values=values)

        blocks.append(new_block)

    # create block manager
    df = DataFrame(BlockManager(blocks, axes))

    # create views
    for block in df._data.blocks:
        dtype = block.dtype
        inds = block.mgr_locs.indexer
        if isinstance(inds, slice):
            inds = list(range(inds.start, inds.stop, inds.step))
        for i, ind in enumerate(inds):
            col = df.columns[ind]
            if is_categorical_dtype(dtype):
                views[col] = block.values._codes
                views[col+'-catdef'] = block.values
            elif getattr(block.dtype, 'tz', None):
                views[col] = np.asarray(block.values, dtype='M8[ns]')
            else:
                views[col] = block.values[i]

    if index_names:
        df.index.names = [
            None if re.match(r'__index_level_\d+__', n) else n
            for n in index_names
        ]
    return df, views
Beispiel #17
0
def block_concat(dfs, idx, columns):
    manager = BlockManager(iter_blocks(dfs), [columns, idx])
    return pd.DataFrame(manager).copy()