Ejemplo n.º 1
0
def test_validate_ndim():
    values = np.array([1.0, 2.0])
    placement = slice(2)
    msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]"

    with pytest.raises(ValueError, match=msg):
        make_block(values, placement, ndim=2)
Ejemplo n.º 2
0
def test_validate_ndim():
    values = np.array([1.0, 2.0])
    placement = slice(2)
    msg = "Wrong number of dimensions. values.ndim != ndim \[1 != 2\]"

    with tm.assert_raises_regex(ValueError, msg):
        make_block(values, placement, ndim=2)
Ejemplo n.º 3
0
def _reconstruct_block(item):
    # Construct the individual blocks converting dictionary types to pandas
    # categorical types and Timestamps-with-timezones types to the proper
    # pandas Blocks

    block_arr = item['block']
    placement = item['placement']
    if 'dictionary' in item:
        cat = pd.Categorical.from_codes(block_arr,
                                        categories=item['dictionary'],
                                        ordered=item['ordered'])
        block = _int.make_block(cat, placement=placement,
                                klass=_int.CategoricalBlock)
    elif 'timezone' in item:
        dtype = _make_datetimetz(item['timezone'])
        block = _int.make_block(block_arr, placement=placement,
                                klass=_int.DatetimeTZBlock,
                                dtype=dtype)
    elif 'object' in item:
        block = _int.make_block(builtin_pickle.loads(block_arr),
                                placement=placement, klass=_int.ObjectBlock)
    else:
        block = _int.make_block(block_arr, placement=placement)

    return block
Ejemplo n.º 4
0
    def _concat_blocks(self, blocks):
        values_list = [b.values for b in blocks if b is not None]
        concat_values = com._concat_compat(values_list, axis=self.axis)

        if self.axis > 0:
            # Not safe to remove this check, need to profile
            if not _all_indexes_same([b.items for b in blocks]):
                raise Exception("dtypes are not consistent throughout " "DataFrames")
            return make_block(concat_values, blocks[0].items, self.new_axes[0])
        else:

            offsets = np.r_[0, np.cumsum([len(x._data.axes[0]) for x in self.objs])]
            indexer = np.concatenate([offsets[i] + b.ref_locs for i, b in enumerate(blocks) if b is not None])
            if self.ignore_index:
                concat_items = indexer
            else:
                concat_items = self.new_axes[0].take(indexer)

            if self.ignore_index:
                ref_items = self._get_fresh_axis()
                return make_block(concat_values, concat_items, ref_items)

            block = make_block(concat_values, concat_items, self.new_axes[0])

            # we need to set the ref_locs in this block so we have the mapping
            # as we now have a non-unique index across dtypes, and we need to
            # map the column location to the block location
            # GH3602
            if not self.new_axes[0].is_unique:
                block._ref_locs = indexer

            return block
Ejemplo n.º 5
0
    def _concat_blocks(self, blocks):
        values_list = [b.values for b in blocks if b is not None]
        concat_values = com._concat_compat(values_list, axis=self.axis)

        if self.axis > 0:
            # Not safe to remove this check, need to profile
            if not _all_indexes_same([b.items for b in blocks]):
                raise Exception('dtypes are not consistent throughout '
                                'DataFrames')
            return make_block(concat_values, blocks[0].items, self.new_axes[0])
        else:
            offsets = np.r_[0, np.cumsum([len(x._data.axes[0]) for
                                            x in self.objs])]
            indexer = np.concatenate([offsets[i] + b.ref_locs
                                      for i, b in enumerate(blocks)
                                      if b is not None])
            if self.ignore_index:
                concat_items = indexer
            else:
                concat_items = self.new_axes[0].take(indexer)

            if self.ignore_index:
                ref_items = self._get_fresh_axis()
                return make_block(concat_values, concat_items, ref_items)

            return make_block(concat_values, concat_items, self.new_axes[0])
Ejemplo n.º 6
0
def _unstack_frame(obj, level):
    from pandas.core.internals import BlockManager, make_block

    if obj._is_mixed_type:
        unstacker = _Unstacker(np.empty(obj.shape, dtype=bool),  # dummy
                               obj.index, level=level,
                               value_columns=obj.columns)
        new_columns = unstacker.get_new_columns()
        new_index = unstacker.get_new_index()
        new_axes = [new_columns, new_index]

        new_blocks = []
        mask_blocks = []
        for blk in obj._data.blocks:
            bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
                                    value_columns=blk.items)
            new_items = bunstacker.get_new_columns()
            new_values, mask = bunstacker.get_new_values()

            mblk = make_block(mask.T, new_items, new_columns)
            mask_blocks.append(mblk)

            newb = make_block(new_values.T, new_items, new_columns)
            new_blocks.append(newb)

        result = DataFrame(BlockManager(new_blocks, new_axes))
        mask_frame = DataFrame(BlockManager(mask_blocks, new_axes))
        return result.ix[:, mask_frame.sum(0) > 0]
    else:
        unstacker = _Unstacker(obj.values, obj.index, level=level,
                               value_columns=obj.columns)
        return unstacker.get_result()
Ejemplo n.º 7
0
    def test_merge(self):
        avals = randn(2, 10)
        bvals = randn(2, 10)

        ref_cols = Index(['e', 'a', 'b', 'd', 'f'])

        ablock = make_block(avals, ref_cols.get_indexer(['e', 'b']))
        bblock = make_block(bvals, ref_cols.get_indexer(['a', 'd']))
        merged = ablock.merge(bblock)
        assert_almost_equal(merged.mgr_locs, [0, 1, 2, 3])
        assert_almost_equal(merged.values[[0, 2]], avals)
        assert_almost_equal(merged.values[[1, 3]], bvals)
Ejemplo n.º 8
0
    def test_merge(self):
        avals = randn(2, 10)
        bvals = randn(2, 10)

        ref_cols = Index(['e', 'a', 'b', 'd', 'f'])

        ablock = make_block(avals, ref_cols.get_indexer(['e', 'b']))
        bblock = make_block(bvals, ref_cols.get_indexer(['a', 'd']))
        merged = ablock.merge(bblock)
        tm.assert_numpy_array_equal(merged.mgr_locs.as_array,
                                    np.array([0, 1, 2, 3], dtype=np.int64))
        tm.assert_numpy_array_equal(merged.values[[0, 2]], np.array(avals))
        tm.assert_numpy_array_equal(merged.values[[1, 3]], np.array(bvals))
Ejemplo n.º 9
0
    def _concat_blocks(self, blocks):
        concat_values = np.concatenate([b.values for b in blocks],
                                       axis=self.axis)

        if self.axis > 0:
            # Not safe to remove this check, need to profile
            if not _all_indexes_same([b.items for b in blocks]):
                raise Exception('dtypes are not consistent throughout '
                                'DataFrames')
            return make_block(concat_values, blocks[0].items, self.new_axes[0])
        else:
            concat_items = _concat_indexes([b.items for b in blocks])
            # TODO: maybe want to "take" from the new columns?
            return make_block(concat_values, concat_items, self.new_axes[0])
Ejemplo n.º 10
0
    def _merge_blocks(self, merge_chunks):
        """
        merge_chunks -> [(_JoinUnit, Block)]
        """
        funit, fblock = merge_chunks[0]
        fidx = funit.indexer

        out_shape = list(fblock.values.shape)

        n = len(fidx) if fidx is not None else out_shape[self.axis]

        out_shape[0] = sum(len(blk) for unit, blk in merge_chunks)
        out_shape[self.axis] = n

        # Should use Fortran order??
        out = np.empty(out_shape, dtype=fblock.values.dtype)

        sofar = 0
        for unit, blk in merge_chunks:
            out_chunk = out[sofar : sofar + len(blk)]

            if unit.indexer is None:
                # is this really faster than assigning to arr.flat?
                com.take_fast(blk.values, np.arange(n, dtype=np.int64), None, False, axis=self.axis, out=out_chunk)
            else:
                # write out the values to the result array
                com.take_fast(blk.values, unit.indexer, None, False, axis=self.axis, out=out_chunk)

            sofar += len(blk)

        # does not sort
        new_block_items = _concat_indexes([b.items for _, b in merge_chunks])
        return make_block(out, new_block_items, self.result_items)
Ejemplo n.º 11
0
def _upcast_blocks(blocks):
    """
    Upcast and consolidate if necessary
    """
    new_blocks = []
    for block in blocks:
        if isinstance(block, IntBlock):
            newb = make_block(block.values.astype(float), block.items, block.ref_items, placement=block._ref_locs)
        elif isinstance(block, BoolBlock):
            newb = make_block(block.values.astype(object), block.items, block.ref_items, placement=block._ref_locs)
        else:
            newb = block
        new_blocks.append(newb)

    # use any ref_items
    return _consolidate(new_blocks, newb.ref_items)
Ejemplo n.º 12
0
    def _merge_blocks(self, merge_chunks):
        """
        merge_chunks -> [(_JoinUnit, Block)]
        """
        funit, fblock = merge_chunks[0]
        fidx = funit.indexer

        out_shape = list(fblock.values.shape)

        n = len(fidx) if fidx is not None else out_shape[self.axis]

        out_shape[0] = sum(len(blk) for unit, blk in merge_chunks)
        out_shape[self.axis] = n

        # Should use Fortran order??
        block_dtype = _get_block_dtype([x[1] for x in merge_chunks])
        out = np.empty(out_shape, dtype=block_dtype)

        sofar = 0
        for unit, blk in merge_chunks:
            out_chunk = out[sofar: sofar + len(blk)]
            com.take_nd(blk.values, unit.indexer, self.axis, out=out_chunk)
            sofar += len(blk)

        # does not sort
        new_block_items = _concat_indexes([b.items for _, b in merge_chunks])
        return make_block(out, new_block_items, self.result_items)
Ejemplo n.º 13
0
def block2d_to_block3d(values, items, shape, major_labels, minor_labels,
                       ref_items=None):
    """
    Developer method for pivoting DataFrame -> Panel. Used in HDFStore and
    DataFrame.to_panel
    """
    from pandas.core.internals import make_block
    panel_shape = (len(items),) + shape

    # TODO: lexsort depth needs to be 2!!

    # Create observation selection vector using major and minor
    # labels, for converting to panel format.
    selector = minor_labels + shape[1] * major_labels
    mask = np.zeros(np.prod(shape), dtype=bool)
    mask.put(selector, True)

    pvalues = np.empty(panel_shape, dtype=values.dtype)
    if not issubclass(pvalues.dtype.type, (np.integer, np.bool_)):
        pvalues.fill(np.nan)
    elif not mask.all():
        pvalues = com._maybe_upcast(pvalues)
        pvalues.fill(np.nan)

    values = values
    for i in xrange(len(items)):
        pvalues[i].flat[mask] = values[:, i]

    if ref_items is None:
        ref_items = items

    return make_block(pvalues, items, ref_items)
Ejemplo n.º 14
0
def block2d_to_blocknd(values, items, shape, labels, ref_items=None):
    """ pivot to the labels shape """
    from pandas.core.internals import make_block
    panel_shape = (len(items),) + shape

    # TODO: lexsort depth needs to be 2!!

    # Create observation selection vector using major and minor
    # labels, for converting to panel format.
    selector = factor_indexer(shape[1:], labels)
    mask = np.zeros(np.prod(shape), dtype=bool)
    mask.put(selector, True)

    if mask.all():
        pvalues = np.empty(panel_shape, dtype=values.dtype)
    else:
        dtype, fill_value = _maybe_promote(values.dtype)
        pvalues = np.empty(panel_shape, dtype=dtype)
        pvalues.fill(fill_value)

    values = values
    for i in xrange(len(items)):
        pvalues[i].flat[mask] = values[:, i]

    if ref_items is None:
        ref_items = items

    return make_block(pvalues, items, ref_items)
Ejemplo n.º 15
0
 def create_block(b):
     dtype = dtype_for(b["dtype"])
     return make_block(
         unconvert(b["values"], dtype, b["compress"]).reshape(b["shape"]),
         b["items"],
         axes[0],
         klass=getattr(internals, b["klass"]),
     )
Ejemplo n.º 16
0
    def test_split(self):
        # GH#37799
        values = np.random.randn(3, 4)
        blk = make_block(values, placement=[3, 1, 6], ndim=2)
        result = blk._split()

        # check that we get views, not copies
        values[:] = -9999
        assert (blk.values == -9999).all()

        assert len(result) == 3
        expected = [
            make_block(values[[0]], placement=[3], ndim=2),
            make_block(values[[1]], placement=[1], ndim=2),
            make_block(values[[2]], placement=[6], ndim=2),
        ]
        for res, exp in zip(result, expected):
            assert_block_equal(res, exp)
Ejemplo n.º 17
0
def test_make_block_no_pandas_array():
    # https://github.com/pandas-dev/pandas/pull/24866
    arr = pd.array([1, 2])

    # PandasArray, no dtype
    result = make_block(arr, slice(len(arr)))
    assert result.is_integer is True
    assert result.is_extension is False

    # PandasArray, PandasDtype
    result = make_block(arr, slice(len(arr)), dtype=arr.dtype)
    assert result.is_integer is True
    assert result.is_extension is False

    # ndarray, PandasDtype
    result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype)
    assert result.is_integer is True
    assert result.is_extension is False
Ejemplo n.º 18
0
def _upcast_blocks(blocks):
    """
    Upcast and consolidate if necessary
    """
    new_blocks = []
    for block in blocks:
        if isinstance(block, IntBlock):
            newb = make_block(block.values.astype(float), block.items,
                              block.ref_items)
        elif isinstance(block, BoolBlock):
            newb = make_block(block.values.astype(object), block.items,
                              block.ref_items)
        else:
            newb = block
        new_blocks.append(newb)

    # use any ref_items
    return _consolidate(new_blocks, newb.ref_items)
Ejemplo n.º 19
0
    def test_get(self):
        cols = Index(list("abc"))
        values = np.random.rand(3, 3)
        block = make_block(values=values.copy(), placement=np.arange(3))
        mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)])

        tm.assert_almost_equal(mgr.get("a").internal_values(), values[0])
        tm.assert_almost_equal(mgr.get("b").internal_values(), values[1])
        tm.assert_almost_equal(mgr.get("c").internal_values(), values[2])
Ejemplo n.º 20
0
def test_make_block_no_pandas_array():
    # https://github.com/pandas-dev/pandas/pull/24866
    arr = pd.arrays.PandasArray(np.array([1, 2]))

    # PandasArray, no dtype
    result = make_block(arr, slice(len(arr)))
    assert result.is_integer is True
    assert result.is_extension is False

    # PandasArray, PandasDtype
    result = make_block(arr, slice(len(arr)), dtype=arr.dtype)
    assert result.is_integer is True
    assert result.is_extension is False

    # ndarray, PandasDtype
    result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype)
    assert result.is_integer is True
    assert result.is_extension is False
Ejemplo n.º 21
0
    def _concat_blocks(self, blocks):

        values_list = [b.get_values() for b in blocks if b is not None]
        concat_values = com._concat_compat(values_list, axis=self.axis)

        if self.axis > 0:
            # Not safe to remove this check, need to profile
            if not _all_indexes_same([b.items for b in blocks]):
                # TODO: Either profile this piece or remove.
                # FIXME: Need to figure out how to test whether this line exists or does not...(unclear if even possible
                #        or maybe would require performance test)
                raise PandasError('dtypes are not consistent throughout '
                                  'DataFrames')
            return make_block(concat_values,
                              blocks[0].items,
                              self.new_axes[0],
                              placement=blocks[0]._ref_locs)
        else:

            offsets = np.r_[
                0, np.cumsum([len(x._data.axes[0]) for x in self.objs])]
            indexer = np.concatenate([
                offsets[i] + b.ref_locs for i, b in enumerate(blocks)
                if b is not None
            ])
            if self.ignore_index:
                concat_items = indexer
            else:
                concat_items = self.new_axes[0].take(indexer)

            if self.ignore_index:
                ref_items = self._get_fresh_axis()
                return make_block(concat_values, concat_items, ref_items)

            block = make_block(concat_values, concat_items, self.new_axes[0])

            # we need to set the ref_locs in this block so we have the mapping
            # as we now have a non-unique index across dtypes, and we need to
            # map the column location to the block location
            # GH3602
            if not self.new_axes[0].is_unique:
                block.set_ref_locs(indexer)

            return block
Ejemplo n.º 22
0
    def _concat_blocks(self, blocks):

        values_list = [b.get_values() for b in blocks if b is not None]
        concat_values = com._concat_compat(values_list, axis=self.axis)

        if self.axis > 0:
            # Not safe to remove this check, need to profile
            if not _all_indexes_same([b.items for b in blocks]):
                # TODO: Either profile this piece or remove.
                # FIXME: Need to figure out how to test whether this line exists or does not...(unclear if even possible
                #        or maybe would require performance test)
                raise PandasError('dtypes are not consistent throughout '
                                  'DataFrames')
            return make_block(concat_values,
                              blocks[0].items,
                              self.new_axes[0],
                              placement=blocks[0]._ref_locs)
        else:

            offsets = np.r_[0, np.cumsum([len(x._data.axes[0]) for
                                          x in self.objs])]
            indexer = np.concatenate([offsets[i] + b.ref_locs
                                      for i, b in enumerate(blocks)
                                      if b is not None])
            if self.ignore_index:
                concat_items = indexer
            else:
                concat_items = self.new_axes[0].take(indexer)

            if self.ignore_index:
                ref_items = self._get_fresh_axis()
                return make_block(concat_values, concat_items, ref_items)

            block = make_block(concat_values, concat_items, self.new_axes[0])

            # we need to set the ref_locs in this block so we have the mapping
            # as we now have a non-unique index across dtypes, and we need to
            # map the column location to the block location
            # GH3602
            if not self.new_axes[0].is_unique:
                block.set_ref_locs(indexer)

            return block
Ejemplo n.º 23
0
    def _cython_agg_general(self, how):
        obj = self._obj_with_exclusions
        if self.axis == 1:
            obj = obj.T

        new_blocks = []

        for block in obj._data.blocks:
            values = block.values.T
            if not issubclass(values.dtype.type, (np.number, np.bool_)):
                continue

            values = com._ensure_float64(values)
            result, counts = self.grouper.aggregate(values, how)

            mask = counts > 0
            if len(mask) > 0:
                result = result[mask]
            newb = make_block(result.T, block.items, block.ref_items)
            new_blocks.append(newb)

        if len(new_blocks) == 0:
            raise GroupByError('No numeric types to aggregate')

        agg_axis = 0 if self.axis == 1 else 1
        agg_labels = self._obj_with_exclusions._get_axis(agg_axis)

        if sum(len(x.items) for x in new_blocks) == len(agg_labels):
            output_keys = agg_labels
        else:
            all_items = []
            for b in new_blocks:
                all_items.extend(b.items)
            output_keys = agg_labels[agg_labels.isin(all_items)]

        if not self.as_index:
            index = np.arange(new_blocks[0].values.shape[1])
            mgr = BlockManager(new_blocks, [output_keys, index])
            result = DataFrame(mgr)

            group_levels = self.grouper.get_group_levels()
            zipped = zip(self.grouper.names, group_levels)

            for i, (name, labels) in enumerate(zipped):
                result.insert(i, name, labels)
            result = result.consolidate()
        else:
            index = self.grouper.result_index
            mgr = BlockManager(new_blocks, [output_keys, index])
            result = DataFrame(mgr)

        if self.axis == 1:
            result = result.T

        return result
Ejemplo n.º 24
0
    def _cython_agg_general(self, how):
        obj = self._obj_with_exclusions
        if self.axis == 1:
            obj = obj.T

        new_blocks = []

        for block in obj._data.blocks:
            values = block.values.T
            if not issubclass(values.dtype.type, (np.number, np.bool_)):
                continue

            values = com._ensure_float64(values)
            result, counts = self.grouper.aggregate(values, how)

            mask = counts > 0
            if len(mask) > 0:
                result = result[mask]
            newb = make_block(result.T, block.items, block.ref_items)
            new_blocks.append(newb)

        if len(new_blocks) == 0:
            raise GroupByError('No numeric types to aggregate')

        agg_axis = 0 if self.axis == 1 else 1
        agg_labels = self._obj_with_exclusions._get_axis(agg_axis)

        if sum(len(x.items) for x in new_blocks) == len(agg_labels):
            output_keys = agg_labels
        else:
            all_items = []
            for b in new_blocks:
                all_items.extend(b.items)
            output_keys = agg_labels[agg_labels.isin(all_items)]

        if not self.as_index:
            index = np.arange(new_blocks[0].values.shape[1])
            mgr = BlockManager(new_blocks, [output_keys, index])
            result = DataFrame(mgr)

            group_levels = self.grouper.get_group_levels()
            zipped = zip(self.grouper.names, group_levels)

            for i, (name, labels) in enumerate(zipped):
                result.insert(i, name, labels)
            result = result.consolidate()
        else:
            index = self.grouper.result_index
            mgr = BlockManager(new_blocks, [output_keys, index])
            result = DataFrame(mgr)

        if self.axis == 1:
            result = result.T

        return result
Ejemplo n.º 25
0
def _unstack_frame(obj, level, fill_value=None):
    from pandas.core.internals import BlockManager, make_block

    if obj._is_mixed_type:
        unstacker = _Unstacker(
            np.empty(obj.shape, dtype=bool),  # dummy
            obj.index,
            level=level,
            value_columns=obj.columns)
        new_columns = unstacker.get_new_columns()
        new_index = unstacker.get_new_index()
        new_axes = [new_columns, new_index]

        new_blocks = []
        mask_blocks = []
        for blk in obj._data.blocks:
            blk_items = obj._data.items[blk.mgr_locs.indexer]
            bunstacker = _Unstacker(blk.values.T,
                                    obj.index,
                                    level=level,
                                    value_columns=blk_items,
                                    fill_value=fill_value)
            new_items = bunstacker.get_new_columns()
            new_placement = new_columns.get_indexer(new_items)
            new_values, mask = bunstacker.get_new_values()

            mblk = make_block(mask.T, placement=new_placement)
            mask_blocks.append(mblk)

            newb = make_block(new_values.T, placement=new_placement)
            new_blocks.append(newb)

        result = DataFrame(BlockManager(new_blocks, new_axes))
        mask_frame = DataFrame(BlockManager(mask_blocks, new_axes))
        return result.loc[:, mask_frame.sum(0) > 0]
    else:
        unstacker = _Unstacker(obj.values,
                               obj.index,
                               level=level,
                               value_columns=obj.columns,
                               fill_value=fill_value)
        return unstacker.get_result()
Ejemplo n.º 26
0
        def create_block(b):
            values = unconvert(b['values'], dtype_for(b['dtype']),
                               b['compress']).reshape(b['shape'])

            # locs handles duplicate column names, and should be used instead of items; see GH 9618
            if 'locs' in b:
                placement = b['locs']
            else:
                placement = axes[0].get_indexer(b['items'])
            return make_block(values=values,
                              klass=getattr(internals, b['klass']),
                              placement=placement)
Ejemplo n.º 27
0
        def create_block(b):
            values = unconvert(b['values'], dtype_for(b['dtype']),
                               b['compress']).reshape(b['shape'])

            # locs handles duplicate column names, and should be used instead of items; see GH 9618
            if 'locs' in b:
                placement = b['locs']
            else:
                placement = axes[0].get_indexer(b['items'])
            return make_block(values=values,
                              klass=getattr(internals, b['klass']),
                              placement=placement)
Ejemplo n.º 28
0
    def test_get(self):
        cols = Index(list('abc'))
        values = np.random.rand(3, 3)
        block = make_block(values=values.copy(), placement=np.arange(3))
        mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)])

        assert_almost_equal(mgr.get('a', fastpath=False), values[0])
        assert_almost_equal(mgr.get('b', fastpath=False), values[1])
        assert_almost_equal(mgr.get('c', fastpath=False), values[2])
        assert_almost_equal(mgr.get('a').internal_values(), values[0])
        assert_almost_equal(mgr.get('b').internal_values(), values[1])
        assert_almost_equal(mgr.get('c').internal_values(), values[2])
Ejemplo n.º 29
0
    def test_get(self):
        cols = Index(list('abc'))
        values = np.random.rand(3, 3)
        block = make_block(values=values.copy(), placement=np.arange(3))
        mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)])

        assert_almost_equal(mgr.get('a', fastpath=False), values[0])
        assert_almost_equal(mgr.get('b', fastpath=False), values[1])
        assert_almost_equal(mgr.get('c', fastpath=False), values[2])
        assert_almost_equal(mgr.get('a').internal_values(), values[0])
        assert_almost_equal(mgr.get('b').internal_values(), values[1])
        assert_almost_equal(mgr.get('c').internal_values(), values[2])
Ejemplo n.º 30
0
        def create_block(b):
            values = _safe_reshape(unconvert(b[u"values"], dtype_for(b[u"dtype"]), b[u"compress"]), b[u"shape"])

            # locs handles duplicate column names, and should be used instead
            # of items; see GH 9618
            if u"locs" in b:
                placement = b[u"locs"]
            else:
                placement = axes[0].get_indexer(b[u"items"])
            return make_block(
                values=values, klass=getattr(internals, b[u"klass"]), placement=placement, dtype=b[u"dtype"]
            )
Ejemplo n.º 31
0
def test_make_block_no_pandas_array():
    # https://github.com/pandas-dev/pandas/pull/24866
    arr = pd.arrays.PandasArray(np.array([1, 2]))

    # PandasArray, no dtype
    result = make_block(arr, slice(len(arr)), ndim=arr.ndim)
    assert result.dtype.kind in ["i", "u"]
    assert result.is_extension is False

    # PandasArray, PandasDtype
    result = make_block(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim)
    assert result.dtype.kind in ["i", "u"]
    assert result.is_extension is False

    # ndarray, PandasDtype
    result = make_block(arr.to_numpy(),
                        slice(len(arr)),
                        dtype=arr.dtype,
                        ndim=arr.ndim)
    assert result.dtype.kind in ["i", "u"]
    assert result.is_extension is False
Ejemplo n.º 32
0
    def _concat_blocks(self, blocks):
        concat_values = np.concatenate([b.values for b in blocks],
                                       axis=self.axis)

        if self.axis > 0:
            # Not safe to remove this check, need to profile
            if not _all_indexes_same([b.items for b in blocks]):
                raise Exception('dtypes are not consistent throughout '
                                'DataFrames')
            return make_block(concat_values, blocks[0].items, self.new_axes[0])
        else:
            all_items = [b.items for b in blocks]
            if self.axis == 0 and self.keys is not None:
                offsets = np.r_[0, np.cumsum([len(x._data.axes[self.axis]) for
                                              x in self.objs])]
                indexer = np.concatenate([offsets[i] + b.ref_locs
                                          for i, b in enumerate(blocks)])
                concat_items = self.new_axes[0].take(indexer)
            else:
                concat_items = _concat_indexes(all_items)

            return make_block(concat_values, concat_items, self.new_axes[0])
Ejemplo n.º 33
0
Archivo: pandas.py Proyecto: dask/partd
def block_from_header_bytes(header, bytes):
    placement, dtype, shape, (extension_type, extension_values) = header
    values = pnp.deserialize(pnp.decompress(bytes, dtype), dtype,
                             copy=True).reshape(shape)
    if extension_type == 'categorical_type':
        values = pd.Categorical.from_codes(values,
                                           extension_values[1],
                                           ordered=extension_values[0])
    elif extension_type == 'datetime64_tz_type':
        tz_info = extension_values[0]
        values = pd.DatetimeIndex(values).tz_localize('utc').tz_convert(
            tz_info)
    return make_block(values, placement=placement)
Ejemplo n.º 34
0
    def _concat_blocks(self, blocks):
        concat_values = np.concatenate([b.values for b in blocks],
                                       axis=self.axis)

        if self.axis > 0:
            # Not safe to remove this check, need to profile
            if not _all_indexes_same([b.items for b in blocks]):
                raise Exception('dtypes are not consistent throughout '
                                'DataFrames')
            return make_block(concat_values, blocks[0].items, self.new_axes[0])
        else:
            all_items = [b.items for b in blocks]
            if self.axis == 0 and self.keys is not None:
                offsets = np.r_[0, np.cumsum([len(x._data.axes[self.axis]) for
                                              x in self.objs])]
                indexer = np.concatenate([offsets[i] + b.ref_locs
                                          for i, b in enumerate(blocks)])
                concat_items = self.new_axes[0].take(indexer)
            else:
                concat_items = _concat_indexes(all_items)

            return make_block(concat_values, concat_items, self.new_axes[0])
Ejemplo n.º 35
0
def block_from_header_bytes(header, bytes):
    placement, dtype, shape, (extension_type, extension_values) = header
    values = pnp.deserialize(pnp.decompress(bytes, dtype), dtype,
                             copy=True).reshape(shape)
    if extension_type == 'categorical_type':
        values = pd.Categorical.from_codes(values,
                                           extension_values[1],
                                           ordered=extension_values[0])
    elif extension_type == 'datetime64_tz_type':
        tz_info = extension_values[0]
        values = pd.DatetimeIndex(values).tz_localize('utc').tz_convert(
            tz_info)
    return make_block(values, placement=placement)
Ejemplo n.º 36
0
    def _merge_blocks(self, lblk, rblk):
        lidx = self.lindexer
        ridx = self.rindexer

        n = lblk.values.shape[self.axis] if lidx is None else len(lidx)
        lk = len(lblk.items)
        rk = len(rblk.items)

        out_shape = list(lblk.shape)
        out_shape[0] = lk + rk
        out_shape[self.axis] = n

        out = np.empty(out_shape, dtype=lblk.values.dtype)

        # is this really faster than assigning to arr.flat?
        if lidx is None:
            # out[:lk] = lblk.values
            com.take_fast(lblk.values,
                          np.arange(n, dtype='i4'),
                          None,
                          False,
                          axis=self.axis,
                          out=out[:lk])
        else:
            # write out the values to the result array
            com.take_fast(lblk.values,
                          lidx,
                          None,
                          False,
                          axis=self.axis,
                          out=out[:lk])
        if ridx is None:
            # out[lk:] = lblk.values
            com.take_fast(rblk.values,
                          np.arange(n, dtype='i4'),
                          None,
                          False,
                          axis=self.axis,
                          out=out[lk:])
        else:
            com.take_fast(rblk.values,
                          ridx,
                          None,
                          False,
                          axis=self.axis,
                          out=out[lk:])

        # does not sort
        new_items = lblk.items.append(rblk.items)
        return make_block(out, new_items, self.result_items)
Ejemplo n.º 37
0
def _reconstruct_block(item):
    # Construct the individual blocks converting dictionary types to pandas
    # categorical types and Timestamps-with-timezones types to the proper
    # pandas Blocks

    block_arr = item['block']
    placement = item['placement']
    if 'dictionary' in item:
        cat = pd.Categorical.from_codes(block_arr,
                                        categories=item['dictionary'],
                                        ordered=item['ordered'])
        block = _int.make_block(cat, placement=placement,
                                klass=_int.CategoricalBlock,
                                fastpath=True)
    elif 'timezone' in item:
        dtype = _make_datetimetz(item['timezone'])
        block = _int.make_block(block_arr, placement=placement,
                                klass=_int.DatetimeTZBlock,
                                dtype=dtype, fastpath=True)
    else:
        block = _int.make_block(block_arr, placement=placement)

    return block
Ejemplo n.º 38
0
    def _concat_blocks(self, blocks):
        values_list = [b.values for b in blocks if b is not None]
        concat_values = com._concat_compat(values_list, axis=self.axis)

        if self.axis > 0:
            # Not safe to remove this check, need to profile
            if not _all_indexes_same([b.items for b in blocks]):
                raise Exception('dtypes are not consistent throughout '
                                'DataFrames')
            return make_block(concat_values, blocks[0].items, self.new_axes[0])
        else:

            offsets = np.r_[
                0, np.cumsum([len(x._data.axes[0]) for x in self.objs])]
            indexer = np.concatenate([
                offsets[i] + b.ref_locs for i, b in enumerate(blocks)
                if b is not None
            ])
            if self.ignore_index:
                concat_items = indexer
            else:
                concat_items = self.new_axes[0].take(indexer)

            if self.ignore_index:
                ref_items = self._get_fresh_axis()
                return make_block(concat_values, concat_items, ref_items)

            block = make_block(concat_values, concat_items, self.new_axes[0])

            # we need to set the ref_locs in this block so we have the mapping
            # as we now have a non-unique index across dtypes, and we need to
            # map the column location to the block location
            # GH3602
            if not self.new_axes[0].is_unique:
                block._ref_locs = indexer

            return block
Ejemplo n.º 39
0
def _unstack_frame(obj, level, fill_value=None):
    from pandas.core.internals import BlockManager, make_block

    if obj._is_mixed_type:
        unstacker = _Unstacker(np.empty(obj.shape, dtype=bool),  # dummy
                               obj.index, level=level,
                               value_columns=obj.columns)
        new_columns = unstacker.get_new_columns()
        new_index = unstacker.get_new_index()
        new_axes = [new_columns, new_index]

        new_blocks = []
        mask_blocks = []
        for blk in obj._data.blocks:
            blk_items = obj._data.items[blk.mgr_locs.indexer]
            bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
                                    value_columns=blk_items,
                                    fill_value=fill_value)
            new_items = bunstacker.get_new_columns()
            new_placement = new_columns.get_indexer(new_items)
            new_values, mask = bunstacker.get_new_values()

            mblk = make_block(mask.T, placement=new_placement)
            mask_blocks.append(mblk)

            newb = make_block(new_values.T, placement=new_placement)
            new_blocks.append(newb)

        result = obj._constructor(BlockManager(new_blocks, new_axes))
        mask_frame = obj._constructor(BlockManager(mask_blocks, new_axes))
        return result.ix[:, mask_frame.sum(0) > 0]
    else:
        unstacker = _Unstacker(obj.values, obj.index, level=level,
                               value_columns=obj.columns,
                               fill_value=fill_value)
        return unstacker.get_result()
Ejemplo n.º 40
0
    def _read_block_manager(self, group):
        ndim = group._v_attrs.ndim

        axes = []
        for i in xrange(ndim):
            ax = self._read_index(group, 'axis%d' % i)
            axes.append(ax)

        items = axes[0]
        blocks = []
        for i in range(group._v_attrs.nblocks):
            blk_items = self._read_index(group, 'block%d_items' % i)
            values = _read_array(group, 'block%d_values' % i)
            blk = make_block(values, blk_items, items)
            blocks.append(blk)

        return BlockManager(blocks, axes)
Ejemplo n.º 41
0
    def _merge_blocks(self, merge_chunks):
        """
        merge_chunks -> [(_JoinUnit, Block)]
        """
        funit, fblock = merge_chunks[0]
        fidx = funit.indexer

        out_shape = list(fblock.get_values().shape)

        n = len(fidx) if fidx is not None else out_shape[self.axis]

        out_shape[0] = sum(blk.get_merge_length()
                           for unit, blk in merge_chunks)
        out_shape[self.axis] = n

        # Should use Fortran order??
        block_dtype = _get_block_dtype([x[1] for x in merge_chunks])
        out = np.empty(out_shape, dtype=block_dtype)

        sofar = 0
        for unit, blk in merge_chunks:
            out_chunk = out[sofar:sofar + len(blk)]
            com.take_nd(blk.get_values(),
                        unit.indexer,
                        self.axis,
                        out=out_chunk)
            sofar += len(blk)

        # does not sort
        new_block_items = _concat_indexes([b.items for _, b in merge_chunks])

        # need to set placement if we have a non-unique result
        # calculate by the existing placement plus the offset in the result set
        placement = None
        if not self.result_items.is_unique:
            nchunks = len(merge_chunks)
            offsets = np.array([0] + [len(self.result_items) / nchunks] *
                               (nchunks - 1)).cumsum()
            placement = []
            for (unit, blk), offset in zip(merge_chunks, offsets):
                placement.extend(blk.ref_locs + offset)

        return make_block(out,
                          new_block_items,
                          self.result_items,
                          placement=placement)
Ejemplo n.º 42
0
    def _read_block_manager(self, group):
        ndim = group._v_attrs.ndim

        axes = []
        for i in xrange(ndim):
            ax = self._read_index(group, 'axis%d' % i)
            axes.append(ax)

        items = axes[0]
        blocks = []
        for i in range(group._v_attrs.nblocks):
            blk_items = self._read_index(group, 'block%d_items' % i)
            values = _read_array(group, 'block%d_values' % i)
            blk = make_block(values, blk_items, items)
            blocks.append(blk)

        return BlockManager(blocks, axes)
Ejemplo n.º 43
0
    def _merge_blocks(self, merge_chunks):
        """
        merge_chunks -> [(_JoinUnit, Block)]
        """
        funit, fblock = merge_chunks[0]
        fidx = funit.indexer

        out_shape = list(fblock.values.shape)

        n = len(fidx) if fidx is not None else out_shape[self.axis]

        out_shape[0] = sum(len(blk) for unit, blk in merge_chunks)
        out_shape[self.axis] = n

        # Should use Fortran order??
        block_dtype = _get_block_dtype([x[1] for x in merge_chunks])
        out = np.empty(out_shape, dtype=block_dtype)

        sofar = 0
        for unit, blk in merge_chunks:
            out_chunk = out[sofar:sofar + len(blk)]

            if unit.indexer is None:
                # is this really faster than assigning to arr.flat?
                com.take_fast(blk.values,
                              np.arange(n, dtype=np.int64),
                              None,
                              False,
                              axis=self.axis,
                              out=out_chunk)
            else:
                # write out the values to the result array
                com.take_fast(blk.values,
                              unit.indexer,
                              None,
                              False,
                              axis=self.axis,
                              out=out_chunk)

            sofar += len(blk)

        # does not sort
        new_block_items = _concat_indexes([b.items for _, b in merge_chunks])
        return make_block(out, new_block_items, self.result_items)
Ejemplo n.º 44
0
    def _read_block_manager(self, group):
        from pandas.core.internals import BlockManager, make_block

        ndim = group._v_attrs.ndim
        nblocks = group._v_attrs.nblocks

        axes = []
        for i in xrange(ndim):
            ax = self._read_index(group, "axis%d" % i)
            axes.append(ax)

        items = axes[0]
        blocks = []
        for i in range(group._v_attrs.nblocks):
            blk_items = self._read_index(group, "block%d_items" % i)
            values = _read_array(group, "block%d_values" % i)
            blk = make_block(values, blk_items, items)
            blocks.append(blk)

        return BlockManager(blocks, axes)
Ejemplo n.º 45
0
        def create_block(b):
            values = _safe_reshape(
                unconvert(b['values'], dtype_for(b['dtype']), b['compress']),
                b['shape'])

            # locs handles duplicate column names, and should be used instead
            # of items; see GH 9618
            if 'locs' in b:
                placement = b['locs']
            else:
                placement = axes[0].get_indexer(b['items'])

            if is_datetime64tz_dtype(b['dtype']):
                assert isinstance(values, np.ndarray), type(values)
                assert values.dtype == 'M8[ns]', values.dtype
                values = DatetimeArray(values, dtype=b['dtype'])

            return make_block(values=values,
                              klass=getattr(internals, b['klass']),
                              placement=placement,
                              dtype=b['dtype'])
Ejemplo n.º 46
0
    def _init_matrix(self, data, axes, dtype=None, copy=False):
        values = _prep_ndarray(data, copy=copy)

        if dtype is not None:
            try:
                values = values.astype(dtype)
            except Exception:
                raise ValueError('failed to cast to %s' % dtype)

        shape = values.shape
        fixed_axes = []
        for i, ax in enumerate(axes):
            if ax is None:
                ax = _default_index(shape[i])
            else:
                ax = _ensure_index(ax)
            fixed_axes.append(ax)

        items = fixed_axes[0]
        block = make_block(values, items, items)
        return BlockManager([block], fixed_axes)
Ejemplo n.º 47
0
    def _init_matrix(self, data, axes, dtype=None, copy=False):
        values = _prep_ndarray(data, copy=copy)

        if dtype is not None:
            try:
                values = values.astype(dtype)
            except Exception:
                raise ValueError('failed to cast to %s' % dtype)

        shape = values.shape
        fixed_axes = []
        for i, ax in enumerate(axes):
            if ax is None:
                ax = _default_index(shape[i])
            else:
                ax = _ensure_index(ax)
            fixed_axes.append(ax)

        items = fixed_axes[0]
        block = make_block(values, items, items)
        return BlockManager([block], fixed_axes)
Ejemplo n.º 48
0
        def create_block(b):
            values = _safe_reshape(unconvert(
                b[u'values'], dtype_for(b[u'dtype']),
                b[u'compress']), b[u'shape'])

            # locs handles duplicate column names, and should be used instead
            # of items; see GH 9618
            if u'locs' in b:
                placement = b[u'locs']
            else:
                placement = axes[0].get_indexer(b[u'items'])

            if is_datetime64tz_dtype(b[u'dtype']):
                assert isinstance(values, np.ndarray), type(values)
                assert values.dtype == 'M8[ns]', values.dtype
                values = DatetimeArray(values, dtype=b[u'dtype'])

            return make_block(values=values,
                              klass=getattr(internals, b[u'klass']),
                              placement=placement,
                              dtype=b[u'dtype'])
Ejemplo n.º 49
0
        def create_block(b):
            values = _safe_reshape(
                unconvert(b[u"values"], dtype_for(b[u"dtype"]), b[u"compress"]),
                b[u"shape"],
            )

            # locs handles duplicate column names, and should be used instead
            # of items; see GH 9618
            if u"locs" in b:
                placement = b[u"locs"]
            else:
                placement = axes[0].get_indexer(b[u"items"])
            klass  = getattr(internals, b[u"klass"])
            if klass == DatetimeTZBlock:
                raise ValueError("Lost the ability to parse datetime with timezone. Sorry")
                
            return make_block(
                values=values.copy(),
                klass=getattr(internals, b[u"klass"]),
                placement=placement,
                dtype=b[u"dtype"],
            )
Ejemplo n.º 50
0
    def _merge_blocks(self, merge_chunks):
        """
        merge_chunks -> [(_JoinUnit, Block)]
        """
        funit, fblock = merge_chunks[0]
        fidx = funit.indexer

        out_shape = list(fblock.get_values().shape)

        n = len(fidx) if fidx is not None else out_shape[self.axis]

        merge_lengths = list(blk.get_merge_length() for unit, blk in merge_chunks)
        out_shape[0] = sum(merge_lengths)
        out_shape[self.axis] = n

        # Should use Fortran order??
        block_dtype = _get_block_dtype([x[1] for x in merge_chunks])
        out = np.empty(out_shape, dtype=block_dtype)

        sofar = 0
        for unit, blk in merge_chunks:
            out_chunk = out[sofar : sofar + len(blk)]
            com.take_nd(blk.get_values(), unit.indexer, self.axis, out=out_chunk)
            sofar += len(blk)

        # does not sort
        new_block_items = _concat_indexes([b.items for _, b in merge_chunks])

        # need to set placement if we have a non-unique result
        # calculate by the existing placement plus the offset in the result set
        placement = None
        if not self.result_items.is_unique:
            placement = []
            offsets = np.append(np.array([0]), self.offsets.cumsum()[:-1])
            for (unit, blk), offset in zip(merge_chunks, offsets):
                placement.extend(blk.ref_locs + offset)

        return make_block(out, new_block_items, self.result_items, placement=placement)
Ejemplo n.º 51
0
def block2d_to_block3d(values,
                       items,
                       shape,
                       major_labels,
                       minor_labels,
                       ref_items=None):
    """
    Developer method for pivoting DataFrame -> Panel. Used in HDFStore and
    DataFrame.to_panel
    """
    from pandas.core.internals import make_block
    panel_shape = (len(items), ) + shape

    # TODO: lexsort depth needs to be 2!!

    # Create observation selection vector using major and minor
    # labels, for converting to panel format.
    selector = minor_labels + shape[1] * major_labels
    mask = np.zeros(np.prod(shape), dtype=bool)
    mask.put(selector, True)

    pvalues = np.empty(panel_shape, dtype=values.dtype)
    if not issubclass(pvalues.dtype.type, (np.integer, np.bool_)):
        pvalues.fill(np.nan)
    elif not mask.all():
        pvalues = com._maybe_upcast(pvalues)
        pvalues.fill(np.nan)

    values = values
    for i in xrange(len(items)):
        pvalues[i].flat[mask] = values[:, i]

    if ref_items is None:
        ref_items = items

    return make_block(pvalues, items, ref_items)
Ejemplo n.º 52
0
    def _merge_blocks(self, lblk, rblk):
        lidx = self.lindexer
        ridx = self.rindexer

        n = lblk.values.shape[self.axis] if lidx is None else len(lidx)
        lk = len(lblk.items)
        rk = len(rblk.items)

        out_shape = list(lblk.shape)
        out_shape[0] = lk + rk
        out_shape[self.axis] = n

        out = np.empty(out_shape, dtype=lblk.values.dtype)

        # is this really faster than assigning to arr.flat?
        if lidx is None:
            # out[:lk] = lblk.values
            com.take_fast(lblk.values, np.arange(n, dtype='i4'),
                          None, False,
                          axis=self.axis, out=out[:lk])
        else:
            # write out the values to the result array
            com.take_fast(lblk.values, lidx, None, False,
                             axis=self.axis, out=out[:lk])
        if ridx is None:
            # out[lk:] = lblk.values
            com.take_fast(rblk.values, np.arange(n, dtype='i4'),
                          None, False,
                          axis=self.axis, out=out[lk:])
        else:
            com.take_fast(rblk.values, ridx, None, False,
                          axis=self.axis, out=out[lk:])

        # does not sort
        new_items = lblk.items.append(rblk.items)
        return make_block(out, new_items, self.result_items)
Ejemplo n.º 53
0
def _reconstruct_block(item, columns=None, extension_columns=None):
    """
    Construct a pandas Block from the `item` dictionary coming from pyarrow's
    serialization or returned by arrow::python::ConvertTableToPandas.

    This function takes care of converting dictionary types to pandas
    categorical, Timestamp-with-timezones to the proper pandas Block, and
    conversion to pandas ExtensionBlock

    Parameters
    ----------
    item : dict
        For basic types, this is a dictionary in the form of
        {'block': np.ndarray of values, 'placement': pandas block placement}.
        Additional keys are present for other types (dictionary, timezone,
        object).
    columns :
        Column names of the table being constructed, used for extension types
    extension_columns : dict
        Dictionary of {column_name: pandas_dtype} that includes all columns
        and corresponding dtypes that will be converted to a pandas
        ExtensionBlock.

    Returns
    -------
    pandas Block

    """
    import pandas.core.internals as _int

    block_arr = item.get('block', None)
    placement = item['placement']
    if 'dictionary' in item:
        cat = _pandas_api.categorical_type.from_codes(
            block_arr, categories=item['dictionary'], ordered=item['ordered'])
        block = _int.make_block(cat,
                                placement=placement,
                                klass=_int.CategoricalBlock)
    elif 'timezone' in item:
        dtype = make_datetimetz(item['timezone'])
        block = _int.make_block(block_arr,
                                placement=placement,
                                klass=_int.DatetimeTZBlock,
                                dtype=dtype)
    elif 'object' in item:
        block = _int.make_block(builtin_pickle.loads(block_arr),
                                placement=placement,
                                klass=_int.ObjectBlock)
    elif 'py_array' in item:
        # create ExtensionBlock
        arr = item['py_array']
        assert len(placement) == 1
        name = columns[placement[0]]
        pandas_dtype = extension_columns[name]
        if not hasattr(pandas_dtype, '__from_arrow__'):
            raise ValueError("This column does not support to be converted "
                             "to a pandas ExtensionArray")
        pd_ext_arr = pandas_dtype.__from_arrow__(arr)
        block = _int.make_block(pd_ext_arr,
                                placement=placement,
                                klass=_int.ExtensionBlock)
    else:
        block = _int.make_block(block_arr, placement=placement)

    return block
Ejemplo n.º 54
0
    def test_interval_can_hold_element_emptylist(self, dtype, element):
        arr = np.array([1, 3, 4], dtype=dtype)
        ii = IntervalIndex.from_breaks(arr)
        blk = make_block(ii._data, [1], ndim=2)

        assert blk._can_hold_element([])
Ejemplo n.º 55
0
def create_block(typestr, placement, item_shape=None, num_offset=0):
    """
    Supported typestr:

        * float, f8, f4, f2
        * int, i8, i4, i2, i1
        * uint, u8, u4, u2, u1
        * complex, c16, c8
        * bool
        * object, string, O
        * datetime, dt, M8[ns], M8[ns, tz]
        * timedelta, td, m8[ns]
        * sparse (SparseArray with fill_value=0.0)
        * sparse_na (SparseArray with fill_value=np.nan)
        * category, category2

    """
    placement = BlockPlacement(placement)
    num_items = len(placement)

    if item_shape is None:
        item_shape = (N, )

    shape = (num_items, ) + item_shape

    mat = get_numeric_mat(shape)

    if typestr in ('float', 'f8', 'f4', 'f2', 'int', 'i8', 'i4', 'i2', 'i1',
                   'uint', 'u8', 'u4', 'u2', 'u1'):
        values = mat.astype(typestr) + num_offset
    elif typestr in ('complex', 'c16', 'c8'):
        values = 1.j * (mat.astype(typestr) + num_offset)
    elif typestr in ('object', 'string', 'O'):
        values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset],
                            shape)
    elif typestr in (
            'b',
            'bool',
    ):
        values = np.ones(shape, dtype=np.bool_)
    elif typestr in ('datetime', 'dt', 'M8[ns]'):
        values = (mat * 1e9).astype('M8[ns]')
    elif typestr.startswith('M8[ns'):
        # datetime with tz
        m = re.search(r'M8\[ns,\s*(\w+\/?\w*)\]', typestr)
        assert m is not None, "incompatible typestr -> {0}".format(typestr)
        tz = m.groups()[0]
        assert num_items == 1, "must have only 1 num items for a tz-aware"
        values = DatetimeIndex(np.arange(N) * 1e9, tz=tz)
    elif typestr in ('timedelta', 'td', 'm8[ns]'):
        values = (mat * 1).astype('m8[ns]')
    elif typestr in ('category', ):
        values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4])
    elif typestr in ('category2', ):
        values = Categorical(
            ['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'c', 'd'])
    elif typestr in ('sparse', 'sparse_na'):
        # FIXME: doesn't support num_rows != 10
        assert shape[-1] == 10
        assert all(s == 1 for s in shape[:-1])
        if typestr.endswith('_na'):
            fill_value = np.nan
        else:
            fill_value = 0.0
        values = SparseArray(
            [fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6],
            fill_value=fill_value)
        arr = values.sp_values.view()
        arr += (num_offset - 1)
    else:
        raise ValueError('Unsupported typestr: "%s"' % typestr)

    return make_block(values, placement=placement, ndim=len(shape))
Ejemplo n.º 56
0
    def test_period_can_hold_element_emptylist(self):
        pi = period_range("2016", periods=3, freq="A")
        blk = make_block(pi._data, [1], ndim=2)

        assert blk._can_hold_element([])
Ejemplo n.º 57
0
def test_deprecated_fastpath():
    # GH#19265
    values = np.random.rand(3, 3)
    with tm.assert_produces_warning(DeprecationWarning,
                                    check_stacklevel=False):
        make_block(values, placement=np.arange(3), fastpath=True)
Ejemplo n.º 58
0
def create_block(typestr, placement, item_shape=None, num_offset=0):
    """
    Supported typestr:

        * float, f8, f4, f2
        * int, i8, i4, i2, i1
        * uint, u8, u4, u2, u1
        * complex, c16, c8
        * bool
        * object, string, O
        * datetime, dt, M8[ns], M8[ns, tz]
        * timedelta, td, m8[ns]
        * sparse (SparseArray with fill_value=0.0)
        * sparse_na (SparseArray with fill_value=np.nan)
        * category, category2

    """
    placement = BlockPlacement(placement)
    num_items = len(placement)

    if item_shape is None:
        item_shape = (N,)

    shape = (num_items,) + item_shape

    mat = get_numeric_mat(shape)

    if typestr in (
        "float",
        "f8",
        "f4",
        "f2",
        "int",
        "i8",
        "i4",
        "i2",
        "i1",
        "uint",
        "u8",
        "u4",
        "u2",
        "u1",
    ):
        values = mat.astype(typestr) + num_offset
    elif typestr in ("complex", "c16", "c8"):
        values = 1.0j * (mat.astype(typestr) + num_offset)
    elif typestr in ("object", "string", "O"):
        values = np.reshape([f"A{i:d}" for i in mat.ravel() + num_offset], shape)
    elif typestr in ("b", "bool"):
        values = np.ones(shape, dtype=np.bool_)
    elif typestr in ("datetime", "dt", "M8[ns]"):
        values = (mat * 1e9).astype("M8[ns]")
    elif typestr.startswith("M8[ns"):
        # datetime with tz
        m = re.search(r"M8\[ns,\s*(\w+\/?\w*)\]", typestr)
        assert m is not None, f"incompatible typestr -> {typestr}"
        tz = m.groups()[0]
        assert num_items == 1, "must have only 1 num items for a tz-aware"
        values = DatetimeIndex(np.arange(N) * 1e9, tz=tz)
    elif typestr in ("timedelta", "td", "m8[ns]"):
        values = (mat * 1).astype("m8[ns]")
    elif typestr in ("category",):
        values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4])
    elif typestr in ("category2",):
        values = Categorical(["a", "a", "a", "a", "b", "b", "c", "c", "c", "d"])
    elif typestr in ("sparse", "sparse_na"):
        # FIXME: doesn't support num_rows != 10
        assert shape[-1] == 10
        assert all(s == 1 for s in shape[:-1])
        if typestr.endswith("_na"):
            fill_value = np.nan
        else:
            fill_value = 0.0
        values = SparseArray(
            [fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6],
            fill_value=fill_value,
        )
        arr = values.sp_values.view()
        arr += num_offset - 1
    else:
        raise ValueError(f'Unsupported typestr: "{typestr}"')

    return make_block(values, placement=placement, ndim=len(shape))
Ejemplo n.º 59
0
def table_to_blockmanager(table, nthreads=1):
    import pandas.core.internals as _int
    from pyarrow.compat import DatetimeTZDtype
    import pyarrow.lib as lib

    block_table = table

    index_columns = []
    index_arrays = []
    index_names = []
    schema = table.schema
    row_count = table.num_rows
    metadata = schema.metadata

    if metadata is not None and b'pandas' in metadata:
        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
        index_columns = pandas_metadata['index_columns']

    for name in index_columns:
        i = schema.get_field_index(name)
        if i != -1:
            col = table.column(i)
            index_name = (None if is_unnamed_index_level(name) else name)
            values = col.to_pandas().values
            if not values.flags.writeable:
                # ARROW-1054: in pandas 0.19.2, factorize will reject
                # non-writeable arrays when calling MultiIndex.from_arrays
                values = values.copy()

            index_arrays.append(values)
            index_names.append(index_name)
            block_table = block_table.remove_column(
                block_table.schema.get_field_index(name))

    result = lib.table_to_blocks(block_table, nthreads)

    blocks = []
    for item in result:
        block_arr = item['block']
        placement = item['placement']
        if 'dictionary' in item:
            cat = pd.Categorical(block_arr,
                                 categories=item['dictionary'],
                                 ordered=False,
                                 fastpath=True)
            block = _int.make_block(cat,
                                    placement=placement,
                                    klass=_int.CategoricalBlock,
                                    fastpath=True)
        elif 'timezone' in item:
            dtype = DatetimeTZDtype('ns', tz=item['timezone'])
            block = _int.make_block(block_arr,
                                    placement=placement,
                                    klass=_int.DatetimeTZBlock,
                                    dtype=dtype,
                                    fastpath=True)
        else:
            block = _int.make_block(block_arr, placement=placement)
        blocks.append(block)

    if len(index_arrays) > 1:
        index = pd.MultiIndex.from_arrays(index_arrays, names=index_names)
    elif len(index_arrays) == 1:
        index = pd.Index(index_arrays[0], name=index_names[0])
    else:
        index = pd.RangeIndex(row_count)

    axes = [[column.name for column in block_table.itercolumns()], index]

    return _int.BlockManager(blocks, axes)
Ejemplo n.º 60
0
def table_to_blockmanager(options, table, memory_pool, nthreads=1):
    import pandas.core.internals as _int
    import pyarrow.lib as lib

    index_columns = []
    columns = []
    column_indexes = []
    index_arrays = []
    index_names = []
    schema = table.schema
    row_count = table.num_rows
    metadata = schema.metadata
    columns_metadata = None

    has_pandas_metadata = metadata is not None and b'pandas' in metadata

    if has_pandas_metadata:
        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
        index_columns = pandas_metadata['index_columns']
        columns = pandas_metadata['columns']
        column_indexes = pandas_metadata.get('column_indexes', [])
        table = _add_any_metadata(table, pandas_metadata)
        columns_metadata = pandas_metadata.get('columns', None)

    block_table = table

    # Build up a list of index columns and names while removing those columns
    # from the original table
    logical_index_names = [c['name'] for c in columns[-len(index_columns):]]
    for raw_name, logical_name in zip(index_columns, logical_index_names):
        i = schema.get_field_index(raw_name)
        if i != -1:
            col = table.column(i)
            col_pandas = col.to_pandas()
            values = col_pandas.values
            if hasattr(values, 'flags') and not values.flags.writeable:
                # ARROW-1054: in pandas 0.19.2, factorize will reject
                # non-writeable arrays when calling MultiIndex.from_arrays
                values = values.copy()

            index_arrays.append(pd.Series(values, dtype=col_pandas.dtype))
            index_names.append(
                backwards_compatible_index_name(raw_name, logical_name))
            block_table = block_table.remove_column(
                block_table.schema.get_field_index(raw_name))

    # Convert an arrow table to Block from the internal pandas API
    result = lib.table_to_blocks(options, block_table, nthreads, memory_pool)

    # Construct the individual blocks converting dictionary types to pandas
    # categorical types and Timestamps-with-timezones types to the proper
    # pandas Blocks
    blocks = []
    for item in result:
        block_arr = item['block']
        placement = item['placement']
        if 'dictionary' in item:
            cat = pd.Categorical(block_arr,
                                 categories=item['dictionary'],
                                 ordered=item['ordered'],
                                 fastpath=True)
            block = _int.make_block(cat,
                                    placement=placement,
                                    klass=_int.CategoricalBlock,
                                    fastpath=True)
        elif 'timezone' in item:
            dtype = make_datetimetz(item['timezone'])
            block = _int.make_block(block_arr,
                                    placement=placement,
                                    klass=_int.DatetimeTZBlock,
                                    dtype=dtype,
                                    fastpath=True)
        else:
            block = _int.make_block(block_arr, placement=placement)
        blocks.append(block)

    # Construct the row index
    if len(index_arrays) > 1:
        index = pd.MultiIndex.from_arrays(index_arrays, names=index_names)
    elif len(index_arrays) == 1:
        index = pd.Index(index_arrays[0], name=index_names[0])
    else:
        index = pd.RangeIndex(row_count)

    column_strings = [x.name for x in block_table.itercolumns()]
    if columns_metadata is not None:
        columns_name_dict = dict(
            (str(x['name']), x['name']) for x in columns_metadata)
        columns_values = [
            columns_name_dict[y] if y in columns_name_dict.keys() else y
            for y in column_strings
        ]
    else:
        columns_values = column_strings

    # If we're passed multiple column indexes then evaluate with
    # ast.literal_eval, since the column index values show up as a list of
    # tuples
    to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, )

    # Create the column index

    # Construct the base index
    if not columns_values:
        columns = pd.Index(columns_values)
    else:
        columns = pd.MultiIndex.from_tuples(
            list(map(to_pair, columns_values)),
            names=[col_index['name'] for col_index in column_indexes] or None,
        )

    # if we're reconstructing the index
    if has_pandas_metadata:

        # Get levels and labels, and provide sane defaults if the index has a
        # single level to avoid if/else spaghetti.
        levels = getattr(columns, 'levels', None) or [columns]
        labels = getattr(columns, 'labels', None) or [
            pd.RangeIndex(len(level)) for level in levels
        ]

        # Convert each level to the dtype provided in the metadata
        levels_dtypes = [(level, col_index.get('numpy_type', level.dtype))
                         for level, col_index in zip_longest(
                             levels, column_indexes, fillvalue={})]
        new_levels = [
            _level if _level.dtype == _dtype else _level.astype(_dtype)
            for _level, _dtype in levels_dtypes
        ]

        columns = pd.MultiIndex(levels=new_levels,
                                labels=labels,
                                names=columns.names)

    # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0
    columns = _flatten_single_level_multiindex(columns)

    axes = [columns, index]
    return _int.BlockManager(blocks, axes)