def test_validate_ndim(): values = np.array([1.0, 2.0]) placement = slice(2) msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" with pytest.raises(ValueError, match=msg): make_block(values, placement, ndim=2)
def test_validate_ndim(): values = np.array([1.0, 2.0]) placement = slice(2) msg = "Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" with tm.assert_raises_regex(ValueError, msg): make_block(values, placement, ndim=2)
def _reconstruct_block(item): # Construct the individual blocks converting dictionary types to pandas # categorical types and Timestamps-with-timezones types to the proper # pandas Blocks block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: cat = pd.Categorical.from_codes(block_arr, categories=item['dictionary'], ordered=item['ordered']) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock) elif 'timezone' in item: dtype = _make_datetimetz(item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype) elif 'object' in item: block = _int.make_block(builtin_pickle.loads(block_arr), placement=placement, klass=_int.ObjectBlock) else: block = _int.make_block(block_arr, placement=placement) return block
def _concat_blocks(self, blocks): values_list = [b.values for b in blocks if b is not None] concat_values = com._concat_compat(values_list, axis=self.axis) if self.axis > 0: # Not safe to remove this check, need to profile if not _all_indexes_same([b.items for b in blocks]): raise Exception("dtypes are not consistent throughout " "DataFrames") return make_block(concat_values, blocks[0].items, self.new_axes[0]) else: offsets = np.r_[0, np.cumsum([len(x._data.axes[0]) for x in self.objs])] indexer = np.concatenate([offsets[i] + b.ref_locs for i, b in enumerate(blocks) if b is not None]) if self.ignore_index: concat_items = indexer else: concat_items = self.new_axes[0].take(indexer) if self.ignore_index: ref_items = self._get_fresh_axis() return make_block(concat_values, concat_items, ref_items) block = make_block(concat_values, concat_items, self.new_axes[0]) # we need to set the ref_locs in this block so we have the mapping # as we now have a non-unique index across dtypes, and we need to # map the column location to the block location # GH3602 if not self.new_axes[0].is_unique: block._ref_locs = indexer return block
def _concat_blocks(self, blocks): values_list = [b.values for b in blocks if b is not None] concat_values = com._concat_compat(values_list, axis=self.axis) if self.axis > 0: # Not safe to remove this check, need to profile if not _all_indexes_same([b.items for b in blocks]): raise Exception('dtypes are not consistent throughout ' 'DataFrames') return make_block(concat_values, blocks[0].items, self.new_axes[0]) else: offsets = np.r_[0, np.cumsum([len(x._data.axes[0]) for x in self.objs])] indexer = np.concatenate([offsets[i] + b.ref_locs for i, b in enumerate(blocks) if b is not None]) if self.ignore_index: concat_items = indexer else: concat_items = self.new_axes[0].take(indexer) if self.ignore_index: ref_items = self._get_fresh_axis() return make_block(concat_values, concat_items, ref_items) return make_block(concat_values, concat_items, self.new_axes[0])
def _unstack_frame(obj, level): from pandas.core.internals import BlockManager, make_block if obj._is_mixed_type: unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy obj.index, level=level, value_columns=obj.columns) new_columns = unstacker.get_new_columns() new_index = unstacker.get_new_index() new_axes = [new_columns, new_index] new_blocks = [] mask_blocks = [] for blk in obj._data.blocks: bunstacker = _Unstacker(blk.values.T, obj.index, level=level, value_columns=blk.items) new_items = bunstacker.get_new_columns() new_values, mask = bunstacker.get_new_values() mblk = make_block(mask.T, new_items, new_columns) mask_blocks.append(mblk) newb = make_block(new_values.T, new_items, new_columns) new_blocks.append(newb) result = DataFrame(BlockManager(new_blocks, new_axes)) mask_frame = DataFrame(BlockManager(mask_blocks, new_axes)) return result.ix[:, mask_frame.sum(0) > 0] else: unstacker = _Unstacker(obj.values, obj.index, level=level, value_columns=obj.columns) return unstacker.get_result()
def test_merge(self): avals = randn(2, 10) bvals = randn(2, 10) ref_cols = Index(['e', 'a', 'b', 'd', 'f']) ablock = make_block(avals, ref_cols.get_indexer(['e', 'b'])) bblock = make_block(bvals, ref_cols.get_indexer(['a', 'd'])) merged = ablock.merge(bblock) assert_almost_equal(merged.mgr_locs, [0, 1, 2, 3]) assert_almost_equal(merged.values[[0, 2]], avals) assert_almost_equal(merged.values[[1, 3]], bvals)
def test_merge(self): avals = randn(2, 10) bvals = randn(2, 10) ref_cols = Index(['e', 'a', 'b', 'd', 'f']) ablock = make_block(avals, ref_cols.get_indexer(['e', 'b'])) bblock = make_block(bvals, ref_cols.get_indexer(['a', 'd'])) merged = ablock.merge(bblock) tm.assert_numpy_array_equal(merged.mgr_locs.as_array, np.array([0, 1, 2, 3], dtype=np.int64)) tm.assert_numpy_array_equal(merged.values[[0, 2]], np.array(avals)) tm.assert_numpy_array_equal(merged.values[[1, 3]], np.array(bvals))
def _concat_blocks(self, blocks): concat_values = np.concatenate([b.values for b in blocks], axis=self.axis) if self.axis > 0: # Not safe to remove this check, need to profile if not _all_indexes_same([b.items for b in blocks]): raise Exception('dtypes are not consistent throughout ' 'DataFrames') return make_block(concat_values, blocks[0].items, self.new_axes[0]) else: concat_items = _concat_indexes([b.items for b in blocks]) # TODO: maybe want to "take" from the new columns? return make_block(concat_values, concat_items, self.new_axes[0])
def _merge_blocks(self, merge_chunks): """ merge_chunks -> [(_JoinUnit, Block)] """ funit, fblock = merge_chunks[0] fidx = funit.indexer out_shape = list(fblock.values.shape) n = len(fidx) if fidx is not None else out_shape[self.axis] out_shape[0] = sum(len(blk) for unit, blk in merge_chunks) out_shape[self.axis] = n # Should use Fortran order?? out = np.empty(out_shape, dtype=fblock.values.dtype) sofar = 0 for unit, blk in merge_chunks: out_chunk = out[sofar : sofar + len(blk)] if unit.indexer is None: # is this really faster than assigning to arr.flat? com.take_fast(blk.values, np.arange(n, dtype=np.int64), None, False, axis=self.axis, out=out_chunk) else: # write out the values to the result array com.take_fast(blk.values, unit.indexer, None, False, axis=self.axis, out=out_chunk) sofar += len(blk) # does not sort new_block_items = _concat_indexes([b.items for _, b in merge_chunks]) return make_block(out, new_block_items, self.result_items)
def _upcast_blocks(blocks): """ Upcast and consolidate if necessary """ new_blocks = [] for block in blocks: if isinstance(block, IntBlock): newb = make_block(block.values.astype(float), block.items, block.ref_items, placement=block._ref_locs) elif isinstance(block, BoolBlock): newb = make_block(block.values.astype(object), block.items, block.ref_items, placement=block._ref_locs) else: newb = block new_blocks.append(newb) # use any ref_items return _consolidate(new_blocks, newb.ref_items)
def _merge_blocks(self, merge_chunks): """ merge_chunks -> [(_JoinUnit, Block)] """ funit, fblock = merge_chunks[0] fidx = funit.indexer out_shape = list(fblock.values.shape) n = len(fidx) if fidx is not None else out_shape[self.axis] out_shape[0] = sum(len(blk) for unit, blk in merge_chunks) out_shape[self.axis] = n # Should use Fortran order?? block_dtype = _get_block_dtype([x[1] for x in merge_chunks]) out = np.empty(out_shape, dtype=block_dtype) sofar = 0 for unit, blk in merge_chunks: out_chunk = out[sofar: sofar + len(blk)] com.take_nd(blk.values, unit.indexer, self.axis, out=out_chunk) sofar += len(blk) # does not sort new_block_items = _concat_indexes([b.items for _, b in merge_chunks]) return make_block(out, new_block_items, self.result_items)
def block2d_to_block3d(values, items, shape, major_labels, minor_labels, ref_items=None): """ Developer method for pivoting DataFrame -> Panel. Used in HDFStore and DataFrame.to_panel """ from pandas.core.internals import make_block panel_shape = (len(items),) + shape # TODO: lexsort depth needs to be 2!! # Create observation selection vector using major and minor # labels, for converting to panel format. selector = minor_labels + shape[1] * major_labels mask = np.zeros(np.prod(shape), dtype=bool) mask.put(selector, True) pvalues = np.empty(panel_shape, dtype=values.dtype) if not issubclass(pvalues.dtype.type, (np.integer, np.bool_)): pvalues.fill(np.nan) elif not mask.all(): pvalues = com._maybe_upcast(pvalues) pvalues.fill(np.nan) values = values for i in xrange(len(items)): pvalues[i].flat[mask] = values[:, i] if ref_items is None: ref_items = items return make_block(pvalues, items, ref_items)
def block2d_to_blocknd(values, items, shape, labels, ref_items=None): """ pivot to the labels shape """ from pandas.core.internals import make_block panel_shape = (len(items),) + shape # TODO: lexsort depth needs to be 2!! # Create observation selection vector using major and minor # labels, for converting to panel format. selector = factor_indexer(shape[1:], labels) mask = np.zeros(np.prod(shape), dtype=bool) mask.put(selector, True) if mask.all(): pvalues = np.empty(panel_shape, dtype=values.dtype) else: dtype, fill_value = _maybe_promote(values.dtype) pvalues = np.empty(panel_shape, dtype=dtype) pvalues.fill(fill_value) values = values for i in xrange(len(items)): pvalues[i].flat[mask] = values[:, i] if ref_items is None: ref_items = items return make_block(pvalues, items, ref_items)
def create_block(b): dtype = dtype_for(b["dtype"]) return make_block( unconvert(b["values"], dtype, b["compress"]).reshape(b["shape"]), b["items"], axes[0], klass=getattr(internals, b["klass"]), )
def test_split(self): # GH#37799 values = np.random.randn(3, 4) blk = make_block(values, placement=[3, 1, 6], ndim=2) result = blk._split() # check that we get views, not copies values[:] = -9999 assert (blk.values == -9999).all() assert len(result) == 3 expected = [ make_block(values[[0]], placement=[3], ndim=2), make_block(values[[1]], placement=[1], ndim=2), make_block(values[[2]], placement=[6], ndim=2), ] for res, exp in zip(result, expected): assert_block_equal(res, exp)
def test_make_block_no_pandas_array(): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.array([1, 2]) # PandasArray, no dtype result = make_block(arr, slice(len(arr))) assert result.is_integer is True assert result.is_extension is False # PandasArray, PandasDtype result = make_block(arr, slice(len(arr)), dtype=arr.dtype) assert result.is_integer is True assert result.is_extension is False # ndarray, PandasDtype result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype) assert result.is_integer is True assert result.is_extension is False
def _upcast_blocks(blocks): """ Upcast and consolidate if necessary """ new_blocks = [] for block in blocks: if isinstance(block, IntBlock): newb = make_block(block.values.astype(float), block.items, block.ref_items) elif isinstance(block, BoolBlock): newb = make_block(block.values.astype(object), block.items, block.ref_items) else: newb = block new_blocks.append(newb) # use any ref_items return _consolidate(new_blocks, newb.ref_items)
def test_get(self): cols = Index(list("abc")) values = np.random.rand(3, 3) block = make_block(values=values.copy(), placement=np.arange(3)) mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) tm.assert_almost_equal(mgr.get("a").internal_values(), values[0]) tm.assert_almost_equal(mgr.get("b").internal_values(), values[1]) tm.assert_almost_equal(mgr.get("c").internal_values(), values[2])
def test_make_block_no_pandas_array(): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.arrays.PandasArray(np.array([1, 2])) # PandasArray, no dtype result = make_block(arr, slice(len(arr))) assert result.is_integer is True assert result.is_extension is False # PandasArray, PandasDtype result = make_block(arr, slice(len(arr)), dtype=arr.dtype) assert result.is_integer is True assert result.is_extension is False # ndarray, PandasDtype result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype) assert result.is_integer is True assert result.is_extension is False
def _concat_blocks(self, blocks): values_list = [b.get_values() for b in blocks if b is not None] concat_values = com._concat_compat(values_list, axis=self.axis) if self.axis > 0: # Not safe to remove this check, need to profile if not _all_indexes_same([b.items for b in blocks]): # TODO: Either profile this piece or remove. # FIXME: Need to figure out how to test whether this line exists or does not...(unclear if even possible # or maybe would require performance test) raise PandasError('dtypes are not consistent throughout ' 'DataFrames') return make_block(concat_values, blocks[0].items, self.new_axes[0], placement=blocks[0]._ref_locs) else: offsets = np.r_[ 0, np.cumsum([len(x._data.axes[0]) for x in self.objs])] indexer = np.concatenate([ offsets[i] + b.ref_locs for i, b in enumerate(blocks) if b is not None ]) if self.ignore_index: concat_items = indexer else: concat_items = self.new_axes[0].take(indexer) if self.ignore_index: ref_items = self._get_fresh_axis() return make_block(concat_values, concat_items, ref_items) block = make_block(concat_values, concat_items, self.new_axes[0]) # we need to set the ref_locs in this block so we have the mapping # as we now have a non-unique index across dtypes, and we need to # map the column location to the block location # GH3602 if not self.new_axes[0].is_unique: block.set_ref_locs(indexer) return block
def _concat_blocks(self, blocks): values_list = [b.get_values() for b in blocks if b is not None] concat_values = com._concat_compat(values_list, axis=self.axis) if self.axis > 0: # Not safe to remove this check, need to profile if not _all_indexes_same([b.items for b in blocks]): # TODO: Either profile this piece or remove. # FIXME: Need to figure out how to test whether this line exists or does not...(unclear if even possible # or maybe would require performance test) raise PandasError('dtypes are not consistent throughout ' 'DataFrames') return make_block(concat_values, blocks[0].items, self.new_axes[0], placement=blocks[0]._ref_locs) else: offsets = np.r_[0, np.cumsum([len(x._data.axes[0]) for x in self.objs])] indexer = np.concatenate([offsets[i] + b.ref_locs for i, b in enumerate(blocks) if b is not None]) if self.ignore_index: concat_items = indexer else: concat_items = self.new_axes[0].take(indexer) if self.ignore_index: ref_items = self._get_fresh_axis() return make_block(concat_values, concat_items, ref_items) block = make_block(concat_values, concat_items, self.new_axes[0]) # we need to set the ref_locs in this block so we have the mapping # as we now have a non-unique index across dtypes, and we need to # map the column location to the block location # GH3602 if not self.new_axes[0].is_unique: block.set_ref_locs(indexer) return block
def _cython_agg_general(self, how): obj = self._obj_with_exclusions if self.axis == 1: obj = obj.T new_blocks = [] for block in obj._data.blocks: values = block.values.T if not issubclass(values.dtype.type, (np.number, np.bool_)): continue values = com._ensure_float64(values) result, counts = self.grouper.aggregate(values, how) mask = counts > 0 if len(mask) > 0: result = result[mask] newb = make_block(result.T, block.items, block.ref_items) new_blocks.append(newb) if len(new_blocks) == 0: raise GroupByError('No numeric types to aggregate') agg_axis = 0 if self.axis == 1 else 1 agg_labels = self._obj_with_exclusions._get_axis(agg_axis) if sum(len(x.items) for x in new_blocks) == len(agg_labels): output_keys = agg_labels else: all_items = [] for b in new_blocks: all_items.extend(b.items) output_keys = agg_labels[agg_labels.isin(all_items)] if not self.as_index: index = np.arange(new_blocks[0].values.shape[1]) mgr = BlockManager(new_blocks, [output_keys, index]) result = DataFrame(mgr) group_levels = self.grouper.get_group_levels() zipped = zip(self.grouper.names, group_levels) for i, (name, labels) in enumerate(zipped): result.insert(i, name, labels) result = result.consolidate() else: index = self.grouper.result_index mgr = BlockManager(new_blocks, [output_keys, index]) result = DataFrame(mgr) if self.axis == 1: result = result.T return result
def _cython_agg_general(self, how): obj = self._obj_with_exclusions if self.axis == 1: obj = obj.T new_blocks = [] for block in obj._data.blocks: values = block.values.T if not issubclass(values.dtype.type, (np.number, np.bool_)): continue values = com._ensure_float64(values) result, counts = self.grouper.aggregate(values, how) mask = counts > 0 if len(mask) > 0: result = result[mask] newb = make_block(result.T, block.items, block.ref_items) new_blocks.append(newb) if len(new_blocks) == 0: raise GroupByError('No numeric types to aggregate') agg_axis = 0 if self.axis == 1 else 1 agg_labels = self._obj_with_exclusions._get_axis(agg_axis) if sum(len(x.items) for x in new_blocks) == len(agg_labels): output_keys = agg_labels else: all_items = [] for b in new_blocks: all_items.extend(b.items) output_keys = agg_labels[agg_labels.isin(all_items)] if not self.as_index: index = np.arange(new_blocks[0].values.shape[1]) mgr = BlockManager(new_blocks, [output_keys, index]) result = DataFrame(mgr) group_levels = self.grouper.get_group_levels() zipped = zip(self.grouper.names, group_levels) for i, (name, labels) in enumerate(zipped): result.insert(i, name, labels) result = result.consolidate() else: index = self.grouper.result_index mgr = BlockManager(new_blocks, [output_keys, index]) result = DataFrame(mgr) if self.axis == 1: result = result.T return result
def _unstack_frame(obj, level, fill_value=None): from pandas.core.internals import BlockManager, make_block if obj._is_mixed_type: unstacker = _Unstacker( np.empty(obj.shape, dtype=bool), # dummy obj.index, level=level, value_columns=obj.columns) new_columns = unstacker.get_new_columns() new_index = unstacker.get_new_index() new_axes = [new_columns, new_index] new_blocks = [] mask_blocks = [] for blk in obj._data.blocks: blk_items = obj._data.items[blk.mgr_locs.indexer] bunstacker = _Unstacker(blk.values.T, obj.index, level=level, value_columns=blk_items, fill_value=fill_value) new_items = bunstacker.get_new_columns() new_placement = new_columns.get_indexer(new_items) new_values, mask = bunstacker.get_new_values() mblk = make_block(mask.T, placement=new_placement) mask_blocks.append(mblk) newb = make_block(new_values.T, placement=new_placement) new_blocks.append(newb) result = DataFrame(BlockManager(new_blocks, new_axes)) mask_frame = DataFrame(BlockManager(mask_blocks, new_axes)) return result.loc[:, mask_frame.sum(0) > 0] else: unstacker = _Unstacker(obj.values, obj.index, level=level, value_columns=obj.columns, fill_value=fill_value) return unstacker.get_result()
def create_block(b): values = unconvert(b['values'], dtype_for(b['dtype']), b['compress']).reshape(b['shape']) # locs handles duplicate column names, and should be used instead of items; see GH 9618 if 'locs' in b: placement = b['locs'] else: placement = axes[0].get_indexer(b['items']) return make_block(values=values, klass=getattr(internals, b['klass']), placement=placement)
def create_block(b): values = unconvert(b['values'], dtype_for(b['dtype']), b['compress']).reshape(b['shape']) # locs handles duplicate column names, and should be used instead of items; see GH 9618 if 'locs' in b: placement = b['locs'] else: placement = axes[0].get_indexer(b['items']) return make_block(values=values, klass=getattr(internals, b['klass']), placement=placement)
def test_get(self): cols = Index(list('abc')) values = np.random.rand(3, 3) block = make_block(values=values.copy(), placement=np.arange(3)) mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) assert_almost_equal(mgr.get('a', fastpath=False), values[0]) assert_almost_equal(mgr.get('b', fastpath=False), values[1]) assert_almost_equal(mgr.get('c', fastpath=False), values[2]) assert_almost_equal(mgr.get('a').internal_values(), values[0]) assert_almost_equal(mgr.get('b').internal_values(), values[1]) assert_almost_equal(mgr.get('c').internal_values(), values[2])
def test_get(self): cols = Index(list('abc')) values = np.random.rand(3, 3) block = make_block(values=values.copy(), placement=np.arange(3)) mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) assert_almost_equal(mgr.get('a', fastpath=False), values[0]) assert_almost_equal(mgr.get('b', fastpath=False), values[1]) assert_almost_equal(mgr.get('c', fastpath=False), values[2]) assert_almost_equal(mgr.get('a').internal_values(), values[0]) assert_almost_equal(mgr.get('b').internal_values(), values[1]) assert_almost_equal(mgr.get('c').internal_values(), values[2])
def create_block(b): values = _safe_reshape(unconvert(b[u"values"], dtype_for(b[u"dtype"]), b[u"compress"]), b[u"shape"]) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 if u"locs" in b: placement = b[u"locs"] else: placement = axes[0].get_indexer(b[u"items"]) return make_block( values=values, klass=getattr(internals, b[u"klass"]), placement=placement, dtype=b[u"dtype"] )
def test_make_block_no_pandas_array(): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.arrays.PandasArray(np.array([1, 2])) # PandasArray, no dtype result = make_block(arr, slice(len(arr)), ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False # PandasArray, PandasDtype result = make_block(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False # ndarray, PandasDtype result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False
def _concat_blocks(self, blocks): concat_values = np.concatenate([b.values for b in blocks], axis=self.axis) if self.axis > 0: # Not safe to remove this check, need to profile if not _all_indexes_same([b.items for b in blocks]): raise Exception('dtypes are not consistent throughout ' 'DataFrames') return make_block(concat_values, blocks[0].items, self.new_axes[0]) else: all_items = [b.items for b in blocks] if self.axis == 0 and self.keys is not None: offsets = np.r_[0, np.cumsum([len(x._data.axes[self.axis]) for x in self.objs])] indexer = np.concatenate([offsets[i] + b.ref_locs for i, b in enumerate(blocks)]) concat_items = self.new_axes[0].take(indexer) else: concat_items = _concat_indexes(all_items) return make_block(concat_values, concat_items, self.new_axes[0])
def block_from_header_bytes(header, bytes): placement, dtype, shape, (extension_type, extension_values) = header values = pnp.deserialize(pnp.decompress(bytes, dtype), dtype, copy=True).reshape(shape) if extension_type == 'categorical_type': values = pd.Categorical.from_codes(values, extension_values[1], ordered=extension_values[0]) elif extension_type == 'datetime64_tz_type': tz_info = extension_values[0] values = pd.DatetimeIndex(values).tz_localize('utc').tz_convert( tz_info) return make_block(values, placement=placement)
def _concat_blocks(self, blocks): concat_values = np.concatenate([b.values for b in blocks], axis=self.axis) if self.axis > 0: # Not safe to remove this check, need to profile if not _all_indexes_same([b.items for b in blocks]): raise Exception('dtypes are not consistent throughout ' 'DataFrames') return make_block(concat_values, blocks[0].items, self.new_axes[0]) else: all_items = [b.items for b in blocks] if self.axis == 0 and self.keys is not None: offsets = np.r_[0, np.cumsum([len(x._data.axes[self.axis]) for x in self.objs])] indexer = np.concatenate([offsets[i] + b.ref_locs for i, b in enumerate(blocks)]) concat_items = self.new_axes[0].take(indexer) else: concat_items = _concat_indexes(all_items) return make_block(concat_values, concat_items, self.new_axes[0])
def block_from_header_bytes(header, bytes): placement, dtype, shape, (extension_type, extension_values) = header values = pnp.deserialize(pnp.decompress(bytes, dtype), dtype, copy=True).reshape(shape) if extension_type == 'categorical_type': values = pd.Categorical.from_codes(values, extension_values[1], ordered=extension_values[0]) elif extension_type == 'datetime64_tz_type': tz_info = extension_values[0] values = pd.DatetimeIndex(values).tz_localize('utc').tz_convert( tz_info) return make_block(values, placement=placement)
def _merge_blocks(self, lblk, rblk): lidx = self.lindexer ridx = self.rindexer n = lblk.values.shape[self.axis] if lidx is None else len(lidx) lk = len(lblk.items) rk = len(rblk.items) out_shape = list(lblk.shape) out_shape[0] = lk + rk out_shape[self.axis] = n out = np.empty(out_shape, dtype=lblk.values.dtype) # is this really faster than assigning to arr.flat? if lidx is None: # out[:lk] = lblk.values com.take_fast(lblk.values, np.arange(n, dtype='i4'), None, False, axis=self.axis, out=out[:lk]) else: # write out the values to the result array com.take_fast(lblk.values, lidx, None, False, axis=self.axis, out=out[:lk]) if ridx is None: # out[lk:] = lblk.values com.take_fast(rblk.values, np.arange(n, dtype='i4'), None, False, axis=self.axis, out=out[lk:]) else: com.take_fast(rblk.values, ridx, None, False, axis=self.axis, out=out[lk:]) # does not sort new_items = lblk.items.append(rblk.items) return make_block(out, new_items, self.result_items)
def _reconstruct_block(item): # Construct the individual blocks converting dictionary types to pandas # categorical types and Timestamps-with-timezones types to the proper # pandas Blocks block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: cat = pd.Categorical.from_codes(block_arr, categories=item['dictionary'], ordered=item['ordered']) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) elif 'timezone' in item: dtype = _make_datetimetz(item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype, fastpath=True) else: block = _int.make_block(block_arr, placement=placement) return block
def _concat_blocks(self, blocks): values_list = [b.values for b in blocks if b is not None] concat_values = com._concat_compat(values_list, axis=self.axis) if self.axis > 0: # Not safe to remove this check, need to profile if not _all_indexes_same([b.items for b in blocks]): raise Exception('dtypes are not consistent throughout ' 'DataFrames') return make_block(concat_values, blocks[0].items, self.new_axes[0]) else: offsets = np.r_[ 0, np.cumsum([len(x._data.axes[0]) for x in self.objs])] indexer = np.concatenate([ offsets[i] + b.ref_locs for i, b in enumerate(blocks) if b is not None ]) if self.ignore_index: concat_items = indexer else: concat_items = self.new_axes[0].take(indexer) if self.ignore_index: ref_items = self._get_fresh_axis() return make_block(concat_values, concat_items, ref_items) block = make_block(concat_values, concat_items, self.new_axes[0]) # we need to set the ref_locs in this block so we have the mapping # as we now have a non-unique index across dtypes, and we need to # map the column location to the block location # GH3602 if not self.new_axes[0].is_unique: block._ref_locs = indexer return block
def _unstack_frame(obj, level, fill_value=None): from pandas.core.internals import BlockManager, make_block if obj._is_mixed_type: unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy obj.index, level=level, value_columns=obj.columns) new_columns = unstacker.get_new_columns() new_index = unstacker.get_new_index() new_axes = [new_columns, new_index] new_blocks = [] mask_blocks = [] for blk in obj._data.blocks: blk_items = obj._data.items[blk.mgr_locs.indexer] bunstacker = _Unstacker(blk.values.T, obj.index, level=level, value_columns=blk_items, fill_value=fill_value) new_items = bunstacker.get_new_columns() new_placement = new_columns.get_indexer(new_items) new_values, mask = bunstacker.get_new_values() mblk = make_block(mask.T, placement=new_placement) mask_blocks.append(mblk) newb = make_block(new_values.T, placement=new_placement) new_blocks.append(newb) result = obj._constructor(BlockManager(new_blocks, new_axes)) mask_frame = obj._constructor(BlockManager(mask_blocks, new_axes)) return result.ix[:, mask_frame.sum(0) > 0] else: unstacker = _Unstacker(obj.values, obj.index, level=level, value_columns=obj.columns, fill_value=fill_value) return unstacker.get_result()
def _read_block_manager(self, group): ndim = group._v_attrs.ndim axes = [] for i in xrange(ndim): ax = self._read_index(group, 'axis%d' % i) axes.append(ax) items = axes[0] blocks = [] for i in range(group._v_attrs.nblocks): blk_items = self._read_index(group, 'block%d_items' % i) values = _read_array(group, 'block%d_values' % i) blk = make_block(values, blk_items, items) blocks.append(blk) return BlockManager(blocks, axes)
def _merge_blocks(self, merge_chunks): """ merge_chunks -> [(_JoinUnit, Block)] """ funit, fblock = merge_chunks[0] fidx = funit.indexer out_shape = list(fblock.get_values().shape) n = len(fidx) if fidx is not None else out_shape[self.axis] out_shape[0] = sum(blk.get_merge_length() for unit, blk in merge_chunks) out_shape[self.axis] = n # Should use Fortran order?? block_dtype = _get_block_dtype([x[1] for x in merge_chunks]) out = np.empty(out_shape, dtype=block_dtype) sofar = 0 for unit, blk in merge_chunks: out_chunk = out[sofar:sofar + len(blk)] com.take_nd(blk.get_values(), unit.indexer, self.axis, out=out_chunk) sofar += len(blk) # does not sort new_block_items = _concat_indexes([b.items for _, b in merge_chunks]) # need to set placement if we have a non-unique result # calculate by the existing placement plus the offset in the result set placement = None if not self.result_items.is_unique: nchunks = len(merge_chunks) offsets = np.array([0] + [len(self.result_items) / nchunks] * (nchunks - 1)).cumsum() placement = [] for (unit, blk), offset in zip(merge_chunks, offsets): placement.extend(blk.ref_locs + offset) return make_block(out, new_block_items, self.result_items, placement=placement)
def _read_block_manager(self, group): ndim = group._v_attrs.ndim axes = [] for i in xrange(ndim): ax = self._read_index(group, 'axis%d' % i) axes.append(ax) items = axes[0] blocks = [] for i in range(group._v_attrs.nblocks): blk_items = self._read_index(group, 'block%d_items' % i) values = _read_array(group, 'block%d_values' % i) blk = make_block(values, blk_items, items) blocks.append(blk) return BlockManager(blocks, axes)
def _merge_blocks(self, merge_chunks): """ merge_chunks -> [(_JoinUnit, Block)] """ funit, fblock = merge_chunks[0] fidx = funit.indexer out_shape = list(fblock.values.shape) n = len(fidx) if fidx is not None else out_shape[self.axis] out_shape[0] = sum(len(blk) for unit, blk in merge_chunks) out_shape[self.axis] = n # Should use Fortran order?? block_dtype = _get_block_dtype([x[1] for x in merge_chunks]) out = np.empty(out_shape, dtype=block_dtype) sofar = 0 for unit, blk in merge_chunks: out_chunk = out[sofar:sofar + len(blk)] if unit.indexer is None: # is this really faster than assigning to arr.flat? com.take_fast(blk.values, np.arange(n, dtype=np.int64), None, False, axis=self.axis, out=out_chunk) else: # write out the values to the result array com.take_fast(blk.values, unit.indexer, None, False, axis=self.axis, out=out_chunk) sofar += len(blk) # does not sort new_block_items = _concat_indexes([b.items for _, b in merge_chunks]) return make_block(out, new_block_items, self.result_items)
def _read_block_manager(self, group): from pandas.core.internals import BlockManager, make_block ndim = group._v_attrs.ndim nblocks = group._v_attrs.nblocks axes = [] for i in xrange(ndim): ax = self._read_index(group, "axis%d" % i) axes.append(ax) items = axes[0] blocks = [] for i in range(group._v_attrs.nblocks): blk_items = self._read_index(group, "block%d_items" % i) values = _read_array(group, "block%d_values" % i) blk = make_block(values, blk_items, items) blocks.append(blk) return BlockManager(blocks, axes)
def create_block(b): values = _safe_reshape( unconvert(b['values'], dtype_for(b['dtype']), b['compress']), b['shape']) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 if 'locs' in b: placement = b['locs'] else: placement = axes[0].get_indexer(b['items']) if is_datetime64tz_dtype(b['dtype']): assert isinstance(values, np.ndarray), type(values) assert values.dtype == 'M8[ns]', values.dtype values = DatetimeArray(values, dtype=b['dtype']) return make_block(values=values, klass=getattr(internals, b['klass']), placement=placement, dtype=b['dtype'])
def _init_matrix(self, data, axes, dtype=None, copy=False): values = _prep_ndarray(data, copy=copy) if dtype is not None: try: values = values.astype(dtype) except Exception: raise ValueError('failed to cast to %s' % dtype) shape = values.shape fixed_axes = [] for i, ax in enumerate(axes): if ax is None: ax = _default_index(shape[i]) else: ax = _ensure_index(ax) fixed_axes.append(ax) items = fixed_axes[0] block = make_block(values, items, items) return BlockManager([block], fixed_axes)
def _init_matrix(self, data, axes, dtype=None, copy=False): values = _prep_ndarray(data, copy=copy) if dtype is not None: try: values = values.astype(dtype) except Exception: raise ValueError('failed to cast to %s' % dtype) shape = values.shape fixed_axes = [] for i, ax in enumerate(axes): if ax is None: ax = _default_index(shape[i]) else: ax = _ensure_index(ax) fixed_axes.append(ax) items = fixed_axes[0] block = make_block(values, items, items) return BlockManager([block], fixed_axes)
def create_block(b): values = _safe_reshape(unconvert( b[u'values'], dtype_for(b[u'dtype']), b[u'compress']), b[u'shape']) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 if u'locs' in b: placement = b[u'locs'] else: placement = axes[0].get_indexer(b[u'items']) if is_datetime64tz_dtype(b[u'dtype']): assert isinstance(values, np.ndarray), type(values) assert values.dtype == 'M8[ns]', values.dtype values = DatetimeArray(values, dtype=b[u'dtype']) return make_block(values=values, klass=getattr(internals, b[u'klass']), placement=placement, dtype=b[u'dtype'])
def create_block(b): values = _safe_reshape( unconvert(b[u"values"], dtype_for(b[u"dtype"]), b[u"compress"]), b[u"shape"], ) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 if u"locs" in b: placement = b[u"locs"] else: placement = axes[0].get_indexer(b[u"items"]) klass = getattr(internals, b[u"klass"]) if klass == DatetimeTZBlock: raise ValueError("Lost the ability to parse datetime with timezone. Sorry") return make_block( values=values.copy(), klass=getattr(internals, b[u"klass"]), placement=placement, dtype=b[u"dtype"], )
def _merge_blocks(self, merge_chunks): """ merge_chunks -> [(_JoinUnit, Block)] """ funit, fblock = merge_chunks[0] fidx = funit.indexer out_shape = list(fblock.get_values().shape) n = len(fidx) if fidx is not None else out_shape[self.axis] merge_lengths = list(blk.get_merge_length() for unit, blk in merge_chunks) out_shape[0] = sum(merge_lengths) out_shape[self.axis] = n # Should use Fortran order?? block_dtype = _get_block_dtype([x[1] for x in merge_chunks]) out = np.empty(out_shape, dtype=block_dtype) sofar = 0 for unit, blk in merge_chunks: out_chunk = out[sofar : sofar + len(blk)] com.take_nd(blk.get_values(), unit.indexer, self.axis, out=out_chunk) sofar += len(blk) # does not sort new_block_items = _concat_indexes([b.items for _, b in merge_chunks]) # need to set placement if we have a non-unique result # calculate by the existing placement plus the offset in the result set placement = None if not self.result_items.is_unique: placement = [] offsets = np.append(np.array([0]), self.offsets.cumsum()[:-1]) for (unit, blk), offset in zip(merge_chunks, offsets): placement.extend(blk.ref_locs + offset) return make_block(out, new_block_items, self.result_items, placement=placement)
def block2d_to_block3d(values, items, shape, major_labels, minor_labels, ref_items=None): """ Developer method for pivoting DataFrame -> Panel. Used in HDFStore and DataFrame.to_panel """ from pandas.core.internals import make_block panel_shape = (len(items), ) + shape # TODO: lexsort depth needs to be 2!! # Create observation selection vector using major and minor # labels, for converting to panel format. selector = minor_labels + shape[1] * major_labels mask = np.zeros(np.prod(shape), dtype=bool) mask.put(selector, True) pvalues = np.empty(panel_shape, dtype=values.dtype) if not issubclass(pvalues.dtype.type, (np.integer, np.bool_)): pvalues.fill(np.nan) elif not mask.all(): pvalues = com._maybe_upcast(pvalues) pvalues.fill(np.nan) values = values for i in xrange(len(items)): pvalues[i].flat[mask] = values[:, i] if ref_items is None: ref_items = items return make_block(pvalues, items, ref_items)
def _merge_blocks(self, lblk, rblk): lidx = self.lindexer ridx = self.rindexer n = lblk.values.shape[self.axis] if lidx is None else len(lidx) lk = len(lblk.items) rk = len(rblk.items) out_shape = list(lblk.shape) out_shape[0] = lk + rk out_shape[self.axis] = n out = np.empty(out_shape, dtype=lblk.values.dtype) # is this really faster than assigning to arr.flat? if lidx is None: # out[:lk] = lblk.values com.take_fast(lblk.values, np.arange(n, dtype='i4'), None, False, axis=self.axis, out=out[:lk]) else: # write out the values to the result array com.take_fast(lblk.values, lidx, None, False, axis=self.axis, out=out[:lk]) if ridx is None: # out[lk:] = lblk.values com.take_fast(rblk.values, np.arange(n, dtype='i4'), None, False, axis=self.axis, out=out[lk:]) else: com.take_fast(rblk.values, ridx, None, False, axis=self.axis, out=out[lk:]) # does not sort new_items = lblk.items.append(rblk.items) return make_block(out, new_items, self.result_items)
def _reconstruct_block(item, columns=None, extension_columns=None): """ Construct a pandas Block from the `item` dictionary coming from pyarrow's serialization or returned by arrow::python::ConvertTableToPandas. This function takes care of converting dictionary types to pandas categorical, Timestamp-with-timezones to the proper pandas Block, and conversion to pandas ExtensionBlock Parameters ---------- item : dict For basic types, this is a dictionary in the form of {'block': np.ndarray of values, 'placement': pandas block placement}. Additional keys are present for other types (dictionary, timezone, object). columns : Column names of the table being constructed, used for extension types extension_columns : dict Dictionary of {column_name: pandas_dtype} that includes all columns and corresponding dtypes that will be converted to a pandas ExtensionBlock. Returns ------- pandas Block """ import pandas.core.internals as _int block_arr = item.get('block', None) placement = item['placement'] if 'dictionary' in item: cat = _pandas_api.categorical_type.from_codes( block_arr, categories=item['dictionary'], ordered=item['ordered']) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock) elif 'timezone' in item: dtype = make_datetimetz(item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype) elif 'object' in item: block = _int.make_block(builtin_pickle.loads(block_arr), placement=placement, klass=_int.ObjectBlock) elif 'py_array' in item: # create ExtensionBlock arr = item['py_array'] assert len(placement) == 1 name = columns[placement[0]] pandas_dtype = extension_columns[name] if not hasattr(pandas_dtype, '__from_arrow__'): raise ValueError("This column does not support to be converted " "to a pandas ExtensionArray") pd_ext_arr = pandas_dtype.__from_arrow__(arr) block = _int.make_block(pd_ext_arr, placement=placement, klass=_int.ExtensionBlock) else: block = _int.make_block(block_arr, placement=placement) return block
def test_interval_can_hold_element_emptylist(self, dtype, element): arr = np.array([1, 3, 4], dtype=dtype) ii = IntervalIndex.from_breaks(arr) blk = make_block(ii._data, [1], ndim=2) assert blk._can_hold_element([])
def create_block(typestr, placement, item_shape=None, num_offset=0): """ Supported typestr: * float, f8, f4, f2 * int, i8, i4, i2, i1 * uint, u8, u4, u2, u1 * complex, c16, c8 * bool * object, string, O * datetime, dt, M8[ns], M8[ns, tz] * timedelta, td, m8[ns] * sparse (SparseArray with fill_value=0.0) * sparse_na (SparseArray with fill_value=np.nan) * category, category2 """ placement = BlockPlacement(placement) num_items = len(placement) if item_shape is None: item_shape = (N, ) shape = (num_items, ) + item_shape mat = get_numeric_mat(shape) if typestr in ('float', 'f8', 'f4', 'f2', 'int', 'i8', 'i4', 'i2', 'i1', 'uint', 'u8', 'u4', 'u2', 'u1'): values = mat.astype(typestr) + num_offset elif typestr in ('complex', 'c16', 'c8'): values = 1.j * (mat.astype(typestr) + num_offset) elif typestr in ('object', 'string', 'O'): values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset], shape) elif typestr in ( 'b', 'bool', ): values = np.ones(shape, dtype=np.bool_) elif typestr in ('datetime', 'dt', 'M8[ns]'): values = (mat * 1e9).astype('M8[ns]') elif typestr.startswith('M8[ns'): # datetime with tz m = re.search(r'M8\[ns,\s*(\w+\/?\w*)\]', typestr) assert m is not None, "incompatible typestr -> {0}".format(typestr) tz = m.groups()[0] assert num_items == 1, "must have only 1 num items for a tz-aware" values = DatetimeIndex(np.arange(N) * 1e9, tz=tz) elif typestr in ('timedelta', 'td', 'm8[ns]'): values = (mat * 1).astype('m8[ns]') elif typestr in ('category', ): values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4]) elif typestr in ('category2', ): values = Categorical( ['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'c', 'd']) elif typestr in ('sparse', 'sparse_na'): # FIXME: doesn't support num_rows != 10 assert shape[-1] == 10 assert all(s == 1 for s in shape[:-1]) if typestr.endswith('_na'): fill_value = np.nan else: fill_value = 0.0 values = SparseArray( [fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6], fill_value=fill_value) arr = values.sp_values.view() arr += (num_offset - 1) else: raise ValueError('Unsupported typestr: "%s"' % typestr) return make_block(values, placement=placement, ndim=len(shape))
def test_period_can_hold_element_emptylist(self): pi = period_range("2016", periods=3, freq="A") blk = make_block(pi._data, [1], ndim=2) assert blk._can_hold_element([])
def test_deprecated_fastpath(): # GH#19265 values = np.random.rand(3, 3) with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): make_block(values, placement=np.arange(3), fastpath=True)
def create_block(typestr, placement, item_shape=None, num_offset=0): """ Supported typestr: * float, f8, f4, f2 * int, i8, i4, i2, i1 * uint, u8, u4, u2, u1 * complex, c16, c8 * bool * object, string, O * datetime, dt, M8[ns], M8[ns, tz] * timedelta, td, m8[ns] * sparse (SparseArray with fill_value=0.0) * sparse_na (SparseArray with fill_value=np.nan) * category, category2 """ placement = BlockPlacement(placement) num_items = len(placement) if item_shape is None: item_shape = (N,) shape = (num_items,) + item_shape mat = get_numeric_mat(shape) if typestr in ( "float", "f8", "f4", "f2", "int", "i8", "i4", "i2", "i1", "uint", "u8", "u4", "u2", "u1", ): values = mat.astype(typestr) + num_offset elif typestr in ("complex", "c16", "c8"): values = 1.0j * (mat.astype(typestr) + num_offset) elif typestr in ("object", "string", "O"): values = np.reshape([f"A{i:d}" for i in mat.ravel() + num_offset], shape) elif typestr in ("b", "bool"): values = np.ones(shape, dtype=np.bool_) elif typestr in ("datetime", "dt", "M8[ns]"): values = (mat * 1e9).astype("M8[ns]") elif typestr.startswith("M8[ns"): # datetime with tz m = re.search(r"M8\[ns,\s*(\w+\/?\w*)\]", typestr) assert m is not None, f"incompatible typestr -> {typestr}" tz = m.groups()[0] assert num_items == 1, "must have only 1 num items for a tz-aware" values = DatetimeIndex(np.arange(N) * 1e9, tz=tz) elif typestr in ("timedelta", "td", "m8[ns]"): values = (mat * 1).astype("m8[ns]") elif typestr in ("category",): values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4]) elif typestr in ("category2",): values = Categorical(["a", "a", "a", "a", "b", "b", "c", "c", "c", "d"]) elif typestr in ("sparse", "sparse_na"): # FIXME: doesn't support num_rows != 10 assert shape[-1] == 10 assert all(s == 1 for s in shape[:-1]) if typestr.endswith("_na"): fill_value = np.nan else: fill_value = 0.0 values = SparseArray( [fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6], fill_value=fill_value, ) arr = values.sp_values.view() arr += num_offset - 1 else: raise ValueError(f'Unsupported typestr: "{typestr}"') return make_block(values, placement=placement, ndim=len(shape))
def table_to_blockmanager(table, nthreads=1): import pandas.core.internals as _int from pyarrow.compat import DatetimeTZDtype import pyarrow.lib as lib block_table = table index_columns = [] index_arrays = [] index_names = [] schema = table.schema row_count = table.num_rows metadata = schema.metadata if metadata is not None and b'pandas' in metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] for name in index_columns: i = schema.get_field_index(name) if i != -1: col = table.column(i) index_name = (None if is_unnamed_index_level(name) else name) values = col.to_pandas().values if not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() index_arrays.append(values) index_names.append(index_name) block_table = block_table.remove_column( block_table.schema.get_field_index(name)) result = lib.table_to_blocks(block_table, nthreads) blocks = [] for item in result: block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: cat = pd.Categorical(block_arr, categories=item['dictionary'], ordered=False, fastpath=True) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) elif 'timezone' in item: dtype = DatetimeTZDtype('ns', tz=item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype, fastpath=True) else: block = _int.make_block(block_arr, placement=placement) blocks.append(block) if len(index_arrays) > 1: index = pd.MultiIndex.from_arrays(index_arrays, names=index_names) elif len(index_arrays) == 1: index = pd.Index(index_arrays[0], name=index_names[0]) else: index = pd.RangeIndex(row_count) axes = [[column.name for column in block_table.itercolumns()], index] return _int.BlockManager(blocks, axes)
def table_to_blockmanager(options, table, memory_pool, nthreads=1): import pandas.core.internals as _int import pyarrow.lib as lib index_columns = [] columns = [] column_indexes = [] index_arrays = [] index_names = [] schema = table.schema row_count = table.num_rows metadata = schema.metadata columns_metadata = None has_pandas_metadata = metadata is not None and b'pandas' in metadata if has_pandas_metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] columns = pandas_metadata['columns'] column_indexes = pandas_metadata.get('column_indexes', []) table = _add_any_metadata(table, pandas_metadata) columns_metadata = pandas_metadata.get('columns', None) block_table = table # Build up a list of index columns and names while removing those columns # from the original table logical_index_names = [c['name'] for c in columns[-len(index_columns):]] for raw_name, logical_name in zip(index_columns, logical_index_names): i = schema.get_field_index(raw_name) if i != -1: col = table.column(i) col_pandas = col.to_pandas() values = col_pandas.values if hasattr(values, 'flags') and not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() index_arrays.append(pd.Series(values, dtype=col_pandas.dtype)) index_names.append( backwards_compatible_index_name(raw_name, logical_name)) block_table = block_table.remove_column( block_table.schema.get_field_index(raw_name)) # Convert an arrow table to Block from the internal pandas API result = lib.table_to_blocks(options, block_table, nthreads, memory_pool) # Construct the individual blocks converting dictionary types to pandas # categorical types and Timestamps-with-timezones types to the proper # pandas Blocks blocks = [] for item in result: block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: cat = pd.Categorical(block_arr, categories=item['dictionary'], ordered=item['ordered'], fastpath=True) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) elif 'timezone' in item: dtype = make_datetimetz(item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype, fastpath=True) else: block = _int.make_block(block_arr, placement=placement) blocks.append(block) # Construct the row index if len(index_arrays) > 1: index = pd.MultiIndex.from_arrays(index_arrays, names=index_names) elif len(index_arrays) == 1: index = pd.Index(index_arrays[0], name=index_names[0]) else: index = pd.RangeIndex(row_count) column_strings = [x.name for x in block_table.itercolumns()] if columns_metadata is not None: columns_name_dict = dict( (str(x['name']), x['name']) for x in columns_metadata) columns_values = [ columns_name_dict[y] if y in columns_name_dict.keys() else y for y in column_strings ] else: columns_values = column_strings # If we're passed multiple column indexes then evaluate with # ast.literal_eval, since the column index values show up as a list of # tuples to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, ) # Create the column index # Construct the base index if not columns_values: columns = pd.Index(columns_values) else: columns = pd.MultiIndex.from_tuples( list(map(to_pair, columns_values)), names=[col_index['name'] for col_index in column_indexes] or None, ) # if we're reconstructing the index if has_pandas_metadata: # Get levels and labels, and provide sane defaults if the index has a # single level to avoid if/else spaghetti. levels = getattr(columns, 'levels', None) or [columns] labels = getattr(columns, 'labels', None) or [ pd.RangeIndex(len(level)) for level in levels ] # Convert each level to the dtype provided in the metadata levels_dtypes = [(level, col_index.get('numpy_type', level.dtype)) for level, col_index in zip_longest( levels, column_indexes, fillvalue={})] new_levels = [ _level if _level.dtype == _dtype else _level.astype(_dtype) for _level, _dtype in levels_dtypes ] columns = pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names) # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0 columns = _flatten_single_level_multiindex(columns) axes = [columns, index] return _int.BlockManager(blocks, axes)