def pandas_dataframe_resolver(obj, resolver): meta = obj.meta columns = from_json(meta['columns_']) if not columns: return pd.DataFrame() names = [] # ensure zero-copy blocks = [] index_size = 0 for idx, _ in enumerate(columns): names.append(from_json(meta['__values_-key-%d' % idx])) np_value = resolver.run(obj.member('__values_-value-%d' % idx)) index_size = len(np_value) # ndim: 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame if BlockPlacement: placement = BlockPlacement(slice(idx, idx + 1, 1)) else: placement = slice(idx, idx + 1, 1) if DatetimeArray is not None and isinstance(np_value, DatetimeArray): values = np_value.reshape(1, -1) setattr(values, '__vineyard_ref', getattr(np_value, '__vineyard_ref', None)) block = DatetimeLikeBlock(values, placement, ndim=2) else: values = np.expand_dims(np_value, 0).view(ndarray) setattr(values, '__vineyard_ref', getattr(np_value, '__vineyard_ref', None)) block = Block(values, placement, ndim=2) blocks.append(block) if 'index_' in meta: index = resolver.run(obj.member('index_')) else: index = pd.RangeIndex(index_size) return pd.DataFrame(BlockManager(blocks, [ensure_index(names), index]))
def _concat_managers_axis0(mgrs_indexers, axes: list[Index], copy: bool) -> BlockManager: """ concat_managers specialized to concat_axis=0, with reindexing already having been done in _maybe_reindex_columns_na_proxy. """ had_reindexers = { i: len(mgrs_indexers[i][1]) > 0 for i in range(len(mgrs_indexers)) } mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers) mgrs = [x[0] for x in mgrs_indexers] offset = 0 blocks = [] for i, mgr in enumerate(mgrs): # If we already reindexed, then we definitely don't need another copy made_copy = had_reindexers[i] for blk in mgr.blocks: if made_copy: nb = blk.copy(deep=False) elif copy: nb = blk.copy() else: # by slicing instead of copy(deep=False), we get a new array # object, see test_concat_copy nb = blk.getitem_block(slice(None)) nb._mgr_locs = nb._mgr_locs.add(offset) blocks.append(nb) offset += len(mgr.items) return BlockManager(tuple(blocks), axes)
def _iter_block_pairs(left: BlockManager, right: BlockManager) -> Iterator[BlockPairInfo]: # At this point we have already checked the parent DataFrames for # assert rframe._indexed_same(lframe) for blk in left.blocks: locs = blk.mgr_locs blk_vals = blk.values left_ea = blk_vals.ndim == 1 rblks = right._slice_take_blocks_ax0(locs.indexer, only_slice=True) # Assertions are disabled for performance, but should hold: # if left_ea: # assert len(locs) == 1, locs # assert len(rblks) == 1, rblks # assert rblks[0].shape[0] == 1, rblks[0].shape for rblk in rblks: right_ea = rblk.values.ndim == 1 lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea) info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk) yield info
def pandas_dataframe_resolver(obj, resolver): meta = obj.meta columns = from_json(meta['columns_']) if not columns: return pd.DataFrame() # ensure zero-copy blocks = [] index_size = 0 for idx, name in enumerate(columns): np_value = resolver.run(obj.member('__values_-value-%d' % idx)) index_size = len(np_value) # ndim: 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame if BlockPlacement: placement = BlockPlacement(slice(idx, idx + 1, 1)) else: placement = slice(idx, idx + 1, 1) values = np.expand_dims(np_value, 0).view(ndarray) setattr(values, '__vineyard_ref', getattr(np_value, '__vineyard_ref', None)) blocks.append(Block(values, placement, ndim=2)) if 'index_' in meta: index = resolver.run(obj.member('index_')) else: index = pd.RangeIndex(index_size) return pd.DataFrame(BlockManager(blocks, [pd.Index(columns), index]))
def concatenate_block_managers(mgrs_indexers, axes, concat_axis: int, copy: bool) -> BlockManager: """ Concatenate block managers into one. Parameters ---------- mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples axes : list of Index concat_axis : int copy : bool Returns ------- BlockManager """ concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers ] concat_plan = _combine_concat_plans(concat_plans, concat_axis) blocks = [] for placement, join_units in concat_plan: if len(join_units) == 1 and not join_units[0].indexers: b = join_units[0].block values = b.values if copy: values = values.copy() else: values = values.view() b = b.make_block_same_class(values, placement=placement) elif _is_uniform_join_units(join_units): blk = join_units[0].block vals = [ju.block.values for ju in join_units] if not blk.is_extension: values = concat_compat(vals, axis=blk.ndim - 1) else: # TODO(EA2D): special-casing not needed with 2D EAs values = concat_compat(vals) if not isinstance(values, ExtensionArray): values = values.reshape(1, len(values)) b = make_block(values, placement=placement, ndim=blk.ndim) else: b = make_block( _concatenate_join_units(join_units, concat_axis, copy=copy), placement=placement, ndim=len(axes), ) blocks.append(b) return BlockManager(blocks, axes)
def concatenate_block_managers(mgrs_indexers, axes, concat_axis: int, copy: bool) -> BlockManager: """ Concatenate block managers into one. Parameters ---------- mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples axes : list of Index concat_axis : int copy : bool Returns ------- BlockManager """ concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers ] concat_plan = _combine_concat_plans(concat_plans, concat_axis) blocks = [] for placement, join_units in concat_plan: if len(join_units) == 1 and not join_units[0].indexers: b = join_units[0].block values = b.values if copy: values = values.copy() else: values = values.view() b = b.make_block_same_class(values, placement=placement) elif _is_uniform_join_units(join_units): blk = join_units[0].block vals = [ju.block.values for ju in join_units] if not blk.is_extension or blk.is_datetimetz or blk.is_categorical: # datetimetz and categorical can have the same type but multiple # dtypes, concatting does not necessarily preserve dtype values = concat_compat(vals, axis=blk.ndim - 1) else: # TODO(EA2D): special-casing not needed with 2D EAs values = concat_compat(vals) b = make_block(values, placement=placement, ndim=blk.ndim) else: b = make_block( _concatenate_join_units(join_units, concat_axis, copy=copy), placement=placement, ) blocks.append(b) return BlockManager(blocks, axes)
def dataframe_resolver(obj, resolver): meta = obj.meta columns = json.loads(meta['columns_']) if not columns: return pd.DataFrame() # ensure zero-copy blocks = [] index_size = 0 for idx, name in enumerate(columns): np_value = resolver.run(obj.member('__values_-value-%d' % idx)) blocks.append( Block(np.expand_dims(np_value, 0), slice(idx, idx + 1, 1))) index_size = len(np_value) return pd.DataFrame(BlockManager(blocks, [columns, np.arange(index_size)]))
def pandas_dataframe_resolver(obj, resolver): meta = obj.meta columns = from_json(meta['columns_']) index = resolver.run(obj.member('index_')) if not columns: return pd.DataFrame() # ensure zero-copy blocks = [] for idx, name in enumerate(columns): np_value = resolver.run(obj.member('__values_-value-%d' % idx)) # ndim: 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame blocks.append( Block(np.expand_dims(np_value, 0), slice(idx, idx + 1, 1), ndim=2)) return pd.DataFrame(BlockManager(blocks, [columns, index]))
def execute(cls, ctx, op): if vineyard is None: raise RuntimeError('vineyard is not available') client = vineyard.connect(op.vineyard_socket) # chunk has no tensor chunk df_chunk = client.get(op.object_id) if not df_chunk.columns: ctx[op.outputs[0].key] = pd.DataFrame() else: # ensure zero-copy blocks = [] index_size = 0 for idx, name in enumerate(df_chunk.columns): value = df_chunk[name].numpy() blocks.append( Block(np.expand_dims(value, 0), slice(idx, idx + 1, 1))) index_size = len(value) ctx[op.outputs[0].key] = pd.DataFrame( BlockManager( blocks, [df_chunk.columns, np.arange(index_size)]))
def concatenate_managers( mgrs_indexers, axes: list[Index], concat_axis: int, copy: bool ) -> Manager: """ Concatenate block managers into one. Parameters ---------- mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples axes : list of Index concat_axis : int copy : bool Returns ------- BlockManager """ # TODO(ArrayManager) this assumes that all managers are of the same type if isinstance(mgrs_indexers[0][0], ArrayManager): return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers ] concat_plan = _combine_concat_plans(concat_plans, concat_axis) blocks = [] for placement, join_units in concat_plan: unit = join_units[0] blk = unit.block if len(join_units) == 1 and not join_units[0].indexers: values = blk.values if copy: values = values.copy() else: values = values.view() fastpath = True elif _is_uniform_join_units(join_units): vals = [ju.block.values for ju in join_units] if not blk.is_extension: # _is_uniform_join_units ensures a single dtype, so # we can use np.concatenate, which is more performant # than concat_compat values = np.concatenate(vals, axis=blk.ndim - 1) else: # TODO(EA2D): special-casing not needed with 2D EAs values = concat_compat(vals, axis=1) values = ensure_block_shape(values, blk.ndim) values = ensure_wrapped_if_datetimelike(values) fastpath = blk.values.dtype == values.dtype else: values = _concatenate_join_units(join_units, concat_axis, copy=copy) fastpath = False if fastpath: b = blk.make_block_same_class(values, placement=placement) else: b = new_block(values, placement=placement, ndim=len(axes)) blocks.append(b) return BlockManager(tuple(blocks), axes)
def concatenate_managers(mgrs_indexers, axes: list[Index], concat_axis: int, copy: bool) -> Manager: """ Concatenate block managers into one. Parameters ---------- mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples axes : list of Index concat_axis : int copy : bool Returns ------- BlockManager """ # TODO(ArrayManager) this assumes that all managers are of the same type if isinstance(mgrs_indexers[0][0], ArrayManager): return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) # Assertions disabled for performance # for tup in mgrs_indexers: # # caller is responsible for ensuring this # indexers = tup[1] # assert concat_axis not in indexers if concat_axis == 0: return _concat_managers_axis0(mgrs_indexers, axes, copy) mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers) # Assertion disabled for performance # assert all(not x[1] for x in mgrs_indexers) concat_plans = [ _get_mgr_concatenation_plan(mgr) for mgr, _ in mgrs_indexers ] concat_plan = _combine_concat_plans(concat_plans) blocks = [] for placement, join_units in concat_plan: unit = join_units[0] blk = unit.block # Assertion disabled for performance # assert len(join_units) == len(mgrs_indexers) if len(join_units) == 1: values = blk.values if copy: values = values.copy() else: values = values.view() fastpath = True elif _is_uniform_join_units(join_units): vals = [ju.block.values for ju in join_units] if not blk.is_extension: # _is_uniform_join_units ensures a single dtype, so # we can use np.concatenate, which is more performant # than concat_compat values = np.concatenate(vals, axis=1) else: # TODO(EA2D): special-casing not needed with 2D EAs values = concat_compat(vals, axis=1) values = ensure_block_shape(values, ndim=2) values = ensure_wrapped_if_datetimelike(values) fastpath = blk.values.dtype == values.dtype else: values = _concatenate_join_units(join_units, copy=copy) fastpath = False if fastpath: b = blk.make_block_same_class(values, placement=placement) else: b = new_block_2d(values, placement=placement) blocks.append(b) return BlockManager(tuple(blocks), axes)
def concatenate_block_managers(mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool) -> Manager: """ Concatenate block managers into one. Parameters ---------- mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples axes : list of Index concat_axis : int copy : bool Returns ------- BlockManager """ if isinstance(mgrs_indexers[0][0], ArrayManager): if concat_axis == 1: # TODO for now only fastpath without indexers mgrs = [t[0] for t in mgrs_indexers] arrays = [ concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))], axis=0) for j in range(len(mgrs[0].arrays)) ] return ArrayManager(arrays, [axes[1], axes[0]]) elif concat_axis == 0: mgrs = [t[0] for t in mgrs_indexers] arrays = list( itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) return ArrayManager(arrays, [axes[1], axes[0]]) concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers ] concat_plan = _combine_concat_plans(concat_plans, concat_axis) blocks = [] for placement, join_units in concat_plan: if len(join_units) == 1 and not join_units[0].indexers: b = join_units[0].block values = b.values if copy: values = values.copy() else: values = values.view() b = b.make_block_same_class(values, placement=placement) elif _is_uniform_join_units(join_units): blk = join_units[0].block vals = [ju.block.values for ju in join_units] if not blk.is_extension: # _is_uniform_join_units ensures a single dtype, so # we can use np.concatenate, which is more performant # than concat_compat values = np.concatenate(vals, axis=blk.ndim - 1) else: # TODO(EA2D): special-casing not needed with 2D EAs values = concat_compat(vals) if not isinstance(values, ExtensionArray): values = values.reshape(1, len(values)) if blk.values.dtype == values.dtype: # Fast-path b = blk.make_block_same_class(values, placement=placement) else: b = make_block(values, placement=placement, ndim=blk.ndim) else: b = make_block( _concatenate_join_units(join_units, concat_axis, copy=copy), placement=placement, ndim=len(axes), ) blocks.append(b) return BlockManager(blocks, axes)