def mgr_to_mgr(mgr, typ: str): """ Convert to specific type of Manager. Does not copy if the type is already correct. Does not guarantee a copy otherwise. """ new_mgr: Manager if typ == "block": if isinstance(mgr, BlockManager): new_mgr = mgr else: new_mgr = arrays_to_mgr(mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], typ="block") elif typ == "array": if isinstance(mgr, ArrayManager): new_mgr = mgr else: arrays = [ mgr.iget_values(i).copy() for i in range(len(mgr.axes[0])) ] new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) else: raise ValueError( f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") return new_mgr
def mgr_to_mgr(mgr, typ: str, copy: bool = True): """ Convert to specific type of Manager. Does not copy if the type is already correct. Does not guarantee a copy otherwise. `copy` keyword only controls whether conversion from Block->ArrayManager copies the 1D arrays. """ new_mgr: Manager if typ == "block": if isinstance(mgr, BlockManager): new_mgr = mgr else: if mgr.ndim == 2: new_mgr = arrays_to_mgr( mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block" ) else: new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index) elif typ == "array": if isinstance(mgr, ArrayManager): new_mgr = mgr else: if mgr.ndim == 2: arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))] if copy: arrays = [arr.copy() for arr in arrays] new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) else: array = mgr.internal_values() if copy: array = array.copy() new_mgr = SingleArrayManager([array], [mgr.index]) else: raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") return new_mgr
def arrays_to_mgr( arrays, columns: Index, index, *, dtype: DtypeObj | None = None, verify_integrity: bool = True, typ: str | None = None, consolidate: bool = True, ) -> Manager: """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ if verify_integrity: # figure out the index, if necessary if index is None: index = _extract_index(arrays) else: index = ensure_index(index) # don't force copy because getting jammed in an ndarray anyway arrays = _homogenize(arrays, index, dtype) # _homogenize ensures # - all(len(x) == len(index) for x in arrays) # - all(x.ndim == 1 for x in arrays) # - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays) # - all(type(x) is not PandasArray for x in arrays) else: index = ensure_index(index) arrays = [extract_array(x, extract_numpy=True) for x in arrays] # Reached via DataFrame._from_arrays; we do validation here for arr in arrays: if (not isinstance(arr, (np.ndarray, ExtensionArray)) or arr.ndim != 1 or len(arr) != len(index)): raise ValueError( "Arrays must be 1-dimensional np.ndarray or ExtensionArray " "with length matching len(index)") columns = ensure_index(columns) if len(columns) != len(arrays): raise ValueError("len(arrays) must match len(columns)") # from BlockManager perspective axes = [columns, index] if typ == "block": return create_block_manager_from_column_arrays(arrays, axes, consolidate=consolidate) elif typ == "array": return ArrayManager(arrays, [index, columns]) else: raise ValueError( f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
def concatenate_array_managers(mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool) -> Manager: """ Concatenate array managers into one. Parameters ---------- mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples axes : list of Index concat_axis : int copy : bool Returns ------- ArrayManager """ # reindex all arrays mgrs = [] for mgr, indexers in mgrs_indexers: for ax, indexer in indexers.items(): mgr = mgr.reindex_indexer(axes[ax], indexer, axis=ax, allow_dups=True) mgrs.append(mgr) if concat_axis == 1: # concatting along the rows -> concat the reindexed arrays # TODO(ArrayManager) doesn't yet preserve the correct dtype arrays = [ concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))]) for j in range(len(mgrs[0].arrays)) ] return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False) else: # concatting along the columns -> combine reindexed arrays in a single manager assert concat_axis == 0 arrays = list( itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False)
def arrays_to_mgr( arrays, arr_names, index, columns, *, dtype: DtypeObj | None = None, verify_integrity: bool = True, typ: str | None = None, consolidate: bool = True, ) -> Manager: """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ arr_names = ensure_index(arr_names) if verify_integrity: # figure out the index, if necessary if index is None: index = _extract_index(arrays) else: index = ensure_index(index) # don't force copy because getting jammed in an ndarray anyway arrays = _homogenize(arrays, index, dtype) else: index = ensure_index(index) columns = ensure_index(columns) # from BlockManager perspective axes = [columns, index] if typ == "block": return create_block_manager_from_arrays(arrays, arr_names, axes, consolidate=consolidate) elif typ == "array": if len(columns) != len(arrays): assert len(arrays) == 0 arrays = [np.array([], dtype=object) for _ in range(len(columns))] return ArrayManager(arrays, [index, columns]) else: raise ValueError( f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
def _concatenate_array_managers(mgrs_indexers, axes: list[Index], concat_axis: int, copy: bool) -> Manager: """ Concatenate array managers into one. Parameters ---------- mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples axes : list of Index concat_axis : int copy : bool Returns ------- ArrayManager """ # reindex all arrays mgrs = [] for mgr, indexers in mgrs_indexers: axis1_made_copy = False for ax, indexer in indexers.items(): mgr = mgr.reindex_indexer(axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True) if ax == 1 and indexer is not None: axis1_made_copy = True if copy and concat_axis == 0 and not axis1_made_copy: # for concat_axis 1 we will always get a copy through concat_arrays mgr = mgr.copy() mgrs.append(mgr) if concat_axis == 1: # concatting along the rows -> concat the reindexed arrays # TODO(ArrayManager) doesn't yet preserve the correct dtype arrays = [ concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))]) for j in range(len(mgrs[0].arrays)) ] else: # concatting along the columns -> combine reindexed arrays in a single manager assert concat_axis == 0 arrays = list( itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) return new_mgr
def arrays_to_mgr( arrays, arr_names, index, columns, dtype: Optional[DtypeObj] = None, verify_integrity: bool = True, typ: Optional[str] = None, ): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ arr_names = ensure_index(arr_names) if verify_integrity: # figure out the index, if necessary if index is None: index = extract_index(arrays) else: index = ensure_index(index) # don't force copy because getting jammed in an ndarray anyway arrays = _homogenize(arrays, index, dtype) columns = ensure_index(columns) else: columns = ensure_index(columns) index = ensure_index(index) # from BlockManager perspective axes = [columns, index] if typ == "block": return create_block_manager_from_arrays(arrays, arr_names, axes) elif typ == "array": return ArrayManager(arrays, [index, columns]) else: raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = Index([values.name]) if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) # if the array preparation does a copy -> avoid this for ArrayManager, # since the copy is done on conversion to 1D arrays copy_on_sanitize = False if typ == "array" else copy vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype): # GH#19157 if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1: # GH#12513 a EA dtype passed with a 2D array, split into # multiple EAs that view the values # error: No overload variant of "__getitem__" of "ExtensionArray" # matches argument type "Tuple[slice, int]" values = [ values[:, n] # type: ignore[call-overload] for n in range(values.shape[1]) ] else: values = [values] if columns is None: columns = Index(range(len(values))) else: columns = ensure_index(columns) return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ, PeriodDtype values = extract_array(values, extract_numpy=True) if copy: values = values.copy() if values.ndim == 1: values = values.reshape(-1, 1) else: # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy_on_sanitize) if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape flat = values.ravel() # GH#40110 see similar check inside sanitize_array rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") values = sanitize_array(flat, None, dtype=dtype, copy=copy_on_sanitize, raise_cast_failure=rcf) values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes(values.shape[0], values.shape[1], index=index, columns=columns) _check_values_indices_shape_match(values, index, columns) if typ == "array": if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) if dtype is None and is_object_dtype(values.dtype): arrays = [ ensure_wrapped_if_datetimelike( maybe_infer_to_datetimelike(values[:, i])) for i in range(values.shape[1]) ] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) arrays = [values[:, i] for i in range(values.shape[1])] if copy: arrays = [arr.copy() for arr in arrays] return ArrayManager(arrays, [index, columns], verify_integrity=False) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): obj_columns = list(values) maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] # don't convert (and copy) the objects if no type inference occurs if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): dvals_list = [ ensure_block_shape(dval, 2) for dval in maybe_datetime ] block_values = [ new_block_2d(dvals_list[n], placement=BlockPlacement(n)) for n in range(len(dvals_list)) ] else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp) block_values = [nb] else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp) block_values = [nb] if len(columns) == 0: block_values = [] return create_block_manager_from_blocks(block_values, [columns, index], verify_integrity=False)
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = Index([values.name]) if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: # GH#12513 a EA dtype passed with a 2D array, split into # multiple EAs that view the values values = [values[:, n] for n in range(values.shape[1])] else: values = [values] if columns is None: columns = Index(range(len(values))) return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) if is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ values = extract_array(values, extract_numpy=True) if copy: values = values.copy() if values.ndim == 1: values = values.reshape(-1, 1) else: # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape flat = values.ravel() if not is_integer_dtype(dtype): # TODO: skipping integer_dtype is needed to keep the tests passing, # not clear it is correct # Note: we really only need _try_cast, but keeping to exposed funcs values = sanitize_array(flat, None, dtype=dtype, copy=copy, raise_cast_failure=True) else: try: values = construct_1d_ndarray_preserving_na(flat, dtype=dtype, copy=False) except IntCastingNaNError: # following Series, we ignore the dtype and retain floating # values instead of casting nans to meaningless ints pass values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes(values.shape[0], values.shape[1], index=index, columns=columns) _check_values_indices_shape_match(values, index, columns) if typ == "array": if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) if dtype is None and is_object_dtype(values.dtype): arrays = [ ensure_wrapped_if_datetimelike( maybe_infer_to_datetimelike(values[:, i].copy())) for i in range(values.shape[1]) ] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) arrays = [values[:, i].copy() for i in range(values.shape[1])] return ArrayManager(arrays, [index, columns], verify_integrity=False) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): if values.ndim == 2 and values.shape[0] != 1: # transpose and separate blocks dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values] dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals] # TODO: What about re-joining object columns? block_values = [ new_block(dvals_list[n], placement=n, ndim=2) for n in range(len(dvals_list)) ] else: datelike_vals = maybe_infer_to_datetimelike(values) nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2) block_values = [nb] else: nb = new_block(values, placement=slice(len(columns)), ndim=2) block_values = [nb] if len(columns) == 0: block_values = [] return create_block_manager_from_blocks(block_values, [columns, index])
def concatenate_block_managers(mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool) -> Manager: """ Concatenate block managers into one. Parameters ---------- mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples axes : list of Index concat_axis : int copy : bool Returns ------- BlockManager """ if isinstance(mgrs_indexers[0][0], ArrayManager): if concat_axis == 1: # TODO for now only fastpath without indexers mgrs = [t[0] for t in mgrs_indexers] arrays = [ concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))], axis=0) for j in range(len(mgrs[0].arrays)) ] return ArrayManager(arrays, [axes[1], axes[0]]) elif concat_axis == 0: mgrs = [t[0] for t in mgrs_indexers] arrays = list( itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) return ArrayManager(arrays, [axes[1], axes[0]]) concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers ] concat_plan = _combine_concat_plans(concat_plans, concat_axis) blocks = [] for placement, join_units in concat_plan: if len(join_units) == 1 and not join_units[0].indexers: b = join_units[0].block values = b.values if copy: values = values.copy() else: values = values.view() b = b.make_block_same_class(values, placement=placement) elif _is_uniform_join_units(join_units): blk = join_units[0].block vals = [ju.block.values for ju in join_units] if not blk.is_extension: # _is_uniform_join_units ensures a single dtype, so # we can use np.concatenate, which is more performant # than concat_compat values = np.concatenate(vals, axis=blk.ndim - 1) else: # TODO(EA2D): special-casing not needed with 2D EAs values = concat_compat(vals) if not isinstance(values, ExtensionArray): values = values.reshape(1, len(values)) if blk.values.dtype == values.dtype: # Fast-path b = blk.make_block_same_class(values, placement=placement) else: b = make_block(values, placement=placement, ndim=blk.ndim) else: b = make_block( _concatenate_join_units(join_units, concat_axis, copy=copy), placement=placement, ndim=len(axes), ) blocks.append(b) return BlockManager(blocks, axes)