def _concatenate_join_units( join_units: list[JoinUnit], concat_axis: int, copy: bool ) -> ArrayLike: """ Concatenate values from several join units along selected axis. """ if concat_axis == 0 and len(join_units) > 1: # Concatenating join units along ax0 is handled in _merge_blocks. raise AssertionError("Concatenating join units along axis0") empty_dtype = _get_empty_dtype(join_units) has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks) to_concat = [ ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) for ju in join_units ] if len(to_concat) == 1: # Only one block, nothing to concatenate. concat_values = to_concat[0] if copy: if isinstance(concat_values, np.ndarray): # non-reindexed (=not yet copied) arrays are made into a view # in JoinUnit.get_reindexed_values if concat_values.base is not None: concat_values = concat_values.copy() else: concat_values = concat_values.copy() elif any(is_1d_only_ea_dtype(t.dtype) for t in to_concat): # TODO(EA2D): special case not needed if all EAs used HybridBlocks # NB: we are still assuming here that Hybrid blocks have shape (1, N) # concatting with at least one EA means we are concatting a single column # the non-EA values are 2D arrays with shape (1, n) # error: No overload variant of "__getitem__" of "ExtensionArray" matches # argument type "Tuple[int, slice]" to_concat = [ t if is_1d_only_ea_dtype(t.dtype) else t[0, :] # type: ignore[call-overload] for t in to_concat ] concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) concat_values = ensure_block_shape(concat_values, 2) else: concat_values = concat_compat(to_concat, axis=concat_axis) return concat_values
def unstack(obj, level, fill_value=None): if isinstance(level, (tuple, list)): if len(level) != 1: # _unstack_multiple only handles MultiIndexes, # and isn't needed for a single level return _unstack_multiple(obj, level, fill_value=fill_value) else: level = level[0] # Prioritize integer interpretation (GH #21677): if not is_integer(level) and not level == "__placeholder__": level = obj.index._get_level_number(level) if isinstance(obj, DataFrame): if isinstance(obj.index, MultiIndex): return _unstack_frame(obj, level, fill_value=fill_value) else: return obj.T.stack(dropna=False) elif not isinstance(obj.index, MultiIndex): # GH 36113 # Give nicer error messages when unstack a Series whose # Index is not a MultiIndex. raise ValueError( f"index must be a MultiIndex to unstack, {type(obj.index)} was passed" ) else: if is_1d_only_ea_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker( obj.index, level=level, constructor=obj._constructor_expanddim ) return unstacker.get_result( obj._values, value_columns=None, fill_value=fill_value )
def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike: """ Construct an np.ndarray or ExtensionArray of the given dtype and shape holding all-NA values. """ if is_datetime64tz_dtype(dtype): # NaT here is analogous to dtype.na_value below i8values = np.full(shape, NaT.value) return DatetimeArray(i8values, dtype=dtype) elif is_1d_only_ea_dtype(dtype): dtype = cast(ExtensionDtype, dtype) cls = dtype.construct_array_type() missing_arr = cls._from_sequence([], dtype=dtype) nrows = shape[-1] taker = -1 * np.ones((nrows, ), dtype=np.intp) return missing_arr.take(taker, allow_fill=True, fill_value=dtype.na_value) elif isinstance(dtype, ExtensionDtype): # TODO: no tests get here, a handful would if we disabled # the dt64tz special-case above (which is faster) cls = dtype.construct_array_type() missing_arr = cls._empty(shape=shape, dtype=dtype) missing_arr[:] = dtype.na_value return missing_arr else: # NB: we should never get here with dtype integer or bool; # if we did, the missing_arr.fill would cast to gibberish missing_arr = np.empty(shape, dtype=dtype) fill_value = _dtype_to_na_value(dtype) missing_arr.fill(fill_value) return missing_arr
def cython_operation( self, *, values: ArrayLike, axis: int, min_count: int = -1, comp_ids: np.ndarray, ngroups: int, **kwargs, ) -> ArrayLike: """ Call our cython function, with appropriate pre- and post- processing. """ if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: assert axis == 1, axis elif not is_1d_only_ea_dtype(values.dtype): # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 0 dtype = values.dtype is_numeric = is_numeric_dtype(dtype) # can we do this operation with our cython functions # if not raise NotImplementedError self._disallow_invalid_ops(dtype, is_numeric) if not isinstance(values, np.ndarray): # i.e. ExtensionArray return self._ea_wrap_cython_operation( values, min_count=min_count, ngroups=ngroups, comp_ids=comp_ids, **kwargs, ) return self._cython_op_ndim_compat( values, min_count=min_count, ngroups=ngroups, comp_ids=comp_ids, mask=None, **kwargs, )
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = Index([values.name]) if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) # if the array preparation does a copy -> avoid this for ArrayManager, # since the copy is done on conversion to 1D arrays copy_on_sanitize = False if typ == "array" else copy vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype): # GH#19157 if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1: # GH#12513 a EA dtype passed with a 2D array, split into # multiple EAs that view the values # error: No overload variant of "__getitem__" of "ExtensionArray" # matches argument type "Tuple[slice, int]" values = [ values[:, n] # type: ignore[call-overload] for n in range(values.shape[1]) ] else: values = [values] if columns is None: columns = Index(range(len(values))) else: columns = ensure_index(columns) return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ, PeriodDtype values = extract_array(values, extract_numpy=True) if copy: values = values.copy() if values.ndim == 1: values = values.reshape(-1, 1) else: # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy_on_sanitize) if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape flat = values.ravel() # GH#40110 see similar check inside sanitize_array rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") values = sanitize_array(flat, None, dtype=dtype, copy=copy_on_sanitize, raise_cast_failure=rcf) values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes(values.shape[0], values.shape[1], index=index, columns=columns) _check_values_indices_shape_match(values, index, columns) if typ == "array": if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) if dtype is None and is_object_dtype(values.dtype): arrays = [ ensure_wrapped_if_datetimelike( maybe_infer_to_datetimelike(values[:, i])) for i in range(values.shape[1]) ] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) arrays = [values[:, i] for i in range(values.shape[1])] if copy: arrays = [arr.copy() for arr in arrays] return ArrayManager(arrays, [index, columns], verify_integrity=False) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): obj_columns = list(values) maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] # don't convert (and copy) the objects if no type inference occurs if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): dvals_list = [ ensure_block_shape(dval, 2) for dval in maybe_datetime ] block_values = [ new_block_2d(dvals_list[n], placement=BlockPlacement(n)) for n in range(len(dvals_list)) ] else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp) block_values = [nb] else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp) block_values = [nb] if len(columns) == 0: block_values = [] return create_block_manager_from_blocks(block_values, [columns, index], verify_integrity=False)
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_valid_na_for(empty_dtype): blk_dtype = getattr(self.block, "dtype", None) if blk_dtype == np.dtype("object"): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls values = self.block.values.ravel(order="K") if len(values) and values[0] is None: fill_value = None if is_datetime64tz_dtype(empty_dtype): i8values = np.full(self.shape, fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) elif is_extension_array_dtype(blk_dtype): pass elif is_1d_only_ea_dtype(empty_dtype): empty_dtype = cast(ExtensionDtype, empty_dtype) cls = empty_dtype.construct_array_type() missing_arr = cls._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape assert ncols == 1, ncols empty_arr = -1 * np.ones((nrows,), dtype=np.intp) return missing_arr.take( empty_arr, allow_fill=True, fill_value=fill_value ) else: # NB: we should never get here with empty_dtype integer or bool; # if we did, the missing_arr.fill would cast to gibberish empty_dtype = cast(np.dtype, empty_dtype) missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax) return values
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = Index([values.name]) if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: # GH#12513 a EA dtype passed with a 2D array, split into # multiple EAs that view the values values = [values[:, n] for n in range(values.shape[1])] else: values = [values] if columns is None: columns = Index(range(len(values))) return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) if is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ values = extract_array(values, extract_numpy=True) if copy: values = values.copy() if values.ndim == 1: values = values.reshape(-1, 1) else: # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape flat = values.ravel() if not is_integer_dtype(dtype): # TODO: skipping integer_dtype is needed to keep the tests passing, # not clear it is correct # Note: we really only need _try_cast, but keeping to exposed funcs values = sanitize_array(flat, None, dtype=dtype, copy=copy, raise_cast_failure=True) else: try: values = construct_1d_ndarray_preserving_na(flat, dtype=dtype, copy=False) except IntCastingNaNError: # following Series, we ignore the dtype and retain floating # values instead of casting nans to meaningless ints pass values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes(values.shape[0], values.shape[1], index=index, columns=columns) _check_values_indices_shape_match(values, index, columns) if typ == "array": if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) if dtype is None and is_object_dtype(values.dtype): arrays = [ ensure_wrapped_if_datetimelike( maybe_infer_to_datetimelike(values[:, i].copy())) for i in range(values.shape[1]) ] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) arrays = [values[:, i].copy() for i in range(values.shape[1])] return ArrayManager(arrays, [index, columns], verify_integrity=False) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): if values.ndim == 2 and values.shape[0] != 1: # transpose and separate blocks dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values] dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals] # TODO: What about re-joining object columns? block_values = [ new_block(dvals_list[n], placement=n, ndim=2) for n in range(len(dvals_list)) ] else: datelike_vals = maybe_infer_to_datetimelike(values) nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2) block_values = [nb] else: nb = new_block(values, placement=slice(len(columns)), ndim=2) block_values = [nb] if len(columns) == 0: block_values = [] return create_block_manager_from_blocks(block_values, [columns, index])
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: values: ArrayLike if upcasted_na is None and not self.is_na: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_na: if is_datetime64tz_dtype(empty_dtype): i8values = np.full(self.shape, fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) elif is_1d_only_ea_dtype(empty_dtype): empty_dtype = cast(ExtensionDtype, empty_dtype) cls = empty_dtype.construct_array_type() missing_arr = cls._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape assert ncols == 1, ncols empty_arr = -1 * np.ones((nrows, ), dtype=np.intp) return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value) elif isinstance(empty_dtype, ExtensionDtype): # TODO: no tests get here, a handful would if we disabled # the dt64tz special-case above (which is faster) cls = empty_dtype.construct_array_type() missing_arr = cls._empty(shape=self.shape, dtype=empty_dtype) missing_arr[:] = fill_value return missing_arr else: # NB: we should never get here with empty_dtype integer or bool; # if we did, the missing_arr.fill would cast to gibberish missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax) return values