def test_is_sparse(check_scipy): assert com.is_sparse(pd.SparseArray([1, 2, 3])) assert com.is_sparse(pd.SparseSeries([1, 2, 3])) assert not com.is_sparse(np.array([1, 2, 3])) if check_scipy: import scipy.sparse assert not com.is_sparse(scipy.sparse.bsr_matrix([1, 2, 3]))
def test_is_sparse(): assert com.is_sparse(pd.SparseArray([1, 2, 3])) assert com.is_sparse(pd.SparseSeries([1, 2, 3])) assert not com.is_sparse(np.array([1, 2, 3])) # This test will only skip if the previous assertions # pass AND scipy is not installed. sparse = pytest.importorskip("scipy.sparse") assert not com.is_sparse(sparse.bsr_matrix([1, 2, 3]))
def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx): # See GH16874, GH18914 and #18686 for why this should be a DataFrame from pandas.core.dtypes.common import is_sparse frames = [self.dense1, self.dense3] sparse_frame = [frames[dense_idx], frames[sparse_idx].to_sparse(fill_value=fill_value)] dense_frame = [frames[dense_idx], frames[sparse_idx]] # This will try both directions sparse + dense and dense + sparse for _ in range(2): res = pd.concat(sparse_frame, axis=1) exp = pd.concat(dense_frame, axis=1) cols = [i for (i, x) in enumerate(res.dtypes) if is_sparse(x)] for col in cols: exp.iloc[:, col] = exp.iloc[:, col].astype("Sparse") for column in frames[dense_idx].columns: if dense_idx == sparse_idx: tm.assert_frame_equal(res[column], exp[column]) else: tm.assert_series_equal(res[column], exp[column]) tm.assert_frame_equal(res, exp) sparse_frame = sparse_frame[::-1] dense_frame = dense_frame[::-1]
def is_na(self): if self.block is None: return True if not self.block._can_hold_na: return False # Usually it's enough to check but a small fraction of values to see if # a block is NOT null, chunks should help in such cases. 1000 value # was chosen rather arbitrarily. values = self.block.values if self.block.is_categorical: values_flat = values.categories elif is_sparse(self.block.values.dtype): return False elif self.block.is_extension: values_flat = values else: values_flat = values.ravel(order='K') total_len = values_flat.shape[0] chunk_len = max(total_len // 40, 1000) for i in range(0, total_len, chunk_len): if not isna(values_flat[i:i + chunk_len]).all(): return False return True
def _disallow_invalid_ops(self, values: ArrayLike, how: str): """ Check if we can do this operation with our cython functions. Raises ------ NotImplementedError This is either not a valid function for this dtype, or valid but not implemented in cython. """ dtype = values.dtype if is_categorical_dtype(dtype) or is_sparse(dtype): # categoricals are only 1d, so we # are not setup for dim transforming raise NotImplementedError(f"{dtype} dtype not supported") elif is_datetime64_any_dtype(dtype): # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( f"datetime64 type does not support {how} operations") elif is_timedelta64_dtype(dtype): if how in ["prod", "cumprod"]: raise NotImplementedError( f"timedelta64 type does not support {how} operations")
def _disallow_invalid_ops( self, dtype: DtypeObj, how: str, is_numeric: bool = False ): """ Check if we can do this operation with our cython functions. Raises ------ NotImplementedError This is either not a valid function for this dtype, or valid but not implemented in cython. """ if is_numeric: # never an invalid op for those dtypes, so return early as fastpath return if is_categorical_dtype(dtype) or is_sparse(dtype): # categoricals are only 1d, so we # are not setup for dim transforming raise NotImplementedError(f"{dtype} dtype not supported") elif is_datetime64_any_dtype(dtype): # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( f"datetime64 type does not support {how} operations" ) elif is_timedelta64_dtype(dtype): if how in ["prod", "cumprod"]: raise NotImplementedError( f"timedelta64 type does not support {how} operations" )
def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx): # See GH16874, GH18914 and #18686 for why this should be a DataFrame from pandas.core.dtypes.common import is_sparse frames = [self.dense1, self.dense3] sparse_frame = [ frames[dense_idx], frames[sparse_idx].to_sparse(fill_value=fill_value) ] dense_frame = [frames[dense_idx], frames[sparse_idx]] # This will try both directions sparse + dense and dense + sparse for _ in range(2): res = pd.concat(sparse_frame, axis=1) exp = pd.concat(dense_frame, axis=1) cols = [i for (i, x) in enumerate(res.dtypes) if is_sparse(x)] for col in cols: exp.iloc[:, col] = exp.iloc[:, col].astype("Sparse") for column in frames[dense_idx].columns: if dense_idx == sparse_idx: tm.assert_frame_equal(res[column], exp[column]) else: tm.assert_series_equal(res[column], exp[column]) tm.assert_frame_equal(res, exp) sparse_frame = sparse_frame[::-1] dense_frame = dense_frame[::-1]
def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool): """ Convert input to numpy ndarray and optionally cast to a given dtype. Parameters ---------- arr : ndarray, list, tuple, iterator (catchall) Excludes: ExtensionArray, Series, Index. dtype : np.dtype, ExtensionDtype or None copy : bool If False, don't copy the data if not needed. raise_cast_failure : bool If True, and if a dtype is specified, raise errors during casting. Otherwise an object array is returned. """ # perf shortcut as this is the most common case if isinstance(arr, np.ndarray): if maybe_castable(arr) and not copy and dtype is None: return arr if isinstance(dtype, ExtensionDtype) and (dtype.kind != "M" or is_sparse(dtype)): # create an extension array from its dtype # DatetimeTZ case needs to go through maybe_cast_to_datetime but # SparseDtype does not array_type = dtype.construct_array_type()._from_sequence subarr = array_type(arr, dtype=dtype, copy=copy) return subarr if is_object_dtype(dtype) and not isinstance(arr, np.ndarray): subarr = construct_1d_object_array_from_listlike(arr) return subarr try: # GH#15832: Check if we are requesting a numeric dtype and # that we can convert the data to the requested dtype. if is_integer_dtype(dtype): # this will raise if we have e.g. floats maybe_cast_to_integer_array(arr, dtype) subarr = arr else: subarr = maybe_cast_to_datetime(arr, dtype) if not isinstance(subarr, (ABCExtensionArray, ABCIndex)): subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) except OutOfBoundsDatetime: # in case of out of bound datetime64 -> always raise raise except (ValueError, TypeError) as err: if dtype is not None and raise_cast_failure: raise elif "Cannot cast" in str(err): # via _disallow_mismatched_datetimelike raise else: subarr = np.array(arr, dtype=object, copy=copy) return subarr
def quantile_ea_compat(values: ExtensionArray, qs, interpolation: str, axis: int) -> ExtensionArray: """ ExtensionArray compatibility layer for quantile_with_mask. We pretend that an ExtensionArray with shape (N,) is actually (1, N,) for compatibility with non-EA code. Parameters ---------- values : ExtensionArray qs : a scalar or list of the quantiles to be computed interpolation: str axis : int Returns ------- ExtensionArray """ # TODO(EA2D): make-believe not needed with 2D EAs orig = values # asarray needed for Sparse, see GH#24600 mask = np.asarray(values.isna()) mask = np.atleast_2d(mask) # error: Incompatible types in assignment (expression has type "ndarray", variable # has type "ExtensionArray") values, fill_value = values._values_for_factorize( ) # type: ignore[assignment] # error: No overload variant of "atleast_2d" matches argument type "ExtensionArray" values = np.atleast_2d(values) # type: ignore[call-overload] # error: Argument 1 to "quantile_with_mask" has incompatible type "ExtensionArray"; # expected "ndarray" result = quantile_with_mask( values, mask, fill_value, qs, interpolation, axis # type: ignore[arg-type] ) if not is_sparse(orig.dtype): # shape[0] should be 1 as long as EAs are 1D if result.ndim == 1: # i.e. qs was originally a scalar assert result.shape == (1, ), result.shape result = type(orig)._from_factorized(result, orig) else: assert result.shape == (1, len(qs)), result.shape result = type(orig)._from_factorized(result[0], orig) # error: Incompatible return value type (got "ndarray", expected "ExtensionArray") return result # type: ignore[return-value]
def __init__(self, values, index, level=-1, value_columns=None, fill_value=None, constructor=None): self.is_categorical = None self.is_sparse = is_sparse(values) if values.ndim == 1: if isinstance(values, Categorical): self.is_categorical = values values = np.array(values) elif self.is_sparse: # XXX: Makes SparseArray *dense*, but it's supposedly # a single column at a time, so it's "doable" values = values.values values = values[:, np.newaxis] self.values = values self.value_columns = value_columns self.fill_value = fill_value if constructor is None: if self.is_sparse: self.constructor = SparseDataFrame else: self.constructor = DataFrame else: self.constructor = constructor if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError('must pass column labels for multi-column data') self.index = index.remove_unused_levels() if isinstance(self.index, MultiIndex): if index._reference_duplicate_name(level): msg = ("Ambiguous reference to {level}. The index " "names are not unique.".format(level=level)) raise ValueError(msg) self.level = self.index._get_level_number(level) # when index includes `nan`, need to lift levels/strides by 1 self.lift = 1 if -1 in self.index.labels[self.level] else 0 self.new_index_levels = list(self.index.levels) self.new_index_names = list(self.index.names) self.removed_name = self.new_index_names.pop(self.level) self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] self._make_sorted_values_labels() self._make_selectors()
def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ Helper function for `arr.astype(common_dtype)` but handling all special cases. """ if is_dtype_equal(arr.dtype, dtype): return arr if ( is_categorical_dtype(arr.dtype) and isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.integer) ): # problem case: categorical of int -> gives int as result dtype, # but categorical can contain NAs -> fall back to object dtype try: return arr.astype(dtype, copy=False) except ValueError: return arr.astype(object, copy=False) if is_sparse(arr) and not is_sparse(dtype): # problem case: SparseArray.astype(dtype) doesn't follow the specified # dtype exactly, but converts this to Sparse[dtype] -> first manually # convert to dense array arr = cast(SparseArray, arr) return arr.to_dense().astype(dtype, copy=False) if ( isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"] and dtype is np.dtype("object") ): # wrap datetime-likes in EA to ensure astype(object) gives Timestamp/Timedelta # this can happen when concat_compat is called directly on arrays (when arrays # are not coming from Index/Series._values), eg in BlockManager.quantile arr = ensure_wrapped_if_datetimelike(arr) if isinstance(dtype, ExtensionDtype): if isinstance(arr, np.ndarray): # numpy's astype cannot handle ExtensionDtypes return pd_array(arr, dtype=dtype, copy=False) return arr.astype(dtype, copy=False) return arr.astype(dtype, copy=False)
def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ Helper function for `arr.astype(common_dtype)` but handling all special cases. """ if ( is_categorical_dtype(arr.dtype) and isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.integer) ): # problem case: categorical of int -> gives int as result dtype, # but categorical can contain NAs -> fall back to object dtype try: return arr.astype(dtype, copy=False) except ValueError: return arr.astype(object, copy=False) if is_sparse(arr) and not is_sparse(dtype): # problem case: SparseArray.astype(dtype) doesn't follow the specified # dtype exactly, but converts this to Sparse[dtype] -> first manually # convert to dense array arr = cast(SparseArray, arr) return arr.to_dense().astype(dtype, copy=False) if ( isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"] and dtype is np.dtype("object") ): # wrap datetime-likes in EA to ensure astype(object) gives Timestamp/Timedelta # this can happen when concat_compat is called directly on arrays (when arrays # are not coming from Index/Series._values), eg in BlockManager.quantile arr = ensure_wrapped_if_datetimelike(arr) if is_extension_array_dtype(dtype) and isinstance(arr, np.ndarray): # numpy's astype cannot handle ExtensionDtypes return pd_array(arr, dtype=dtype, copy=False) # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has incompatible type # "Union[dtype[Any], ExtensionDtype]"; expected "Union[dtype[Any], None, type, # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], # List[Any], _DTypeDict, Tuple[Any, Any]]]" return arr.astype(dtype, copy=False) # type: ignore[arg-type]
def _get_series_result_type(result): """ return appropriate class of Series concat input is either dict or array-like """ if isinstance(result, dict): # concat Series with axis 1 if all(is_sparse(c) for c in compat.itervalues(result)): from pandas.core.sparse.api import SparseDataFrame return SparseDataFrame else: from pandas.core.frame import DataFrame return DataFrame elif is_sparse(result): # concat Series with axis 1 from pandas.core.sparse.api import SparseSeries return SparseSeries else: from pandas.core.series import Series return Series
def _get_frame_result_type(result, objs): """ return appropriate class of DataFrame-like concat if all blocks are sparse, return SparseDataFrame otherwise, return 1st obj """ if (result.blocks and ( all(is_sparse(b) for b in result.blocks) or all(isinstance(obj, ABCSparseDataFrame) for obj in objs))): from pandas.core.sparse.api import SparseDataFrame return SparseDataFrame else: return next(obj for obj in objs if not isinstance(obj, ABCSparseDataFrame))
def _get_frame_result_type(result, objs): """ return appropriate class of DataFrame-like concat if all blocks are sparse, return SparseDataFrame otherwise, return 1st obj """ if (result.blocks and (all(is_sparse(b) for b in result.blocks) or all(isinstance(obj, ABCSparseDataFrame) for obj in objs))): from pandas.core.sparse.api import SparseDataFrame return SparseDataFrame else: return next(obj for obj in objs if not isinstance(obj, ABCSparseDataFrame))
def quantile_ea_compat(values: ExtensionArray, qs, interpolation: str, axis: int) -> ExtensionArray: """ ExtensionArray compatibility layer for quantile_with_mask. We pretend that an ExtensionArray with shape (N,) is actually (1, N,) for compatibility with non-EA code. Parameters ---------- values : ExtensionArray qs : a scalar or list of the quantiles to be computed interpolation: str axis : int Returns ------- ExtensionArray """ # TODO(EA2D): make-believe not needed with 2D EAs orig = values # asarray needed for Sparse, see GH#24600 mask = np.asarray(values.isna()) mask = np.atleast_2d(mask) values, fill_value = values._values_for_factorize() values = np.atleast_2d(values) result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) if not is_sparse(orig.dtype): # shape[0] should be 1 as long as EAs are 1D if result.ndim == 1: # i.e. qs was originally a scalar assert result.shape == (1, ), result.shape result = type(orig)._from_factorized(result, orig) else: assert result.shape == (1, len(qs)), result.shape result = type(orig)._from_factorized(result[0], orig) return result
def _quantile_ea_compat( values: ExtensionArray, qs: np.ndarray, interpolation: str ) -> ExtensionArray: """ ExtensionArray compatibility layer for _quantile_with_mask. We pretend that an ExtensionArray with shape (N,) is actually (1, N,) for compatibility with non-EA code. Parameters ---------- values : ExtensionArray qs : np.ndarray[float64] interpolation: str Returns ------- ExtensionArray """ # TODO(EA2D): make-believe not needed with 2D EAs orig = values # asarray needed for Sparse, see GH#24600 mask = np.asarray(values.isna()) mask = np.atleast_2d(mask) arr, fill_value = values._values_for_factorize() arr = np.atleast_2d(arr) result = _quantile_with_mask(arr, mask, fill_value, qs, interpolation) if not is_sparse(orig.dtype): # shape[0] should be 1 as long as EAs are 1D if orig.ndim == 2: # i.e. DatetimeArray result = type(orig)._from_factorized(result, orig) else: assert result.shape == (1, len(qs)), result.shape result = type(orig)._from_factorized(result[0], orig) # error: Incompatible return value type (got "ndarray", expected "ExtensionArray") return result # type: ignore[return-value]
def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): """ Check if we can do this operation with our cython functions. Raises ------ NotImplementedError This is either not a valid function for this dtype, or valid but not implemented in cython. """ how = self.how if is_numeric: # never an invalid op for those dtypes, so return early as fastpath return if isinstance(dtype, CategoricalDtype): # NotImplementedError for methods that can fall back to a # non-cython implementation. if how in ["add", "prod", "cumsum", "cumprod"]: raise TypeError( f"{dtype} type does not support {how} operations") elif how not in ["rank"]: # only "rank" is implemented in cython raise NotImplementedError(f"{dtype} dtype not supported") elif not dtype.ordered: # TODO: TypeError? raise NotImplementedError(f"{dtype} dtype not supported") elif is_sparse(dtype): # categoricals are only 1d, so we # are not setup for dim transforming raise NotImplementedError(f"{dtype} dtype not supported") elif is_datetime64_any_dtype(dtype): # TODO: same for period_dtype? no for these methods with Period # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes if how in ["add", "prod", "cumsum", "cumprod"]: raise TypeError( f"datetime64 type does not support {how} operations") elif is_timedelta64_dtype(dtype): if how in ["prod", "cumprod"]: raise TypeError( f"timedelta64 type does not support {how} operations")
def _get_sliced_frame_result_type(data, obj): """ return appropriate class of Series. When data is sparse it will return a SparseSeries, otherwise it will return the Series. Parameters ---------- data : array-like obj : DataFrame Returns ------- Series or SparseSeries """ if is_sparse(data): from pandas.core.sparse.api import SparseSeries return SparseSeries return obj._constructor_sliced
def is_na(self) -> bool: if self.block is None: return True if not self.block._can_hold_na: return False # Usually it's enough to check but a small fraction of values to see if # a block is NOT null, chunks should help in such cases. 1000 value # was chosen rather arbitrarily. values = self.block.values if is_sparse(self.block.values.dtype): return False elif self.block.is_extension: # TODO(EA2D): no need for special case with 2D EAs values_flat = values else: values_flat = values.ravel(order="K") return isna_all(values_flat)
def get_dtype_kinds(l): """ Parameters ---------- l : list of arrays Returns ------- a set of kinds that exist in this list of arrays """ typs = set() for arr in l: dtype = arr.dtype if is_categorical_dtype(dtype): typ = 'category' elif is_sparse(arr): typ = 'sparse' elif isinstance(arr, ABCRangeIndex): typ = 'range' elif is_datetimetz(arr): # if to_concat contains different tz, # the result must be object dtype typ = str(arr.dtype) elif is_datetime64_dtype(dtype): typ = 'datetime' elif is_timedelta64_dtype(dtype): typ = 'timedelta' elif is_object_dtype(dtype): typ = 'object' elif is_bool_dtype(dtype): typ = 'bool' elif is_period_dtype(dtype): typ = str(arr.dtype) elif is_interval_dtype(dtype): typ = str(arr.dtype) else: typ = dtype.kind typs.add(typ) return typs
def _get_series_result_type(result, objs=None): """ return appropriate class of Series concat input is either dict or array-like """ # concat Series with axis 1 if isinstance(result, dict): # concat Series with axis 1 if all(is_sparse(c) for c in compat.itervalues(result)): from pandas.core.sparse.api import SparseDataFrame return SparseDataFrame else: from pandas.core.frame import DataFrame return DataFrame # otherwise it is a SingleBlockManager (axis = 0) if result._block.is_sparse: from pandas.core.sparse.api import SparseSeries return SparseSeries else: return objs[0]._constructor
def __init__(self, values, index, level=-1, value_columns=None, fill_value=None): self.is_categorical = None self.is_sparse = is_sparse(values) if values.ndim == 1: if isinstance(values, Categorical): self.is_categorical = values values = np.array(values) elif self.is_sparse: # XXX: Makes SparseArray *dense*, but it's supposedly # a single column at a time, so it's "doable" values = values.values values = values[:, np.newaxis] self.values = values self.value_columns = value_columns self.fill_value = fill_value if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError('must pass column labels for multi-column data') self.index = index self.level = self.index._get_level_number(level) # when index includes `nan`, need to lift levels/strides by 1 self.lift = 1 if -1 in self.index.labels[self.level] else 0 self.new_index_levels = list(index.levels) self.new_index_names = list(index.names) self.removed_name = self.new_index_names.pop(self.level) self.removed_level = self.new_index_levels.pop(self.level) self._make_sorted_values_labels() self._make_selectors()
def get_dtype_kinds(l): """ Parameters ---------- l : list of arrays Returns ------- a set of kinds that exist in this list of arrays """ typs = set() for arr in l: dtype = arr.dtype if is_categorical_dtype(dtype): typ = "category" elif is_sparse(arr): typ = "sparse" elif isinstance(arr, ABCRangeIndex): typ = "range" elif is_datetime64tz_dtype(arr): # if to_concat contains different tz, # the result must be object dtype typ = str(arr.dtype) elif is_datetime64_dtype(dtype): typ = "datetime" elif is_timedelta64_dtype(dtype): typ = "timedelta" elif is_object_dtype(dtype): typ = "object" elif is_bool_dtype(dtype): typ = "bool" elif is_extension_array_dtype(dtype): typ = str(arr.dtype) else: typ = dtype.kind typs.add(typ) return typs
def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str: """Select upcast class name based on dtype.""" if is_categorical_dtype(dtype): return "category" elif is_datetime64tz_dtype(dtype): return "datetimetz" elif is_extension_array_dtype(dtype): return "extension" elif issubclass(dtype.type, np.bool_): return "bool" elif issubclass(dtype.type, np.object_): return "object" elif is_datetime64_dtype(dtype): return "datetime" elif is_timedelta64_dtype(dtype): return "timedelta" elif is_sparse(dtype): dtype = cast("SparseDtype", dtype) return dtype.subtype.name elif is_float_dtype(dtype) or is_numeric_dtype(dtype): return dtype.name else: return "float"
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ["transform", "aggregate"] # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError( "{} are not support in cython ops".format(values.dtype)) elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( "datetime64 type does not support {} " "operations".format(how)) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( "timedelta64 type does not support {} " "operations".format(how)) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError("arity of more than 1 is not " "supported for the 'how' argument") out_shape = (self.ngroups, ) + values.shape[1:] is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = "{kind}{itemsize}".format( kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = "object" labels, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, labels, func, is_numeric, is_datetimelike, min_count, ) elif kind == "transform": result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, labels, func, is_numeric, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): if result.ndim == 2: try: result = lib.row_bool_subset(result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( ensure_object(result), (counts > 0).view(np.uint8)) else: result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) return result, names
def _get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.float64, np.nan if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = "category" elif is_datetime64tz_dtype(dtype): upcast_cls = "datetimetz" elif issubclass(dtype.type, np.bool_): upcast_cls = "bool" elif issubclass(dtype.type, np.object_): upcast_cls = "object" elif is_datetime64_dtype(dtype): upcast_cls = "datetime" elif is_timedelta64_dtype(dtype): upcast_cls = "timedelta" elif is_sparse(dtype): upcast_cls = dtype.subtype.name elif is_extension_array_dtype(dtype): upcast_cls = "object" elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: upcast_cls = "float" # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to # null upcast classes. if unit.is_na: null_upcast_classes[upcast_cls].append(dtype) else: upcast_classes[upcast_cls].append(dtype) if not upcast_classes: upcast_classes = null_upcast_classes # TODO: de-duplicate with maybe_promote? # create the result if "object" in upcast_classes: return np.dtype(np.object_), np.nan elif "bool" in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif "category" in upcast_classes: return np.dtype(np.object_), np.nan elif "datetimetz" in upcast_classes: # GH-25014. We use NaT instead of iNaT, since this eventually # ends up in DatetimeArray.take, which does not allow iNaT. dtype = upcast_classes["datetimetz"] return dtype[0], tslibs.NaT elif "datetime" in upcast_classes: return np.dtype("M8[ns]"), np.datetime64("NaT", "ns") elif "timedelta" in upcast_classes: return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: # pragma try: g = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: if is_float_dtype(g): return g, g.type(np.nan) elif is_numeric_dtype(g): if has_none_blocks: return np.float64, np.nan else: return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.float64, np.nan if is_uniform_reindex(join_units): # XXX: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = 'category' elif is_datetimetz(dtype): upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): upcast_cls = 'bool' elif issubclass(dtype.type, np.object_): upcast_cls = 'object' elif is_datetime64_dtype(dtype): upcast_cls = 'datetime' elif is_timedelta64_dtype(dtype): upcast_cls = 'timedelta' elif is_sparse(dtype): upcast_cls = dtype.subtype.name elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: upcast_cls = 'float' # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to # null upcast classes. if unit.is_na: null_upcast_classes[upcast_cls].append(dtype) else: upcast_classes[upcast_cls].append(dtype) if not upcast_classes: upcast_classes = null_upcast_classes # create the result if 'object' in upcast_classes: return np.dtype(np.object_), np.nan elif 'bool' in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif 'category' in upcast_classes: return np.dtype(np.object_), np.nan elif 'datetimetz' in upcast_classes: dtype = upcast_classes['datetimetz'] return dtype[0], tslibs.iNaT elif 'datetime' in upcast_classes: return np.dtype('M8[ns]'), tslibs.iNaT elif 'timedelta' in upcast_classes: return np.dtype('m8[ns]'), tslibs.iNaT else: # pragma try: g = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: if is_float_dtype(g): return g, g.type(np.nan) elif is_numeric_dtype(g): if has_none_blocks: return np.float64, np.nan else: return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def _cython_operation( self, kind: str, values, how: str, axis, min_count: int = -1, **kwargs ) -> Tuple[np.ndarray, Optional[List[str]]]: """ Returns the values of a cython operation as a Tuple of [data, names]. Names is only useful when dealing with 2D results, like ohlc (see self._name_functions). """ assert kind in ["transform", "aggregate"] orig_values = values if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( f"datetime64 type does not support {how} operations" ) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( f"timedelta64 type does not support {how} operations" ) if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups,) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill( np.empty(out_shape, dtype=out_dtype), fill_value=np.nan ) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = _maybe_fill( np.empty_like(values, dtype=out_dtype), fill_value=np.nan ) # TODO: min_count result = self._transform( result, values, codes, func, is_datetimelike, **kwargs ) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan elif ( how == "add" and is_integer_dtype(orig_values.dtype) and is_extension_array_dtype(orig_values.dtype) ): # We need this to ensure that Series[Int64Dtype].resample().sum() # remains int64 dtype. # Two options for avoiding this special case # 1. mask-aware ops and avoid casting to float with NaN above # 2. specify the result dtype when calling this method result = result.astype("int64") if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] names: Optional[List[str]] = self._name_functions.get(how, None) if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype( orig_values.dtype ): # We need to use the constructors directly for these dtypes # since numpy won't recognize them # https://github.com/pandas-dev/pandas/issues/31471 result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names
def get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.float64, np.nan if is_uniform_reindex(join_units): # XXX: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = 'category' elif is_datetime64tz_dtype(dtype): upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): upcast_cls = 'bool' elif issubclass(dtype.type, np.object_): upcast_cls = 'object' elif is_datetime64_dtype(dtype): upcast_cls = 'datetime' elif is_timedelta64_dtype(dtype): upcast_cls = 'timedelta' elif is_sparse(dtype): upcast_cls = dtype.subtype.name elif is_extension_array_dtype(dtype): upcast_cls = 'object' elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: upcast_cls = 'float' # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to # null upcast classes. if unit.is_na: null_upcast_classes[upcast_cls].append(dtype) else: upcast_classes[upcast_cls].append(dtype) if not upcast_classes: upcast_classes = null_upcast_classes # create the result if 'object' in upcast_classes: return np.dtype(np.object_), np.nan elif 'bool' in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif 'category' in upcast_classes: return np.dtype(np.object_), np.nan elif 'datetimetz' in upcast_classes: dtype = upcast_classes['datetimetz'] return dtype[0], tslibs.iNaT elif 'datetime' in upcast_classes: return np.dtype('M8[ns]'), tslibs.iNaT elif 'timedelta' in upcast_classes: return np.dtype('m8[ns]'), tslibs.iNaT else: # pragma try: g = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: if is_float_dtype(g): return g, g.type(np.nan) elif is_numeric_dtype(g): if has_none_blocks: return np.float64, np.nan else: return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def _concat_sparse(to_concat, axis=0, typs=None): """ provide concatenation of an sparse/dense array of arrays each of which is a single dtype Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation typs : set of to_concat dtypes Returns ------- a single array, preserving the combined dtypes """ from pandas.core.sparse.array import SparseArray, _make_index def convert_sparse(x, axis): # coerce to native type if isinstance(x, SparseArray): x = x.get_values() x = x.ravel() if axis > 0: x = np.atleast_2d(x) return x if typs is None: typs = get_dtype_kinds(to_concat) if len(typs) == 1: # concat input as it is if all inputs are sparse # and have the same fill_value fill_values = set(c.fill_value for c in to_concat) if len(fill_values) == 1: sp_values = [c.sp_values for c in to_concat] indexes = [c.sp_index.to_int_index() for c in to_concat] indices = [] loc = 0 for idx in indexes: indices.append(idx.indices + loc) loc += idx.length sp_values = np.concatenate(sp_values) indices = np.concatenate(indices) sp_index = _make_index(loc, indices, kind=to_concat[0].sp_index) return SparseArray(sp_values, sparse_index=sp_index, fill_value=to_concat[0].fill_value) # input may be sparse / dense mixed and may have different fill_value # input must contain sparse at least 1 sparses = [c for c in to_concat if is_sparse(c)] fill_values = [c.fill_value for c in sparses] sp_indexes = [c.sp_index for c in sparses] # densify and regular concat to_concat = [convert_sparse(x, axis) for x in to_concat] result = np.concatenate(to_concat, axis=axis) if not len(typs - set(['sparse', 'f', 'i'])): # sparsify if inputs are sparse and dense numerics # first sparse input's fill_value and SparseIndex is used result = SparseArray(result.ravel(), fill_value=fill_values[0], kind=sp_indexes[0]) else: # coerce to object if needed result = result.astype('object') return result
def _try_cast( arr: Union[list, np.ndarray], dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool, ) -> ArrayLike: """ Convert input to numpy ndarray and optionally cast to a given dtype. Parameters ---------- arr : ndarray or list Excludes: ExtensionArray, Series, Index. dtype : np.dtype, ExtensionDtype or None copy : bool If False, don't copy the data if not needed. raise_cast_failure : bool If True, and if a dtype is specified, raise errors during casting. Otherwise an object array is returned. Returns ------- np.ndarray or ExtensionArray """ # perf shortcut as this is the most common case if (isinstance(arr, np.ndarray) and maybe_castable(arr.dtype) and not copy and dtype is None): # error: Incompatible return value type (got "ndarray", expected # "ExtensionArray") return arr # type: ignore[return-value] if isinstance(dtype, ExtensionDtype) and (dtype.kind != "M" or is_sparse(dtype)): # create an extension array from its dtype # DatetimeTZ case needs to go through maybe_cast_to_datetime but # SparseDtype does not array_type = dtype.construct_array_type()._from_sequence subarr = array_type(arr, dtype=dtype, copy=copy) return subarr if is_object_dtype(dtype) and not isinstance(arr, np.ndarray): subarr = construct_1d_object_array_from_listlike(arr) return subarr try: # GH#15832: Check if we are requesting a numeric dtype and # that we can convert the data to the requested dtype. if is_integer_dtype(dtype): # this will raise if we have e.g. floats # error: Argument 2 to "maybe_cast_to_integer_array" has incompatible type # "Union[dtype, ExtensionDtype, None]"; expected "Union[ExtensionDtype, str, # dtype, Type[str], Type[float], Type[int], Type[complex], Type[bool], # Type[object]]" maybe_cast_to_integer_array(arr, dtype) # type: ignore[arg-type] subarr = arr else: subarr = maybe_cast_to_datetime(arr, dtype) if dtype is not None and dtype.kind == "M": return subarr if not isinstance(subarr, ABCExtensionArray): subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) except OutOfBoundsDatetime: # in case of out of bound datetime64 -> always raise raise except (ValueError, TypeError) as err: if dtype is not None and raise_cast_failure: raise elif "Cannot cast" in str(err): # via _disallow_mismatched_datetimelike raise else: subarr = np.array(arr, dtype=object, copy=copy) return subarr
def _cython_operation(self, kind: str, values, how, axis, min_count=-1, **kwargs): assert kind in ["transform", "aggregate"] orig_values = values # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError( "{dtype} dtype not supported".format(dtype=values.dtype)) elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( "datetime64 type does not support {how} operations".format( how=how)) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( "timedelta64 type does not support {how} operations". format(how=how)) if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: try: values = ensure_float64(values) except TypeError: if lib.infer_dtype(values, skipna=False) == "complex": values = values.astype(complex) else: raise func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = "{kind}{itemsize}".format( kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = "object" labels, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, labels, func, is_datetimelike, min_count) elif kind == "transform": result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, labels, func, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: names = self._name_functions[how]() # type: Optional[List[str]] else: names = None if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names