def _create_from_codes(self, codes, categories=None, ordered=None, name=None): """ *this is an internal non-public method* create the correct categorical from codes Parameters ---------- codes : new codes categories : optional categories, defaults to existing ordered : optional ordered attribute, defaults to existing name : optional name attribute, defaults to existing Returns ------- CategoricalIndex """ from pandas.core.arrays import Categorical if categories is None: categories = self.categories if ordered is None: ordered = self.ordered if name is None: name = self.name cat = Categorical.from_codes(codes, categories=categories, ordered=self.ordered) return CategoricalIndex(cat, name=name)
def _create_categorical(self, data, categories=None, ordered=None, dtype=None): """ *this is an internal non-public method* create the correct categorical from data and the properties Parameters ---------- data : data for new Categorical categories : optional categories, defaults to existing ordered : optional ordered attribute, defaults to existing dtype : CategoricalDtype, defaults to existing Returns ------- Categorical """ if (isinstance(data, (ABCSeries, type(self))) and is_categorical_dtype(data)): data = data.values if not isinstance(data, ABCCategorical): if ordered is None and dtype is None: ordered = False from pandas.core.arrays import Categorical data = Categorical(data, categories=categories, ordered=ordered, dtype=dtype) else: if categories is not None: data = data.set_categories(categories, ordered=ordered) elif ordered is not None and ordered != data.ordered: data = data.set_ordered(ordered) if isinstance(dtype, CategoricalDtype): # we want to silently ignore dtype='category' data = data._set_dtype(dtype) return data
def groups(self): return self.index.groupby(Categorical.from_codes(self.labels, self.group_index))
def __init__( self, index, grouper=None, obj=None, name=None, level=None, sort=True, observed=False, in_axis=False, ): self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) self.all_grouper = None self.index = index self.sort = sort self.obj = obj self.observed = observed self.in_axis = in_axis # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name if isinstance(grouper, MultiIndex): self.grouper = grouper.values # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level if level is not None: if not isinstance(level, int): if level not in index.names: raise AssertionError("Level {} not in index".format(level)) level = index.names.index(level) if self.name is None: self.name = index.names[level] self.grouper, self._labels, self._group_index = index._get_grouper_for_level( # noqa: E501 self.grouper, level) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get labels elif isinstance(self.grouper, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False) if self.name is None: self.name = grouper.result_index.name self.obj = self.grouper.obj self.grouper = grouper._get_grouper() else: if self.grouper is None and self.name is not None: self.grouper = self.obj[self.name] elif isinstance(self.grouper, (list, tuple)): self.grouper = com.asarray_tuplesafe(self.grouper) # a passed Categorical elif is_categorical_dtype(self.grouper): self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed) categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes self._labels = self.grouper.codes if observed: codes = algorithms.unique1d(self.grouper.codes) codes = codes[codes != -1] if sort or self.grouper.ordered: codes = np.sort(codes) else: codes = np.arange(len(categories)) self._group_index = CategoricalIndex( Categorical.from_codes(codes=codes, categories=categories, ordered=self.grouper.ordered)) # we are done if isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed elif not isinstance(self.grouper, (Series, Index, ExtensionArray, np.ndarray)): if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError( "Grouper for '{}' not 1-dimensional".format(t)) self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): errmsg = ("Grouper result violates len(labels) == " "len(data)\nresult: %s" % pprint_thing(self.grouper)) self.grouper = None # Try for sanity raise AssertionError(errmsg) # if we have a date/time-like grouper, make sure that we have # Timestamps like if getattr(self.grouper, "dtype", None) is not None: if is_datetime64_dtype(self.grouper): self.grouper = self.grouper.astype("datetime64[ns]") elif is_timedelta64_dtype(self.grouper): self.grouper = self.grouper.astype("timedelta64[ns]")
def __init__(self, index, grouper=None, obj=None, name=None, level=None, sort=True, observed=False, in_axis=False): self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) self.all_grouper = None self.index = index self.sort = sort self.obj = obj self.observed = observed self.in_axis = in_axis # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name if isinstance(grouper, MultiIndex): self.grouper = grouper.values # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level if level is not None: if not isinstance(level, int): if level not in index.names: raise AssertionError('Level {} not in index'.format(level)) level = index.names.index(level) if self.name is None: self.name = index.names[level] self.grouper, self._labels, self._group_index = \ index._get_grouper_for_level(self.grouper, level) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get labels elif isinstance(self.grouper, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False) if self.name is None: self.name = grouper.result_index.name self.obj = self.grouper.obj self.grouper = grouper._get_grouper() else: if self.grouper is None and self.name is not None: self.grouper = self.obj[self.name] elif isinstance(self.grouper, (list, tuple)): self.grouper = com.asarray_tuplesafe(self.grouper) # a passed Categorical elif is_categorical_dtype(self.grouper): from pandas.core.groupby.categorical import recode_for_groupby self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed) categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes self._labels = self.grouper.codes if observed: codes = algorithms.unique1d(self.grouper.codes) codes = codes[codes != -1] if sort or self.grouper.ordered: codes = np.sort(codes) else: codes = np.arange(len(categories)) self._group_index = CategoricalIndex( Categorical.from_codes( codes=codes, categories=categories, ordered=self.grouper.ordered)) # we are done if isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed elif not isinstance(self.grouper, (Series, Index, ExtensionArray, np.ndarray)): if getattr(self.grouper, 'ndim', 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError( "Grouper for '{}' not 1-dimensional".format(t)) self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): errmsg = ('Grouper result violates len(labels) == ' 'len(data)\nresult: %s' % pprint_thing(self.grouper)) self.grouper = None # Try for sanity raise AssertionError(errmsg) # if we have a date/time-like grouper, make sure that we have # Timestamps like if getattr(self.grouper, 'dtype', None) is not None: if is_datetime64_dtype(self.grouper): from pandas import to_datetime self.grouper = to_datetime(self.grouper) elif is_timedelta64_dtype(self.grouper): from pandas import to_timedelta self.grouper = to_timedelta(self.grouper)
def groups(self) -> dict[Hashable, np.ndarray]: return self.index.groupby( Categorical.from_codes(self.codes, self.group_index))
def lexsort_indexer(keys, orders=None, na_position: str = "last", key: Optional[Callable] = None): """ Performs lexical sorting on a set of keys Parameters ---------- keys : sequence of arrays Sequence of ndarrays to be sorted by the indexer orders : boolean or list of booleans, optional Determines the sorting order for each element in keys. If a list, it must be the same length as keys. This determines whether the corresponding element in keys should be sorted in ascending (True) or descending (False) order. if bool, applied to all elements as above. if None, defaults to True. na_position : {'first', 'last'}, default 'last' Determines placement of NA elements in the sorted list ("last" or "first") key : Callable, optional Callable key function applied to every element in keys before sorting .. versionadded:: 1.0.0 """ from pandas.core.arrays import Categorical labels = [] shape = [] if isinstance(orders, bool): orders = [orders] * len(keys) elif orders is None: orders = [True] * len(keys) keys = [ensure_key_mapped(k, key) for k in keys] for k, order in zip(keys, orders): # we are already a Categorical if is_categorical_dtype(k): cat = k # create the Categorical else: cat = Categorical(k, ordered=True) if na_position not in ["last", "first"]: raise ValueError(f"invalid na_position: {na_position}") n = len(cat.categories) codes = cat.codes.copy() mask = cat.codes == -1 if order: # ascending if na_position == "last": codes = np.where(mask, n, codes) elif na_position == "first": codes += 1 else: # not order means descending if na_position == "last": codes = np.where(mask, n, n - codes - 1) elif na_position == "first": codes = np.where(mask, 0, n - codes) if mask.any(): n += 1 shape.append(n) labels.append(codes) return indexer_from_factorized(labels, shape)
def _cast_types(self, values, cast_type, column): """ Cast values to specified type Parameters ---------- values : ndarray cast_type : string or np.dtype dtype to cast values to column : string column name - used only for error reporting Returns ------- converted : ndarray """ if is_categorical_dtype(cast_type): known_cats = (isinstance(cast_type, CategoricalDtype) and cast_type.categories is not None) if not is_object_dtype(values) and not known_cats: # TODO: this is for consistency with # c-parser which parses all categories # as strings values = astype_nansafe(values, np.dtype(str)) cats = Index(values).unique().dropna() values = Categorical._from_inferred_categories( cats, cats.get_indexer(values), cast_type, true_values=self.true_values) # use the EA's implementation of casting elif is_extension_array_dtype(cast_type): # ensure cast_type is an actual dtype and not a string cast_type = pandas_dtype(cast_type) array_type = cast_type.construct_array_type() try: if is_bool_dtype(cast_type): return array_type._from_sequence_of_strings( values, dtype=cast_type, true_values=self.true_values, false_values=self.false_values, ) else: return array_type._from_sequence_of_strings( values, dtype=cast_type) except NotImplementedError as err: raise NotImplementedError( f"Extension Array: {array_type} must implement " "_from_sequence_of_strings in order to be used in parser methods" ) from err else: try: values = astype_nansafe(values, cast_type, copy=True, skipna=True) except ValueError as err: raise ValueError( f"Unable to convert column {column} to type {cast_type}" ) from err return values
def groups(self) -> dict: return self.index.groupby( Categorical.from_codes(self.codes, self.group_index))
def lexsort_indexer(keys, orders=None, na_position: str = "last", key: Callable | None = None) -> npt.NDArray[np.intp]: """ Performs lexical sorting on a set of keys Parameters ---------- keys : sequence of arrays Sequence of ndarrays to be sorted by the indexer orders : bool or list of booleans, optional Determines the sorting order for each element in keys. If a list, it must be the same length as keys. This determines whether the corresponding element in keys should be sorted in ascending (True) or descending (False) order. if bool, applied to all elements as above. if None, defaults to True. na_position : {'first', 'last'}, default 'last' Determines placement of NA elements in the sorted list ("last" or "first") key : Callable, optional Callable key function applied to every element in keys before sorting .. versionadded:: 1.0.0 Returns ------- np.ndarray[np.intp] """ from pandas.core.arrays import Categorical labels = [] shape = [] if isinstance(orders, bool): orders = [orders] * len(keys) elif orders is None: orders = [True] * len(keys) keys = [ensure_key_mapped(k, key) for k in keys] for k, order in zip(keys, orders): with warnings.catch_warnings(): # TODO(2.0): unnecessary once deprecation is enforced # GH#45618 don't issue warning user can't do anything about warnings.filterwarnings("ignore", ".*SparseArray.*", category=FutureWarning) cat = Categorical(k, ordered=True) if na_position not in ["last", "first"]: raise ValueError(f"invalid na_position: {na_position}") n = len(cat.categories) codes = cat.codes.copy() mask = cat.codes == -1 if order: # ascending if na_position == "last": codes = np.where(mask, n, codes) elif na_position == "first": codes += 1 else: # not order means descending if na_position == "last": codes = np.where(mask, n, n - codes - 1) elif na_position == "first": codes = np.where(mask, 0, n - codes) if mask.any(): n += 1 shape.append(n) labels.append(codes) return indexer_from_factorized(labels, tuple(shape))
def __init__( self, index: Index, grouper=None, obj: FrameOrSeries | None = None, name: Hashable = None, level=None, sort: bool = True, observed: bool = False, in_axis: bool = False, dropna: bool = True, ): self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) self.all_grouper = None self.index = index self.sort = sort self.obj = obj self.observed = observed self.in_axis = in_axis self.dropna = dropna # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level if level is not None: if not isinstance(level, int): if level not in index.names: raise AssertionError(f"Level {level} not in index") level = index.names.index(level) if self.name is None: self.name = index.names[level] ( self.grouper, self._codes, self._group_index, ) = index._get_grouper_for_level(self.grouper, level) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes elif isinstance(self.grouper, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) _, grouper, _ = self.grouper._get_grouper( # error: Value of type variable "FrameOrSeries" of "_get_grouper" # of "Grouper" cannot be "Optional[FrameOrSeries]" self.obj, # type: ignore[type-var] validate=False, ) if self.name is None: self.name = grouper.result_index.name self.obj = self.grouper.obj self.grouper = grouper._get_grouper() else: # a passed Categorical if is_categorical_dtype(self.grouper): self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed) categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes self._codes = self.grouper.codes if observed: codes = algorithms.unique1d(self.grouper.codes) codes = codes[codes != -1] if sort or self.grouper.ordered: codes = np.sort(codes) else: codes = np.arange(len(categories)) self._group_index = CategoricalIndex( Categorical.from_codes(codes=codes, categories=categories, ordered=self.grouper.ordered), name=self.name, ) # we are done elif isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed elif not isinstance(self.grouper, (Series, Index, ExtensionArray, np.ndarray)): if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError(f"Grouper for '{t}' not 1-dimensional") self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): grper = pprint_thing(self.grouper) errmsg = ("Grouper result violates len(labels) == " f"len(data)\nresult: {grper}") self.grouper = None # Try for sanity raise AssertionError(errmsg) if isinstance(self.grouper, np.ndarray): # if we have a date/time-like grouper, make sure that we have # Timestamps like self.grouper = sanitize_to_nanoseconds(self.grouper)