def get_indexer(self, target, method=None, limit=None, tolerance=None): from pandas.core.arrays.categorical import _recode_for_categories method = missing.clean_reindex_fill_method(method) target = ibase.ensure_index(target) if self.is_unique and self.equals(target): return np.arange(len(self), dtype='intp') if method == 'pad' or method == 'backfill': raise NotImplementedError("method='pad' and method='backfill' not " "implemented yet for CategoricalIndex") elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for CategoricalIndex') if (isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target)): if self.values.equals(target.values): # we have the same codes codes = target.codes else: codes = _recode_for_categories(target.codes, target.categories, self.values.categories) else: if isinstance(target, CategoricalIndex): code_indexer = self.categories.get_indexer(target.categories) codes = take_1d(code_indexer, target.codes, fill_value=-1) else: codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer)
def get_indexer(self, target, method=None, limit=None, tolerance=None): from pandas.core.arrays.categorical import _recode_for_categories method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) if self.is_unique and self.equals(target): return np.arange(len(self), dtype='intp') if method == 'pad' or method == 'backfill': raise NotImplementedError("method='pad' and method='backfill' not " "implemented yet for CategoricalIndex") elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for CategoricalIndex') if (isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target)): if self.values.equals(target.values): # we have the same codes codes = target.codes else: codes = _recode_for_categories(target.codes, target.categories, self.values.categories) else: if isinstance(target, CategoricalIndex): code_indexer = self.categories.get_indexer(target.categories) codes = take_1d(code_indexer, target.codes, fill_value=-1) else: codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer)
def test_recode_to_categories(self, codes, old, new, expected): codes = np.asanyarray(codes, dtype=np.int8) expected = np.asanyarray(expected, dtype=np.int8) old = Index(old) new = Index(new) result = _recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected)
def test_recode_to_categories(self, codes, old, new, expected): codes = np.asanyarray(codes, dtype=np.int8) expected = np.asanyarray(expected, dtype=np.int8) old = Index(old) new = Index(new) result = _recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected)
def test_recode_to_categories_large(self): N = 1000 codes = np.arange(N) old = Index(codes) expected = np.arange(N - 1, -1, -1, dtype=np.int16) new = Index(expected) result = _recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected)
def test_recode_to_categories_large(self): N = 1000 codes = np.arange(N) old = Index(codes) expected = np.arange(N - 1, -1, -1, dtype=np.int16) new = Index(expected) result = _recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected)
def union_categoricals(to_union, sort_categories=False, ignore_order=False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categorical, CategoricalIndex, or Series with dtype='category' sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order: boolean, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. .. versionadded:: 0.20.0 Returns ------- result : Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed Notes ----- To learn more about categories, see `link <http://pandas.pydata.org/pandas-docs/stable/categorical.html#unioning>`__ Examples -------- >>> from pandas.api.types import union_categoricals If you want to combine categoricals that do not necessarily have the same categories, `union_categoricals` will combine a list-like of categoricals. The new categories will be the union of the categories being combined. >>> a = pd.Categorical(["b", "c"]) >>> b = pd.Categorical(["a", "b"]) >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] By default, the resulting categories will be ordered as they appear in the `categories` of the data. If you want the categories to be lexsorted, use `sort_categories=True` argument. >>> union_categoricals([a, b], sort_categories=True) [b, c, a, b] Categories (3, object): [a, b, c] `union_categoricals` also works with the case of combining two categoricals of the same categories and order information (e.g. what you could also `append` for). >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "a"], ordered=True) >>> union_categoricals([a, b]) [a, b, a, b, a] Categories (2, object): [a < b] Raises `TypeError` because the categories are ordered and not identical. >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "c"], ordered=True) >>> union_categoricals([a, b]) TypeError: to union ordered Categoricals, all categories must be the same New in version 0.20.0 Ordered categoricals with different categories or orderings can be combined by using the `ignore_ordered=True` argument. >>> a = pd.Categorical(["a", "b", "c"], ordered=True) >>> b = pd.Categorical(["c", "b", "a"], ordered=True) >>> union_categoricals([a, b], ignore_order=True) [a, b, c, c, b, a] Categories (3, object): [a, b, c] `union_categoricals` also works with a `CategoricalIndex`, or `Series` containing categorical data, but note that the resulting array will always be a plain `Categorical` >>> a = pd.Series(["b", "c"], dtype='category') >>> b = pd.Series(["a", "b"], dtype='category') >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] """ from pandas import Index, Categorical, CategoricalIndex, Series from pandas.core.arrays.categorical import _recode_for_categories if len(to_union) == 0: raise ValueError('No Categoricals to union') def _maybe_unwrap(x): if isinstance(x, (CategoricalIndex, Series)): return x.values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all( is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered if all( first.categories.equals(other.categories) for other in to_union[1:]): new_codes = np.concatenate([c.codes for c in to_union]) else: codes = [first.codes] + [ _recode_for_categories(other.codes, other.categories, first.categories) for other in to_union[1:] ] new_codes = np.concatenate(codes) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) from pandas.core.algorithms import take_1d new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) if sort_categories: categories = categories.sort_values() new_codes = [] for c in to_union: new_codes.append( _recode_for_categories(c.codes, c.categories, categories)) new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def union_categoricals(to_union, sort_categories=False, ignore_order=False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categorical, CategoricalIndex, or Series with dtype='category' sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order: boolean, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. .. versionadded:: 0.20.0 Returns ------- result : Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed Notes ----- To learn more about categories, see `link <http://pandas.pydata.org/pandas-docs/stable/categorical.html#unioning>`__ Examples -------- >>> from pandas.api.types import union_categoricals If you want to combine categoricals that do not necessarily have the same categories, `union_categoricals` will combine a list-like of categoricals. The new categories will be the union of the categories being combined. >>> a = pd.Categorical(["b", "c"]) >>> b = pd.Categorical(["a", "b"]) >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] By default, the resulting categories will be ordered as they appear in the `categories` of the data. If you want the categories to be lexsorted, use `sort_categories=True` argument. >>> union_categoricals([a, b], sort_categories=True) [b, c, a, b] Categories (3, object): [a, b, c] `union_categoricals` also works with the case of combining two categoricals of the same categories and order information (e.g. what you could also `append` for). >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "a"], ordered=True) >>> union_categoricals([a, b]) [a, b, a, b, a] Categories (2, object): [a < b] Raises `TypeError` because the categories are ordered and not identical. >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "c"], ordered=True) >>> union_categoricals([a, b]) TypeError: to union ordered Categoricals, all categories must be the same New in version 0.20.0 Ordered categoricals with different categories or orderings can be combined by using the `ignore_ordered=True` argument. >>> a = pd.Categorical(["a", "b", "c"], ordered=True) >>> b = pd.Categorical(["c", "b", "a"], ordered=True) >>> union_categoricals([a, b], ignore_order=True) [a, b, c, c, b, a] Categories (3, object): [a, b, c] `union_categoricals` also works with a `CategoricalIndex`, or `Series` containing categorical data, but note that the resulting array will always be a plain `Categorical` >>> a = pd.Series(["b", "c"], dtype='category') >>> b = pd.Series(["a", "b"], dtype='category') >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] """ from pandas import Index, Categorical, CategoricalIndex, Series from pandas.core.arrays.categorical import _recode_for_categories if len(to_union) == 0: raise ValueError('No Categoricals to union') def _maybe_unwrap(x): if isinstance(x, (CategoricalIndex, Series)): return x.values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered if all(first.categories.equals(other.categories) for other in to_union[1:]): new_codes = np.concatenate([c.codes for c in to_union]) else: codes = [first.codes] + [_recode_for_categories(other.codes, other.categories, first.categories) for other in to_union[1:]] new_codes = np.concatenate(codes) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) from pandas.core.algorithms import take_1d new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) if sort_categories: categories = categories.sort_values() new_codes = [] for c in to_union: new_codes.append(_recode_for_categories(c.codes, c.categories, categories)) new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def recode_for_groupby(c, sort, observed): """ Code the categories to ensure we can groupby for categoricals. If observed=True, we return a new Categorical with the observed categories only. If sort=False, return a copy of self, coded with categories as returned by .unique(), followed by any categories not appearing in the data. If sort=True, return self. This method is needed solely to ensure the categorical index of the GroupBy result has categories in the order of appearance in the data (GH-8868). Parameters ---------- c : Categorical sort : boolean The value of the sort parameter groupby was called with. observed : boolean Account only for the observed values Returns ------- New Categorical If sort=False, the new categories are set to the order of appearance in codes (unless ordered=True, in which case the original order is preserved), followed by any unrepresented categories in the original order. Categorical or None If we are observed, return the original categorical, otherwise None """ # we only care about observed values if observed: unique_codes = unique1d(c.codes) take_codes = unique_codes[unique_codes != -1] if c.ordered: take_codes = np.sort(take_codes) # we recode according to the uniques categories = c.categories.take(take_codes) codes = _recode_for_categories(c.codes, c.categories, categories) # return a new categorical that maps our new codes # and categories dtype = CategoricalDtype(categories, ordered=c.ordered) return Categorical(codes, dtype=dtype, fastpath=True), c # Already sorted according to c.categories; all is fine if sort: return c, None # sort=False should order groups in as-encountered order (GH-8868) cat = c.unique() # But for groupby to work, all categories should be present, # including those missing from the data (GH-13179), which .unique() # above dropped cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)]) return c.reorder_categories(cat.categories), None
def recode_for_groupby(c, sort, observed): """ Code the categories to ensure we can groupby for categoricals. If observed=True, we return a new Categorical with the observed categories only. If sort=False, return a copy of self, coded with categories as returned by .unique(), followed by any categories not appearing in the data. If sort=True, return self. This method is needed solely to ensure the categorical index of the GroupBy result has categories in the order of appearance in the data (GH-8868). Parameters ---------- c : Categorical sort : boolean The value of the sort parameter groupby was called with. observed : boolean Account only for the observed values Returns ------- New Categorical If sort=False, the new categories are set to the order of appearance in codes (unless ordered=True, in which case the original order is preserved), followed by any unrepresented categories in the original order. Categorical or None If we are observed, return the original categorical, otherwise None """ # we only care about observed values if observed: unique_codes = unique1d(c.codes) take_codes = unique_codes[unique_codes != -1] if c.ordered: take_codes = np.sort(take_codes) # we recode according to the uniques categories = c.categories.take(take_codes) codes = _recode_for_categories(c.codes, c.categories, categories) # return a new categorical that maps our new codes # and categories dtype = CategoricalDtype(categories, ordered=c.ordered) return Categorical(codes, dtype=dtype, fastpath=True), c # Already sorted according to c.categories; all is fine if sort: return c, None # sort=False should order groups in as-encountered order (GH-8868) cat = c.unique() # But for groupby to work, all categories should be present, # including those missing from the data (GH-13179), which .unique() # above dropped cat = cat.add_categories( c.categories[~c.categories.isin(cat.categories)]) return c.reorder_categories(cat.categories), None