def test_merge(self): avals = randn(2, 10) bvals = randn(2, 10) ref_cols = Index(['e', 'a', 'b', 'd', 'f']) ablock = make_block(avals, ref_cols.get_indexer(['e', 'b'])) bblock = make_block(bvals, ref_cols.get_indexer(['a', 'd'])) merged = ablock.merge(bblock) assert_almost_equal(merged.mgr_locs, [0, 1, 2, 3]) assert_almost_equal(merged.values[[0, 2]], avals) assert_almost_equal(merged.values[[1, 3]], bvals)
def test_get_indexer(): major_axis = Index(lrange(4)) minor_axis = Index(lrange(2)) major_labels = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp) minor_labels = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp) index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) idx1 = index[:5] idx2 = index[[1, 3, 5]] r1 = idx1.get_indexer(idx2) assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp)) r1 = idx2.get_indexer(idx1, method='pad') e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp) assert_almost_equal(r1, e1) r2 = idx2.get_indexer(idx1[::-1], method='pad') assert_almost_equal(r2, e1[::-1]) rffill1 = idx2.get_indexer(idx1, method='ffill') assert_almost_equal(r1, rffill1) r1 = idx2.get_indexer(idx1, method='backfill') e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp) assert_almost_equal(r1, e1) r2 = idx2.get_indexer(idx1[::-1], method='backfill') assert_almost_equal(r2, e1[::-1]) rbfill1 = idx2.get_indexer(idx1, method='bfill') assert_almost_equal(r1, rbfill1) # pass non-MultiIndex r1 = idx1.get_indexer(idx2.values) rexp1 = idx1.get_indexer(idx2) assert_almost_equal(r1, rexp1) r1 = idx1.get_indexer([1, 2, 3]) assert (r1 == [-1, -1, -1]).all() # create index with duplicates idx1 = Index(lrange(10) + lrange(10)) idx2 = Index(lrange(20)) msg = "Reindexing only valid with uniquely valued Index objects" with tm.assert_raises_regex(InvalidIndexError, msg): idx1.get_indexer(idx2)
def test_merge(self): avals = randn(2, 10) bvals = randn(2, 10) ref_cols = Index(["e", "a", "b", "d", "f"]) ablock = make_block(avals, ref_cols.get_indexer(["e", "b"])) bblock = make_block(bvals, ref_cols.get_indexer(["a", "d"])) merged = ablock.merge(bblock) assert_almost_equal(merged.mgr_locs, [0, 1, 2, 3]) assert_almost_equal(merged.values[[0, 2]], avals) assert_almost_equal(merged.values[[1, 3]], bvals)
def union_categoricals(to_union): """ Combine list-like of Categoricals, unioning categories. All must have the same dtype, and none can be ordered. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categoricals Returns ------- Categorical A single array, categories will be ordered as they appear in the list Raises ------ TypeError If any of the categoricals are ordered or all do not have the same dtype ValueError Emmpty list of categoricals passed """ from pandas import Index, Categorical if len(to_union) == 0: raise ValueError("No Categoricals to union") first = to_union[0] if any(c.ordered for c in to_union): raise TypeError("Can only combine unordered Categoricals") if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same") cats = first.categories unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() categories = Index(unique_cats) new_codes = [] for c in to_union: if len(c.categories) > 0: indexer = categories.get_indexer(c.categories) new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) else: # must be all NaN new_codes.append(c.codes) new_codes = np.concatenate(new_codes) return Categorical(new_codes, categories=categories, ordered=False, fastpath=True)
def test_merge(self): avals = randn(2, 10) bvals = randn(2, 10) ref_cols = Index(['e', 'a', 'b', 'd', 'f']) ablock = make_block(avals, ref_cols.get_indexer(['e', 'b'])) bblock = make_block(bvals, ref_cols.get_indexer(['a', 'd'])) merged = ablock.merge(bblock) tm.assert_numpy_array_equal(merged.mgr_locs.as_array, np.array([0, 1, 2, 3], dtype=np.int64)) tm.assert_numpy_array_equal(merged.values[[0, 2]], np.array(avals)) tm.assert_numpy_array_equal(merged.values[[1, 3]], np.array(bvals))
def test_get_indexer_strings_raises(self): index = Index(["b", "c"]) msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" with pytest.raises(TypeError, match=msg): index.get_indexer(["a", "b", "c", "d"], method="nearest") with pytest.raises(TypeError, match=msg): index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) with pytest.raises(TypeError, match=msg): index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2])
def _gen_query_anndata( data: Union[MultimodalData, UnimodalData], ref_features: pd.Index, obs_columns: Optional[List[str]] = None, matkey: str = "counts", ) -> anndata.AnnData: """ Generate a new query Anndata object for scvitools Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. ref_features: ``pd.Index`` A pandas index of reference feature names obs_columns: ``List[str]`` A list of obs keys that should be included in the new anndata. matkey: ``str``, optional, default: ``"counts"`` Matrix key for the raw count Returns ------- An AnnData object. """ mat = data.get_matrix(matkey) if obs_columns is not None and obs_columns: obs_field = data.obs[obs_columns] else: obs_field = data.obs var_field = pd.DataFrame(index=ref_features) indexer = ref_features.get_indexer(data.var_names) new_size = (indexer[mat.indices] >= 0).sum() data_new, indices_new, indptr_new = _select_csr(mat.data, mat.indices, mat.indptr, indexer, new_size) X = csr_matrix((data_new, indices_new, indptr_new), shape=(mat.shape[0], ref_features.size)) X.sort_indices() return anndata.AnnData(X=X, obs=obs_field, var=var_field)
def union_categoricals(to_union, sort_categories=False, ignore_order=False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categorical, CategoricalIndex, or Series with dtype='category' sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order: boolean, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. .. versionadded:: 0.20.0 Returns ------- result : Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed """ from pandas import Index, Categorical, CategoricalIndex, Series if len(to_union) == 0: raise ValueError('No Categoricals to union') def _maybe_unwrap(x): if isinstance(x, (CategoricalIndex, Series)): return x.values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered new_codes = np.concatenate([c.codes for c in to_union]) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) if sort_categories: categories = categories.sort_values() new_codes = [] for c in to_union: if len(c.categories) > 0: indexer = categories.get_indexer(c.categories) new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) else: # must be all NaN new_codes.append(c.codes) new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def test_get_indexer_nearest(self, method, tolerance, indexer, expected): index = Index(np.arange(10)) actual = index.get_indexer(indexer, method=method, tolerance=tolerance) tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp))
def union_categoricals(to_union, sort_categories=False, ignore_order=False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categorical, CategoricalIndex, or Series with dtype='category' sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order: boolean, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. .. versionadded:: 0.20.0 Returns ------- result : Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed Notes ----- To learn more about categories, see `link <http://pandas.pydata.org/pandas-docs/stable/categorical.html#unioning>`__ Examples -------- >>> from pandas.api.types import union_categoricals If you want to combine categoricals that do not necessarily have the same categories, `union_categoricals` will combine a list-like of categoricals. The new categories will be the union of the categories being combined. >>> a = pd.Categorical(["b", "c"]) >>> b = pd.Categorical(["a", "b"]) >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] By default, the resulting categories will be ordered as they appear in the `categories` of the data. If you want the categories to be lexsorted, use `sort_categories=True` argument. >>> union_categoricals([a, b], sort_categories=True) [b, c, a, b] Categories (3, object): [a, b, c] `union_categoricals` also works with the case of combining two categoricals of the same categories and order information (e.g. what you could also `append` for). >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "a"], ordered=True) >>> union_categoricals([a, b]) [a, b, a, b, a] Categories (2, object): [a < b] Raises `TypeError` because the categories are ordered and not identical. >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "c"], ordered=True) >>> union_categoricals([a, b]) TypeError: to union ordered Categoricals, all categories must be the same New in version 0.20.0 Ordered categoricals with different categories or orderings can be combined by using the `ignore_ordered=True` argument. >>> a = pd.Categorical(["a", "b", "c"], ordered=True) >>> b = pd.Categorical(["c", "b", "a"], ordered=True) >>> union_categoricals([a, b], ignore_order=True) [a, b, c, c, b, a] Categories (3, object): [a, b, c] `union_categoricals` also works with a `CategoricalIndex`, or `Series` containing categorical data, but note that the resulting array will always be a plain `Categorical` >>> a = pd.Series(["b", "c"], dtype='category') >>> b = pd.Series(["a", "b"], dtype='category') >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] """ from pandas import Index, Categorical, CategoricalIndex, Series if len(to_union) == 0: raise ValueError('No Categoricals to union') def _maybe_unwrap(x): if isinstance(x, (CategoricalIndex, Series)): return x.values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all( is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered new_codes = np.concatenate([c.codes for c in to_union]) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) from pandas.core.algorithms import take_1d new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) if sort_categories: categories = categories.sort_values() new_codes = [] for c in to_union: if len(c.categories) > 0: indexer = categories.get_indexer(c.categories) from pandas.core.algorithms import take_1d new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) else: # must be all NaN new_codes.append(c.codes) new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def union_categoricals(to_union): """ Combine list-like of Categoricals, unioning categories. All must have the same dtype, and none can be ordered. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categoricals Returns ------- Categorical A single array, categories will be ordered as they appear in the list Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical ValueError Emmpty list of categoricals passed """ from pandas import Index, Categorical if len(to_union) == 0: raise ValueError('No Categoricals to union') first = to_union[0] if not all( is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same") if all(first.is_dtype_equal(other) for other in to_union[1:]): return Categorical(np.concatenate([c.codes for c in to_union]), categories=first.categories, ordered=first.ordered, fastpath=True) elif all(not c.ordered for c in to_union): # not ordered pass else: # to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') cats = first.categories unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() categories = Index(unique_cats) new_codes = [] for c in to_union: if len(c.categories) > 0: indexer = categories.get_indexer(c.categories) new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) else: # must be all NaN new_codes.append(c.codes) new_codes = np.concatenate(new_codes) return Categorical(new_codes, categories=categories, ordered=False, fastpath=True)
def union_categoricals(to_union): """ Combine list-like of Categoricals, unioning categories. All must have the same dtype, and none can be ordered. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categoricals Returns ------- Categorical A single array, categories will be ordered as they appear in the list Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical ValueError Emmpty list of categoricals passed """ from pandas import Index, Categorical if len(to_union) == 0: raise ValueError('No Categoricals to union') first = to_union[0] if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same") if all(first.is_dtype_equal(other) for other in to_union[1:]): return Categorical(np.concatenate([c.codes for c in to_union]), categories=first.categories, ordered=first.ordered, fastpath=True) elif all(not c.ordered for c in to_union): # not ordered pass else: # to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') cats = first.categories unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() categories = Index(unique_cats) new_codes = [] for c in to_union: if len(c.categories) > 0: indexer = categories.get_indexer(c.categories) new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) else: # must be all NaN new_codes.append(c.codes) new_codes = np.concatenate(new_codes) return Categorical(new_codes, categories=categories, ordered=False, fastpath=True)
def test_get_indexer_strings(self, method, expected): index = Index(["b", "c"]) actual = index.get_indexer(["a", "b", "c", "d"], method=method) tm.assert_numpy_array_equal(actual, expected)
def gen_reindexer(new_var: pd.Index, cur_var: pd.Index, *, fill_value=0): """ Given a new set of var_names, and a current set, generates a function which will reindex a matrix to be aligned with the new set. Usage ----- >>> a = AnnData(sparse.eye(3), var=pd.DataFrame(index=list("abc"))) >>> b = AnnData(sparse.eye(2), var=pd.DataFrame(index=list("ba"))) >>> reindexer = gen_reindexer(a.var_names, b.var_names) >>> sparse.vstack([a.X, reindexer(b.X)]).toarray() array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.], [0., 1., 0.], [1., 0., 0.]], dtype=float32) >>> reindexer_nan = gen_reindexer(a.var_names, b.var_names, fill_value=np.nan) >>> sparse.vstack([a.X, reindexer_nan(b.X)]).toarray() array([[ 1., 0., 0.], [ 0., 1., 0.], [ 0., 0., 1.], [ 0., 1., nan], [ 1., 0., nan]], dtype=float32) """ new_size = len(new_var) old_size = len(cur_var) new_pts = new_var.get_indexer(cur_var) cur_pts = np.arange(len(new_pts)) mask = new_pts != -1 new_pts = new_pts[mask] cur_pts = cur_pts[mask] def reindexer(X, fill_value=fill_value): if not np.can_cast(fill_value, X.dtype): out_dtype = np.promote_types(np.array(fill_value).dtype, X.dtype) else: out_dtype = X.dtype idxmtx = sparse.coo_matrix( (np.ones(len(new_pts), dtype=int), (cur_pts, new_pts)), shape=(old_size, new_size), dtype=out_dtype, ) out = X @ idxmtx if fill_value != 0: to_fill = new_var.get_indexer(new_var.difference(cur_var)) if len(to_fill) > 0: # More efficient to set columns on csc if sparse.issparse(out): out = sparse.csc_matrix(out) with warnings.catch_warnings(): warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning) out[:, to_fill] = fill_value return out return reindexer
def union_categoricals(to_union, sort_categories=False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categorical, CategoricalIndex, or Series with dtype='category' sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. Returns ------- result : Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Emmpty list of categoricals passed """ from pandas import Index, Categorical, CategoricalIndex, Series if len(to_union) == 0: raise ValueError('No Categoricals to union') def _maybe_unwrap(x): if isinstance(x, (CategoricalIndex, Series)): return x.values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all( is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered new_codes = np.concatenate([c.codes for c in to_union]) if sort_categories and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) new_codes = take_1d(indexer, new_codes, fill_value=-1) elif all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) if sort_categories: categories = categories.sort_values() new_codes = [] for c in to_union: if len(c.categories) > 0: indexer = categories.get_indexer(c.categories) new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) else: # must be all NaN new_codes.append(c.codes) new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def _process_pd_index(base_idx: pd.Index, index_1d: pd.Index) -> List[int]: if index_1d.has_duplicates: index_1d = index_1d.drop_duplicates() indexer = base_idx.get_indexer(index_1d) indexer = indexer[indexer >= 0] return indexer
def union_categoricals(to_union, sort_categories=False, ignore_order=False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categorical, CategoricalIndex, or Series with dtype='category' sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order: boolean, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. .. versionadded:: 0.20.0 Returns ------- result : Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed Notes ----- To learn more about categories, see `link <http://pandas.pydata.org/pandas-docs/stable/categorical.html#unioning>`__ Examples -------- >>> from pandas.api.types import union_categoricals If you want to combine categoricals that do not necessarily have the same categories, `union_categoricals` will combine a list-like of categoricals. The new categories will be the union of the categories being combined. >>> a = pd.Categorical(["b", "c"]) >>> b = pd.Categorical(["a", "b"]) >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] By default, the resulting categories will be ordered as they appear in the `categories` of the data. If you want the categories to be lexsorted, use `sort_categories=True` argument. >>> union_categoricals([a, b], sort_categories=True) [b, c, a, b] Categories (3, object): [a, b, c] `union_categoricals` also works with the case of combining two categoricals of the same categories and order information (e.g. what you could also `append` for). >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "a"], ordered=True) >>> union_categoricals([a, b]) [a, b, a, b, a] Categories (2, object): [a < b] Raises `TypeError` because the categories are ordered and not identical. >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "c"], ordered=True) >>> union_categoricals([a, b]) TypeError: to union ordered Categoricals, all categories must be the same New in version 0.20.0 Ordered categoricals with different categories or orderings can be combined by using the `ignore_ordered=True` argument. >>> a = pd.Categorical(["a", "b", "c"], ordered=True) >>> b = pd.Categorical(["c", "b", "a"], ordered=True) >>> union_categoricals([a, b], ignore_order=True) [a, b, c, c, b, a] Categories (3, object): [a, b, c] `union_categoricals` also works with a `CategoricalIndex`, or `Series` containing categorical data, but note that the resulting array will always be a plain `Categorical` >>> a = pd.Series(["b", "c"], dtype='category') >>> b = pd.Series(["a", "b"], dtype='category') >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] """ from pandas import Index, Categorical, CategoricalIndex, Series if len(to_union) == 0: raise ValueError('No Categoricals to union') def _maybe_unwrap(x): if isinstance(x, (CategoricalIndex, Series)): return x.values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered new_codes = np.concatenate([c.codes for c in to_union]) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) from pandas.core.algorithms import take_1d new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) if sort_categories: categories = categories.sort_values() new_codes = [] for c in to_union: if len(c.categories) > 0: indexer = categories.get_indexer(c.categories) from pandas.core.algorithms import take_1d new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) else: # must be all NaN new_codes.append(c.codes) new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)