def test_union_categoricals_nan(self): # GH 13759 res = union_categoricals([pd.Categorical([1, 2, np.nan]), pd.Categorical([3, 2, np.nan])]) exp = Categorical([1, 2, np.nan, 3, 2, np.nan]) tm.assert_categorical_equal(res, exp) res = union_categoricals([pd.Categorical(['A', 'B']), pd.Categorical(['B', 'B', np.nan])]) exp = Categorical(['A', 'B', 'B', 'B', np.nan]) tm.assert_categorical_equal(res, exp) val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), pd.NaT] val2 = [pd.NaT, pd.Timestamp('2011-01-01'), pd.Timestamp('2011-02-01')] res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)]) exp = Categorical(val1 + val2, categories=[pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), pd.Timestamp('2011-02-01')]) tm.assert_categorical_equal(res, exp) # all NaN res = union_categoricals([pd.Categorical([np.nan, np.nan]), pd.Categorical(['X'])]) exp = Categorical([np.nan, np.nan, 'X']) tm.assert_categorical_equal(res, exp) res = union_categoricals([pd.Categorical([np.nan, np.nan]), pd.Categorical([np.nan, np.nan])]) exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp)
def test_union_categoricals_nan(self): # GH 13759 res = union_categoricals([pd.Categorical([1, 2, np.nan]), pd.Categorical([3, 2, np.nan])]) exp = Categorical([1, 2, np.nan, 3, 2, np.nan]) tm.assert_categorical_equal(res, exp) res = union_categoricals([pd.Categorical(['A', 'B']), pd.Categorical(['B', 'B', np.nan])]) exp = Categorical(['A', 'B', 'B', 'B', np.nan]) tm.assert_categorical_equal(res, exp) val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), pd.NaT] val2 = [pd.NaT, pd.Timestamp('2011-01-01'), pd.Timestamp('2011-02-01')] res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)]) exp = Categorical(val1 + val2, categories=[pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), pd.Timestamp('2011-02-01')]) tm.assert_categorical_equal(res, exp) # all NaN res = union_categoricals([pd.Categorical([np.nan, np.nan]), pd.Categorical(['X'])]) exp = Categorical([np.nan, np.nan, 'X']) tm.assert_categorical_equal(res, exp) res = union_categoricals([pd.Categorical([np.nan, np.nan]), pd.Categorical([np.nan, np.nan])]) exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp)
def test_union_categorical_match_types(self): # must exactly match types s = Categorical([0, 1.2, 2]) s2 = Categorical([2, 3, 4]) msg = "dtype of categories must be the same" with pytest.raises(TypeError, match=msg): union_categoricals([s, s2])
def test_union_categoricals_empty(self): # GH 13759 res = union_categoricals([pd.Categorical([]), pd.Categorical([])]) exp = Categorical([]) tm.assert_categorical_equal(res, exp) res = union_categoricals([Categorical([]), Categorical(["1"])]) exp = Categorical(["1"]) tm.assert_categorical_equal(res, exp)
def test_union_categoricals_empty(self): # GH 13759 res = union_categoricals([pd.Categorical([]), pd.Categorical([])]) exp = Categorical([]) tm.assert_categorical_equal(res, exp) res = union_categoricals([Categorical([]), Categorical(['1'])]) exp = Categorical(['1']) tm.assert_categorical_equal(res, exp)
def test_union_categorical_same_category(self): # check fastpath c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4]) res = union_categoricals([c1, c2]) exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4]) tm.assert_categorical_equal(res, exp) c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"]) c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"]) res = union_categoricals([c1, c2]) exp = Categorical(["z", "z", "z", "x", "x", "x"], categories=["x", "y", "z"]) tm.assert_categorical_equal(res, exp)
def test_union_categoricals_sort(self): # GH 13846 c1 = Categorical(["x", "y", "z"]) c2 = Categorical(["a", "b", "c"]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(["x", "y", "z", "a", "b", "c"], categories=["a", "b", "c", "x", "y", "z"]) tm.assert_categorical_equal(result, expected) # fastpath c1 = Categorical(["a", "b"], categories=["b", "a", "c"]) c2 = Categorical(["b", "c"], categories=["b", "a", "c"]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected) c1 = Categorical(["a", "b"], categories=["c", "a", "b"]) c2 = Categorical(["b", "c"], categories=["c", "a", "b"]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected) # fastpath - skip resort c1 = Categorical(["a", "b"], categories=["a", "b", "c"]) c2 = Categorical(["b", "c"], categories=["a", "b", "c"]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected) c1 = Categorical(["x", np.nan]) c2 = Categorical([np.nan, "b"]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(["x", np.nan, np.nan, "b"], categories=["b", "x"]) tm.assert_categorical_equal(result, expected) c1 = Categorical([np.nan]) c2 = Categorical([np.nan]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical([np.nan, np.nan]) tm.assert_categorical_equal(result, expected) c1 = Categorical([]) c2 = Categorical([]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical([]) tm.assert_categorical_equal(result, expected) c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True) c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True) msg = "Cannot use sort_categories=True with ordered Categoricals" with pytest.raises(TypeError, match=msg): union_categoricals([c1, c2], sort_categories=True)
def test_union_categoricals_empty(self): # GH 13759 res = union_categoricals([pd.Categorical([]), pd.Categorical([])]) exp = Categorical([]) tm.assert_categorical_equal(res, exp) res = union_categoricals([pd.Categorical([]), pd.Categorical([1.0])]) exp = Categorical([1.0]) tm.assert_categorical_equal(res, exp) # to make dtype equal nanc = pd.Categorical(np.array([np.nan], dtype=np.float64)) res = union_categoricals([nanc, pd.Categorical([])]) tm.assert_categorical_equal(res, nanc)
def test_union_categoricals_sort(self): # GH 13846 c1 = Categorical(['x', 'y', 'z']) c2 = Categorical(['a', 'b', 'c']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], categories=['a', 'b', 'c', 'x', 'y', 'z']) tm.assert_categorical_equal(result, expected) # fastpath c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['a', 'b', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b']) c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['a', 'b', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) # fastpath - skip resort c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['a', 'b', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) c1 = Categorical(['x', np.nan]) c2 = Categorical([np.nan, 'b']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['x', np.nan, np.nan, 'b'], categories=['b', 'x']) tm.assert_categorical_equal(result, expected) c1 = Categorical([np.nan]) c2 = Categorical([np.nan]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical([np.nan, np.nan], categories=[]) tm.assert_categorical_equal(result, expected) c1 = Categorical([]) c2 = Categorical([]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical([]) tm.assert_categorical_equal(result, expected) c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) with tm.assertRaises(TypeError): union_categoricals([c1, c2], sort_categories=True)
def test_union_categoricals_sort(self): # GH 13846 c1 = Categorical(['x', 'y', 'z']) c2 = Categorical(['a', 'b', 'c']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], categories=['a', 'b', 'c', 'x', 'y', 'z']) tm.assert_categorical_equal(result, expected) # fastpath c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['a', 'b', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b']) c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['a', 'b', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) # fastpath - skip resort c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['a', 'b', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) c1 = Categorical(['x', np.nan]) c2 = Categorical([np.nan, 'b']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['x', np.nan, np.nan, 'b'], categories=['b', 'x']) tm.assert_categorical_equal(result, expected) c1 = Categorical([np.nan]) c2 = Categorical([np.nan]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical([np.nan, np.nan], categories=[]) tm.assert_categorical_equal(result, expected) c1 = Categorical([]) c2 = Categorical([]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical([]) tm.assert_categorical_equal(result, expected) c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) with tm.assertRaises(TypeError): union_categoricals([c1, c2], sort_categories=True)
def test_union_categorical_same_category(self): # check fastpath c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4]) res = union_categoricals([c1, c2]) exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4]) tm.assert_categorical_equal(res, exp) c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z']) c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z']) res = union_categoricals([c1, c2]) exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'], categories=['x', 'y', 'z']) tm.assert_categorical_equal(res, exp)
def test_union_categorical_same_category(self): # check fastpath c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4]) res = union_categoricals([c1, c2]) exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4]) tm.assert_categorical_equal(res, exp) c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z']) c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z']) res = union_categoricals([c1, c2]) exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'], categories=['x', 'y', 'z']) tm.assert_categorical_equal(res, exp)
def test_union_categoricals_sort_false_one_nan(self): c1 = Categorical(["x", np.nan]) c2 = Categorical([np.nan, "b"]) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical(["x", np.nan, np.nan, "b"], categories=["x", "b"]) tm.assert_categorical_equal(result, expected)
def find_common_type_cat(types): """ Find a common data type among the given dtypes. Parameters ---------- types : array-like Array of dtypes. Returns ------- pandas.core.dtypes.dtypes.ExtensionDtype or np.dtype or None `dtype` that is common for all passed `types`. """ if all(isinstance(t, pandas.CategoricalDtype) for t in types): if all(t.ordered for t in types): return pandas.CategoricalDtype( np.sort(np.unique([c for t in types for c in t.categories])[0]), ordered=True, ) return union_categoricals( [pandas.Categorical([], dtype=t) for t in types], sort_categories=all(t.ordered for t in types), ).dtype else: return find_common_type(list(types))
def test_union_categorical_same_category_str(self): c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"]) c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"]) res = union_categoricals([c1, c2]) exp = Categorical(["z", "z", "z", "x", "x", "x"], categories=["x", "y", "z"]) tm.assert_categorical_equal(res, exp)
def test_union_categoricals_sort_false(self): # GH 13846 c1 = Categorical(["x", "y", "z"]) c2 = Categorical(["a", "b", "c"]) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical(["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]) tm.assert_categorical_equal(result, expected)
def test_union_categoricals_sort_false_skipresort(self): # fastpath - skip resort c1 = Categorical(["a", "b"], categories=["a", "b", "c"]) c2 = Categorical(["b", "c"], categories=["a", "b", "c"]) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected)
def test_union_categorical_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/19096 c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']) c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c']) result = union_categoricals([c1, c2]) expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected)
def test_union_categoricals_sort_false_ordered_true(self): c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True) c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical(["b", "a", "a", "c"], categories=["b", "a", "c"], ordered=True) tm.assert_categorical_equal(result, expected)
def test_union_categorical_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/19096 c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']) c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c']) result = union_categoricals([c1, c2]) expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected)
def test_union_categorical_ordered_appearance(self): # new categories ordered by appearance s = Categorical(["x", "y", "z"]) s2 = Categorical(["a", "b", "c"]) result = union_categoricals([s, s2]) expected = Categorical(["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]) tm.assert_categorical_equal(result, expected)
def test_union_categoricals_empty(self): # GH 13759 res = union_categoricals([pd.Categorical([]), pd.Categorical([])]) exp = Categorical([]) tm.assert_categorical_equal(res, exp) res = union_categoricals([pd.Categorical([]), pd.Categorical([1.0])]) exp = Categorical([1.0]) tm.assert_categorical_equal(res, exp) # to make dtype equal nanc = pd.Categorical(np.array([np.nan], dtype=np.float64)) res = union_categoricals([nanc, pd.Categorical([])]) tm.assert_categorical_equal(res, nanc)
def test_union_categorical(self): # GH 13361 data = [ (list("abc"), list("abd"), list("abcabd")), ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), ( ["b", "b", np.nan, "a"], ["a", np.nan, "c"], ["b", "b", np.nan, "a", "a", np.nan, "c"], ), ( pd.date_range("2014-01-01", "2014-01-05"), pd.date_range("2014-01-06", "2014-01-07"), pd.date_range("2014-01-01", "2014-01-07"), ), ( pd.date_range("2014-01-01", "2014-01-05", tz="US/Central"), pd.date_range("2014-01-06", "2014-01-07", tz="US/Central"), pd.date_range("2014-01-01", "2014-01-07", tz="US/Central"), ), ( pd.period_range("2014-01-01", "2014-01-05"), pd.period_range("2014-01-06", "2014-01-07"), pd.period_range("2014-01-01", "2014-01-07"), ), ] for a, b, combined in data: for box in [Categorical, CategoricalIndex, Series]: result = union_categoricals( [box(Categorical(a)), box(Categorical(b))]) expected = Categorical(combined) tm.assert_categorical_equal(result, expected) # new categories ordered by appearance s = Categorical(["x", "y", "z"]) s2 = Categorical(["a", "b", "c"]) result = union_categoricals([s, s2]) expected = Categorical(["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]) tm.assert_categorical_equal(result, expected) s = Categorical([0, 1.2, 2], ordered=True) s2 = Categorical([0, 1.2, 2], ordered=True) result = union_categoricals([s, s2]) expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True) tm.assert_categorical_equal(result, expected) # must exactly match types s = Categorical([0, 1.2, 2]) s2 = Categorical([2, 3, 4]) msg = "dtype of categories must be the same" with pytest.raises(TypeError, match=msg): union_categoricals([s, s2]) msg = "No Categoricals to union" with pytest.raises(ValueError, match=msg): union_categoricals([])
def test_union_categorical_unwrap(self): # GH 14173 c1 = Categorical(["a", "b"]) c2 = pd.Series(["b", "c"], dtype="category") result = union_categoricals([c1, c2]) expected = Categorical(["a", "b", "b", "c"]) tm.assert_categorical_equal(result, expected) c2 = CategoricalIndex(c2) result = union_categoricals([c1, c2]) tm.assert_categorical_equal(result, expected) c1 = Series(c1) result = union_categoricals([c1, c2]) tm.assert_categorical_equal(result, expected) with pytest.raises(TypeError): union_categoricals([c1, ["a", "b", "c"]])
def test_union_categorical_unwrap(self): # GH 14173 c1 = Categorical(['a', 'b']) c2 = pd.Series(['b', 'c'], dtype='category') result = union_categoricals([c1, c2]) expected = Categorical(['a', 'b', 'b', 'c']) tm.assert_categorical_equal(result, expected) c2 = CategoricalIndex(c2) result = union_categoricals([c1, c2]) tm.assert_categorical_equal(result, expected) c1 = Series(c1) result = union_categoricals([c1, c2]) tm.assert_categorical_equal(result, expected) with tm.assertRaises(TypeError): union_categoricals([c1, ['a', 'b', 'c']])
def test_union_categorical_unwrap(self): # GH 14173 c1 = Categorical(['a', 'b']) c2 = pd.Series(['b', 'c'], dtype='category') result = union_categoricals([c1, c2]) expected = Categorical(['a', 'b', 'b', 'c']) tm.assert_categorical_equal(result, expected) c2 = CategoricalIndex(c2) result = union_categoricals([c1, c2]) tm.assert_categorical_equal(result, expected) c1 = Series(c1) result = union_categoricals([c1, c2]) tm.assert_categorical_equal(result, expected) with tm.assertRaises(TypeError): union_categoricals([c1, ['a', 'b', 'c']])
def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: """ Concatenate chunks of data read with low_memory=True. The tricky part is handling Categoricals, where different chunks may have different inferred categories. """ names = list(chunks[0].keys()) warning_columns = [] result = {} for name in names: arrs = [chunk.pop(name) for chunk in chunks] # Check each arr for consistent types. dtypes = {a.dtype for a in arrs} # TODO: shouldn't we exclude all EA dtypes here? numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} if len(numpy_dtypes) > 1: # error: Argument 1 to "find_common_type" has incompatible type # "Set[Any]"; expected "Sequence[Union[dtype[Any], None, type, # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]]" common_type = np.find_common_type( numpy_dtypes, # type: ignore[arg-type] [], ) # error: Non-overlapping equality check (left operand type: "dtype[Any]", # right operand type: "Type[object]") if common_type == object: # type: ignore[comparison-overlap] warning_columns.append(str(name)) dtype = dtypes.pop() if is_categorical_dtype(dtype): result[name] = union_categoricals(arrs, sort_categories=False) else: if isinstance(dtype, ExtensionDtype): # TODO: concat_compat? array_type = dtype.construct_array_type() # error: Argument 1 to "_concat_same_type" of "ExtensionArray" # has incompatible type "List[Union[ExtensionArray, ndarray]]"; # expected "Sequence[ExtensionArray]" result[name] = array_type._concat_same_type( arrs # type: ignore[arg-type] ) else: result[name] = np.concatenate(arrs) if warning_columns: warning_names = ",".join(warning_columns) warning_message = " ".join( [ f"Columns ({warning_names}) have mixed types. " f"Specify dtype option on import or set low_memory=False." ] ) warnings.warn(warning_message, DtypeWarning, stacklevel=8) return result
def test_union_categorical_unwrap(self): # GH 14173 c1 = Categorical(["a", "b"]) c2 = Series(["b", "c"], dtype="category") result = union_categoricals([c1, c2]) expected = Categorical(["a", "b", "b", "c"]) tm.assert_categorical_equal(result, expected) c2 = CategoricalIndex(c2) result = union_categoricals([c1, c2]) tm.assert_categorical_equal(result, expected) c1 = Series(c1) result = union_categoricals([c1, c2]) tm.assert_categorical_equal(result, expected) msg = "all components to combine must be Categorical" with pytest.raises(TypeError, match=msg): union_categoricals([c1, ["a", "b", "c"]])
def test_union_categoricals_nan(self): # GH 13759 res = union_categoricals( [pd.Categorical([1, 2, np.nan]), pd.Categorical([3, 2, np.nan])] ) exp = Categorical([1, 2, np.nan, 3, 2, np.nan]) tm.assert_categorical_equal(res, exp) res = union_categoricals( [pd.Categorical(["A", "B"]), pd.Categorical(["B", "B", np.nan])] ) exp = Categorical(["A", "B", "B", "B", np.nan]) tm.assert_categorical_equal(res, exp) val1 = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-03-01"), pd.NaT] val2 = [pd.NaT, pd.Timestamp("2011-01-01"), pd.Timestamp("2011-02-01")] res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)]) exp = Categorical( val1 + val2, categories=[ pd.Timestamp("2011-01-01"), pd.Timestamp("2011-03-01"), pd.Timestamp("2011-02-01"), ], ) tm.assert_categorical_equal(res, exp) # all NaN res = union_categoricals( [ pd.Categorical(np.array([np.nan, np.nan], dtype=object)), pd.Categorical(["X"]), ] ) exp = Categorical([np.nan, np.nan, "X"]) tm.assert_categorical_equal(res, exp) res = union_categoricals( [pd.Categorical([np.nan, np.nan]), pd.Categorical([np.nan, np.nan])] ) exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp)
def test_union_categorical(self): # GH 13361 data = [ (list('abc'), list('abd'), list('abcabd')), ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'], ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']), (pd.date_range('2014-01-01', '2014-01-05'), pd.date_range('2014-01-06', '2014-01-07'), pd.date_range('2014-01-01', '2014-01-07')), (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'), pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'), pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')), (pd.period_range('2014-01-01', '2014-01-05'), pd.period_range('2014-01-06', '2014-01-07'), pd.period_range('2014-01-01', '2014-01-07')), ] for a, b, combined in data: for box in [Categorical, CategoricalIndex, Series]: result = union_categoricals([box(Categorical(a)), box(Categorical(b))]) expected = Categorical(combined) tm.assert_categorical_equal(result, expected, check_category_order=True) # new categories ordered by appearance s = Categorical(['x', 'y', 'z']) s2 = Categorical(['a', 'b', 'c']) result = union_categoricals([s, s2]) expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], categories=['x', 'y', 'z', 'a', 'b', 'c']) tm.assert_categorical_equal(result, expected) s = Categorical([0, 1.2, 2], ordered=True) s2 = Categorical([0, 1.2, 2], ordered=True) result = union_categoricals([s, s2]) expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True) tm.assert_categorical_equal(result, expected) # must exactly match types s = Categorical([0, 1.2, 2]) s2 = Categorical([2, 3, 4]) msg = 'dtype of categories must be the same' with tm.assert_raises_regex(TypeError, msg): union_categoricals([s, s2]) msg = 'No Categoricals to union' with tm.assert_raises_regex(ValueError, msg): union_categoricals([])
def test_union_categoricals_ordered(self): c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], ordered=False) msg = 'Categorical.ordered must be the same' with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2]) res = union_categoricals([c1, c1]) exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3, np.nan], ordered=True) c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) res = union_categoricals([c1, c2]) exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) msg = "to union ordered Categoricals, all categories must be the same" with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2])
def test_union_categoricals_ordered(self): c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], ordered=False) msg = 'Categorical.ordered must be the same' with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2]) res = union_categoricals([c1, c1]) exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3, np.nan], ordered=True) c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) res = union_categoricals([c1, c2]) exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) msg = "to union ordered Categoricals, all categories must be the same" with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2])
def test_union_categorical(self): # GH 13361 data = [ (list('abc'), list('abd'), list('abcabd')), ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'], ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']), (pd.date_range('2014-01-01', '2014-01-05'), pd.date_range('2014-01-06', '2014-01-07'), pd.date_range('2014-01-01', '2014-01-07')), (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'), pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'), pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')), (pd.period_range('2014-01-01', '2014-01-05'), pd.period_range('2014-01-06', '2014-01-07'), pd.period_range('2014-01-01', '2014-01-07')), ] for a, b, combined in data: for box in [Categorical, CategoricalIndex, Series]: result = union_categoricals([box(Categorical(a)), box(Categorical(b))]) expected = Categorical(combined) tm.assert_categorical_equal(result, expected, check_category_order=True) # new categories ordered by appearance s = Categorical(['x', 'y', 'z']) s2 = Categorical(['a', 'b', 'c']) result = union_categoricals([s, s2]) expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], categories=['x', 'y', 'z', 'a', 'b', 'c']) tm.assert_categorical_equal(result, expected) s = Categorical([0, 1.2, 2], ordered=True) s2 = Categorical([0, 1.2, 2], ordered=True) result = union_categoricals([s, s2]) expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True) tm.assert_categorical_equal(result, expected) # must exactly match types s = Categorical([0, 1.2, 2]) s2 = Categorical([2, 3, 4]) msg = 'dtype of categories must be the same' with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([s, s2]) msg = 'No Categoricals to union' with tm.assertRaisesRegexp(ValueError, msg): union_categoricals([])
def find_common_type_cat(types): if all(isinstance(t, pandas.CategoricalDtype) for t in types): if all(t.ordered for t in types): return pandas.CategoricalDtype( np.sort(np.unique([c for t in types for c in t.categories])[0]), ordered=True, ) return union_categoricals( [pandas.Categorical([], dtype=t) for t in types], sort_categories=all(t.ordered for t in types), ).dtype else: return find_common_type(types)
def test_union_categoricals_sort_false(self): # GH 13846 c1 = Categorical(["x", "y", "z"]) c2 = Categorical(["a", "b", "c"]) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical(["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]) tm.assert_categorical_equal(result, expected) # fastpath c1 = Categorical(["a", "b"], categories=["b", "a", "c"]) c2 = Categorical(["b", "c"], categories=["b", "a", "c"]) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical(["a", "b", "b", "c"], categories=["b", "a", "c"]) tm.assert_categorical_equal(result, expected) # fastpath - skip resort c1 = Categorical(["a", "b"], categories=["a", "b", "c"]) c2 = Categorical(["b", "c"], categories=["a", "b", "c"]) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected) c1 = Categorical(["x", np.nan]) c2 = Categorical([np.nan, "b"]) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical(["x", np.nan, np.nan, "b"], categories=["x", "b"]) tm.assert_categorical_equal(result, expected) c1 = Categorical([np.nan]) c2 = Categorical([np.nan]) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical([np.nan, np.nan]) tm.assert_categorical_equal(result, expected) c1 = Categorical([]) c2 = Categorical([]) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical([]) tm.assert_categorical_equal(result, expected) c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True) c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical(["b", "a", "a", "c"], categories=["b", "a", "c"], ordered=True) tm.assert_categorical_equal(result, expected)
def test_union_categorical(self, a, b, combined, box): # GH 13361 result = union_categoricals([box(Categorical(a)), box(Categorical(b))]) expected = Categorical(combined) tm.assert_categorical_equal(result, expected)
def test_union_categoricals_ignore_order(self): # GH 15219 c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], ordered=False) res = union_categoricals([c1, c2], ignore_order=True) exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) msg = 'Categorical.ordered must be the same' with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2], ignore_order=False) res = union_categoricals([c1, c1], ignore_order=True) exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) res = union_categoricals([c1, c1], ignore_order=False) exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3, np.nan], ordered=True) c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) res = union_categoricals([c1, c2], ignore_order=True) exp = Categorical([1, 2, 3, np.nan, 3, 2]) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) res = union_categoricals([c1, c2], ignore_order=True) exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True) exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([4, 5, 6], ordered=True) result = union_categoricals([c1, c2], ignore_order=True) expected = Categorical([1, 2, 3, 4, 5, 6]) tm.assert_categorical_equal(result, expected) msg = "to union ordered Categoricals, all categories must be the same" with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2], ignore_order=False) with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2])
def test_union_categorical_ordered_true(self): s = Categorical([0, 1.2, 2], ordered=True) s2 = Categorical([0, 1.2, 2], ordered=True) result = union_categoricals([s, s2]) expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True) tm.assert_categorical_equal(result, expected)
def test_union_categorical_empty(self): msg = "No Categoricals to union" with pytest.raises(ValueError, match=msg): union_categoricals([])
def test_union_categoricals_ignore_order(self): # GH 15219 c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], ordered=False) res = union_categoricals([c1, c2], ignore_order=True) exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) msg = 'Categorical.ordered must be the same' with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2], ignore_order=False) res = union_categoricals([c1, c1], ignore_order=True) exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) res = union_categoricals([c1, c1], ignore_order=False) exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3, np.nan], ordered=True) c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) res = union_categoricals([c1, c2], ignore_order=True) exp = Categorical([1, 2, 3, np.nan, 3, 2]) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) res = union_categoricals([c1, c2], ignore_order=True) exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True) exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([4, 5, 6], ordered=True) result = union_categoricals([c1, c2], ignore_order=True) expected = Categorical([1, 2, 3, 4, 5, 6]) tm.assert_categorical_equal(result, expected) msg = "to union ordered Categoricals, all categories must be the same" with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2], ignore_order=False) with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2])