def test_union_categoricals_nan(self): # GH 13759 res = union_categoricals([pd.Categorical([1, 2, np.nan]), pd.Categorical([3, 2, np.nan])]) exp = Categorical([1, 2, np.nan, 3, 2, np.nan]) tm.assert_categorical_equal(res, exp) res = union_categoricals([pd.Categorical(['A', 'B']), pd.Categorical(['B', 'B', np.nan])]) exp = Categorical(['A', 'B', 'B', 'B', np.nan]) tm.assert_categorical_equal(res, exp) val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), pd.NaT] val2 = [pd.NaT, pd.Timestamp('2011-01-01'), pd.Timestamp('2011-02-01')] res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)]) exp = Categorical(val1 + val2, categories=[pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), pd.Timestamp('2011-02-01')]) tm.assert_categorical_equal(res, exp) # all NaN res = union_categoricals([pd.Categorical([np.nan, np.nan]), pd.Categorical(['X'])]) exp = Categorical([np.nan, np.nan, 'X']) tm.assert_categorical_equal(res, exp) res = union_categoricals([pd.Categorical([np.nan, np.nan]), pd.Categorical([np.nan, np.nan])]) exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp)
def test_deprecation_access_func(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): from pandas.types.concat import union_categoricals c1 = pd.Categorical(list('aabc')) c2 = pd.Categorical(list('abcd')) union_categoricals([c1, c2], sort_categories=True, ignore_order=True)
def test_deprecation_access_func(self): with tm.assert_produces_warning( FutureWarning, check_stacklevel=False): from pandas.types.concat import union_categoricals c1 = pd.Categorical(list('aabc')) c2 = pd.Categorical(list('abcd')) union_categoricals( [c1, c2], sort_categories=True, ignore_order=True)
def test_union_categoricals_sort(self): # GH 13846 c1 = Categorical(['x', 'y', 'z']) c2 = Categorical(['a', 'b', 'c']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], categories=['a', 'b', 'c', 'x', 'y', 'z']) tm.assert_categorical_equal(result, expected) # fastpath c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['a', 'b', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b']) c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['a', 'b', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) # fastpath - skip resort c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['a', 'b', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) c1 = Categorical(['x', np.nan]) c2 = Categorical([np.nan, 'b']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['x', np.nan, np.nan, 'b'], categories=['b', 'x']) tm.assert_categorical_equal(result, expected) c1 = Categorical([np.nan]) c2 = Categorical([np.nan]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical([np.nan, np.nan], categories=[]) tm.assert_categorical_equal(result, expected) c1 = Categorical([]) c2 = Categorical([]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical([]) tm.assert_categorical_equal(result, expected) c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) with tm.assertRaises(TypeError): union_categoricals([c1, c2], sort_categories=True)
def test_union_categoricals_empty(self): # GH 13759 res = union_categoricals([pd.Categorical([]), pd.Categorical([])]) exp = Categorical([]) tm.assert_categorical_equal(res, exp) res = union_categoricals([pd.Categorical([]), pd.Categorical([1.0])]) exp = Categorical([1.0]) tm.assert_categorical_equal(res, exp) # to make dtype equal nanc = pd.Categorical(np.array([np.nan], dtype=np.float64)) res = union_categoricals([nanc, pd.Categorical([])]) tm.assert_categorical_equal(res, nanc)
def test_union_categorical_same_category(self): # check fastpath c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4]) res = union_categoricals([c1, c2]) exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4]) tm.assert_categorical_equal(res, exp) c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z']) c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z']) res = union_categoricals([c1, c2]) exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'], categories=['x', 'y', 'z']) tm.assert_categorical_equal(res, exp)
def test_union_categorical_unwrap(self): # GH 14173 c1 = Categorical(['a', 'b']) c2 = pd.Series(['b', 'c'], dtype='category') result = union_categoricals([c1, c2]) expected = Categorical(['a', 'b', 'b', 'c']) tm.assert_categorical_equal(result, expected) c2 = CategoricalIndex(c2) result = union_categoricals([c1, c2]) tm.assert_categorical_equal(result, expected) c1 = Series(c1) result = union_categoricals([c1, c2]) tm.assert_categorical_equal(result, expected) with tm.assertRaises(TypeError): union_categoricals([c1, ['a', 'b', 'c']])
def test_union_categoricals_ordered(self): c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], ordered=False) msg = 'Categorical.ordered must be the same' with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2]) res = union_categoricals([c1, c1]) exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3, np.nan], ordered=True) c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) res = union_categoricals([c1, c2]) exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) msg = "to union ordered Categoricals, all categories must be the same" with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2])
def test_union_categorical(self): # GH 13361 data = [ (list('abc'), list('abd'), list('abcabd')), ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'], ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']), (pd.date_range('2014-01-01', '2014-01-05'), pd.date_range('2014-01-06', '2014-01-07'), pd.date_range('2014-01-01', '2014-01-07')), (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'), pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'), pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')), (pd.period_range('2014-01-01', '2014-01-05'), pd.period_range('2014-01-06', '2014-01-07'), pd.period_range('2014-01-01', '2014-01-07')), ] for a, b, combined in data: for box in [Categorical, CategoricalIndex, Series]: result = union_categoricals([box(Categorical(a)), box(Categorical(b))]) expected = Categorical(combined) tm.assert_categorical_equal(result, expected, check_category_order=True) # new categories ordered by appearance s = Categorical(['x', 'y', 'z']) s2 = Categorical(['a', 'b', 'c']) result = union_categoricals([s, s2]) expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], categories=['x', 'y', 'z', 'a', 'b', 'c']) tm.assert_categorical_equal(result, expected) s = Categorical([0, 1.2, 2], ordered=True) s2 = Categorical([0, 1.2, 2], ordered=True) result = union_categoricals([s, s2]) expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True) tm.assert_categorical_equal(result, expected) # must exactly match types s = Categorical([0, 1.2, 2]) s2 = Categorical([2, 3, 4]) msg = 'dtype of categories must be the same' with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([s, s2]) msg = 'No Categoricals to union' with tm.assertRaisesRegexp(ValueError, msg): union_categoricals([])
def test_union_categoricals_nan(self): # GH 13759 res = union_categoricals( [pd.Categorical([1, 2, np.nan]), pd.Categorical([3, 2, np.nan])]) exp = Categorical([1, 2, np.nan, 3, 2, np.nan]) tm.assert_categorical_equal(res, exp) res = union_categoricals( [pd.Categorical(['A', 'B']), pd.Categorical(['B', 'B', np.nan])]) exp = Categorical(['A', 'B', 'B', 'B', np.nan]) tm.assert_categorical_equal(res, exp) val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), pd.NaT] val2 = [pd.NaT, pd.Timestamp('2011-01-01'), pd.Timestamp('2011-02-01')] res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)]) exp = Categorical(val1 + val2, categories=[ pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), pd.Timestamp('2011-02-01') ]) tm.assert_categorical_equal(res, exp) # all NaN res = union_categoricals( [pd.Categorical([np.nan, np.nan]), pd.Categorical(['X'])]) exp = Categorical([np.nan, np.nan, 'X']) tm.assert_categorical_equal(res, exp) res = union_categoricals([ pd.Categorical([np.nan, np.nan]), pd.Categorical([np.nan, np.nan]) ]) exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp)
def test_union_categorical(self): # GH 13361 data = [ (list('abc'), list('abd'), list('abcabd')), ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'], ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']), (pd.date_range('2014-01-01', '2014-01-05'), pd.date_range('2014-01-06', '2014-01-07'), pd.date_range('2014-01-01', '2014-01-07')), (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'), pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'), pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')), (pd.period_range('2014-01-01', '2014-01-05'), pd.period_range('2014-01-06', '2014-01-07'), pd.period_range('2014-01-01', '2014-01-07')), ] for a, b, combined in data: for box in [Categorical, CategoricalIndex, Series]: result = union_categoricals( [box(Categorical(a)), box(Categorical(b))]) expected = Categorical(combined) tm.assert_categorical_equal(result, expected, check_category_order=True) # new categories ordered by appearance s = Categorical(['x', 'y', 'z']) s2 = Categorical(['a', 'b', 'c']) result = union_categoricals([s, s2]) expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], categories=['x', 'y', 'z', 'a', 'b', 'c']) tm.assert_categorical_equal(result, expected) s = Categorical([0, 1.2, 2], ordered=True) s2 = Categorical([0, 1.2, 2], ordered=True) result = union_categoricals([s, s2]) expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True) tm.assert_categorical_equal(result, expected) # must exactly match types s = Categorical([0, 1.2, 2]) s2 = Categorical([2, 3, 4]) msg = 'dtype of categories must be the same' with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([s, s2]) msg = 'No Categoricals to union' with tm.assertRaisesRegexp(ValueError, msg): union_categoricals([])
def test_union_categorical(self): # GH 13361 data = [ (list('abc'), list('abd'), list('abcabd')), ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), (pd.date_range('2014-01-01', '2014-01-05'), pd.date_range('2014-01-06', '2014-01-07'), pd.date_range('2014-01-01', '2014-01-07')), (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'), pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'), pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')), (pd.period_range('2014-01-01', '2014-01-05'), pd.period_range('2014-01-06', '2014-01-07'), pd.period_range('2014-01-01', '2014-01-07')), ] for a, b, combined in data: result = union_categoricals([Categorical(a), Categorical(b)]) expected = Categorical(combined) tm.assert_categorical_equal(result, expected, check_category_order=True) # new categories ordered by appearance s = Categorical(['x', 'y', 'z']) s2 = Categorical(['a', 'b', 'c']) result = union_categoricals([s, s2]).categories expected = Index(['x', 'y', 'z', 'a', 'b', 'c']) tm.assert_index_equal(result, expected) # can't be ordered s = Categorical([0, 1.2, 2], ordered=True) s2 = Categorical([0, 1.2, 2], ordered=True) with tm.assertRaises(TypeError): union_categoricals([s, s2]) # must exactly match types s = Categorical([0, 1.2, 2]) s2 = Categorical([2, 3, 4]) with tm.assertRaises(TypeError): union_categoricals([s, s2]) with tm.assertRaises(ValueError): union_categoricals([])
def time_union(self): union_categoricals([self.a, self.b])
def test_union_categoricals_ignore_order(self): # GH 15219 c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], ordered=False) res = union_categoricals([c1, c2], ignore_order=True) exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) msg = 'Categorical.ordered must be the same' with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2], ignore_order=False) res = union_categoricals([c1, c1], ignore_order=True) exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) res = union_categoricals([c1, c1], ignore_order=False) exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3, np.nan], ordered=True) c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) res = union_categoricals([c1, c2], ignore_order=True) exp = Categorical([1, 2, 3, np.nan, 3, 2]) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) res = union_categoricals([c1, c2], ignore_order=True) exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True) exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([4, 5, 6], ordered=True) result = union_categoricals([c1, c2], ignore_order=True) expected = Categorical([1, 2, 3, 4, 5, 6]) tm.assert_categorical_equal(result, expected) msg = "to union ordered Categoricals, all categories must be the same" with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2], ignore_order=False) with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2])
def concat(dfs, axis=0, join='outer', uniform=False): """Concatenate, handling some edge cases: - Unions categoricals between partitions - Ignores empty partitions Parameters ---------- dfs : list of DataFrame, Series, or Index axis : int or str, optional join : str, optional uniform : bool, optional Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to True if all arguments have the same columns and dtypes (but not necessarily categories). Default is False. """ if axis == 1: return pd.concat(dfs, axis=axis, join=join) if len(dfs) == 1: return dfs[0] # Support concatenating indices along axis 0 if isinstance(dfs[0], pd.Index): if isinstance(dfs[0], pd.CategoricalIndex): return pd.CategoricalIndex(union_categoricals(dfs), name=dfs[0].name) return dfs[0].append(dfs[1:]) # Handle categorical index separately if isinstance(dfs[0].index, pd.CategoricalIndex): dfs2 = [df.reset_index(drop=True) for df in dfs] ind = concat([df.index for df in dfs]) else: dfs2 = dfs ind = None # Concatenate the partitions together, handling categories as needed if (isinstance(dfs2[0], pd.DataFrame) if uniform else any(isinstance(df, pd.DataFrame) for df in dfs2)): if uniform: dfs3 = dfs2 cat_mask = dfs2[0].dtypes == 'category' else: # When concatenating mixed dataframes and series on axis 1, Pandas # converts series to dataframes with a single column named 0, then # concatenates. dfs3 = [df if isinstance(df, pd.DataFrame) else df.rename(0).to_frame() for df in dfs2] cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T for df in dfs3], join=join).any() if cat_mask.any(): not_cat = cat_mask[~cat_mask].index out = pd.concat([df[df.columns.intersection(not_cat)] for df in dfs3], join=join) for col in cat_mask.index.difference(not_cat): # Find an example of categoricals in this column for df in dfs3: sample = df.get(col) if sample is not None: break # Extract partitions, subbing in missing if needed parts = [] for df in dfs3: if col in df.columns: parts.append(df[col]) else: codes = np.full(len(df), -1, dtype='i8') data = pd.Categorical.from_codes(codes, sample.cat.categories, sample.cat.ordered) parts.append(data) out[col] = union_categoricals(parts) out = out.reindex_axis(cat_mask.index, axis=1) else: out = pd.concat(dfs3, join=join) else: if is_categorical_dtype(dfs2[0].dtype): if ind is None: ind = concat([df.index for df in dfs2]) return pd.Series(union_categoricals(dfs2), index=ind, name=dfs2[0].name) out = pd.concat(dfs2, join=join) # Re-add the index if needed if ind is not None: out.index = ind return out
def concat(dfs, axis=0, join='outer', uniform=False): """Concatenate, handling some edge cases: - Unions categoricals between partitions - Ignores empty partitions Parameters ---------- dfs : list of DataFrame, Series, or Index axis : int or str, optional join : str, optional uniform : bool, optional Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to True if all arguments have the same columns and dtypes (but not necessarily categories). Default is False. """ if axis == 1: return pd.concat(dfs, axis=axis, join=join) # Support concatenating indices along axis 0 if isinstance(dfs[0], pd.Index): if isinstance(dfs[0], pd.CategoricalIndex): return pd.CategoricalIndex(union_categoricals(dfs), name=dfs[0].name) return dfs[0].append(dfs[1:]) # Handle categorical index separately if isinstance(dfs[0].index, pd.CategoricalIndex): dfs2 = [df.reset_index(drop=True) for df in dfs] ind = concat([df.index for df in dfs]) else: dfs2 = dfs ind = None # Concatenate the partitions together, handling categories as needed if (isinstance(dfs2[0], pd.DataFrame) if uniform else any( isinstance(df, pd.DataFrame) for df in dfs2)): if uniform: dfs3 = dfs2 cat_mask = dfs2[0].dtypes == 'category' else: # When concatenating mixed dataframes and series on axis 1, Pandas # converts series to dataframes with a single column named 0, then # concatenates. dfs3 = [ df if isinstance(df, pd.DataFrame) else df.rename(0).to_frame() for df in dfs2 ] cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T for df in dfs3], join=join).any() if cat_mask.any(): not_cat = cat_mask[~cat_mask].index out = pd.concat( [df[df.columns.intersection(not_cat)] for df in dfs3], join=join) for col in cat_mask.index.difference(not_cat): # Find an example of categoricals in this column for df in dfs3: sample = df.get(col) if sample is not None: break # Extract partitions, subbing in missing if needed parts = [] for df in dfs3: if col in df.columns: parts.append(df[col]) else: codes = np.full(len(df), -1, dtype='i8') data = pd.Categorical.from_codes( codes, sample.cat.categories, sample.cat.ordered) parts.append(data) out[col] = union_categoricals(parts) out = out.reindex_axis(cat_mask.index, axis=1) else: out = pd.concat(dfs3, join=join) else: if is_categorical_dtype(dfs2[0].dtype): if ind is None: ind = concat([df.index for df in dfs2]) return pd.Series(union_categoricals(dfs2), index=ind, name=dfs2[0].name) out = pd.concat(dfs2, join=join) # Re-add the index if needed if ind is not None: out.index = ind return out
def time_union_categorical(self): union_categoricals([self.a, self.b])