Exemple #1
0
    def test_union_categoricals_nan(self):
        # GH 13759
        res = union_categoricals([pd.Categorical([1, 2, np.nan]),
                                  pd.Categorical([3, 2, np.nan])])
        exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([pd.Categorical(['A', 'B']),
                                  pd.Categorical(['B', 'B', np.nan])])
        exp = Categorical(['A', 'B', 'B', 'B', np.nan])
        tm.assert_categorical_equal(res, exp)

        val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'),
                pd.NaT]
        val2 = [pd.NaT, pd.Timestamp('2011-01-01'),
                pd.Timestamp('2011-02-01')]

        res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
        exp = Categorical(val1 + val2,
                          categories=[pd.Timestamp('2011-01-01'),
                                      pd.Timestamp('2011-03-01'),
                                      pd.Timestamp('2011-02-01')])
        tm.assert_categorical_equal(res, exp)

        # all NaN
        res = union_categoricals([pd.Categorical([np.nan, np.nan]),
                                  pd.Categorical(['X'])])
        exp = Categorical([np.nan, np.nan, 'X'])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([pd.Categorical([np.nan, np.nan]),
                                  pd.Categorical([np.nan, np.nan])])
        exp = Categorical([np.nan, np.nan, np.nan, np.nan])
        tm.assert_categorical_equal(res, exp)
    def test_union_categoricals_nan(self):
        # GH 13759
        res = union_categoricals([pd.Categorical([1, 2, np.nan]),
                                  pd.Categorical([3, 2, np.nan])])
        exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([pd.Categorical(['A', 'B']),
                                  pd.Categorical(['B', 'B', np.nan])])
        exp = Categorical(['A', 'B', 'B', 'B', np.nan])
        tm.assert_categorical_equal(res, exp)

        val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'),
                pd.NaT]
        val2 = [pd.NaT, pd.Timestamp('2011-01-01'),
                pd.Timestamp('2011-02-01')]

        res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
        exp = Categorical(val1 + val2,
                          categories=[pd.Timestamp('2011-01-01'),
                                      pd.Timestamp('2011-03-01'),
                                      pd.Timestamp('2011-02-01')])
        tm.assert_categorical_equal(res, exp)

        # all NaN
        res = union_categoricals([pd.Categorical([np.nan, np.nan]),
                                  pd.Categorical(['X'])])
        exp = Categorical([np.nan, np.nan, 'X'])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([pd.Categorical([np.nan, np.nan]),
                                  pd.Categorical([np.nan, np.nan])])
        exp = Categorical([np.nan, np.nan, np.nan, np.nan])
        tm.assert_categorical_equal(res, exp)
Exemple #3
0
 def test_union_categorical_match_types(self):
     # must exactly match types
     s = Categorical([0, 1.2, 2])
     s2 = Categorical([2, 3, 4])
     msg = "dtype of categories must be the same"
     with pytest.raises(TypeError, match=msg):
         union_categoricals([s, s2])
    def test_union_categoricals_empty(self):
        # GH 13759
        res = union_categoricals([pd.Categorical([]), pd.Categorical([])])
        exp = Categorical([])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([Categorical([]), Categorical(["1"])])
        exp = Categorical(["1"])
        tm.assert_categorical_equal(res, exp)
    def test_union_categoricals_empty(self):
        # GH 13759
        res = union_categoricals([pd.Categorical([]),
                                  pd.Categorical([])])
        exp = Categorical([])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([Categorical([]),
                                  Categorical(['1'])])
        exp = Categorical(['1'])
        tm.assert_categorical_equal(res, exp)
    def test_union_categorical_same_category(self):
        # check fastpath
        c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
        c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
        res = union_categoricals([c1, c2])
        exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4])
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"])
        c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"])
        res = union_categoricals([c1, c2])
        exp = Categorical(["z", "z", "z", "x", "x", "x"], categories=["x", "y", "z"])
        tm.assert_categorical_equal(res, exp)
    def test_union_categoricals_sort(self):
        # GH 13846
        c1 = Categorical(["x", "y", "z"])
        c2 = Categorical(["a", "b", "c"])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(["x", "y", "z", "a", "b", "c"],
                               categories=["a", "b", "c", "x", "y", "z"])
        tm.assert_categorical_equal(result, expected)

        # fastpath
        c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
        c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(["a", "b", "b", "c"],
                               categories=["a", "b", "c"])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(["a", "b"], categories=["c", "a", "b"])
        c2 = Categorical(["b", "c"], categories=["c", "a", "b"])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(["a", "b", "b", "c"],
                               categories=["a", "b", "c"])
        tm.assert_categorical_equal(result, expected)

        # fastpath - skip resort
        c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
        c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(["a", "b", "b", "c"],
                               categories=["a", "b", "c"])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(["x", np.nan])
        c2 = Categorical([np.nan, "b"])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(["x", np.nan, np.nan, "b"],
                               categories=["b", "x"])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical([np.nan])
        c2 = Categorical([np.nan])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical([np.nan, np.nan])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical([])
        c2 = Categorical([])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical([])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
        c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
        msg = "Cannot use sort_categories=True with ordered Categoricals"
        with pytest.raises(TypeError, match=msg):
            union_categoricals([c1, c2], sort_categories=True)
    def test_union_categoricals_empty(self):
        # GH 13759
        res = union_categoricals([pd.Categorical([]), pd.Categorical([])])
        exp = Categorical([])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([pd.Categorical([]), pd.Categorical([1.0])])
        exp = Categorical([1.0])
        tm.assert_categorical_equal(res, exp)

        # to make dtype equal
        nanc = pd.Categorical(np.array([np.nan], dtype=np.float64))
        res = union_categoricals([nanc, pd.Categorical([])])
        tm.assert_categorical_equal(res, nanc)
    def test_union_categoricals_sort(self):
        # GH 13846
        c1 = Categorical(['x', 'y', 'z'])
        c2 = Categorical(['a', 'b', 'c'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
                               categories=['a', 'b', 'c', 'x', 'y', 'z'])
        tm.assert_categorical_equal(result, expected)

        # fastpath
        c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
        c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['a', 'b', 'b', 'c'],
                               categories=['a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b'])
        c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['a', 'b', 'b', 'c'],
                               categories=['a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        # fastpath - skip resort
        c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
        c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['a', 'b', 'b', 'c'],
                               categories=['a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(['x', np.nan])
        c2 = Categorical([np.nan, 'b'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['x', np.nan, np.nan, 'b'],
                               categories=['b', 'x'])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical([np.nan])
        c2 = Categorical([np.nan])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical([np.nan, np.nan], categories=[])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical([])
        c2 = Categorical([])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical([])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
        c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
        with tm.assertRaises(TypeError):
            union_categoricals([c1, c2], sort_categories=True)
    def test_union_categoricals_sort(self):
        # GH 13846
        c1 = Categorical(['x', 'y', 'z'])
        c2 = Categorical(['a', 'b', 'c'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
                               categories=['a', 'b', 'c', 'x', 'y', 'z'])
        tm.assert_categorical_equal(result, expected)

        # fastpath
        c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
        c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['a', 'b', 'b', 'c'],
                               categories=['a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b'])
        c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['a', 'b', 'b', 'c'],
                               categories=['a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        # fastpath - skip resort
        c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
        c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['a', 'b', 'b', 'c'],
                               categories=['a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(['x', np.nan])
        c2 = Categorical([np.nan, 'b'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['x', np.nan, np.nan, 'b'],
                               categories=['b', 'x'])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical([np.nan])
        c2 = Categorical([np.nan])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical([np.nan, np.nan], categories=[])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical([])
        c2 = Categorical([])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical([])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
        c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
        with tm.assertRaises(TypeError):
            union_categoricals([c1, c2], sort_categories=True)
    def test_union_categorical_same_category(self):
        # check fastpath
        c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
        c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
        res = union_categoricals([c1, c2])
        exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
                          categories=[1, 2, 3, 4])
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
        c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
        res = union_categoricals([c1, c2])
        exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
                          categories=['x', 'y', 'z'])
        tm.assert_categorical_equal(res, exp)
    def test_union_categorical_same_category(self):
        # check fastpath
        c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
        c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
        res = union_categoricals([c1, c2])
        exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
                          categories=[1, 2, 3, 4])
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
        c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
        res = union_categoricals([c1, c2])
        exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
                          categories=['x', 'y', 'z'])
        tm.assert_categorical_equal(res, exp)
Exemple #13
0
 def test_union_categoricals_sort_false_one_nan(self):
     c1 = Categorical(["x", np.nan])
     c2 = Categorical([np.nan, "b"])
     result = union_categoricals([c1, c2], sort_categories=False)
     expected = Categorical(["x", np.nan, np.nan, "b"],
                            categories=["x", "b"])
     tm.assert_categorical_equal(result, expected)
Exemple #14
0
def find_common_type_cat(types):
    """
    Find a common data type among the given dtypes.

    Parameters
    ----------
    types : array-like
        Array of dtypes.

    Returns
    -------
    pandas.core.dtypes.dtypes.ExtensionDtype or
    np.dtype or
    None
        `dtype` that is common for all passed `types`.
    """
    if all(isinstance(t, pandas.CategoricalDtype) for t in types):
        if all(t.ordered for t in types):
            return pandas.CategoricalDtype(
                np.sort(np.unique([c for t in types for c in t.categories])[0]),
                ordered=True,
            )
        return union_categoricals(
            [pandas.Categorical([], dtype=t) for t in types],
            sort_categories=all(t.ordered for t in types),
        ).dtype
    else:
        return find_common_type(list(types))
Exemple #15
0
 def test_union_categorical_same_category_str(self):
     c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"])
     c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"])
     res = union_categoricals([c1, c2])
     exp = Categorical(["z", "z", "z", "x", "x", "x"],
                       categories=["x", "y", "z"])
     tm.assert_categorical_equal(res, exp)
Exemple #16
0
 def test_union_categoricals_sort_false(self):
     # GH 13846
     c1 = Categorical(["x", "y", "z"])
     c2 = Categorical(["a", "b", "c"])
     result = union_categoricals([c1, c2], sort_categories=False)
     expected = Categorical(["x", "y", "z", "a", "b", "c"],
                            categories=["x", "y", "z", "a", "b", "c"])
     tm.assert_categorical_equal(result, expected)
Exemple #17
0
 def test_union_categoricals_sort_false_skipresort(self):
     # fastpath - skip resort
     c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
     c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
     result = union_categoricals([c1, c2], sort_categories=False)
     expected = Categorical(["a", "b", "b", "c"],
                            categories=["a", "b", "c"])
     tm.assert_categorical_equal(result, expected)
 def test_union_categorical_same_categories_different_order(self):
     # https://github.com/pandas-dev/pandas/issues/19096
     c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c'])
     c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c'])
     result = union_categoricals([c1, c2])
     expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
                            categories=['a', 'b', 'c'])
     tm.assert_categorical_equal(result, expected)
Exemple #19
0
 def test_union_categoricals_sort_false_ordered_true(self):
     c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
     c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
     result = union_categoricals([c1, c2], sort_categories=False)
     expected = Categorical(["b", "a", "a", "c"],
                            categories=["b", "a", "c"],
                            ordered=True)
     tm.assert_categorical_equal(result, expected)
 def test_union_categorical_same_categories_different_order(self):
     # https://github.com/pandas-dev/pandas/issues/19096
     c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c'])
     c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c'])
     result = union_categoricals([c1, c2])
     expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
                            categories=['a', 'b', 'c'])
     tm.assert_categorical_equal(result, expected)
Exemple #21
0
 def test_union_categorical_ordered_appearance(self):
     # new categories ordered by appearance
     s = Categorical(["x", "y", "z"])
     s2 = Categorical(["a", "b", "c"])
     result = union_categoricals([s, s2])
     expected = Categorical(["x", "y", "z", "a", "b", "c"],
                            categories=["x", "y", "z", "a", "b", "c"])
     tm.assert_categorical_equal(result, expected)
    def test_union_categoricals_empty(self):
        # GH 13759
        res = union_categoricals([pd.Categorical([]),
                                  pd.Categorical([])])
        exp = Categorical([])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([pd.Categorical([]),
                                  pd.Categorical([1.0])])
        exp = Categorical([1.0])
        tm.assert_categorical_equal(res, exp)

        # to make dtype equal
        nanc = pd.Categorical(np.array([np.nan], dtype=np.float64))
        res = union_categoricals([nanc,
                                  pd.Categorical([])])
        tm.assert_categorical_equal(res, nanc)
    def test_union_categorical(self):
        # GH 13361
        data = [
            (list("abc"), list("abd"), list("abcabd")),
            ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
            ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
            (
                ["b", "b", np.nan, "a"],
                ["a", np.nan, "c"],
                ["b", "b", np.nan, "a", "a", np.nan, "c"],
            ),
            (
                pd.date_range("2014-01-01", "2014-01-05"),
                pd.date_range("2014-01-06", "2014-01-07"),
                pd.date_range("2014-01-01", "2014-01-07"),
            ),
            (
                pd.date_range("2014-01-01", "2014-01-05", tz="US/Central"),
                pd.date_range("2014-01-06", "2014-01-07", tz="US/Central"),
                pd.date_range("2014-01-01", "2014-01-07", tz="US/Central"),
            ),
            (
                pd.period_range("2014-01-01", "2014-01-05"),
                pd.period_range("2014-01-06", "2014-01-07"),
                pd.period_range("2014-01-01", "2014-01-07"),
            ),
        ]

        for a, b, combined in data:
            for box in [Categorical, CategoricalIndex, Series]:
                result = union_categoricals(
                    [box(Categorical(a)),
                     box(Categorical(b))])
                expected = Categorical(combined)
                tm.assert_categorical_equal(result, expected)

        # new categories ordered by appearance
        s = Categorical(["x", "y", "z"])
        s2 = Categorical(["a", "b", "c"])
        result = union_categoricals([s, s2])
        expected = Categorical(["x", "y", "z", "a", "b", "c"],
                               categories=["x", "y", "z", "a", "b", "c"])
        tm.assert_categorical_equal(result, expected)

        s = Categorical([0, 1.2, 2], ordered=True)
        s2 = Categorical([0, 1.2, 2], ordered=True)
        result = union_categoricals([s, s2])
        expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
        tm.assert_categorical_equal(result, expected)

        # must exactly match types
        s = Categorical([0, 1.2, 2])
        s2 = Categorical([2, 3, 4])
        msg = "dtype of categories must be the same"
        with pytest.raises(TypeError, match=msg):
            union_categoricals([s, s2])

        msg = "No Categoricals to union"
        with pytest.raises(ValueError, match=msg):
            union_categoricals([])
    def test_union_categorical_unwrap(self):
        # GH 14173
        c1 = Categorical(["a", "b"])
        c2 = pd.Series(["b", "c"], dtype="category")
        result = union_categoricals([c1, c2])
        expected = Categorical(["a", "b", "b", "c"])
        tm.assert_categorical_equal(result, expected)

        c2 = CategoricalIndex(c2)
        result = union_categoricals([c1, c2])
        tm.assert_categorical_equal(result, expected)

        c1 = Series(c1)
        result = union_categoricals([c1, c2])
        tm.assert_categorical_equal(result, expected)

        with pytest.raises(TypeError):
            union_categoricals([c1, ["a", "b", "c"]])
    def test_union_categorical_unwrap(self):
        # GH 14173
        c1 = Categorical(['a', 'b'])
        c2 = pd.Series(['b', 'c'], dtype='category')
        result = union_categoricals([c1, c2])
        expected = Categorical(['a', 'b', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        c2 = CategoricalIndex(c2)
        result = union_categoricals([c1, c2])
        tm.assert_categorical_equal(result, expected)

        c1 = Series(c1)
        result = union_categoricals([c1, c2])
        tm.assert_categorical_equal(result, expected)

        with tm.assertRaises(TypeError):
            union_categoricals([c1, ['a', 'b', 'c']])
    def test_union_categorical_unwrap(self):
        # GH 14173
        c1 = Categorical(['a', 'b'])
        c2 = pd.Series(['b', 'c'], dtype='category')
        result = union_categoricals([c1, c2])
        expected = Categorical(['a', 'b', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        c2 = CategoricalIndex(c2)
        result = union_categoricals([c1, c2])
        tm.assert_categorical_equal(result, expected)

        c1 = Series(c1)
        result = union_categoricals([c1, c2])
        tm.assert_categorical_equal(result, expected)

        with tm.assertRaises(TypeError):
            union_categoricals([c1, ['a', 'b', 'c']])
Exemple #27
0
def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
    """
    Concatenate chunks of data read with low_memory=True.

    The tricky part is handling Categoricals, where different chunks
    may have different inferred categories.
    """
    names = list(chunks[0].keys())
    warning_columns = []

    result = {}
    for name in names:
        arrs = [chunk.pop(name) for chunk in chunks]
        # Check each arr for consistent types.
        dtypes = {a.dtype for a in arrs}
        # TODO: shouldn't we exclude all EA dtypes here?
        numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
        if len(numpy_dtypes) > 1:
            # error: Argument 1 to "find_common_type" has incompatible type
            # "Set[Any]"; expected "Sequence[Union[dtype[Any], None, type,
            # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any,
            # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]]"
            common_type = np.find_common_type(
                numpy_dtypes,  # type: ignore[arg-type]
                [],
            )
            # error: Non-overlapping equality check (left operand type: "dtype[Any]",
            # right operand type: "Type[object]")
            if common_type == object:  # type: ignore[comparison-overlap]
                warning_columns.append(str(name))

        dtype = dtypes.pop()
        if is_categorical_dtype(dtype):
            result[name] = union_categoricals(arrs, sort_categories=False)
        else:
            if isinstance(dtype, ExtensionDtype):
                # TODO: concat_compat?
                array_type = dtype.construct_array_type()
                # error: Argument 1 to "_concat_same_type" of "ExtensionArray"
                # has incompatible type "List[Union[ExtensionArray, ndarray]]";
                # expected "Sequence[ExtensionArray]"
                result[name] = array_type._concat_same_type(
                    arrs  # type: ignore[arg-type]
                )
            else:
                result[name] = np.concatenate(arrs)

    if warning_columns:
        warning_names = ",".join(warning_columns)
        warning_message = " ".join(
            [
                f"Columns ({warning_names}) have mixed types. "
                f"Specify dtype option on import or set low_memory=False."
            ]
        )
        warnings.warn(warning_message, DtypeWarning, stacklevel=8)
    return result
    def test_union_categorical_unwrap(self):
        # GH 14173
        c1 = Categorical(["a", "b"])
        c2 = Series(["b", "c"], dtype="category")
        result = union_categoricals([c1, c2])
        expected = Categorical(["a", "b", "b", "c"])
        tm.assert_categorical_equal(result, expected)

        c2 = CategoricalIndex(c2)
        result = union_categoricals([c1, c2])
        tm.assert_categorical_equal(result, expected)

        c1 = Series(c1)
        result = union_categoricals([c1, c2])
        tm.assert_categorical_equal(result, expected)

        msg = "all components to combine must be Categorical"
        with pytest.raises(TypeError, match=msg):
            union_categoricals([c1, ["a", "b", "c"]])
    def test_union_categoricals_nan(self):
        # GH 13759
        res = union_categoricals(
            [pd.Categorical([1, 2, np.nan]), pd.Categorical([3, 2, np.nan])]
        )
        exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals(
            [pd.Categorical(["A", "B"]), pd.Categorical(["B", "B", np.nan])]
        )
        exp = Categorical(["A", "B", "B", "B", np.nan])
        tm.assert_categorical_equal(res, exp)

        val1 = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-03-01"), pd.NaT]
        val2 = [pd.NaT, pd.Timestamp("2011-01-01"), pd.Timestamp("2011-02-01")]

        res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
        exp = Categorical(
            val1 + val2,
            categories=[
                pd.Timestamp("2011-01-01"),
                pd.Timestamp("2011-03-01"),
                pd.Timestamp("2011-02-01"),
            ],
        )
        tm.assert_categorical_equal(res, exp)

        # all NaN
        res = union_categoricals(
            [
                pd.Categorical(np.array([np.nan, np.nan], dtype=object)),
                pd.Categorical(["X"]),
            ]
        )
        exp = Categorical([np.nan, np.nan, "X"])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals(
            [pd.Categorical([np.nan, np.nan]), pd.Categorical([np.nan, np.nan])]
        )
        exp = Categorical([np.nan, np.nan, np.nan, np.nan])
        tm.assert_categorical_equal(res, exp)
Exemple #30
0
    def test_union_categorical(self):
        # GH 13361
        data = [
            (list('abc'), list('abd'), list('abcabd')),
            ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
            ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),

            (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'],
             ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']),

            (pd.date_range('2014-01-01', '2014-01-05'),
             pd.date_range('2014-01-06', '2014-01-07'),
             pd.date_range('2014-01-01', '2014-01-07')),

            (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
             pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
             pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),

            (pd.period_range('2014-01-01', '2014-01-05'),
             pd.period_range('2014-01-06', '2014-01-07'),
             pd.period_range('2014-01-01', '2014-01-07')),
        ]

        for a, b, combined in data:
            for box in [Categorical, CategoricalIndex, Series]:
                result = union_categoricals([box(Categorical(a)),
                                             box(Categorical(b))])
                expected = Categorical(combined)
                tm.assert_categorical_equal(result, expected,
                                            check_category_order=True)

        # new categories ordered by appearance
        s = Categorical(['x', 'y', 'z'])
        s2 = Categorical(['a', 'b', 'c'])
        result = union_categoricals([s, s2])
        expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
                               categories=['x', 'y', 'z', 'a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        s = Categorical([0, 1.2, 2], ordered=True)
        s2 = Categorical([0, 1.2, 2], ordered=True)
        result = union_categoricals([s, s2])
        expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
        tm.assert_categorical_equal(result, expected)

        # must exactly match types
        s = Categorical([0, 1.2, 2])
        s2 = Categorical([2, 3, 4])
        msg = 'dtype of categories must be the same'
        with tm.assert_raises_regex(TypeError, msg):
            union_categoricals([s, s2])

        msg = 'No Categoricals to union'
        with tm.assert_raises_regex(ValueError, msg):
            union_categoricals([])
    def test_union_categoricals_ordered(self):
        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], ordered=False)

        msg = 'Categorical.ordered must be the same'
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2])

        res = union_categoricals([c1, c1])
        exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3, np.nan], ordered=True)
        c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)

        res = union_categoricals([c1, c2])
        exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)

        msg = "to union ordered Categoricals, all categories must be the same"
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2])
    def test_union_categoricals_ordered(self):
        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], ordered=False)

        msg = 'Categorical.ordered must be the same'
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2])

        res = union_categoricals([c1, c1])
        exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3, np.nan], ordered=True)
        c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)

        res = union_categoricals([c1, c2])
        exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)

        msg = "to union ordered Categoricals, all categories must be the same"
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2])
    def test_union_categorical(self):
        # GH 13361
        data = [
            (list('abc'), list('abd'), list('abcabd')),
            ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
            ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),

            (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'],
             ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']),

            (pd.date_range('2014-01-01', '2014-01-05'),
             pd.date_range('2014-01-06', '2014-01-07'),
             pd.date_range('2014-01-01', '2014-01-07')),

            (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
             pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
             pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),

            (pd.period_range('2014-01-01', '2014-01-05'),
             pd.period_range('2014-01-06', '2014-01-07'),
             pd.period_range('2014-01-01', '2014-01-07')),
        ]

        for a, b, combined in data:
            for box in [Categorical, CategoricalIndex, Series]:
                result = union_categoricals([box(Categorical(a)),
                                             box(Categorical(b))])
                expected = Categorical(combined)
                tm.assert_categorical_equal(result, expected,
                                            check_category_order=True)

        # new categories ordered by appearance
        s = Categorical(['x', 'y', 'z'])
        s2 = Categorical(['a', 'b', 'c'])
        result = union_categoricals([s, s2])
        expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
                               categories=['x', 'y', 'z', 'a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        s = Categorical([0, 1.2, 2], ordered=True)
        s2 = Categorical([0, 1.2, 2], ordered=True)
        result = union_categoricals([s, s2])
        expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
        tm.assert_categorical_equal(result, expected)

        # must exactly match types
        s = Categorical([0, 1.2, 2])
        s2 = Categorical([2, 3, 4])
        msg = 'dtype of categories must be the same'
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([s, s2])

        msg = 'No Categoricals to union'
        with tm.assertRaisesRegexp(ValueError, msg):
            union_categoricals([])
Exemple #34
0
def find_common_type_cat(types):
    if all(isinstance(t, pandas.CategoricalDtype) for t in types):
        if all(t.ordered for t in types):
            return pandas.CategoricalDtype(
                np.sort(np.unique([c for t in types for c in t.categories])[0]),
                ordered=True,
            )
        return union_categoricals(
            [pandas.Categorical([], dtype=t) for t in types],
            sort_categories=all(t.ordered for t in types),
        ).dtype
    else:
        return find_common_type(types)
Exemple #35
0
    def test_union_categoricals_sort_false(self):
        # GH 13846
        c1 = Categorical(["x", "y", "z"])
        c2 = Categorical(["a", "b", "c"])
        result = union_categoricals([c1, c2], sort_categories=False)
        expected = Categorical(["x", "y", "z", "a", "b", "c"],
                               categories=["x", "y", "z", "a", "b", "c"])
        tm.assert_categorical_equal(result, expected)

        # fastpath
        c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
        c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
        result = union_categoricals([c1, c2], sort_categories=False)
        expected = Categorical(["a", "b", "b", "c"],
                               categories=["b", "a", "c"])
        tm.assert_categorical_equal(result, expected)

        # fastpath - skip resort
        c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
        c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
        result = union_categoricals([c1, c2], sort_categories=False)
        expected = Categorical(["a", "b", "b", "c"],
                               categories=["a", "b", "c"])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(["x", np.nan])
        c2 = Categorical([np.nan, "b"])
        result = union_categoricals([c1, c2], sort_categories=False)
        expected = Categorical(["x", np.nan, np.nan, "b"],
                               categories=["x", "b"])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical([np.nan])
        c2 = Categorical([np.nan])
        result = union_categoricals([c1, c2], sort_categories=False)
        expected = Categorical([np.nan, np.nan])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical([])
        c2 = Categorical([])
        result = union_categoricals([c1, c2], sort_categories=False)
        expected = Categorical([])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
        c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
        result = union_categoricals([c1, c2], sort_categories=False)
        expected = Categorical(["b", "a", "a", "c"],
                               categories=["b", "a", "c"],
                               ordered=True)
        tm.assert_categorical_equal(result, expected)
Exemple #36
0
 def test_union_categorical(self, a, b, combined, box):
     # GH 13361
     result = union_categoricals([box(Categorical(a)), box(Categorical(b))])
     expected = Categorical(combined)
     tm.assert_categorical_equal(result, expected)
    def test_union_categoricals_ignore_order(self):
        # GH 15219
        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], ordered=False)

        res = union_categoricals([c1, c2], ignore_order=True)
        exp = Categorical([1, 2, 3, 1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        msg = 'Categorical.ordered must be the same'
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2], ignore_order=False)

        res = union_categoricals([c1, c1], ignore_order=True)
        exp = Categorical([1, 2, 3, 1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([c1, c1], ignore_order=False)
        exp = Categorical([1, 2, 3, 1, 2, 3],
                          categories=[1, 2, 3],
                          ordered=True)
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3, np.nan], ordered=True)
        c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)

        res = union_categoricals([c1, c2], ignore_order=True)
        exp = Categorical([1, 2, 3, np.nan, 3, 2])
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)

        res = union_categoricals([c1, c2], ignore_order=True)
        exp = Categorical([1, 2, 3, 1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([c2, c1],
                                 ignore_order=True,
                                 sort_categories=True)
        exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([4, 5, 6], ordered=True)
        result = union_categoricals([c1, c2], ignore_order=True)
        expected = Categorical([1, 2, 3, 4, 5, 6])
        tm.assert_categorical_equal(result, expected)

        msg = "to union ordered Categoricals, all categories must be the same"
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2], ignore_order=False)

        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2])
Exemple #38
0
 def test_union_categorical_ordered_true(self):
     s = Categorical([0, 1.2, 2], ordered=True)
     s2 = Categorical([0, 1.2, 2], ordered=True)
     result = union_categoricals([s, s2])
     expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
     tm.assert_categorical_equal(result, expected)
Exemple #39
0
 def test_union_categorical_empty(self):
     msg = "No Categoricals to union"
     with pytest.raises(ValueError, match=msg):
         union_categoricals([])
    def test_union_categoricals_ignore_order(self):
        # GH 15219
        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], ordered=False)

        res = union_categoricals([c1, c2], ignore_order=True)
        exp = Categorical([1, 2, 3, 1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        msg = 'Categorical.ordered must be the same'
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2], ignore_order=False)

        res = union_categoricals([c1, c1], ignore_order=True)
        exp = Categorical([1, 2, 3, 1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([c1, c1], ignore_order=False)
        exp = Categorical([1, 2, 3, 1, 2, 3],
                          categories=[1, 2, 3], ordered=True)
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3, np.nan], ordered=True)
        c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)

        res = union_categoricals([c1, c2], ignore_order=True)
        exp = Categorical([1, 2, 3, np.nan, 3, 2])
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)

        res = union_categoricals([c1, c2], ignore_order=True)
        exp = Categorical([1, 2, 3, 1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([c2, c1], ignore_order=True,
                                 sort_categories=True)
        exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([4, 5, 6], ordered=True)
        result = union_categoricals([c1, c2], ignore_order=True)
        expected = Categorical([1, 2, 3, 4, 5, 6])
        tm.assert_categorical_equal(result, expected)

        msg = "to union ordered Categoricals, all categories must be the same"
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2], ignore_order=False)

        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2])