def test_union_categoricals_nan(self):
        # GH 13759
        res = union_categoricals([pd.Categorical([1, 2, np.nan]),
                                  pd.Categorical([3, 2, np.nan])])
        exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([pd.Categorical(['A', 'B']),
                                  pd.Categorical(['B', 'B', np.nan])])
        exp = Categorical(['A', 'B', 'B', 'B', np.nan])
        tm.assert_categorical_equal(res, exp)

        val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'),
                pd.NaT]
        val2 = [pd.NaT, pd.Timestamp('2011-01-01'),
                pd.Timestamp('2011-02-01')]

        res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
        exp = Categorical(val1 + val2,
                          categories=[pd.Timestamp('2011-01-01'),
                                      pd.Timestamp('2011-03-01'),
                                      pd.Timestamp('2011-02-01')])
        tm.assert_categorical_equal(res, exp)

        # all NaN
        res = union_categoricals([pd.Categorical([np.nan, np.nan]),
                                  pd.Categorical(['X'])])
        exp = Categorical([np.nan, np.nan, 'X'])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([pd.Categorical([np.nan, np.nan]),
                                  pd.Categorical([np.nan, np.nan])])
        exp = Categorical([np.nan, np.nan, np.nan, np.nan])
        tm.assert_categorical_equal(res, exp)
Example #2
0
 def test_deprecation_access_func(self):
     with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
         from pandas.types.concat import union_categoricals
         c1 = pd.Categorical(list('aabc'))
         c2 = pd.Categorical(list('abcd'))
         union_categoricals([c1, c2],
                            sort_categories=True,
                            ignore_order=True)
Example #3
0
 def test_deprecation_access_func(self):
     with tm.assert_produces_warning(
             FutureWarning, check_stacklevel=False):
         from pandas.types.concat import union_categoricals
         c1 = pd.Categorical(list('aabc'))
         c2 = pd.Categorical(list('abcd'))
         union_categoricals(
             [c1, c2],
             sort_categories=True,
             ignore_order=True)
Example #4
0
    def test_union_categoricals_sort(self):
        # GH 13846
        c1 = Categorical(['x', 'y', 'z'])
        c2 = Categorical(['a', 'b', 'c'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
                               categories=['a', 'b', 'c', 'x', 'y', 'z'])
        tm.assert_categorical_equal(result, expected)

        # fastpath
        c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
        c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['a', 'b', 'b', 'c'],
                               categories=['a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b'])
        c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['a', 'b', 'b', 'c'],
                               categories=['a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        # fastpath - skip resort
        c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
        c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['a', 'b', 'b', 'c'],
                               categories=['a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(['x', np.nan])
        c2 = Categorical([np.nan, 'b'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['x', np.nan, np.nan, 'b'],
                               categories=['b', 'x'])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical([np.nan])
        c2 = Categorical([np.nan])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical([np.nan, np.nan], categories=[])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical([])
        c2 = Categorical([])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical([])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
        c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
        with tm.assertRaises(TypeError):
            union_categoricals([c1, c2], sort_categories=True)
    def test_union_categoricals_sort(self):
        # GH 13846
        c1 = Categorical(['x', 'y', 'z'])
        c2 = Categorical(['a', 'b', 'c'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
                               categories=['a', 'b', 'c', 'x', 'y', 'z'])
        tm.assert_categorical_equal(result, expected)

        # fastpath
        c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
        c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['a', 'b', 'b', 'c'],
                               categories=['a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b'])
        c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['a', 'b', 'b', 'c'],
                               categories=['a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        # fastpath - skip resort
        c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
        c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['a', 'b', 'b', 'c'],
                               categories=['a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(['x', np.nan])
        c2 = Categorical([np.nan, 'b'])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical(['x', np.nan, np.nan, 'b'],
                               categories=['b', 'x'])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical([np.nan])
        c2 = Categorical([np.nan])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical([np.nan, np.nan], categories=[])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical([])
        c2 = Categorical([])
        result = union_categoricals([c1, c2], sort_categories=True)
        expected = Categorical([])
        tm.assert_categorical_equal(result, expected)

        c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
        c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
        with tm.assertRaises(TypeError):
            union_categoricals([c1, c2], sort_categories=True)
Example #6
0
    def test_union_categoricals_empty(self):
        # GH 13759
        res = union_categoricals([pd.Categorical([]), pd.Categorical([])])
        exp = Categorical([])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([pd.Categorical([]), pd.Categorical([1.0])])
        exp = Categorical([1.0])
        tm.assert_categorical_equal(res, exp)

        # to make dtype equal
        nanc = pd.Categorical(np.array([np.nan], dtype=np.float64))
        res = union_categoricals([nanc, pd.Categorical([])])
        tm.assert_categorical_equal(res, nanc)
    def test_union_categorical_same_category(self):
        # check fastpath
        c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
        c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
        res = union_categoricals([c1, c2])
        exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
                          categories=[1, 2, 3, 4])
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
        c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
        res = union_categoricals([c1, c2])
        exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
                          categories=['x', 'y', 'z'])
        tm.assert_categorical_equal(res, exp)
Example #8
0
    def test_union_categorical_same_category(self):
        # check fastpath
        c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
        c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
        res = union_categoricals([c1, c2])
        exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
                          categories=[1, 2, 3, 4])
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
        c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
        res = union_categoricals([c1, c2])
        exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
                          categories=['x', 'y', 'z'])
        tm.assert_categorical_equal(res, exp)
    def test_union_categoricals_empty(self):
        # GH 13759
        res = union_categoricals([pd.Categorical([]),
                                  pd.Categorical([])])
        exp = Categorical([])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([pd.Categorical([]),
                                  pd.Categorical([1.0])])
        exp = Categorical([1.0])
        tm.assert_categorical_equal(res, exp)

        # to make dtype equal
        nanc = pd.Categorical(np.array([np.nan], dtype=np.float64))
        res = union_categoricals([nanc,
                                  pd.Categorical([])])
        tm.assert_categorical_equal(res, nanc)
Example #10
0
    def test_union_categorical_unwrap(self):
        # GH 14173
        c1 = Categorical(['a', 'b'])
        c2 = pd.Series(['b', 'c'], dtype='category')
        result = union_categoricals([c1, c2])
        expected = Categorical(['a', 'b', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        c2 = CategoricalIndex(c2)
        result = union_categoricals([c1, c2])
        tm.assert_categorical_equal(result, expected)

        c1 = Series(c1)
        result = union_categoricals([c1, c2])
        tm.assert_categorical_equal(result, expected)

        with tm.assertRaises(TypeError):
            union_categoricals([c1, ['a', 'b', 'c']])
    def test_union_categorical_unwrap(self):
        # GH 14173
        c1 = Categorical(['a', 'b'])
        c2 = pd.Series(['b', 'c'], dtype='category')
        result = union_categoricals([c1, c2])
        expected = Categorical(['a', 'b', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        c2 = CategoricalIndex(c2)
        result = union_categoricals([c1, c2])
        tm.assert_categorical_equal(result, expected)

        c1 = Series(c1)
        result = union_categoricals([c1, c2])
        tm.assert_categorical_equal(result, expected)

        with tm.assertRaises(TypeError):
            union_categoricals([c1, ['a', 'b', 'c']])
Example #12
0
    def test_union_categoricals_ordered(self):
        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], ordered=False)

        msg = 'Categorical.ordered must be the same'
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2])

        res = union_categoricals([c1, c1])
        exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3, np.nan], ordered=True)
        c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)

        res = union_categoricals([c1, c2])
        exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)

        msg = "to union ordered Categoricals, all categories must be the same"
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2])
    def test_union_categorical(self):
        # GH 13361
        data = [
            (list('abc'), list('abd'), list('abcabd')),
            ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
            ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),

            (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'],
             ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']),

            (pd.date_range('2014-01-01', '2014-01-05'),
             pd.date_range('2014-01-06', '2014-01-07'),
             pd.date_range('2014-01-01', '2014-01-07')),

            (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
             pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
             pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),

            (pd.period_range('2014-01-01', '2014-01-05'),
             pd.period_range('2014-01-06', '2014-01-07'),
             pd.period_range('2014-01-01', '2014-01-07')),
        ]

        for a, b, combined in data:
            for box in [Categorical, CategoricalIndex, Series]:
                result = union_categoricals([box(Categorical(a)),
                                             box(Categorical(b))])
                expected = Categorical(combined)
                tm.assert_categorical_equal(result, expected,
                                            check_category_order=True)

        # new categories ordered by appearance
        s = Categorical(['x', 'y', 'z'])
        s2 = Categorical(['a', 'b', 'c'])
        result = union_categoricals([s, s2])
        expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
                               categories=['x', 'y', 'z', 'a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        s = Categorical([0, 1.2, 2], ordered=True)
        s2 = Categorical([0, 1.2, 2], ordered=True)
        result = union_categoricals([s, s2])
        expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
        tm.assert_categorical_equal(result, expected)

        # must exactly match types
        s = Categorical([0, 1.2, 2])
        s2 = Categorical([2, 3, 4])
        msg = 'dtype of categories must be the same'
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([s, s2])

        msg = 'No Categoricals to union'
        with tm.assertRaisesRegexp(ValueError, msg):
            union_categoricals([])
    def test_union_categoricals_ordered(self):
        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], ordered=False)

        msg = 'Categorical.ordered must be the same'
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2])

        res = union_categoricals([c1, c1])
        exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3, np.nan], ordered=True)
        c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)

        res = union_categoricals([c1, c2])
        exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)

        msg = "to union ordered Categoricals, all categories must be the same"
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2])
Example #15
0
    def test_union_categoricals_nan(self):
        # GH 13759
        res = union_categoricals(
            [pd.Categorical([1, 2, np.nan]),
             pd.Categorical([3, 2, np.nan])])
        exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals(
            [pd.Categorical(['A', 'B']),
             pd.Categorical(['B', 'B', np.nan])])
        exp = Categorical(['A', 'B', 'B', 'B', np.nan])
        tm.assert_categorical_equal(res, exp)

        val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), pd.NaT]
        val2 = [pd.NaT, pd.Timestamp('2011-01-01'), pd.Timestamp('2011-02-01')]

        res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
        exp = Categorical(val1 + val2,
                          categories=[
                              pd.Timestamp('2011-01-01'),
                              pd.Timestamp('2011-03-01'),
                              pd.Timestamp('2011-02-01')
                          ])
        tm.assert_categorical_equal(res, exp)

        # all NaN
        res = union_categoricals(
            [pd.Categorical([np.nan, np.nan]),
             pd.Categorical(['X'])])
        exp = Categorical([np.nan, np.nan, 'X'])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([
            pd.Categorical([np.nan, np.nan]),
            pd.Categorical([np.nan, np.nan])
        ])
        exp = Categorical([np.nan, np.nan, np.nan, np.nan])
        tm.assert_categorical_equal(res, exp)
Example #16
0
    def test_union_categorical(self):
        # GH 13361
        data = [
            (list('abc'), list('abd'), list('abcabd')),
            ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
            ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
            (['b', 'b', np.nan,
              'a'], ['a', np.nan,
                     'c'], ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']),
            (pd.date_range('2014-01-01', '2014-01-05'),
             pd.date_range('2014-01-06', '2014-01-07'),
             pd.date_range('2014-01-01', '2014-01-07')),
            (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
             pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
             pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),
            (pd.period_range('2014-01-01', '2014-01-05'),
             pd.period_range('2014-01-06', '2014-01-07'),
             pd.period_range('2014-01-01', '2014-01-07')),
        ]

        for a, b, combined in data:
            for box in [Categorical, CategoricalIndex, Series]:
                result = union_categoricals(
                    [box(Categorical(a)),
                     box(Categorical(b))])
                expected = Categorical(combined)
                tm.assert_categorical_equal(result,
                                            expected,
                                            check_category_order=True)

        # new categories ordered by appearance
        s = Categorical(['x', 'y', 'z'])
        s2 = Categorical(['a', 'b', 'c'])
        result = union_categoricals([s, s2])
        expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
                               categories=['x', 'y', 'z', 'a', 'b', 'c'])
        tm.assert_categorical_equal(result, expected)

        s = Categorical([0, 1.2, 2], ordered=True)
        s2 = Categorical([0, 1.2, 2], ordered=True)
        result = union_categoricals([s, s2])
        expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
        tm.assert_categorical_equal(result, expected)

        # must exactly match types
        s = Categorical([0, 1.2, 2])
        s2 = Categorical([2, 3, 4])
        msg = 'dtype of categories must be the same'
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([s, s2])

        msg = 'No Categoricals to union'
        with tm.assertRaisesRegexp(ValueError, msg):
            union_categoricals([])
Example #17
0
    def test_union_categorical(self):
        # GH 13361
        data = [
            (list('abc'), list('abd'), list('abcabd')),
            ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
            ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),

            (pd.date_range('2014-01-01', '2014-01-05'),
             pd.date_range('2014-01-06', '2014-01-07'),
             pd.date_range('2014-01-01', '2014-01-07')),

            (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
             pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
             pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),

            (pd.period_range('2014-01-01', '2014-01-05'),
             pd.period_range('2014-01-06', '2014-01-07'),
             pd.period_range('2014-01-01', '2014-01-07')),
        ]

        for a, b, combined in data:
            result = union_categoricals([Categorical(a), Categorical(b)])
            expected = Categorical(combined)
            tm.assert_categorical_equal(result, expected,
                                        check_category_order=True)

        # new categories ordered by appearance
        s = Categorical(['x', 'y', 'z'])
        s2 = Categorical(['a', 'b', 'c'])
        result = union_categoricals([s, s2]).categories
        expected = Index(['x', 'y', 'z', 'a', 'b', 'c'])
        tm.assert_index_equal(result, expected)

        # can't be ordered
        s = Categorical([0, 1.2, 2], ordered=True)
        s2 = Categorical([0, 1.2, 2], ordered=True)
        with tm.assertRaises(TypeError):
            union_categoricals([s, s2])

        # must exactly match types
        s = Categorical([0, 1.2, 2])
        s2 = Categorical([2, 3, 4])
        with tm.assertRaises(TypeError):
            union_categoricals([s, s2])

        with tm.assertRaises(ValueError):
            union_categoricals([])
Example #18
0
    def test_union_categorical(self):
        # GH 13361
        data = [
            (list('abc'), list('abd'), list('abcabd')),
            ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
            ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
            (pd.date_range('2014-01-01', '2014-01-05'),
             pd.date_range('2014-01-06', '2014-01-07'),
             pd.date_range('2014-01-01', '2014-01-07')),
            (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
             pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
             pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),
            (pd.period_range('2014-01-01', '2014-01-05'),
             pd.period_range('2014-01-06', '2014-01-07'),
             pd.period_range('2014-01-01', '2014-01-07')),
        ]

        for a, b, combined in data:
            result = union_categoricals([Categorical(a), Categorical(b)])
            expected = Categorical(combined)
            tm.assert_categorical_equal(result,
                                        expected,
                                        check_category_order=True)

        # new categories ordered by appearance
        s = Categorical(['x', 'y', 'z'])
        s2 = Categorical(['a', 'b', 'c'])
        result = union_categoricals([s, s2]).categories
        expected = Index(['x', 'y', 'z', 'a', 'b', 'c'])
        tm.assert_index_equal(result, expected)

        # can't be ordered
        s = Categorical([0, 1.2, 2], ordered=True)
        s2 = Categorical([0, 1.2, 2], ordered=True)
        with tm.assertRaises(TypeError):
            union_categoricals([s, s2])

        # must exactly match types
        s = Categorical([0, 1.2, 2])
        s2 = Categorical([2, 3, 4])
        with tm.assertRaises(TypeError):
            union_categoricals([s, s2])

        with tm.assertRaises(ValueError):
            union_categoricals([])
Example #19
0
 def time_union(self):
     union_categoricals([self.a, self.b])
Example #20
0
    def test_union_categoricals_ignore_order(self):
        # GH 15219
        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], ordered=False)

        res = union_categoricals([c1, c2], ignore_order=True)
        exp = Categorical([1, 2, 3, 1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        msg = 'Categorical.ordered must be the same'
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2], ignore_order=False)

        res = union_categoricals([c1, c1], ignore_order=True)
        exp = Categorical([1, 2, 3, 1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([c1, c1], ignore_order=False)
        exp = Categorical([1, 2, 3, 1, 2, 3],
                          categories=[1, 2, 3],
                          ordered=True)
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3, np.nan], ordered=True)
        c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)

        res = union_categoricals([c1, c2], ignore_order=True)
        exp = Categorical([1, 2, 3, np.nan, 3, 2])
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)

        res = union_categoricals([c1, c2], ignore_order=True)
        exp = Categorical([1, 2, 3, 1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([c2, c1],
                                 ignore_order=True,
                                 sort_categories=True)
        exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([4, 5, 6], ordered=True)
        result = union_categoricals([c1, c2], ignore_order=True)
        expected = Categorical([1, 2, 3, 4, 5, 6])
        tm.assert_categorical_equal(result, expected)

        msg = "to union ordered Categoricals, all categories must be the same"
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2], ignore_order=False)

        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2])
    def test_union_categoricals_ignore_order(self):
        # GH 15219
        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], ordered=False)

        res = union_categoricals([c1, c2], ignore_order=True)
        exp = Categorical([1, 2, 3, 1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        msg = 'Categorical.ordered must be the same'
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2], ignore_order=False)

        res = union_categoricals([c1, c1], ignore_order=True)
        exp = Categorical([1, 2, 3, 1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([c1, c1], ignore_order=False)
        exp = Categorical([1, 2, 3, 1, 2, 3],
                          categories=[1, 2, 3], ordered=True)
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3, np.nan], ordered=True)
        c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)

        res = union_categoricals([c1, c2], ignore_order=True)
        exp = Categorical([1, 2, 3, np.nan, 3, 2])
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)

        res = union_categoricals([c1, c2], ignore_order=True)
        exp = Categorical([1, 2, 3, 1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        res = union_categoricals([c2, c1], ignore_order=True,
                                 sort_categories=True)
        exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
        tm.assert_categorical_equal(res, exp)

        c1 = Categorical([1, 2, 3], ordered=True)
        c2 = Categorical([4, 5, 6], ordered=True)
        result = union_categoricals([c1, c2], ignore_order=True)
        expected = Categorical([1, 2, 3, 4, 5, 6])
        tm.assert_categorical_equal(result, expected)

        msg = "to union ordered Categoricals, all categories must be the same"
        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2], ignore_order=False)

        with tm.assertRaisesRegexp(TypeError, msg):
            union_categoricals([c1, c2])
Example #22
0
def concat(dfs, axis=0, join='outer', uniform=False):
    """Concatenate, handling some edge cases:

    - Unions categoricals between partitions
    - Ignores empty partitions

    Parameters
    ----------
    dfs : list of DataFrame, Series, or Index
    axis : int or str, optional
    join : str, optional
    uniform : bool, optional
        Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to
        True if all arguments have the same columns and dtypes (but not
        necessarily categories). Default is False.
    """
    if axis == 1:
        return pd.concat(dfs, axis=axis, join=join)

    if len(dfs) == 1:
        return dfs[0]

    # Support concatenating indices along axis 0
    if isinstance(dfs[0], pd.Index):
        if isinstance(dfs[0], pd.CategoricalIndex):
            return pd.CategoricalIndex(union_categoricals(dfs),
                                       name=dfs[0].name)
        return dfs[0].append(dfs[1:])

    # Handle categorical index separately
    if isinstance(dfs[0].index, pd.CategoricalIndex):
        dfs2 = [df.reset_index(drop=True) for df in dfs]
        ind = concat([df.index for df in dfs])
    else:
        dfs2 = dfs
        ind = None

    # Concatenate the partitions together, handling categories as needed
    if (isinstance(dfs2[0], pd.DataFrame) if uniform else
            any(isinstance(df, pd.DataFrame) for df in dfs2)):
        if uniform:
            dfs3 = dfs2
            cat_mask = dfs2[0].dtypes == 'category'
        else:
            # When concatenating mixed dataframes and series on axis 1, Pandas
            # converts series to dataframes with a single column named 0, then
            # concatenates.
            dfs3 = [df if isinstance(df, pd.DataFrame) else
                    df.rename(0).to_frame() for df in dfs2]
            cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T
                                  for df in dfs3], join=join).any()

        if cat_mask.any():
            not_cat = cat_mask[~cat_mask].index
            out = pd.concat([df[df.columns.intersection(not_cat)]
                             for df in dfs3], join=join)
            for col in cat_mask.index.difference(not_cat):
                # Find an example of categoricals in this column
                for df in dfs3:
                    sample = df.get(col)
                    if sample is not None:
                        break
                # Extract partitions, subbing in missing if needed
                parts = []
                for df in dfs3:
                    if col in df.columns:
                        parts.append(df[col])
                    else:
                        codes = np.full(len(df), -1, dtype='i8')
                        data = pd.Categorical.from_codes(codes,
                                                         sample.cat.categories,
                                                         sample.cat.ordered)
                        parts.append(data)
                out[col] = union_categoricals(parts)
            out = out.reindex_axis(cat_mask.index, axis=1)
        else:
            out = pd.concat(dfs3, join=join)
    else:
        if is_categorical_dtype(dfs2[0].dtype):
            if ind is None:
                ind = concat([df.index for df in dfs2])
            return pd.Series(union_categoricals(dfs2), index=ind,
                             name=dfs2[0].name)
        out = pd.concat(dfs2, join=join)
    # Re-add the index if needed
    if ind is not None:
        out.index = ind
    return out
Example #23
0
def concat(dfs, axis=0, join='outer', uniform=False):
    """Concatenate, handling some edge cases:

    - Unions categoricals between partitions
    - Ignores empty partitions

    Parameters
    ----------
    dfs : list of DataFrame, Series, or Index
    axis : int or str, optional
    join : str, optional
    uniform : bool, optional
        Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to
        True if all arguments have the same columns and dtypes (but not
        necessarily categories). Default is False.
    """
    if axis == 1:
        return pd.concat(dfs, axis=axis, join=join)

    # Support concatenating indices along axis 0
    if isinstance(dfs[0], pd.Index):
        if isinstance(dfs[0], pd.CategoricalIndex):
            return pd.CategoricalIndex(union_categoricals(dfs),
                                       name=dfs[0].name)
        return dfs[0].append(dfs[1:])

    # Handle categorical index separately
    if isinstance(dfs[0].index, pd.CategoricalIndex):
        dfs2 = [df.reset_index(drop=True) for df in dfs]
        ind = concat([df.index for df in dfs])
    else:
        dfs2 = dfs
        ind = None

    # Concatenate the partitions together, handling categories as needed
    if (isinstance(dfs2[0], pd.DataFrame) if uniform else any(
            isinstance(df, pd.DataFrame) for df in dfs2)):
        if uniform:
            dfs3 = dfs2
            cat_mask = dfs2[0].dtypes == 'category'
        else:
            # When concatenating mixed dataframes and series on axis 1, Pandas
            # converts series to dataframes with a single column named 0, then
            # concatenates.
            dfs3 = [
                df
                if isinstance(df, pd.DataFrame) else df.rename(0).to_frame()
                for df in dfs2
            ]
            cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T
                                  for df in dfs3],
                                 join=join).any()

        if cat_mask.any():
            not_cat = cat_mask[~cat_mask].index
            out = pd.concat(
                [df[df.columns.intersection(not_cat)] for df in dfs3],
                join=join)
            for col in cat_mask.index.difference(not_cat):
                # Find an example of categoricals in this column
                for df in dfs3:
                    sample = df.get(col)
                    if sample is not None:
                        break
                # Extract partitions, subbing in missing if needed
                parts = []
                for df in dfs3:
                    if col in df.columns:
                        parts.append(df[col])
                    else:
                        codes = np.full(len(df), -1, dtype='i8')
                        data = pd.Categorical.from_codes(
                            codes, sample.cat.categories, sample.cat.ordered)
                        parts.append(data)
                out[col] = union_categoricals(parts)
            out = out.reindex_axis(cat_mask.index, axis=1)
        else:
            out = pd.concat(dfs3, join=join)
    else:
        if is_categorical_dtype(dfs2[0].dtype):
            if ind is None:
                ind = concat([df.index for df in dfs2])
            return pd.Series(union_categoricals(dfs2),
                             index=ind,
                             name=dfs2[0].name)
        out = pd.concat(dfs2, join=join)
    # Re-add the index if needed
    if ind is not None:
        out.index = ind
    return out
Example #24
0
 def time_union_categorical(self):
     union_categoricals([self.a, self.b])