Example #1
0
    def test_loc_with_non_string_categories(self, idx_values, ordered_fixture):
        # GH-17569
        cat_idx = CategoricalIndex(idx_values, ordered=ordered_fixture)
        df = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx)
        sl = slice(idx_values[0], idx_values[1])

        # scalar selection
        result = df.loc[idx_values[0]]
        expected = Series(["foo"], index=["A"], name=idx_values[0])
        tm.assert_series_equal(result, expected)

        # list selection
        result = df.loc[idx_values[:2]]
        expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"])
        tm.assert_frame_equal(result, expected)

        # slice selection
        result = df.loc[sl]
        expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"])
        tm.assert_frame_equal(result, expected)

        # scalar assignment
        result = df.copy()
        result.loc[idx_values[0]] = "qux"
        expected = DataFrame({"A": ["qux", "bar", "baz"]}, index=cat_idx)
        tm.assert_frame_equal(result, expected)

        # list assignment
        result = df.copy()
        result.loc[idx_values[:2], "A"] = ["qux", "qux2"]
        expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx)
        tm.assert_frame_equal(result, expected)

        # slice assignment
        result = df.copy()
        result.loc[sl, "A"] = ["qux", "qux2"]
        expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx)
        tm.assert_frame_equal(result, expected)
Example #2
0
    def test_map(self):
        ci = CategoricalIndex(list("ABABC"),
                              categories=list("CBA"),
                              ordered=True)
        result = ci.map(lambda x: x.lower())
        exp = CategoricalIndex(list("ababc"),
                               categories=list("cba"),
                               ordered=True)
        tm.assert_index_equal(result, exp)

        ci = CategoricalIndex(list("ABABC"),
                              categories=list("BAC"),
                              ordered=False,
                              name="XXX")
        result = ci.map(lambda x: x.lower())
        exp = CategoricalIndex(list("ababc"),
                               categories=list("bac"),
                               ordered=False,
                               name="XXX")
        tm.assert_index_equal(result, exp)

        # GH 12766: Return an index not an array
        tm.assert_index_equal(
            ci.map(lambda x: 1),
            Index(np.array([1] * 5, dtype=np.int64), name="XXX"))

        # change categories dtype
        ci = CategoricalIndex(list("ABABC"),
                              categories=list("BAC"),
                              ordered=False)

        def f(x):
            return {"A": 10, "B": 20, "C": 30}.get(x)

        result = ci.map(f)
        exp = CategoricalIndex([10, 20, 10, 20, 30],
                               categories=[20, 10, 30],
                               ordered=False)
        tm.assert_index_equal(result, exp)

        result = ci.map(Series([10, 20, 30], index=["A", "B", "C"]))
        tm.assert_index_equal(result, exp)

        result = ci.map({"A": 10, "B": 20, "C": 30})
        tm.assert_index_equal(result, exp)
Example #3
0
def test_series_groupby_value_counts_on_categorical():
    # GH38672

    s = Series(Categorical(["a"], categories=["a", "b"]))
    result = s.groupby([0]).value_counts()

    expected = Series(
        data=[1, 0],
        index=MultiIndex.from_arrays([
            [0, 0],
            CategoricalIndex(["a", "b"],
                             categories=["a", "b"],
                             ordered=False,
                             dtype="category"),
        ]),
    )

    # Expected:
    # 0  a    1
    #    b    0
    # dtype: int64

    tm.assert_series_equal(result, expected)
Example #4
0
    def test_get_indexer_non_unique(self):
        np.random.seed(123456789)

        ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False)
        oidx = Index(np.array(ci))

        for n in [1, 2, 5, len(ci)]:
            finder = oidx[np.random.randint(0, len(ci), size=n)]
            expected = oidx.get_indexer_non_unique(finder)[0]

            actual = ci.get_indexer(finder)
            tm.assert_numpy_array_equal(expected, actual)

        # see gh-17323
        #
        # Even when indexer is equal to the
        # members in the index, we should
        # respect duplicates instead of taking
        # the fast-track path.
        for finder in [list("aabbca"), list("aababca")]:
            expected = oidx.get_indexer_non_unique(finder)[0]

            actual = ci.get_indexer(finder)
            tm.assert_numpy_array_equal(expected, actual)
Example #5
0
def test_empty_prod():
    # https://github.com/pandas-dev/pandas/issues/18678
    df = DataFrame({
        "A":
        Categorical(["a", "a", "b"], categories=["a", "b", "c"]),
        "B": [1, 2, 1]
    })

    expected_idx = CategoricalIndex(["a", "b", "c"], name="A")

    # 1 by default
    result = df.groupby("A", observed=False).B.prod()
    expected = Series([2, 1, 1], expected_idx, name="B")
    tm.assert_series_equal(result, expected)

    # min_count=0
    result = df.groupby("A", observed=False).B.prod(min_count=0)
    expected = Series([2, 1, 1], expected_idx, name="B")
    tm.assert_series_equal(result, expected)

    # min_count=1
    result = df.groupby("A", observed=False).B.prod(min_count=1)
    expected = Series([2, 1, np.nan], expected_idx, name="B")
    tm.assert_series_equal(result, expected)
Example #6
0
    def test_get_indexer_requires_unique(self):
        np.random.seed(123456789)

        ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False)
        oidx = Index(np.array(ci))

        msg = "Reindexing only valid with uniquely valued Index objects"

        for n in [1, 2, 5, len(ci)]:
            finder = oidx[np.random.randint(0, len(ci), size=n)]

            with pytest.raises(InvalidIndexError, match=msg):
                ci.get_indexer(finder)

        # see gh-17323
        #
        # Even when indexer is equal to the
        # members in the index, we should
        # respect duplicates instead of taking
        # the fast-track path.
        for finder in [list("aabbca"), list("aababca")]:

            with pytest.raises(InvalidIndexError, match=msg):
                ci.get_indexer(finder)
Example #7
0
    def test_categorical_index_repr_period(self):
        # test all length
        idx = period_range('2011-01-01 09:00', freq='H', periods=1)
        i = CategoricalIndex(Categorical(idx))
        exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')"""  # noqa
        assert repr(i) == exp

        idx = period_range('2011-01-01 09:00', freq='H', periods=2)
        i = CategoricalIndex(Categorical(idx))
        exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')"""  # noqa
        assert repr(i) == exp

        idx = period_range('2011-01-01 09:00', freq='H', periods=3)
        i = CategoricalIndex(Categorical(idx))
        exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')"""  # noqa
        assert repr(i) == exp

        idx = period_range('2011-01-01 09:00', freq='H', periods=5)
        i = CategoricalIndex(Categorical(idx))
        exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
                  '2011-01-01 12:00', '2011-01-01 13:00'],
                 categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')"""  # noqa

        assert repr(i) == exp

        i = CategoricalIndex(Categorical(idx.append(idx)))
        exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
                  '2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00',
                  '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00',
                  '2011-01-01 13:00'],
                 categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')"""  # noqa

        assert repr(i) == exp

        idx = period_range('2011-01', freq='M', periods=5)
        i = CategoricalIndex(Categorical(idx))
        exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')"""  # noqa
        assert repr(i) == exp
Example #8
0
    def test_reindex_dtype(self):
        c = CategoricalIndex(["a", "b", "c", "a"])
        res, indexer = c.reindex(["a", "c"])
        tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))

        c = CategoricalIndex(["a", "b", "c", "a"])
        res, indexer = c.reindex(Categorical(["a", "c"]))

        exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))

        c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
        res, indexer = c.reindex(["a", "c"])
        exp = Index(["a", "a", "c"], dtype="object")
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))

        c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
        res, indexer = c.reindex(Categorical(["a", "c"]))
        exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
Example #9
0
    def test_reindex_dtype(self):
        # GH#11586
        ci = CategoricalIndex(["a", "b", "c", "a"])
        with tm.assert_produces_warning(FutureWarning, match="non-unique"):
            res, indexer = ci.reindex(["a", "c"])

        tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2],
                                                      dtype=np.intp))

        ci = CategoricalIndex(["a", "b", "c", "a"])
        with tm.assert_produces_warning(FutureWarning, match="non-unique"):
            res, indexer = ci.reindex(Categorical(["a", "c"]))

        exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2],
                                                      dtype=np.intp))

        ci = CategoricalIndex(["a", "b", "c", "a"],
                              categories=["a", "b", "c", "d"])
        with tm.assert_produces_warning(FutureWarning, match="non-unique"):
            res, indexer = ci.reindex(["a", "c"])
        exp = Index(["a", "a", "c"], dtype="object")
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2],
                                                      dtype=np.intp))

        ci = CategoricalIndex(["a", "b", "c", "a"],
                              categories=["a", "b", "c", "d"])
        with tm.assert_produces_warning(FutureWarning, match="non-unique"):
            res, indexer = ci.reindex(Categorical(["a", "c"]))
        exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2],
                                                      dtype=np.intp))
Example #10
0
    def test_groupby_sort_categorical_datetimelike(self):
        # GH10505

        # use same data as test_groupby_sort_categorical, which category is
        # corresponding to datetime.month
        df = DataFrame(
            {
                'dt': [
                    datetime(2011, 7, 1),
                    datetime(2011, 7, 1),
                    datetime(2011, 2, 1),
                    datetime(2011, 5, 1),
                    datetime(2011, 2, 1),
                    datetime(2011, 1, 1),
                    datetime(2011, 5, 1)
                ],
                'foo': [10, 8, 5, 6, 4, 1, 7],
                'bar': [10, 20, 30, 40, 50, 60, 70]
            },
            columns=['dt', 'foo', 'bar'])

        # ordered=True
        df['dt'] = Categorical(df['dt'], ordered=True)
        index = [
            datetime(2011, 1, 1),
            datetime(2011, 2, 1),
            datetime(2011, 5, 1),
            datetime(2011, 7, 1)
        ]
        result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
                                columns=['foo', 'bar'])
        result_sort.index = CategoricalIndex(index, name='dt', ordered=True)

        index = [
            datetime(2011, 7, 1),
            datetime(2011, 2, 1),
            datetime(2011, 5, 1),
            datetime(2011, 1, 1)
        ]
        result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
                                  columns=['foo', 'bar'])
        result_nosort.index = CategoricalIndex(index,
                                               categories=index,
                                               name='dt',
                                               ordered=True)

        col = 'dt'
        assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
        # when categories is ordered, group is ordered by category's order
        assert_frame_equal(result_sort, df.groupby(col, sort=False).first())

        # ordered = False
        df['dt'] = Categorical(df['dt'], ordered=False)
        index = [
            datetime(2011, 1, 1),
            datetime(2011, 2, 1),
            datetime(2011, 5, 1),
            datetime(2011, 7, 1)
        ]
        result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
                                columns=['foo', 'bar'])
        result_sort.index = CategoricalIndex(index, name='dt')

        index = [
            datetime(2011, 7, 1),
            datetime(2011, 2, 1),
            datetime(2011, 5, 1),
            datetime(2011, 1, 1)
        ]
        result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
                                  columns=['foo', 'bar'])
        result_nosort.index = CategoricalIndex(index,
                                               categories=index,
                                               name='dt')

        col = 'dt'
        assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
        assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
Example #11
0
 def test_categorical_categories(self):
     # GH17884
     c1 = CategoricalDtype(Categorical(['a', 'b']))
     tm.assert_index_equal(c1.categories, pd.Index(['a', 'b']))
     c1 = CategoricalDtype(CategoricalIndex(['a', 'b']))
     tm.assert_index_equal(c1.categories, pd.Index(['a', 'b']))
Example #12
0
def test_basic():

    cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
                       categories=["a", "b", "c", "d"],
                       ordered=True)
    data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

    exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
    expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
    result = data.groupby("b", observed=False).mean()
    tm.assert_frame_equal(result, expected)

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"],
                       ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"],
                       ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})

    # single grouper
    gb = df.groupby("A", observed=False)
    exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
    expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
    result = gb.sum()
    tm.assert_frame_equal(result, expected)

    # GH 8623
    x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']],
                  columns=['person_id', 'person_name'])
    x['person_name'] = Categorical(x.person_name)

    g = x.groupby(['person_id'], observed=False)
    result = g.transform(lambda x: x)
    tm.assert_frame_equal(result, x[['person_name']])

    result = x.drop_duplicates('person_name')
    expected = x.iloc[[0, 1]]
    tm.assert_frame_equal(result, expected)

    def f(x):
        return x.drop_duplicates('person_name').iloc[0]

    result = g.apply(f)
    expected = x.iloc[[0, 1]].copy()
    expected.index = Index([1, 2], name='person_id')
    expected['person_name'] = expected['person_name'].astype('object')
    tm.assert_frame_equal(result, expected)

    # GH 9921
    # Monotonic
    df = DataFrame({"a": [5, 15, 25]})
    c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum), df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
        df[['a']])

    # Filter
    tm.assert_series_equal(
        df.a.groupby(c, observed=False).filter(np.all), df['a'])
    tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df)

    # Non-monotonic
    df = DataFrame({"a": [5, 15, 25, -5]})
    c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum), df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df[['a']])

    # GH 9603
    df = DataFrame({'a': [1, 0, 0, 0]})
    c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
    result = df.groupby(c, observed=False).apply(len)

    exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered)
    expected = Series([1, 0, 0, 0], index=exp_index)
    expected.index.name = 'a'
    tm.assert_series_equal(result, expected)

    # more basic
    levels = ['foo', 'bar', 'baz', 'qux']
    codes = np.random.randint(0, 4, size=100)

    cats = Categorical.from_codes(codes, levels, ordered=True)

    data = DataFrame(np.random.randn(100, 4))

    result = data.groupby(cats, observed=False).mean()

    expected = data.groupby(np.asarray(cats), observed=False).mean()
    exp_idx = CategoricalIndex(levels,
                               categories=cats.categories,
                               ordered=True)
    expected = expected.reindex(exp_idx)

    assert_frame_equal(result, expected)

    grouped = data.groupby(cats, observed=False)
    desc_result = grouped.describe()

    idx = cats.codes.argsort()
    ord_labels = np.asarray(cats).take(idx)
    ord_data = data.take(idx)

    exp_cats = Categorical(ord_labels,
                           ordered=True,
                           categories=['foo', 'bar', 'baz', 'qux'])
    expected = ord_data.groupby(exp_cats, sort=False,
                                observed=False).describe()
    assert_frame_equal(desc_result, expected)

    # GH 10460
    expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
    exp = CategoricalIndex(expc)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
    exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] *
                4)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
Example #13
0
 def test_constructor_interval_values_mismatched_dtype(self):
     dti = date_range("2016-01-01", periods=3)
     ii = IntervalIndex.from_breaks(dti)
     result = Index(ii, dtype="category")
     expected = CategoricalIndex(ii)
     tm.assert_index_equal(result, expected)
Example #14
0
 def test_constructor_period_values_mismatched_dtype(self):
     pi = period_range("2016-01-01", periods=3, freq="D")
     result = Index(pi, dtype="category")
     expected = CategoricalIndex(pi)
     tm.assert_index_equal(result, expected)
Example #15
0
    def test_get_indexer_same_categories_same_order(self):
        ci = CategoricalIndex(["a", "b"], categories=["a", "b"])

        result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"]))
        expected = np.array([1, 1], dtype="intp")
        tm.assert_numpy_array_equal(result, expected)
Example #16
0
    def test_construction(self):

        ci = CategoricalIndex(list("aabbca"),
                              categories=list("abcd"),
                              ordered=False)
        categories = ci.categories

        result = Index(ci)
        tm.assert_index_equal(result, ci, exact=True)
        assert not result.ordered

        result = Index(ci.values)
        tm.assert_index_equal(result, ci, exact=True)
        assert not result.ordered

        # empty
        result = CategoricalIndex(categories=categories)
        tm.assert_index_equal(result.categories, Index(categories))
        tm.assert_numpy_array_equal(result.codes, np.array([], dtype="int8"))
        assert not result.ordered

        # passing categories
        result = CategoricalIndex(list("aabbca"), categories=categories)
        tm.assert_index_equal(result.categories, Index(categories))
        tm.assert_numpy_array_equal(result.codes,
                                    np.array([0, 0, 1, 1, 2, 0], dtype="int8"))

        c = Categorical(list("aabbca"))
        result = CategoricalIndex(c)
        tm.assert_index_equal(result.categories, Index(list("abc")))
        tm.assert_numpy_array_equal(result.codes,
                                    np.array([0, 0, 1, 1, 2, 0], dtype="int8"))
        assert not result.ordered

        result = CategoricalIndex(c, categories=categories)
        tm.assert_index_equal(result.categories, Index(categories))
        tm.assert_numpy_array_equal(result.codes,
                                    np.array([0, 0, 1, 1, 2, 0], dtype="int8"))
        assert not result.ordered

        ci = CategoricalIndex(c, categories=list("abcd"))
        result = CategoricalIndex(ci)
        tm.assert_index_equal(result.categories, Index(categories))
        tm.assert_numpy_array_equal(result.codes,
                                    np.array([0, 0, 1, 1, 2, 0], dtype="int8"))
        assert not result.ordered

        result = CategoricalIndex(ci, categories=list("ab"))
        tm.assert_index_equal(result.categories, Index(list("ab")))
        tm.assert_numpy_array_equal(
            result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8"))
        assert not result.ordered

        result = CategoricalIndex(ci, categories=list("ab"), ordered=True)
        tm.assert_index_equal(result.categories, Index(list("ab")))
        tm.assert_numpy_array_equal(
            result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8"))
        assert result.ordered

        result = CategoricalIndex(ci, categories=list("ab"), ordered=True)
        expected = CategoricalIndex(ci,
                                    categories=list("ab"),
                                    ordered=True,
                                    dtype="category")
        tm.assert_index_equal(result, expected, exact=True)

        # turn me to an Index
        result = Index(np.array(ci))
        assert isinstance(result, Index)
        assert not isinstance(result, CategoricalIndex)
Example #17
0
    idx = index
    idx_non_unique = idx[[0, 0, 1, 2]]

    check_intersection_commutative(idx, idx_non_unique)
    assert idx.intersection(idx_non_unique).is_unique


@pytest.mark.parametrize(
    "cls",
    [
        Int64Index,
        Float64Index,
        DatetimeIndex,
        CategoricalIndex,
        lambda x: CategoricalIndex(x, categories=set(x)),
        TimedeltaIndex,
        lambda x: Index(x, dtype=object),
        UInt64Index,
    ],
)
def test_union_duplicate_index_subsets_of_each_other(cls):
    # GH#31326
    a = cls([1, 2, 2, 3])
    b = cls([3, 3, 4])
    expected = cls([1, 2, 2, 3, 3, 4])
    if isinstance(a, CategoricalIndex):
        expected = Index([1, 2, 2, 3, 3, 4])
    result = a.union(b)
    tm.assert_index_equal(result, expected)
    result = a.union(b, sort=False)
Example #18
0
    def test_take_fill_value(self):
        # GH 12631

        # numeric category
        idx = CategoricalIndex([1, 2, 3], name="xxx")
        result = idx.take(np.array([1, 0, -1]))
        expected = CategoricalIndex([2, 1, 3], name="xxx")
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # fill_value
        result = idx.take(np.array([1, 0, -1]), fill_value=True)
        expected = CategoricalIndex([2, 1, np.nan],
                                    categories=[1, 2, 3],
                                    name="xxx")
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # allow_fill=False
        result = idx.take(np.array([1, 0, -1]),
                          allow_fill=False,
                          fill_value=True)
        expected = CategoricalIndex([2, 1, 3], name="xxx")
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # object category
        idx = CategoricalIndex(list("CBA"),
                               categories=list("ABC"),
                               ordered=True,
                               name="xxx")
        result = idx.take(np.array([1, 0, -1]))
        expected = CategoricalIndex(list("BCA"),
                                    categories=list("ABC"),
                                    ordered=True,
                                    name="xxx")
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # fill_value
        result = idx.take(np.array([1, 0, -1]), fill_value=True)
        expected = CategoricalIndex(["B", "C", np.nan],
                                    categories=list("ABC"),
                                    ordered=True,
                                    name="xxx")
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # allow_fill=False
        result = idx.take(np.array([1, 0, -1]),
                          allow_fill=False,
                          fill_value=True)
        expected = CategoricalIndex(list("BCA"),
                                    categories=list("ABC"),
                                    ordered=True,
                                    name="xxx")
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        msg = ("When allow_fill=True and fill_value is not None, "
               "all indices must be >= -1")
        with pytest.raises(ValueError, match=msg):
            idx.take(np.array([1, 0, -2]), fill_value=True)
        with pytest.raises(ValueError, match=msg):
            idx.take(np.array([1, 0, -5]), fill_value=True)

        msg = "index -5 is out of bounds for (axis 0 with )?size 3"
        with pytest.raises(IndexError, match=msg):
            idx.take(np.array([1, -5]))
Example #19
0
 def test_constructor_timedelta64_values_mismatched_dtype(self):
     # check we don't silently ignore the dtype keyword
     tdi = timedelta_range("4 Days", periods=5)
     result = Index(tdi, dtype="category")
     expected = CategoricalIndex(tdi)
     tm.assert_index_equal(result, expected)
Example #20
0
 def test_get_loc_unique(self):
     cidx = CategoricalIndex(list("abc"))
     result = cidx.get_loc("b")
     assert result == 1
Example #21
0
 def test_constructor_categorical_to_object(self):
     # GH#32167 Categorical data and dtype=object should return object-dtype
     ci = CategoricalIndex(range(5))
     result = Index(ci, dtype=object)
     assert not isinstance(result, CategoricalIndex)
Example #22
0
 def test_get_loc_monotonic_nonunique(self):
     cidx = CategoricalIndex(list("abbc"))
     result = cidx.get_loc("b")
     expected = slice(1, 3, None)
     assert result == expected
    def test_string_categorical_index_repr(self):
        # short
        idx = CategoricalIndex(["a", "bb", "ccc"])
        expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')"""  # noqa
        assert repr(idx) == expected

        # multiple lines
        idx = CategoricalIndex(["a", "bb", "ccc"] * 10)
        expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a',
                  'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb',
                  'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'],
                 categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')"""

        assert repr(idx) == expected

        # truncated
        idx = CategoricalIndex(["a", "bb", "ccc"] * 100)
        expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a',
                  ...
                  'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'],
                 categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)"""  # noqa

        assert repr(idx) == expected

        # larger categories
        idx = CategoricalIndex(list("abcdefghijklmmo"))
        expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                  'm', 'm', 'o'],
                 categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')"""  # noqa

        assert repr(idx) == expected

        # short
        idx = CategoricalIndex(["あ", "いい", "ううう"])
        expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa
        assert repr(idx) == expected

        # multiple lines
        idx = CategoricalIndex(["あ", "いい", "ううう"] * 10)
        expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ',
                  'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""

        assert repr(idx) == expected

        # truncated
        idx = CategoricalIndex(["あ", "いい", "ううう"] * 100)
        expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ',
                  ...
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)"""  # noqa

        assert repr(idx) == expected

        # larger categories
        idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ"))
        expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し',
                  'す', 'せ', 'そ'],
                 categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')"""  # noqa

        assert repr(idx) == expected

        # Emable Unicode option -----------------------------------------
        with cf.option_context("display.unicode.east_asian_width", True):

            # short
            idx = CategoricalIndex(["あ", "いい", "ううう"])
            expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa
            assert repr(idx) == expected

            # multiple lines
            idx = CategoricalIndex(["あ", "いい", "ううう"] * 10)
            expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',
                  'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""

            assert repr(idx) == expected

            # truncated
            idx = CategoricalIndex(["あ", "いい", "ううう"] * 100)
            expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ',
                  ...
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',
                  'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)"""  # noqa

            assert repr(idx) == expected

            # larger categories
            idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ"))
            expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ',
                  'さ', 'し', 'す', 'せ', 'そ'],
                 categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')"""  # noqa

            assert repr(idx) == expected
Example #24
0
 def test_get_loc_nonmonotonic_nonunique(self):
     cidx = CategoricalIndex(list("abcb"))
     result = cidx.get_loc("b")
     expected = np.array([False, True, False, True], dtype=bool)
     tm.assert_numpy_array_equal(result, expected)
Example #25
0
    def test_describe(self):
        # string type
        desc = self.factor.describe()
        assert self.factor.ordered
        exp_index = CategoricalIndex(["a", "b", "c"],
                                     name="categories",
                                     ordered=self.factor.ordered)
        expected = DataFrame(
            {
                "counts": [3, 2, 3],
                "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0]
            },
            index=exp_index)
        tm.assert_frame_equal(desc, expected)

        # check unused categories
        cat = self.factor.copy()
        cat.set_categories(["a", "b", "c", "d"], inplace=True)
        desc = cat.describe()

        exp_index = CategoricalIndex(list("abcd"),
                                     ordered=self.factor.ordered,
                                     name="categories")
        expected = DataFrame(
            {
                "counts": [3, 2, 3, 0],
                "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0]
            },
            index=exp_index,
        )
        tm.assert_frame_equal(desc, expected)

        # check an integer one
        cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
        desc = cat.describe()
        exp_index = CategoricalIndex([1, 2, 3],
                                     ordered=cat.ordered,
                                     name="categories")
        expected = DataFrame(
            {
                "counts": [5, 3, 3],
                "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0]
            },
            index=exp_index,
        )
        tm.assert_frame_equal(desc, expected)

        # https://github.com/pandas-dev/pandas/issues/3678
        # describe should work with NaN
        cat = Categorical([np.nan, 1, 2, 2])
        desc = cat.describe()
        expected = DataFrame(
            {
                "counts": [1, 2, 1],
                "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0]
            },
            index=CategoricalIndex([1, 2, np.nan],
                                   categories=[1, 2],
                                   name="categories"),
        )
        tm.assert_frame_equal(desc, expected)
Example #26
0
 def test_contains_nan(self):
     ci = CategoricalIndex(list("aabbca") + [np.nan],
                           categories=list("cabdef"))
     assert np.nan in ci
Example #27
0
    def test_loc_listlike_dtypes(self):
        # GH 11586

        # unique categories and codes
        index = CategoricalIndex(['a', 'b', 'c'])
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)

        # unique slice
        res = df.loc[['a', 'b']]
        exp_index = CategoricalIndex(['a', 'b'], categories=index.categories)
        exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=exp_index)
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[['a', 'a', 'b']]

        exp_index = CategoricalIndex(['a', 'a', 'b'],
                                     categories=index.categories)
        exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=exp_index)
        tm.assert_frame_equal(res, exp, check_index_type=True)

        with tm.assert_raises_regex(
                KeyError, 'a list-indexer must only include values that are '
                'in the categories'):
            df.loc[['a', 'x']]

        # duplicated categories and codes
        index = CategoricalIndex(['a', 'b', 'a'])
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)

        # unique slice
        res = df.loc[['a', 'b']]
        exp = DataFrame({
            'A': [1, 3, 2],
            'B': [4, 6, 5]
        },
                        index=CategoricalIndex(['a', 'a', 'b']))
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[['a', 'a', 'b']]
        exp = DataFrame({
            'A': [1, 3, 1, 3, 2],
            'B': [4, 6, 4, 6, 5]
        },
                        index=CategoricalIndex(['a', 'a', 'a', 'a', 'b']))
        tm.assert_frame_equal(res, exp, check_index_type=True)

        with tm.assert_raises_regex(
                KeyError, 'a list-indexer must only include values '
                'that are in the categories'):
            df.loc[['a', 'x']]

        # contains unused category
        index = CategoricalIndex(['a', 'b', 'a', 'c'],
                                 categories=list('abcde'))
        df = DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=index)

        res = df.loc[['a', 'b']]
        exp = DataFrame({
            'A': [1, 3, 2],
            'B': [5, 7, 6]
        },
                        index=CategoricalIndex(['a', 'a', 'b'],
                                               categories=list('abcde')))
        tm.assert_frame_equal(res, exp, check_index_type=True)

        res = df.loc[['a', 'e']]
        exp = DataFrame({
            'A': [1, 3, np.nan],
            'B': [5, 7, np.nan]
        },
                        index=CategoricalIndex(['a', 'a', 'e'],
                                               categories=list('abcde')))
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[['a', 'a', 'b']]
        exp = DataFrame({
            'A': [1, 3, 1, 3, 2],
            'B': [5, 7, 5, 7, 6]
        },
                        index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'],
                                               categories=list('abcde')))
        tm.assert_frame_equal(res, exp, check_index_type=True)

        with tm.assert_raises_regex(
                KeyError, 'a list-indexer must only include values '
                'that are in the categories'):
            df.loc[['a', 'x']]
Example #28
0
 def test_contains_interval(self, item, expected):
     # GH 23705
     ci = CategoricalIndex(IntervalIndex.from_breaks(range(3)))
     result = item in ci
     assert result is expected
Example #29
0
    def test_loc_listlike_dtypes(self):
        # GH 11586

        # unique categories and codes
        index = CategoricalIndex(["a", "b", "c"])
        df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index)

        # unique slice
        res = df.loc[["a", "b"]]
        exp_index = CategoricalIndex(["a", "b"], categories=index.categories)
        exp = DataFrame({"A": [1, 2], "B": [4, 5]}, index=exp_index)
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[["a", "a", "b"]]

        exp_index = CategoricalIndex(["a", "a", "b"], categories=index.categories)
        exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index)
        tm.assert_frame_equal(res, exp, check_index_type=True)

        msg = "a list-indexer must only include values that are in the categories"
        with pytest.raises(KeyError, match=msg):
            df.loc[["a", "x"]]

        # duplicated categories and codes
        index = CategoricalIndex(["a", "b", "a"])
        df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index)

        # unique slice
        res = df.loc[["a", "b"]]
        exp = DataFrame(
            {"A": [1, 3, 2], "B": [4, 6, 5]}, index=CategoricalIndex(["a", "a", "b"])
        )
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[["a", "a", "b"]]
        exp = DataFrame(
            {"A": [1, 3, 1, 3, 2], "B": [4, 6, 4, 6, 5]},
            index=CategoricalIndex(["a", "a", "a", "a", "b"]),
        )
        tm.assert_frame_equal(res, exp, check_index_type=True)

        msg = "a list-indexer must only include values that are in the categories"
        with pytest.raises(KeyError, match=msg):
            df.loc[["a", "x"]]

        # contains unused category
        index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde"))
        df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index)

        res = df.loc[["a", "b"]]
        exp = DataFrame(
            {"A": [1, 3, 2], "B": [5, 7, 6]},
            index=CategoricalIndex(["a", "a", "b"], categories=list("abcde")),
        )
        tm.assert_frame_equal(res, exp, check_index_type=True)

        res = df.loc[["a", "e"]]
        exp = DataFrame(
            {"A": [1, 3, np.nan], "B": [5, 7, np.nan]},
            index=CategoricalIndex(["a", "a", "e"], categories=list("abcde")),
        )
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[["a", "a", "b"]]
        exp = DataFrame(
            {"A": [1, 3, 1, 3, 2], "B": [5, 7, 5, 7, 6]},
            index=CategoricalIndex(["a", "a", "a", "a", "b"], categories=list("abcde")),
        )
        tm.assert_frame_equal(res, exp, check_index_type=True)

        msg = "a list-indexer must only include values that are in the categories"
        with pytest.raises(KeyError, match=msg):
            df.loc[["a", "x"]]
Example #30
0
 def test_construction_empty_with_bool_categories(self):
     # see GH#22702
     cat = CategoricalIndex([], categories=[True, False])
     categories = sorted(cat.categories.tolist())
     assert categories == [False, True]