Beispiel #1
0
    def test_get_indexer_same_categories_same_order(self):
        ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'])

        result = ci.get_indexer(CategoricalIndex(['b', 'b'],
                                                 categories=['a', 'b']))
        expected = np.array([1, 1], dtype='intp')
        tm.assert_numpy_array_equal(result, expected)
Beispiel #2
0
    def test_get_indexer_same_categories_different_order(self):
        # https://github.com/pandas-dev/pandas/issues/19551
        ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'])

        result = ci.get_indexer(CategoricalIndex(['b', 'b'],
                                                 categories=['b', 'a']))
        expected = np.array([1, 1], dtype='intp')
        tm.assert_numpy_array_equal(result, expected)
Beispiel #3
0
    def test_constructor_with_index(self):
        ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
        tm.assert_categorical_equal(ci.values, Categorical(ci))

        ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
        tm.assert_categorical_equal(ci.values,
                                    Categorical(ci.astype(object),
                                                categories=ci.categories))
Beispiel #4
0
 def test_get_indexer_array(self):
     arr = np.array([Timestamp('1999-12-31 00:00:00'),
                     Timestamp('2000-12-31 00:00:00')], dtype=object)
     cats = [Timestamp('1999-12-31 00:00:00'),
             Timestamp('2000-12-31 00:00:00')]
     ci = CategoricalIndex(cats,
                           categories=cats,
                           ordered=False, dtype='category')
     result = ci.get_indexer(arr)
     expected = np.array([0, 1], dtype='intp')
     tm.assert_numpy_array_equal(result, expected)
Beispiel #5
0
    def test_get_indexer_same_categories_same_order(self):
        ci = CategoricalIndex(["a", "b"], categories=["a", "b"])

        result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"]))
        expected = np.array([1, 1], dtype="intp")
        tm.assert_numpy_array_equal(result, expected)
Beispiel #6
0
def test_observed(observed):
    # multiple groupers, don't re-expand the output space
    # of the grouper
    # gh-14942 (implement)
    # gh-10132 (back-compat)
    # gh-8138 (back-compat)
    # gh-8869

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"],
                       ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"],
                       ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
    df["C"] = ["foo", "bar"] * 2

    # multiple groupers with a non-cat
    gb = df.groupby(["A", "B", "C"], observed=observed)
    exp_index = MultiIndex.from_arrays([cat1, cat2, ["foo", "bar"] * 2],
                                       names=["A", "B", "C"])
    expected = DataFrame({
        "values": Series([1, 2, 3, 4], index=exp_index)
    }).sort_index()
    result = gb.sum()
    if not observed:
        expected = cartesian_product_for_groupers(expected,
                                                  [cat1, cat2, ["foo", "bar"]],
                                                  list("ABC"))

    tm.assert_frame_equal(result, expected)

    gb = df.groupby(["A", "B"], observed=observed)
    exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
    expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
    result = gb.sum()
    if not observed:
        expected = cartesian_product_for_groupers(expected, [cat1, cat2],
                                                  list("AB"))

    tm.assert_frame_equal(result, expected)

    # https://github.com/pandas-dev/pandas/issues/8138
    d = {
        "cat":
        Categorical(["a", "b", "a", "b"],
                    categories=["a", "b", "c"],
                    ordered=True),
        "ints": [1, 1, 2, 2],
        "val": [10, 20, 30, 40],
    }
    df = DataFrame(d)

    # Grouping on a single column
    groups_single_key = df.groupby("cat", observed=observed)
    result = groups_single_key.mean()

    exp_index = CategoricalIndex(list("ab"),
                                 name="cat",
                                 categories=list("abc"),
                                 ordered=True)
    expected = DataFrame({
        "ints": [1.5, 1.5],
        "val": [20.0, 30]
    },
                         index=exp_index)
    if not observed:
        index = CategoricalIndex(list("abc"),
                                 name="cat",
                                 categories=list("abc"),
                                 ordered=True)
        expected = expected.reindex(index)

    tm.assert_frame_equal(result, expected)

    # Grouping on two columns
    groups_double_key = df.groupby(["cat", "ints"], observed=observed)
    result = groups_double_key.agg("mean")
    expected = DataFrame({
        "val": [10, 30, 20, 40],
        "cat":
        Categorical(["a", "a", "b", "b"],
                    categories=["a", "b", "c"],
                    ordered=True),
        "ints": [1, 2, 1, 2],
    }).set_index(["cat", "ints"])
    if not observed:
        expected = cartesian_product_for_groupers(expected,
                                                  [df.cat.values, [1, 2]],
                                                  ["cat", "ints"])

    tm.assert_frame_equal(result, expected)

    # GH 10132
    for key in [("a", 1), ("b", 2), ("b", 1), ("a", 2)]:
        c, i = key
        result = groups_double_key.get_group(key)
        expected = df[(df.cat == c) & (df.ints == i)]
        assert_frame_equal(result, expected)

    # gh-8869
    # with as_index
    d = {
        "foo": [10, 8, 4, 8, 4, 1, 1],
        "bar": [10, 20, 30, 40, 50, 60, 70],
        "baz": ["d", "c", "e", "a", "a", "d", "c"],
    }
    df = DataFrame(d)
    cat = pd.cut(df["foo"], np.linspace(0, 10, 3))
    df["range"] = cat
    groups = df.groupby(["range", "baz"], as_index=False, observed=observed)
    result = groups.agg("mean")

    groups2 = df.groupby(["range", "baz"], as_index=True, observed=observed)
    expected = groups2.agg("mean").reset_index()
    tm.assert_frame_equal(result, expected)
Beispiel #7
0
def test_sort_datetimelike():
    # GH10505

    # use same data as test_groupby_sort_categorical, which category is
    # corresponding to datetime.month
    df = DataFrame(
        {
            "dt": [
                datetime(2011, 7, 1),
                datetime(2011, 7, 1),
                datetime(2011, 2, 1),
                datetime(2011, 5, 1),
                datetime(2011, 2, 1),
                datetime(2011, 1, 1),
                datetime(2011, 5, 1),
            ],
            "foo": [10, 8, 5, 6, 4, 1, 7],
            "bar": [10, 20, 30, 40, 50, 60, 70],
        },
        columns=["dt", "foo", "bar"],
    )

    # ordered=True
    df["dt"] = Categorical(df["dt"], ordered=True)
    index = [
        datetime(2011, 1, 1),
        datetime(2011, 2, 1),
        datetime(2011, 5, 1),
        datetime(2011, 7, 1),
    ]
    result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
                            columns=["foo", "bar"])
    result_sort.index = CategoricalIndex(index, name="dt", ordered=True)

    index = [
        datetime(2011, 7, 1),
        datetime(2011, 2, 1),
        datetime(2011, 5, 1),
        datetime(2011, 1, 1),
    ]
    result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
                              columns=["foo", "bar"])
    result_nosort.index = CategoricalIndex(index,
                                           categories=index,
                                           name="dt",
                                           ordered=True)

    col = "dt"
    assert_frame_equal(result_sort,
                       df.groupby(col, sort=True, observed=False).first())

    # when categories is ordered, group is ordered by category's order
    assert_frame_equal(result_sort,
                       df.groupby(col, sort=False, observed=False).first())

    # ordered = False
    df["dt"] = Categorical(df["dt"], ordered=False)
    index = [
        datetime(2011, 1, 1),
        datetime(2011, 2, 1),
        datetime(2011, 5, 1),
        datetime(2011, 7, 1),
    ]
    result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
                            columns=["foo", "bar"])
    result_sort.index = CategoricalIndex(index, name="dt")

    index = [
        datetime(2011, 7, 1),
        datetime(2011, 2, 1),
        datetime(2011, 5, 1),
        datetime(2011, 1, 1),
    ]
    result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
                              columns=["foo", "bar"])
    result_nosort.index = CategoricalIndex(index, categories=index, name="dt")

    col = "dt"
    assert_frame_equal(result_sort,
                       df.groupby(col, sort=True, observed=False).first())
    assert_frame_equal(result_nosort,
                       df.groupby(col, sort=False, observed=False).first())
Beispiel #8
0
 def test_reindex_empty_index(self):
     # See GH16770
     c = CategoricalIndex([])
     res, indexer = c.reindex(["a", "b"])
     tm.assert_index_equal(res, Index(["a", "b"]), exact=True)
     tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp))
Beispiel #9
0
    def test_categorial_datetimelike(self, method):
        i = CategoricalIndex([Timestamp("1999-12-31"), Timestamp("2000-12-31")])

        result = method(i)[0]
        assert isinstance(result, Timestamp)
Beispiel #10
0
 def test_constructor_timedelta64_values_mismatched_dtype(self):
     # check we don't silently ignore the dtype keyword
     tdi = timedelta_range("4 Days", periods=5)
     result = Index(tdi, dtype="category")
     expected = CategoricalIndex(tdi)
     tm.assert_index_equal(result, expected)
Beispiel #11
0
 def test_constructor_categorical_to_object(self):
     # GH#32167 Categorical data and dtype=object should return object-dtype
     ci = CategoricalIndex(range(5))
     result = Index(ci, dtype=object)
     assert not isinstance(result, CategoricalIndex)
Beispiel #12
0
    def test_getitem_bool_mask_categorical_index(self):

        df3 = DataFrame(
            {
                "A": np.arange(6, dtype="int64"),
            },
            index=CategoricalIndex(
                [1, 1, 2, 1, 3, 2],
                dtype=CategoricalDtype([3, 2, 1], ordered=True),
                name="B",
            ),
        )
        df4 = DataFrame(
            {
                "A": np.arange(6, dtype="int64"),
            },
            index=CategoricalIndex(
                [1, 1, 2, 1, 3, 2],
                dtype=CategoricalDtype([3, 2, 1], ordered=False),
                name="B",
            ),
        )

        result = df3[df3.index == "a"]
        expected = df3.iloc[[]]
        tm.assert_frame_equal(result, expected)

        result = df4[df4.index == "a"]
        expected = df4.iloc[[]]
        tm.assert_frame_equal(result, expected)

        result = df3[df3.index == 1]
        expected = df3.iloc[[0, 1, 3]]
        tm.assert_frame_equal(result, expected)

        result = df4[df4.index == 1]
        expected = df4.iloc[[0, 1, 3]]
        tm.assert_frame_equal(result, expected)

        # since we have an ordered categorical

        # CategoricalIndex([1, 1, 2, 1, 3, 2],
        #         categories=[3, 2, 1],
        #         ordered=True,
        #         name='B')
        result = df3[df3.index < 2]
        expected = df3.iloc[[4]]
        tm.assert_frame_equal(result, expected)

        result = df3[df3.index > 1]
        expected = df3.iloc[[]]
        tm.assert_frame_equal(result, expected)

        # unordered
        # cannot be compared

        # CategoricalIndex([1, 1, 2, 1, 3, 2],
        #         categories=[3, 2, 1],
        #         ordered=False,
        #         name='B')
        msg = "Unordered Categoricals can only compare equality or not"
        with pytest.raises(TypeError, match=msg):
            df4[df4.index < 2]
        with pytest.raises(TypeError, match=msg):
            df4[df4.index > 1]
Beispiel #13
0
 def test_format_different_scalar_lengths(self):
     # GH#35439
     idx = CategoricalIndex(["aaaaaaaaa", "b"])
     expected = ["aaaaaaaaa", "b"]
     assert idx.format() == expected
Beispiel #14
0
def test_invalid_complib(setup_path):
    df = DataFrame(np.random.rand(4, 5),
                   index=list("abcd"),
                   columns=list("ABCDE"))
    with tm.ensure_clean(setup_path) as path:
        msg = r"complib only supports \[.*\] compression."
        with pytest.raises(ValueError, match=msg):
            df.to_hdf(path, "df", complib="foolib")


@pytest.mark.parametrize(
    "idx",
    [
        date_range("2019", freq="D", periods=3, tz="UTC"),
        CategoricalIndex(list("abc")),
    ],
)
def test_to_hdf_multiindex_extension_dtype(idx, setup_path):
    # GH 7775
    mi = MultiIndex.from_arrays([idx, idx])
    df = DataFrame(0, index=mi, columns=["a"])
    with ensure_clean_path(setup_path) as path:
        with pytest.raises(NotImplementedError, match="Saving a MultiIndex"):
            df.to_hdf(path, "df")


def test_unsuppored_hdf_file_error(datapath):
    # GH 9539
    data_path = datapath("io", "data", "legacy_hdf/incompatible_dataset.h5")
    message = (r"Dataset\(s\) incompatible with Pandas data types, "
Beispiel #15
0
    def test_equals_categorical(self):
        ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True)
        ci2 = CategoricalIndex(["a", "b"],
                               categories=["a", "b", "c"],
                               ordered=True)

        assert ci1.equals(ci1)
        assert not ci1.equals(ci2)
        assert ci1.equals(ci1.astype(object))
        assert ci1.astype(object).equals(ci1)

        assert (ci1 == ci1).all()
        assert not (ci1 != ci1).all()
        assert not (ci1 > ci1).all()
        assert not (ci1 < ci1).all()
        assert (ci1 <= ci1).all()
        assert (ci1 >= ci1).all()

        assert not (ci1 == 1).all()
        assert (ci1 == Index(["a", "b"])).all()
        assert (ci1 == ci1.values).all()

        # invalid comparisons
        with pytest.raises(ValueError, match="Lengths must match"):
            ci1 == Index(["a", "b", "c"])

        msg = "Categoricals can only be compared if 'categories' are the same"
        with pytest.raises(TypeError, match=msg):
            ci1 == ci2
        with pytest.raises(TypeError, match=msg):
            ci1 == Categorical(ci1.values, ordered=False)
        with pytest.raises(TypeError, match=msg):
            ci1 == Categorical(ci1.values, categories=list("abc"))

        # tests
        # make sure that we are testing for category inclusion properly
        ci = CategoricalIndex(list("aabca"), categories=["c", "a", "b"])
        assert not ci.equals(list("aabca"))
        # Same categories, but different order
        # Unordered
        assert ci.equals(CategoricalIndex(list("aabca")))
        # Ordered
        assert not ci.equals(CategoricalIndex(list("aabca"), ordered=True))
        assert ci.equals(ci.copy())

        ci = CategoricalIndex(list("aabca") + [np.nan],
                              categories=["c", "a", "b"])
        assert not ci.equals(list("aabca"))
        assert not ci.equals(CategoricalIndex(list("aabca")))
        assert ci.equals(ci.copy())

        ci = CategoricalIndex(list("aabca") + [np.nan],
                              categories=["c", "a", "b"])
        assert not ci.equals(list("aabca") + [np.nan])
        assert ci.equals(CategoricalIndex(list("aabca") + [np.nan]))
        assert not ci.equals(
            CategoricalIndex(list("aabca") + [np.nan], ordered=True))
        assert ci.equals(ci.copy())
Beispiel #16
0
    def test_reindex_with_categoricalindex(self):
        df = DataFrame(
            {
                "A": np.arange(3, dtype="int64"),
            },
            index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"),
        )

        # reindexing
        # convert to a regular index
        result = df.reindex(["a", "b", "e"])
        expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
            "B"
        )
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["a", "b"])
        expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["e"])
        expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["d"])
        expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        # since we are actually reindexing with a Categorical
        # then return a Categorical
        cats = list("cabe")

        result = df.reindex(Categorical(["a", "e"], categories=cats))
        expected = DataFrame(
            {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))}
        ).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(Categorical(["a"], categories=cats))
        expected = DataFrame(
            {"A": [0], "B": Series(list("a")).astype(CDT(cats))}
        ).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["a", "b", "e"])
        expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
            "B"
        )
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["a", "b"])
        expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["e"])
        expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        # give back the type of categorical that we received
        result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True))
        expected = DataFrame(
            {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))}
        ).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(Categorical(["a", "d"], categories=["a", "d"]))
        expected = DataFrame(
            {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))}
        ).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        df2 = DataFrame(
            {
                "A": np.arange(6, dtype="int64"),
            },
            index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"),
        )
        # passed duplicate indexers are not allowed
        msg = "cannot reindex on an axis with duplicate labels"
        with pytest.raises(ValueError, match=msg):
            with tm.assert_produces_warning(FutureWarning, match="non-unique"):
                df2.reindex(["a", "b"])

        # args NotImplemented ATM
        msg = r"argument {} is not implemented for CategoricalIndex\.reindex"
        with pytest.raises(NotImplementedError, match=msg.format("method")):
            df.reindex(["a"], method="ffill")
        with pytest.raises(NotImplementedError, match=msg.format("level")):
            df.reindex(["a"], level=1)
        with pytest.raises(NotImplementedError, match=msg.format("limit")):
            df.reindex(["a"], limit=2)
Beispiel #17
0
class TestDataFrameSelectReindex:
    # These are specific reindex-based tests; other indexing tests should go in
    # test_indexing

    def test_reindex_copies(self):
        # based on asv time_reindex_axis1
        N = 10
        df = DataFrame(np.random.randn(N * 10, N))
        cols = np.arange(N)
        np.random.shuffle(cols)

        result = df.reindex(columns=cols, copy=True)
        assert not np.shares_memory(result[0]._values, df[0]._values)

        # pass both columns and index
        result2 = df.reindex(columns=cols, index=df.index, copy=True)
        assert not np.shares_memory(result2[0]._values, df[0]._values)

    def test_reindex_date_fill_value(self):
        # passing date to dt64 is deprecated
        arr = date_range("2016-01-01", periods=6).values.reshape(3, 2)
        df = DataFrame(arr, columns=["A", "B"], index=range(3))

        ts = df.iloc[0, 0]
        fv = ts.date()

        with tm.assert_produces_warning(FutureWarning):
            res = df.reindex(index=range(4), columns=["A", "B", "C"], fill_value=fv)

        expected = DataFrame(
            {"A": df["A"].tolist() + [ts], "B": df["B"].tolist() + [ts], "C": [ts] * 4}
        )
        tm.assert_frame_equal(res, expected)

        # same with a datetime-castable str
        res = df.reindex(
            index=range(4), columns=["A", "B", "C"], fill_value="2016-01-01"
        )
        tm.assert_frame_equal(res, expected)

    def test_reindex_with_multi_index(self):
        # https://github.com/pandas-dev/pandas/issues/29896
        # tests for reindexing a multi-indexed DataFrame with a new MultiIndex
        #
        # confirms that we can reindex a multi-indexed DataFrame with a new
        # MultiIndex object correctly when using no filling, backfilling, and
        # padding
        #
        # The DataFrame, `df`, used in this test is:
        #       c
        #  a b
        # -1 0  A
        #    1  B
        #    2  C
        #    3  D
        #    4  E
        #    5  F
        #    6  G
        #  0 0  A
        #    1  B
        #    2  C
        #    3  D
        #    4  E
        #    5  F
        #    6  G
        #  1 0  A
        #    1  B
        #    2  C
        #    3  D
        #    4  E
        #    5  F
        #    6  G
        #
        # and the other MultiIndex, `new_multi_index`, is:
        # 0: 0 0.5
        # 1:   2.0
        # 2:   5.0
        # 3:   5.8
        df = DataFrame(
            {
                "a": [-1] * 7 + [0] * 7 + [1] * 7,
                "b": list(range(7)) * 3,
                "c": ["A", "B", "C", "D", "E", "F", "G"] * 3,
            }
        ).set_index(["a", "b"])
        new_index = [0.5, 2.0, 5.0, 5.8]
        new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"])

        # reindexing w/o a `method` value
        reindexed = df.reindex(new_multi_index)
        expected = DataFrame(
            {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]}
        ).set_index(["a", "b"])
        tm.assert_frame_equal(expected, reindexed)

        # reindexing with backfilling
        expected = DataFrame(
            {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]}
        ).set_index(["a", "b"])
        reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill")
        tm.assert_frame_equal(expected, reindexed_with_backfilling)

        reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill")
        tm.assert_frame_equal(expected, reindexed_with_backfilling)

        # reindexing with padding
        expected = DataFrame(
            {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]}
        ).set_index(["a", "b"])
        reindexed_with_padding = df.reindex(new_multi_index, method="pad")
        tm.assert_frame_equal(expected, reindexed_with_padding)

        reindexed_with_padding = df.reindex(new_multi_index, method="ffill")
        tm.assert_frame_equal(expected, reindexed_with_padding)

    @pytest.mark.parametrize(
        "method,expected_values",
        [
            ("nearest", [0, 1, 1, 2]),
            ("pad", [np.nan, 0, 1, 1]),
            ("backfill", [0, 1, 2, 2]),
        ],
    )
    def test_reindex_methods(self, method, expected_values):
        df = DataFrame({"x": list(range(5))})
        target = np.array([-0.1, 0.9, 1.1, 1.5])

        expected = DataFrame({"x": expected_values}, index=target)
        actual = df.reindex(target, method=method)
        tm.assert_frame_equal(expected, actual)

        actual = df.reindex(target, method=method, tolerance=1)
        tm.assert_frame_equal(expected, actual)
        actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1])
        tm.assert_frame_equal(expected, actual)

        e2 = expected[::-1]
        actual = df.reindex(target[::-1], method=method)
        tm.assert_frame_equal(e2, actual)

        new_order = [3, 0, 2, 1]
        e2 = expected.iloc[new_order]
        actual = df.reindex(target[new_order], method=method)
        tm.assert_frame_equal(e2, actual)

        switched_method = (
            "pad" if method == "backfill" else "backfill" if method == "pad" else method
        )
        actual = df[::-1].reindex(target, method=switched_method)
        tm.assert_frame_equal(expected, actual)

    def test_reindex_methods_nearest_special(self):
        df = DataFrame({"x": list(range(5))})
        target = np.array([-0.1, 0.9, 1.1, 1.5])

        expected = DataFrame({"x": [0, 1, 1, np.nan]}, index=target)
        actual = df.reindex(target, method="nearest", tolerance=0.2)
        tm.assert_frame_equal(expected, actual)

        expected = DataFrame({"x": [0, np.nan, 1, np.nan]}, index=target)
        actual = df.reindex(target, method="nearest", tolerance=[0.5, 0.01, 0.4, 0.1])
        tm.assert_frame_equal(expected, actual)

    def test_reindex_nearest_tz(self, tz_aware_fixture):
        # GH26683
        tz = tz_aware_fixture
        idx = date_range("2019-01-01", periods=5, tz=tz)
        df = DataFrame({"x": list(range(5))}, index=idx)

        expected = df.head(3)
        actual = df.reindex(idx[:3], method="nearest")
        tm.assert_frame_equal(expected, actual)

    def test_reindex_nearest_tz_empty_frame(self):
        # https://github.com/pandas-dev/pandas/issues/31964
        dti = pd.DatetimeIndex(["2016-06-26 14:27:26+00:00"])
        df = DataFrame(index=pd.DatetimeIndex(["2016-07-04 14:00:59+00:00"]))
        expected = DataFrame(index=dti)
        result = df.reindex(dti, method="nearest")
        tm.assert_frame_equal(result, expected)

    def test_reindex_frame_add_nat(self):
        rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s")
        df = DataFrame({"A": np.random.randn(len(rng)), "B": rng})

        result = df.reindex(range(15))
        assert np.issubdtype(result["B"].dtype, np.dtype("M8[ns]"))

        mask = com.isna(result)["B"]
        assert mask[-5:].all()
        assert not mask[:-5].any()

    @pytest.mark.parametrize(
        "method, exp_values",
        [("ffill", [0, 1, 2, 3]), ("bfill", [1.0, 2.0, 3.0, np.nan])],
    )
    def test_reindex_frame_tz_ffill_bfill(self, frame_or_series, method, exp_values):
        # GH#38566
        obj = frame_or_series(
            [0, 1, 2, 3],
            index=date_range("2020-01-01 00:00:00", periods=4, freq="H", tz="UTC"),
        )
        new_index = date_range("2020-01-01 00:01:00", periods=4, freq="H", tz="UTC")
        result = obj.reindex(new_index, method=method, tolerance=pd.Timedelta("1 hour"))
        expected = frame_or_series(exp_values, index=new_index)
        tm.assert_equal(result, expected)

    def test_reindex_limit(self):
        # GH 28631
        data = [["A", "A", "A"], ["B", "B", "B"], ["C", "C", "C"], ["D", "D", "D"]]
        exp_data = [
            ["A", "A", "A"],
            ["B", "B", "B"],
            ["C", "C", "C"],
            ["D", "D", "D"],
            ["D", "D", "D"],
            [np.nan, np.nan, np.nan],
        ]
        df = DataFrame(data)
        result = df.reindex([0, 1, 2, 3, 4, 5], method="ffill", limit=1)
        expected = DataFrame(exp_data)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "idx, check_index_type",
        [
            [["C", "B", "A"], True],
            [["F", "C", "A", "D"], True],
            [["A"], True],
            [["A", "B", "C"], True],
            [["C", "A", "B"], True],
            [["C", "B"], True],
            [["C", "A"], True],
            [["A", "B"], True],
            [["B", "A", "C"], True],
            # reindex by these causes different MultiIndex levels
            [["D", "F"], False],
            [["A", "C", "B"], False],
        ],
    )
    def test_reindex_level_verify_first_level(self, idx, check_index_type):
        df = DataFrame(
            {
                "jim": list("B" * 4 + "A" * 2 + "C" * 3),
                "joe": list("abcdeabcd")[::-1],
                "jolie": [10, 20, 30] * 3,
                "joline": np.random.randint(0, 1000, 9),
            }
        )
        icol = ["jim", "joe", "jolie"]

        def f(val):
            return np.nonzero((df["jim"] == val).to_numpy())[0]

        i = np.concatenate(list(map(f, idx)))
        left = df.set_index(icol).reindex(idx, level="jim")
        right = df.iloc[i].set_index(icol)
        tm.assert_frame_equal(left, right, check_index_type=check_index_type)

    @pytest.mark.parametrize(
        "idx",
        [
            ("mid",),
            ("mid", "btm"),
            ("mid", "btm", "top"),
            ("mid",),
            ("mid", "top"),
            ("mid", "top", "btm"),
            ("btm",),
            ("btm", "mid"),
            ("btm", "mid", "top"),
            ("btm",),
            ("btm", "top"),
            ("btm", "top", "mid"),
            ("top",),
            ("top", "mid"),
            ("top", "mid", "btm"),
            ("top",),
            ("top", "btm"),
            ("top", "btm", "mid"),
        ],
    )
    def test_reindex_level_verify_first_level_repeats(self, idx):
        df = DataFrame(
            {
                "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7,
                "joe": ["3rd"] * 2
                + ["1st"] * 3
                + ["2nd"] * 3
                + ["1st"] * 2
                + ["3rd"] * 3
                + ["1st"] * 2
                + ["3rd"] * 3
                + ["2nd"] * 2,
                # this needs to be jointly unique with jim and joe or
                # reindexing will fail ~1.5% of the time, this works
                # out to needing unique groups of same size as joe
                "jolie": np.concatenate(
                    [
                        np.random.choice(1000, x, replace=False)
                        for x in [2, 3, 3, 2, 3, 2, 3, 2]
                    ]
                ),
                "joline": np.random.randn(20).round(3) * 10,
            }
        )
        icol = ["jim", "joe", "jolie"]

        def f(val):
            return np.nonzero((df["jim"] == val).to_numpy())[0]

        i = np.concatenate(list(map(f, idx)))
        left = df.set_index(icol).reindex(idx, level="jim")
        right = df.iloc[i].set_index(icol)
        tm.assert_frame_equal(left, right)

    @pytest.mark.parametrize(
        "idx, indexer",
        [
            [
                ["1st", "2nd", "3rd"],
                [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 15, 16, 17],
            ],
            [
                ["3rd", "2nd", "1st"],
                [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 13, 14],
            ],
            [["2nd", "3rd"], [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17]],
            [["3rd", "1st"], [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14]],
        ],
    )
    def test_reindex_level_verify_repeats(self, idx, indexer):
        df = DataFrame(
            {
                "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7,
                "joe": ["3rd"] * 2
                + ["1st"] * 3
                + ["2nd"] * 3
                + ["1st"] * 2
                + ["3rd"] * 3
                + ["1st"] * 2
                + ["3rd"] * 3
                + ["2nd"] * 2,
                # this needs to be jointly unique with jim and joe or
                # reindexing will fail ~1.5% of the time, this works
                # out to needing unique groups of same size as joe
                "jolie": np.concatenate(
                    [
                        np.random.choice(1000, x, replace=False)
                        for x in [2, 3, 3, 2, 3, 2, 3, 2]
                    ]
                ),
                "joline": np.random.randn(20).round(3) * 10,
            }
        )
        icol = ["jim", "joe", "jolie"]
        left = df.set_index(icol).reindex(idx, level="joe")
        right = df.iloc[indexer].set_index(icol)
        tm.assert_frame_equal(left, right)

    @pytest.mark.parametrize(
        "idx, indexer, check_index_type",
        [
            [list("abcde"), [3, 2, 1, 0, 5, 4, 8, 7, 6], True],
            [list("abcd"), [3, 2, 1, 0, 5, 8, 7, 6], True],
            [list("abc"), [3, 2, 1, 8, 7, 6], True],
            [list("eca"), [1, 3, 4, 6, 8], True],
            [list("edc"), [0, 1, 4, 5, 6], True],
            [list("eadbc"), [3, 0, 2, 1, 4, 5, 8, 7, 6], True],
            [list("edwq"), [0, 4, 5], True],
            [list("wq"), [], False],
        ],
    )
    def test_reindex_level_verify(self, idx, indexer, check_index_type):
        df = DataFrame(
            {
                "jim": list("B" * 4 + "A" * 2 + "C" * 3),
                "joe": list("abcdeabcd")[::-1],
                "jolie": [10, 20, 30] * 3,
                "joline": np.random.randint(0, 1000, 9),
            }
        )
        icol = ["jim", "joe", "jolie"]
        left = df.set_index(icol).reindex(idx, level="joe")
        right = df.iloc[indexer].set_index(icol)
        tm.assert_frame_equal(left, right, check_index_type=check_index_type)

    def test_non_monotonic_reindex_methods(self):
        dr = date_range("2013-08-01", periods=6, freq="B")
        data = np.random.randn(6, 1)
        df = DataFrame(data, index=dr, columns=list("A"))
        df_rev = DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], columns=list("A"))
        # index is not monotonic increasing or decreasing
        msg = "index must be monotonic increasing or decreasing"
        with pytest.raises(ValueError, match=msg):
            df_rev.reindex(df.index, method="pad")
        with pytest.raises(ValueError, match=msg):
            df_rev.reindex(df.index, method="ffill")
        with pytest.raises(ValueError, match=msg):
            df_rev.reindex(df.index, method="bfill")
        with pytest.raises(ValueError, match=msg):
            df_rev.reindex(df.index, method="nearest")

    def test_reindex_sparse(self):
        # https://github.com/pandas-dev/pandas/issues/35286
        df = DataFrame(
            {"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))}
        )
        result = df.reindex([0, 2])
        expected = DataFrame(
            {
                "A": [0.0, np.nan],
                "B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)),
            },
            index=[0, 2],
        )
        tm.assert_frame_equal(result, expected)

    def test_reindex(self, float_frame):
        datetime_series = tm.makeTimeSeries(nper=30)

        newFrame = float_frame.reindex(datetime_series.index)

        for col in newFrame.columns:
            for idx, val in newFrame[col].items():
                if idx in float_frame.index:
                    if np.isnan(val):
                        assert np.isnan(float_frame[col][idx])
                    else:
                        assert val == float_frame[col][idx]
                else:
                    assert np.isnan(val)

        for col, series in newFrame.items():
            assert tm.equalContents(series.index, newFrame.index)
        emptyFrame = float_frame.reindex(Index([]))
        assert len(emptyFrame.index) == 0

        # Cython code should be unit-tested directly
        nonContigFrame = float_frame.reindex(datetime_series.index[::2])

        for col in nonContigFrame.columns:
            for idx, val in nonContigFrame[col].items():
                if idx in float_frame.index:
                    if np.isnan(val):
                        assert np.isnan(float_frame[col][idx])
                    else:
                        assert val == float_frame[col][idx]
                else:
                    assert np.isnan(val)

        for col, series in nonContigFrame.items():
            assert tm.equalContents(series.index, nonContigFrame.index)

        # corner cases

        # Same index, copies values but not index if copy=False
        newFrame = float_frame.reindex(float_frame.index, copy=False)
        assert newFrame.index is float_frame.index

        # length zero
        newFrame = float_frame.reindex([])
        assert newFrame.empty
        assert len(newFrame.columns) == len(float_frame.columns)

        # length zero with columns reindexed with non-empty index
        newFrame = float_frame.reindex([])
        newFrame = newFrame.reindex(float_frame.index)
        assert len(newFrame.index) == len(float_frame.index)
        assert len(newFrame.columns) == len(float_frame.columns)

        # pass non-Index
        newFrame = float_frame.reindex(list(datetime_series.index))
        expected = datetime_series.index._with_freq(None)
        tm.assert_index_equal(newFrame.index, expected)

        # copy with no axes
        result = float_frame.reindex()
        tm.assert_frame_equal(result, float_frame)
        assert result is not float_frame

    def test_reindex_nan(self):
        df = DataFrame(
            [[1, 2], [3, 5], [7, 11], [9, 23]],
            index=[2, np.nan, 1, 5],
            columns=["joe", "jim"],
        )

        i, j = [np.nan, 5, 5, np.nan, 1, 2, np.nan], [1, 3, 3, 1, 2, 0, 1]
        tm.assert_frame_equal(df.reindex(i), df.iloc[j])

        df.index = df.index.astype("object")
        tm.assert_frame_equal(df.reindex(i), df.iloc[j], check_index_type=False)

        # GH10388
        df = DataFrame(
            {
                "other": ["a", "b", np.nan, "c"],
                "date": ["2015-03-22", np.nan, "2012-01-08", np.nan],
                "amount": [2, 3, 4, 5],
            }
        )

        df["date"] = pd.to_datetime(df.date)
        df["delta"] = (pd.to_datetime("2015-06-18") - df["date"]).shift(1)

        left = df.set_index(["delta", "other", "date"]).reset_index()
        right = df.reindex(columns=["delta", "other", "date", "amount"])
        tm.assert_frame_equal(left, right)

    def test_reindex_name_remains(self):
        s = Series(np.random.rand(10))
        df = DataFrame(s, index=np.arange(len(s)))
        i = Series(np.arange(10), name="iname")

        df = df.reindex(i)
        assert df.index.name == "iname"

        df = df.reindex(Index(np.arange(10), name="tmpname"))
        assert df.index.name == "tmpname"

        s = Series(np.random.rand(10))
        df = DataFrame(s.T, index=np.arange(len(s)))
        i = Series(np.arange(10), name="iname")
        df = df.reindex(columns=i)
        assert df.columns.name == "iname"

    def test_reindex_int(self, int_frame):
        smaller = int_frame.reindex(int_frame.index[::2])

        assert smaller["A"].dtype == np.int64

        bigger = smaller.reindex(int_frame.index)
        assert bigger["A"].dtype == np.float64

        smaller = int_frame.reindex(columns=["A", "B"])
        assert smaller["A"].dtype == np.int64

    def test_reindex_columns(self, float_frame):
        new_frame = float_frame.reindex(columns=["A", "B", "E"])

        tm.assert_series_equal(new_frame["B"], float_frame["B"])
        assert np.isnan(new_frame["E"]).all()
        assert "C" not in new_frame

        # Length zero
        new_frame = float_frame.reindex(columns=[])
        assert new_frame.empty

    def test_reindex_columns_method(self):

        # GH 14992, reindexing over columns ignored method
        df = DataFrame(
            data=[[11, 12, 13], [21, 22, 23], [31, 32, 33]],
            index=[1, 2, 4],
            columns=[1, 2, 4],
            dtype=float,
        )

        # default method
        result = df.reindex(columns=range(6))
        expected = DataFrame(
            data=[
                [np.nan, 11, 12, np.nan, 13, np.nan],
                [np.nan, 21, 22, np.nan, 23, np.nan],
                [np.nan, 31, 32, np.nan, 33, np.nan],
            ],
            index=[1, 2, 4],
            columns=range(6),
            dtype=float,
        )
        tm.assert_frame_equal(result, expected)

        # method='ffill'
        result = df.reindex(columns=range(6), method="ffill")
        expected = DataFrame(
            data=[
                [np.nan, 11, 12, 12, 13, 13],
                [np.nan, 21, 22, 22, 23, 23],
                [np.nan, 31, 32, 32, 33, 33],
            ],
            index=[1, 2, 4],
            columns=range(6),
            dtype=float,
        )
        tm.assert_frame_equal(result, expected)

        # method='bfill'
        result = df.reindex(columns=range(6), method="bfill")
        expected = DataFrame(
            data=[
                [11, 11, 12, 13, 13, np.nan],
                [21, 21, 22, 23, 23, np.nan],
                [31, 31, 32, 33, 33, np.nan],
            ],
            index=[1, 2, 4],
            columns=range(6),
            dtype=float,
        )
        tm.assert_frame_equal(result, expected)

    def test_reindex_axes(self):
        # GH 3317, reindexing by both axes loses freq of the index
        df = DataFrame(
            np.ones((3, 3)),
            index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)],
            columns=["a", "b", "c"],
        )
        time_freq = date_range("2012-01-01", "2012-01-03", freq="d")
        some_cols = ["a", "b"]

        index_freq = df.reindex(index=time_freq).index.freq
        both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq
        seq_freq = df.reindex(index=time_freq).reindex(columns=some_cols).index.freq
        assert index_freq == both_freq
        assert index_freq == seq_freq

    def test_reindex_fill_value(self):
        df = DataFrame(np.random.randn(10, 4))

        # axis=0
        result = df.reindex(list(range(15)))
        assert np.isnan(result.values[-5:]).all()

        result = df.reindex(range(15), fill_value=0)
        expected = df.reindex(range(15)).fillna(0)
        tm.assert_frame_equal(result, expected)

        # axis=1
        result = df.reindex(columns=range(5), fill_value=0.0)
        expected = df.copy()
        expected[4] = 0.0
        tm.assert_frame_equal(result, expected)

        result = df.reindex(columns=range(5), fill_value=0)
        expected = df.copy()
        expected[4] = 0
        tm.assert_frame_equal(result, expected)

        result = df.reindex(columns=range(5), fill_value="foo")
        expected = df.copy()
        expected[4] = "foo"
        tm.assert_frame_equal(result, expected)

        # other dtypes
        df["foo"] = "foo"
        result = df.reindex(range(15), fill_value=0)
        expected = df.reindex(range(15)).fillna(0)
        tm.assert_frame_equal(result, expected)

    def test_reindex_dups(self):

        # GH4746, reindex on duplicate index error messages
        arr = np.random.randn(10)
        df = DataFrame(arr, index=[1, 2, 3, 4, 5, 1, 2, 3, 4, 5])

        # set index is ok
        result = df.copy()
        result.index = list(range(len(df)))
        expected = DataFrame(arr, index=list(range(len(df))))
        tm.assert_frame_equal(result, expected)

        # reindex fails
        msg = "cannot reindex on an axis with duplicate labels"
        with pytest.raises(ValueError, match=msg):
            with tm.assert_produces_warning(FutureWarning, match="non-unique"):
                df.reindex(index=list(range(len(df))))

    def test_reindex_with_duplicate_columns(self):

        # reindex is invalid!
        df = DataFrame(
            [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
        )
        msg = "cannot reindex on an axis with duplicate labels"
        with pytest.raises(ValueError, match=msg):
            with tm.assert_produces_warning(FutureWarning, match="non-unique"):
                df.reindex(columns=["bar"])
        with pytest.raises(ValueError, match=msg):
            with tm.assert_produces_warning(FutureWarning, match="non-unique"):
                df.reindex(columns=["bar", "foo"])

    def test_reindex_axis_style(self):
        # https://github.com/pandas-dev/pandas/issues/12392
        df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
        expected = DataFrame(
            {"A": [1, 2, np.nan], "B": [4, 5, np.nan]}, index=[0, 1, 3]
        )
        result = df.reindex([0, 1, 3])
        tm.assert_frame_equal(result, expected)

        result = df.reindex([0, 1, 3], axis=0)
        tm.assert_frame_equal(result, expected)

        result = df.reindex([0, 1, 3], axis="index")
        tm.assert_frame_equal(result, expected)

    def test_reindex_positional_warns(self):
        # https://github.com/pandas-dev/pandas/issues/12392
        df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
        expected = DataFrame({"A": [1.0, 2], "B": [4.0, 5], "C": [np.nan, np.nan]})
        with tm.assert_produces_warning(FutureWarning):
            result = df.reindex([0, 1], ["A", "B", "C"])

        tm.assert_frame_equal(result, expected)

    def test_reindex_axis_style_raises(self):
        # https://github.com/pandas-dev/pandas/issues/12392
        df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex([0, 1], ["A"], axis=1)

        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex([0, 1], ["A"], axis="index")

        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex(index=[0, 1], axis="index")

        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex(index=[0, 1], axis="columns")

        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex(columns=[0, 1], axis="columns")

        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex(index=[0, 1], columns=[0, 1], axis="columns")

        with pytest.raises(TypeError, match="Cannot specify all"):
            df.reindex([0, 1], [0], ["A"])

        # Mixing styles
        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex(index=[0, 1], axis="index")

        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex(index=[0, 1], axis="columns")

        # Duplicates
        with pytest.raises(TypeError, match="multiple values"):
            df.reindex([0, 1], labels=[0, 1])

    def test_reindex_single_named_indexer(self):
        # https://github.com/pandas-dev/pandas/issues/12392
        df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]})
        result = df.reindex([0, 1], columns=["A"])
        expected = DataFrame({"A": [1, 2]})
        tm.assert_frame_equal(result, expected)

    def test_reindex_api_equivalence(self):
        # https://github.com/pandas-dev/pandas/issues/12392
        # equivalence of the labels/axis and index/columns API's
        df = DataFrame(
            [[1, 2, 3], [3, 4, 5], [5, 6, 7]],
            index=["a", "b", "c"],
            columns=["d", "e", "f"],
        )

        res1 = df.reindex(["b", "a"])
        res2 = df.reindex(index=["b", "a"])
        res3 = df.reindex(labels=["b", "a"])
        res4 = df.reindex(labels=["b", "a"], axis=0)
        res5 = df.reindex(["b", "a"], axis=0)
        for res in [res2, res3, res4, res5]:
            tm.assert_frame_equal(res1, res)

        res1 = df.reindex(columns=["e", "d"])
        res2 = df.reindex(["e", "d"], axis=1)
        res3 = df.reindex(labels=["e", "d"], axis=1)
        for res in [res2, res3]:
            tm.assert_frame_equal(res1, res)

        with tm.assert_produces_warning(FutureWarning) as m:
            res1 = df.reindex(["b", "a"], ["e", "d"])
        assert "reindex" in str(m[0].message)
        res2 = df.reindex(columns=["e", "d"], index=["b", "a"])
        res3 = df.reindex(labels=["b", "a"], axis=0).reindex(labels=["e", "d"], axis=1)
        for res in [res2, res3]:
            tm.assert_frame_equal(res1, res)

    def test_reindex_boolean(self):
        frame = DataFrame(
            np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2]
        )

        reindexed = frame.reindex(np.arange(10))
        assert reindexed.values.dtype == np.object_
        assert isna(reindexed[0][1])

        reindexed = frame.reindex(columns=range(3))
        assert reindexed.values.dtype == np.object_
        assert isna(reindexed[1]).all()

    def test_reindex_objects(self, float_string_frame):
        reindexed = float_string_frame.reindex(columns=["foo", "A", "B"])
        assert "foo" in reindexed

        reindexed = float_string_frame.reindex(columns=["A", "B"])
        assert "foo" not in reindexed

    def test_reindex_corner(self, int_frame):
        index = Index(["a", "b", "c"])
        dm = DataFrame({}).reindex(index=[1, 2, 3])
        reindexed = dm.reindex(columns=index)
        tm.assert_index_equal(reindexed.columns, index)

        # ints are weird
        smaller = int_frame.reindex(columns=["A", "B", "E"])
        assert smaller["E"].dtype == np.float64

    def test_reindex_with_nans(self):
        df = DataFrame(
            [[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]],
            columns=["a", "b"],
            index=[100.0, 101.0, np.nan, 102.0, 103.0],
        )

        result = df.reindex(index=[101.0, 102.0, 103.0])
        expected = df.iloc[[1, 3, 4]]
        tm.assert_frame_equal(result, expected)

        result = df.reindex(index=[103.0])
        expected = df.iloc[[4]]
        tm.assert_frame_equal(result, expected)

        result = df.reindex(index=[101.0])
        expected = df.iloc[[1]]
        tm.assert_frame_equal(result, expected)

    def test_reindex_multi(self):
        df = DataFrame(np.random.randn(3, 3))

        result = df.reindex(index=range(4), columns=range(4))
        expected = df.reindex(list(range(4))).reindex(columns=range(4))

        tm.assert_frame_equal(result, expected)

        df = DataFrame(np.random.randint(0, 10, (3, 3)))

        result = df.reindex(index=range(4), columns=range(4))
        expected = df.reindex(list(range(4))).reindex(columns=range(4))

        tm.assert_frame_equal(result, expected)

        df = DataFrame(np.random.randint(0, 10, (3, 3)))

        result = df.reindex(index=range(2), columns=range(2))
        expected = df.reindex(range(2)).reindex(columns=range(2))

        tm.assert_frame_equal(result, expected)

        df = DataFrame(np.random.randn(5, 3) + 1j, columns=["a", "b", "c"])

        result = df.reindex(index=[0, 1], columns=["a", "b"])
        expected = df.reindex([0, 1]).reindex(columns=["a", "b"])

        tm.assert_frame_equal(result, expected)

    def test_reindex_multi_categorical_time(self):
        # https://github.com/pandas-dev/pandas/issues/21390
        midx = MultiIndex.from_product(
            [
                Categorical(["a", "b", "c"]),
                Categorical(date_range("2012-01-01", periods=3, freq="H")),
            ]
        )
        df = DataFrame({"a": range(len(midx))}, index=midx)
        df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]]

        result = df2.reindex(midx)
        expected = DataFrame({"a": [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx)
        tm.assert_frame_equal(result, expected)

    def test_reindex_with_categoricalindex(self):
        df = DataFrame(
            {
                "A": np.arange(3, dtype="int64"),
            },
            index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"),
        )

        # reindexing
        # convert to a regular index
        result = df.reindex(["a", "b", "e"])
        expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
            "B"
        )
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["a", "b"])
        expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["e"])
        expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["d"])
        expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        # since we are actually reindexing with a Categorical
        # then return a Categorical
        cats = list("cabe")

        result = df.reindex(Categorical(["a", "e"], categories=cats))
        expected = DataFrame(
            {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))}
        ).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(Categorical(["a"], categories=cats))
        expected = DataFrame(
            {"A": [0], "B": Series(list("a")).astype(CDT(cats))}
        ).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["a", "b", "e"])
        expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
            "B"
        )
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["a", "b"])
        expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["e"])
        expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        # give back the type of categorical that we received
        result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True))
        expected = DataFrame(
            {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))}
        ).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(Categorical(["a", "d"], categories=["a", "d"]))
        expected = DataFrame(
            {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))}
        ).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        df2 = DataFrame(
            {
                "A": np.arange(6, dtype="int64"),
            },
            index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"),
        )
        # passed duplicate indexers are not allowed
        msg = "cannot reindex on an axis with duplicate labels"
        with pytest.raises(ValueError, match=msg):
            with tm.assert_produces_warning(FutureWarning, match="non-unique"):
                df2.reindex(["a", "b"])

        # args NotImplemented ATM
        msg = r"argument {} is not implemented for CategoricalIndex\.reindex"
        with pytest.raises(NotImplementedError, match=msg.format("method")):
            df.reindex(["a"], method="ffill")
        with pytest.raises(NotImplementedError, match=msg.format("level")):
            df.reindex(["a"], level=1)
        with pytest.raises(NotImplementedError, match=msg.format("limit")):
            df.reindex(["a"], limit=2)

    def test_reindex_signature(self):
        sig = inspect.signature(DataFrame.reindex)
        parameters = set(sig.parameters)
        assert parameters == {
            "self",
            "labels",
            "index",
            "columns",
            "axis",
            "limit",
            "copy",
            "level",
            "method",
            "fill_value",
            "tolerance",
        }

    def test_reindex_multiindex_ffill_added_rows(self):
        # GH#23693
        # reindex added rows with nan values even when fill method was specified
        mi = MultiIndex.from_tuples([("a", "b"), ("d", "e")])
        df = DataFrame([[0, 7], [3, 4]], index=mi, columns=["x", "y"])
        mi2 = MultiIndex.from_tuples([("a", "b"), ("d", "e"), ("h", "i")])
        result = df.reindex(mi2, axis=0, method="ffill")
        expected = DataFrame([[0, 7], [3, 4], [3, 4]], index=mi2, columns=["x", "y"])
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "kwargs",
        [
            {"method": "pad", "tolerance": timedelta(seconds=9)},
            {"method": "backfill", "tolerance": timedelta(seconds=9)},
            {"method": "nearest"},
            {"method": None},
        ],
    )
    def test_reindex_empty_frame(self, kwargs):
        # GH#27315
        idx = date_range(start="2020", freq="30s", periods=3)
        df = DataFrame([], index=Index([], name="time"), columns=["a"])
        result = df.reindex(idx, **kwargs)
        expected = DataFrame({"a": [pd.NA] * 3}, index=idx)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "src_idx",
        [
            Index([]),
            CategoricalIndex([]),
        ],
    )
    @pytest.mark.parametrize(
        "cat_idx",
        [
            # No duplicates
            Index([]),
            CategoricalIndex([]),
            Index(["A", "B"]),
            CategoricalIndex(["A", "B"]),
            # Duplicates: GH#38906
            Index(["A", "A"]),
            CategoricalIndex(["A", "A"]),
        ],
    )
    def test_reindex_empty(self, src_idx, cat_idx):
        df = DataFrame(columns=src_idx, index=["K"], dtype="f8")

        result = df.reindex(columns=cat_idx)
        expected = DataFrame(index=["K"], columns=cat_idx, dtype="f8")
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]"])
    def test_reindex_datetimelike_to_object(self, dtype):
        # GH#39755 dont cast dt64/td64 to ints
        mi = MultiIndex.from_product([list("ABCDE"), range(2)])

        dti = date_range("2016-01-01", periods=10)
        fv = np.timedelta64("NaT", "ns")
        if dtype == "m8[ns]":
            dti = dti - dti[0]
            fv = np.datetime64("NaT", "ns")

        ser = Series(dti, index=mi)
        ser[::3] = pd.NaT

        df = ser.unstack()

        index = df.index.append(Index([1]))
        columns = df.columns.append(Index(["foo"]))

        res = df.reindex(index=index, columns=columns, fill_value=fv)

        expected = DataFrame(
            {
                0: df[0].tolist() + [fv],
                1: df[1].tolist() + [fv],
                "foo": np.array(["NaT"] * 6, dtype=fv.dtype),
            },
            index=index,
        )
        assert (res.dtypes[[0, 1]] == object).all()
        assert res.iloc[0, 0] is pd.NaT
        assert res.iloc[-1, 0] is fv
        assert res.iloc[-1, 1] is fv
        tm.assert_frame_equal(res, expected)

    @pytest.mark.parametrize(
        "index_df,index_res,index_exp",
        [
            (
                CategoricalIndex([], categories=["A"]),
                Index(["A"]),
                Index(["A"]),
            ),
            (
                CategoricalIndex([], categories=["A"]),
                Index(["B"]),
                Index(["B"]),
            ),
            (
                CategoricalIndex([], categories=["A"]),
                CategoricalIndex(["A"]),
                CategoricalIndex(["A"]),
            ),
            (
                CategoricalIndex([], categories=["A"]),
                CategoricalIndex(["B"]),
                CategoricalIndex(["B"]),
            ),
        ],
    )
    def test_reindex_not_category(self, index_df, index_res, index_exp):
        # GH#28690
        df = DataFrame(index=index_df)
        result = df.reindex(index=index_res)
        expected = DataFrame(index=index_exp)
        tm.assert_frame_equal(result, expected)
Beispiel #18
0
def makeCategoricalIndex(k=10, n=3, name=None, **kwargs):
    """make a length k index or n categories"""
    x = rands_array(nchars=4, size=n)
    return CategoricalIndex(
        Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs
    )
Beispiel #19
0
 def test_constructor_period_values_mismatched_dtype(self):
     pi = period_range("2016-01-01", periods=3, freq="D")
     result = Index(pi, dtype="category")
     expected = CategoricalIndex(pi)
     tm.assert_index_equal(result, expected)
Beispiel #20
0
 def test_categorical_categories(self):
     # GH17884
     c1 = CategoricalDtype(Categorical(["a", "b"]))
     tm.assert_index_equal(c1.categories, pd.Index(["a", "b"]))
     c1 = CategoricalDtype(CategoricalIndex(["a", "b"]))
     tm.assert_index_equal(c1.categories, pd.Index(["a", "b"]))
Beispiel #21
0
 def test_constructor_interval_values_mismatched_dtype(self):
     dti = date_range("2016-01-01", periods=3)
     ii = IntervalIndex.from_breaks(dti)
     result = Index(ii, dtype="category")
     expected = CategoricalIndex(ii)
     tm.assert_index_equal(result, expected)
Beispiel #22
0
    def test_loc_listlike_dtypes(self):
        # GH 11586

        # unique categories and codes
        index = CategoricalIndex(['a', 'b', 'c'])
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)

        # unique slice
        res = df.loc[['a', 'b']]
        exp_index = CategoricalIndex(['a', 'b'], categories=index.categories)
        exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=exp_index)
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[['a', 'a', 'b']]

        exp_index = CategoricalIndex(['a', 'a', 'b'],
                                     categories=index.categories)
        exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=exp_index)
        tm.assert_frame_equal(res, exp, check_index_type=True)

        msg = ('a list-indexer must only include '
               'values that are in the categories')
        with pytest.raises(KeyError, match=msg):
            df.loc[['a', 'x']]

        # duplicated categories and codes
        index = CategoricalIndex(['a', 'b', 'a'])
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index)

        # unique slice
        res = df.loc[['a', 'b']]
        exp = DataFrame({
            'A': [1, 3, 2],
            'B': [4, 6, 5]
        },
                        index=CategoricalIndex(['a', 'a', 'b']))
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[['a', 'a', 'b']]
        exp = DataFrame({
            'A': [1, 3, 1, 3, 2],
            'B': [4, 6, 4, 6, 5]
        },
                        index=CategoricalIndex(['a', 'a', 'a', 'a', 'b']))
        tm.assert_frame_equal(res, exp, check_index_type=True)

        msg = ('a list-indexer must only include values '
               'that are in the categories')
        with pytest.raises(KeyError, match=msg):
            df.loc[['a', 'x']]

        # contains unused category
        index = CategoricalIndex(['a', 'b', 'a', 'c'],
                                 categories=list('abcde'))
        df = DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=index)

        res = df.loc[['a', 'b']]
        exp = DataFrame({
            'A': [1, 3, 2],
            'B': [5, 7, 6]
        },
                        index=CategoricalIndex(['a', 'a', 'b'],
                                               categories=list('abcde')))
        tm.assert_frame_equal(res, exp, check_index_type=True)

        res = df.loc[['a', 'e']]
        exp = DataFrame({
            'A': [1, 3, np.nan],
            'B': [5, 7, np.nan]
        },
                        index=CategoricalIndex(['a', 'a', 'e'],
                                               categories=list('abcde')))
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[['a', 'a', 'b']]
        exp = DataFrame({
            'A': [1, 3, 1, 3, 2],
            'B': [5, 7, 5, 7, 6]
        },
                        index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'],
                                               categories=list('abcde')))
        tm.assert_frame_equal(res, exp, check_index_type=True)

        msg = ('a list-indexer must only include values '
               'that are in the categories')
        with pytest.raises(KeyError, match=msg):
            df.loc[['a', 'x']]
Beispiel #23
0
    def test_construction(self):

        ci = CategoricalIndex(list("aabbca"), categories=list("abcd"), ordered=False)
        categories = ci.categories

        result = Index(ci)
        tm.assert_index_equal(result, ci, exact=True)
        assert not result.ordered

        result = Index(ci.values)
        tm.assert_index_equal(result, ci, exact=True)
        assert not result.ordered

        # empty
        result = CategoricalIndex(categories=categories)
        tm.assert_index_equal(result.categories, Index(categories))
        tm.assert_numpy_array_equal(result.codes, np.array([], dtype="int8"))
        assert not result.ordered

        # passing categories
        result = CategoricalIndex(list("aabbca"), categories=categories)
        tm.assert_index_equal(result.categories, Index(categories))
        tm.assert_numpy_array_equal(
            result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8")
        )

        c = Categorical(list("aabbca"))
        result = CategoricalIndex(c)
        tm.assert_index_equal(result.categories, Index(list("abc")))
        tm.assert_numpy_array_equal(
            result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8")
        )
        assert not result.ordered

        result = CategoricalIndex(c, categories=categories)
        tm.assert_index_equal(result.categories, Index(categories))
        tm.assert_numpy_array_equal(
            result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8")
        )
        assert not result.ordered

        ci = CategoricalIndex(c, categories=list("abcd"))
        result = CategoricalIndex(ci)
        tm.assert_index_equal(result.categories, Index(categories))
        tm.assert_numpy_array_equal(
            result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8")
        )
        assert not result.ordered

        result = CategoricalIndex(ci, categories=list("ab"))
        tm.assert_index_equal(result.categories, Index(list("ab")))
        tm.assert_numpy_array_equal(
            result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8")
        )
        assert not result.ordered

        result = CategoricalIndex(ci, categories=list("ab"), ordered=True)
        tm.assert_index_equal(result.categories, Index(list("ab")))
        tm.assert_numpy_array_equal(
            result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8")
        )
        assert result.ordered

        result = CategoricalIndex(ci, categories=list("ab"), ordered=True)
        expected = CategoricalIndex(
            ci, categories=list("ab"), ordered=True, dtype="category"
        )
        tm.assert_index_equal(result, expected, exact=True)

        # turn me to an Index
        result = Index(np.array(ci))
        assert isinstance(result, Index)
        assert not isinstance(result, CategoricalIndex)
    def test_groupby(self):

        cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
                           categories=["a", "b", "c", "d"],
                           ordered=True)
        data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

        exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
        expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
        result = data.groupby("b").mean()
        tm.assert_frame_equal(result, expected)

        raw_cat1 = Categorical(["a", "a", "b", "b"],
                               categories=["a", "b", "z"],
                               ordered=True)
        raw_cat2 = Categorical(["c", "d", "c", "d"],
                               categories=["c", "d", "y"],
                               ordered=True)
        df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]})

        # single grouper
        gb = df.groupby("A")
        exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
        expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
        result = gb.sum()
        tm.assert_frame_equal(result, expected)

        # multiple groupers
        gb = df.groupby(['A', 'B'])
        exp_index = pd.MultiIndex.from_product([
            Categorical(["a", "b", "z"], ordered=True),
            Categorical(["c", "d", "y"], ordered=True)
        ],
                                               names=['A', 'B'])
        expected = DataFrame(
            {'values': [1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan]},
            index=exp_index)
        result = gb.sum()
        tm.assert_frame_equal(result, expected)

        # multiple groupers with a non-cat
        df = df.copy()
        df['C'] = ['foo', 'bar'] * 2
        gb = df.groupby(['A', 'B', 'C'])
        exp_index = pd.MultiIndex.from_product([
            Categorical(["a", "b", "z"], ordered=True),
            Categorical(["c", "d", "y"], ordered=True), ['foo', 'bar']
        ],
                                               names=['A', 'B', 'C'])
        expected = DataFrame({
            'values': Series(np.nan, index=exp_index)
        }).sort_index()
        expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4]
        result = gb.sum()
        tm.assert_frame_equal(result, expected)

        # GH 8623
        x = DataFrame(
            [[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']],
            columns=['person_id', 'person_name'])
        x['person_name'] = Categorical(x.person_name)

        g = x.groupby(['person_id'])
        result = g.transform(lambda x: x)
        tm.assert_frame_equal(result, x[['person_name']])

        result = x.drop_duplicates('person_name')
        expected = x.iloc[[0, 1]]
        tm.assert_frame_equal(result, expected)

        def f(x):
            return x.drop_duplicates('person_name').iloc[0]

        result = g.apply(f)
        expected = x.iloc[[0, 1]].copy()
        expected.index = Index([1, 2], name='person_id')
        expected['person_name'] = expected['person_name'].astype('object')
        tm.assert_frame_equal(result, expected)

        # GH 9921
        # Monotonic
        df = DataFrame({"a": [5, 15, 25]})
        c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])

        result = df.a.groupby(c).transform(sum)
        tm.assert_series_equal(result, df['a'])

        tm.assert_series_equal(
            df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a'])
        tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']])
        tm.assert_frame_equal(
            df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']])

        # Filter
        tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a'])
        tm.assert_frame_equal(df.groupby(c).filter(np.all), df)

        # Non-monotonic
        df = DataFrame({"a": [5, 15, 25, -5]})
        c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])

        result = df.a.groupby(c).transform(sum)
        tm.assert_series_equal(result, df['a'])

        tm.assert_series_equal(
            df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a'])
        tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']])
        tm.assert_frame_equal(
            df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']])

        # GH 9603
        df = DataFrame({'a': [1, 0, 0, 0]})
        c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
        result = df.groupby(c).apply(len)

        exp_index = CategoricalIndex(c.values.categories,
                                     ordered=c.values.ordered)
        expected = Series([1, 0, 0, 0], index=exp_index)
        expected.index.name = 'a'
        tm.assert_series_equal(result, expected)
Beispiel #25
0
    def test_reindex_dtype(self):
        c = CategoricalIndex(["a", "b", "c", "a"])
        res, indexer = c.reindex(["a", "c"])
        tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))

        c = CategoricalIndex(["a", "b", "c", "a"])
        res, indexer = c.reindex(Categorical(["a", "c"]))

        exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))

        c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
        res, indexer = c.reindex(["a", "c"])
        exp = Index(["a", "a", "c"], dtype="object")
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))

        c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
        res, indexer = c.reindex(Categorical(["a", "c"]))
        exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
    def test_groupby_sort_categorical_datetimelike(self):
        # GH10505

        # use same data as test_groupby_sort_categorical, which category is
        # corresponding to datetime.month
        df = DataFrame(
            {
                'dt': [
                    datetime(2011, 7, 1),
                    datetime(2011, 7, 1),
                    datetime(2011, 2, 1),
                    datetime(2011, 5, 1),
                    datetime(2011, 2, 1),
                    datetime(2011, 1, 1),
                    datetime(2011, 5, 1)
                ],
                'foo': [10, 8, 5, 6, 4, 1, 7],
                'bar': [10, 20, 30, 40, 50, 60, 70]
            },
            columns=['dt', 'foo', 'bar'])

        # ordered=True
        df['dt'] = Categorical(df['dt'], ordered=True)
        index = [
            datetime(2011, 1, 1),
            datetime(2011, 2, 1),
            datetime(2011, 5, 1),
            datetime(2011, 7, 1)
        ]
        result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
                                columns=['foo', 'bar'])
        result_sort.index = CategoricalIndex(index, name='dt', ordered=True)

        index = [
            datetime(2011, 7, 1),
            datetime(2011, 2, 1),
            datetime(2011, 5, 1),
            datetime(2011, 1, 1)
        ]
        result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
                                  columns=['foo', 'bar'])
        result_nosort.index = CategoricalIndex(index,
                                               categories=index,
                                               name='dt',
                                               ordered=True)

        col = 'dt'
        assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
        # when categories is ordered, group is ordered by category's order
        assert_frame_equal(result_sort, df.groupby(col, sort=False).first())

        # ordered = False
        df['dt'] = Categorical(df['dt'], ordered=False)
        index = [
            datetime(2011, 1, 1),
            datetime(2011, 2, 1),
            datetime(2011, 5, 1),
            datetime(2011, 7, 1)
        ]
        result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
                                columns=['foo', 'bar'])
        result_sort.index = CategoricalIndex(index, name='dt')

        index = [
            datetime(2011, 7, 1),
            datetime(2011, 2, 1),
            datetime(2011, 5, 1),
            datetime(2011, 1, 1)
        ]
        result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
                                  columns=['foo', 'bar'])
        result_nosort.index = CategoricalIndex(index,
                                               categories=index,
                                               name='dt')

        col = 'dt'
        assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
        assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
Beispiel #27
0
             ("foo", "two", "min"),
             ("foo", "two", "max"),
             ("bar", "one", "min"),
             ("bar", "one", "max"),
             ("bar", "three", "min"),
             ("bar", "three", "max"),
         ],
         names=["A", "B", None],
     ),
     [1, 1, 3, 3, 2, 2, 4, 4],
 ),
 (
     False,
     MultiIndex.from_product(
         [
             CategoricalIndex(["bar", "foo"], ordered=False),
             CategoricalIndex(["one", "three", "two"], ordered=False),
             Index(["min", "max"]),
         ],
         names=["A", "B", None],
     ),
     [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3],
 ),
 (
     None,
     MultiIndex.from_product(
         [
             CategoricalIndex(["bar", "foo"], ordered=False),
             CategoricalIndex(["one", "three", "two"], ordered=False),
             Index(["min", "max"]),
         ],
Beispiel #28
0
    def test_loc_listlike_dtypes(self):
        # GH 11586

        # unique categories and codes
        index = CategoricalIndex(["a", "b", "c"])
        df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index)

        # unique slice
        res = df.loc[["a", "b"]]
        exp_index = CategoricalIndex(["a", "b"], categories=index.categories)
        exp = DataFrame({"A": [1, 2], "B": [4, 5]}, index=exp_index)
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[["a", "a", "b"]]

        exp_index = CategoricalIndex(["a", "a", "b"],
                                     categories=index.categories)
        exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index)
        tm.assert_frame_equal(res, exp, check_index_type=True)

        msg = "a list-indexer must only include " "values that are in the categories"
        with pytest.raises(KeyError, match=msg):
            df.loc[["a", "x"]]

        # duplicated categories and codes
        index = CategoricalIndex(["a", "b", "a"])
        df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index)

        # unique slice
        res = df.loc[["a", "b"]]
        exp = DataFrame({
            "A": [1, 3, 2],
            "B": [4, 6, 5]
        },
                        index=CategoricalIndex(["a", "a", "b"]))
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[["a", "a", "b"]]
        exp = DataFrame(
            {
                "A": [1, 3, 1, 3, 2],
                "B": [4, 6, 4, 6, 5]
            },
            index=CategoricalIndex(["a", "a", "a", "a", "b"]),
        )
        tm.assert_frame_equal(res, exp, check_index_type=True)

        msg = "a list-indexer must only include values " "that are in the categories"
        with pytest.raises(KeyError, match=msg):
            df.loc[["a", "x"]]

        # contains unused category
        index = CategoricalIndex(["a", "b", "a", "c"],
                                 categories=list("abcde"))
        df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index)

        res = df.loc[["a", "b"]]
        exp = DataFrame(
            {
                "A": [1, 3, 2],
                "B": [5, 7, 6]
            },
            index=CategoricalIndex(["a", "a", "b"], categories=list("abcde")),
        )
        tm.assert_frame_equal(res, exp, check_index_type=True)

        res = df.loc[["a", "e"]]
        exp = DataFrame(
            {
                "A": [1, 3, np.nan],
                "B": [5, 7, np.nan]
            },
            index=CategoricalIndex(["a", "a", "e"], categories=list("abcde")),
        )
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[["a", "a", "b"]]
        exp = DataFrame(
            {
                "A": [1, 3, 1, 3, 2],
                "B": [5, 7, 5, 7, 6]
            },
            index=CategoricalIndex(["a", "a", "a", "a", "b"],
                                   categories=list("abcde")),
        )
        tm.assert_frame_equal(res, exp, check_index_type=True)

        msg = "a list-indexer must only include values " "that are in the categories"
        with pytest.raises(KeyError, match=msg):
            df.loc[["a", "x"]]
Beispiel #29
0
def test_basic():

    cats = Categorical(
        ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
        categories=["a", "b", "c", "d"],
        ordered=True,
    )
    data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

    exp_index = CategoricalIndex(list("abcd"), name="b", ordered=True)
    expected = DataFrame({"a": [1, 2, 4, np.nan]}, index=exp_index)
    result = data.groupby("b", observed=False).mean()
    tm.assert_frame_equal(result, expected)

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"],
                       ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"],
                       ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})

    # single grouper
    gb = df.groupby("A", observed=False)
    exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
    expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
    result = gb.sum()
    tm.assert_frame_equal(result, expected)

    # GH 8623
    x = DataFrame(
        [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]],
        columns=["person_id", "person_name"],
    )
    x["person_name"] = Categorical(x.person_name)

    g = x.groupby(["person_id"], observed=False)
    result = g.transform(lambda x: x)
    tm.assert_frame_equal(result, x[["person_name"]])

    result = x.drop_duplicates("person_name")
    expected = x.iloc[[0, 1]]
    tm.assert_frame_equal(result, expected)

    def f(x):
        return x.drop_duplicates("person_name").iloc[0]

    result = g.apply(f)
    expected = x.iloc[[0, 1]].copy()
    expected.index = Index([1, 2], name="person_id")
    expected["person_name"] = expected["person_name"].astype("object")
    tm.assert_frame_equal(result, expected)

    # GH 9921
    # Monotonic
    df = DataFrame({"a": [5, 15, 25]})
    c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df["a"])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df["a"])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum), df[["a"]])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
        df[["a"]])

    # Filter
    tm.assert_series_equal(
        df.a.groupby(c, observed=False).filter(np.all), df["a"])
    tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df)

    # Non-monotonic
    df = DataFrame({"a": [5, 15, 25, -5]})
    c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df["a"])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df["a"])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum), df[["a"]])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df[["a"]])

    # GH 9603
    df = DataFrame({"a": [1, 0, 0, 0]})
    c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd")))
    result = df.groupby(c, observed=False).apply(len)

    exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered)
    expected = Series([1, 0, 0, 0], index=exp_index)
    expected.index.name = "a"
    tm.assert_series_equal(result, expected)

    # more basic
    levels = ["foo", "bar", "baz", "qux"]
    codes = np.random.randint(0, 4, size=100)

    cats = Categorical.from_codes(codes, levels, ordered=True)

    data = DataFrame(np.random.randn(100, 4))

    result = data.groupby(cats, observed=False).mean()

    expected = data.groupby(np.asarray(cats), observed=False).mean()
    exp_idx = CategoricalIndex(levels,
                               categories=cats.categories,
                               ordered=True)
    expected = expected.reindex(exp_idx)

    assert_frame_equal(result, expected)

    grouped = data.groupby(cats, observed=False)
    desc_result = grouped.describe()

    idx = cats.codes.argsort()
    ord_labels = np.asarray(cats).take(idx)
    ord_data = data.take(idx)

    exp_cats = Categorical(ord_labels,
                           ordered=True,
                           categories=["foo", "bar", "baz", "qux"])
    expected = ord_data.groupby(exp_cats, sort=False,
                                observed=False).describe()
    assert_frame_equal(desc_result, expected)

    # GH 10460
    expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
    exp = CategoricalIndex(expc)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
    exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] *
                4)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
Beispiel #30
0
    def test_describe(self):
        # string type
        desc = self.factor.describe()
        assert self.factor.ordered
        exp_index = CategoricalIndex(["a", "b", "c"],
                                     name="categories",
                                     ordered=self.factor.ordered)
        expected = DataFrame(
            {
                "counts": [3, 2, 3],
                "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0]
            },
            index=exp_index)
        tm.assert_frame_equal(desc, expected)

        # check unused categories
        cat = self.factor.copy()
        cat.set_categories(["a", "b", "c", "d"], inplace=True)
        desc = cat.describe()

        exp_index = CategoricalIndex(list("abcd"),
                                     ordered=self.factor.ordered,
                                     name="categories")
        expected = DataFrame(
            {
                "counts": [3, 2, 3, 0],
                "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0]
            },
            index=exp_index,
        )
        tm.assert_frame_equal(desc, expected)

        # check an integer one
        cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
        desc = cat.describe()
        exp_index = CategoricalIndex([1, 2, 3],
                                     ordered=cat.ordered,
                                     name="categories")
        expected = DataFrame(
            {
                "counts": [5, 3, 3],
                "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0]
            },
            index=exp_index,
        )
        tm.assert_frame_equal(desc, expected)

        # https://github.com/pandas-dev/pandas/issues/3678
        # describe should work with NaN
        cat = Categorical([np.nan, 1, 2, 2])
        desc = cat.describe()
        expected = DataFrame(
            {
                "counts": [1, 2, 1],
                "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0]
            },
            index=CategoricalIndex([1, 2, np.nan],
                                   categories=[1, 2],
                                   name="categories"),
        )
        tm.assert_frame_equal(desc, expected)
Beispiel #31
0
 def test_construction_empty_with_bool_categories(self):
     # see GH#22702
     cat = CategoricalIndex([], categories=[True, False])
     categories = sorted(cat.categories.tolist())
     assert categories == [False, True]
Beispiel #32
0
    idx = index
    idx_non_unique = idx[[0, 0, 1, 2]]

    check_intersection_commutative(idx, idx_non_unique)
    assert idx.intersection(idx_non_unique).is_unique


@pytest.mark.parametrize(
    "cls",
    [
        Int64Index,
        Float64Index,
        DatetimeIndex,
        CategoricalIndex,
        lambda x: CategoricalIndex(x, categories=set(x)),
        TimedeltaIndex,
        lambda x: Index(x, dtype=object),
        UInt64Index,
    ],
)
def test_union_duplicate_index_subsets_of_each_other(cls):
    # GH#31326
    a = cls([1, 2, 2, 3])
    b = cls([3, 3, 4])
    expected = cls([1, 2, 2, 3, 3, 4])
    if isinstance(a, CategoricalIndex):
        expected = Index([1, 2, 2, 3, 3, 4])
    result = a.union(b)
    tm.assert_index_equal(result, expected)
    result = a.union(b, sort=False)
Beispiel #33
0
    def test_string_categorical_index_repr(self):
        # short
        idx = CategoricalIndex(["a", "bb", "ccc"])
        expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')"""  # noqa:E501
        assert repr(idx) == expected

        # multiple lines
        idx = CategoricalIndex(["a", "bb", "ccc"] * 10)
        expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a',
                  'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb',
                  'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'],
                 categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')"""

        assert repr(idx) == expected

        # truncated
        idx = CategoricalIndex(["a", "bb", "ccc"] * 100)
        expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a',
                  ...
                  'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'],
                 categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)"""  # noqa:E501

        assert repr(idx) == expected

        # larger categories
        idx = CategoricalIndex(list("abcdefghijklmmo"))
        expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                  'm', 'm', 'o'],
                 categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')"""  # noqa:E501

        assert repr(idx) == expected

        # short
        idx = CategoricalIndex(["あ", "いい", "ううう"])
        expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa:E501
        assert repr(idx) == expected

        # multiple lines
        idx = CategoricalIndex(["あ", "いい", "ううう"] * 10)
        expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ',
                  'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""

        assert repr(idx) == expected

        # truncated
        idx = CategoricalIndex(["あ", "いい", "ううう"] * 100)
        expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ',
                  ...
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)"""  # noqa:E501

        assert repr(idx) == expected

        # larger categories
        idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ"))
        expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し',
                  'す', 'せ', 'そ'],
                 categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')"""  # noqa:E501

        assert repr(idx) == expected

        # Enable Unicode option -----------------------------------------
        with cf.option_context("display.unicode.east_asian_width", True):

            # short
            idx = CategoricalIndex(["あ", "いい", "ううう"])
            expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa:E501
            assert repr(idx) == expected

            # multiple lines
            idx = CategoricalIndex(["あ", "いい", "ううう"] * 10)
            expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',
                  'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""

            assert repr(idx) == expected

            # truncated
            idx = CategoricalIndex(["あ", "いい", "ううう"] * 100)
            expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ',
                  ...
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',
                  'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)"""  # noqa:E501

            assert repr(idx) == expected

            # larger categories
            idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ"))
            expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ',
                  'さ', 'し', 'す', 'せ', 'そ'],
                 categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')"""  # noqa:E501

            assert repr(idx) == expected
Beispiel #34
0
    def test_get_indexer(self):

        idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
        idx2 = CategoricalIndex(list("abf"))

        for indexer in [idx2, list("abf"), Index(list("abf"))]:
            r1 = idx1.get_indexer(idx2)
            tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp))

        msg = ("method='pad' and method='backfill' not implemented yet for "
               "CategoricalIndex")
        with pytest.raises(NotImplementedError, match=msg):
            idx2.get_indexer(idx1, method="pad")
        with pytest.raises(NotImplementedError, match=msg):
            idx2.get_indexer(idx1, method="backfill")

        msg = "method='nearest' not implemented yet for CategoricalIndex"
        with pytest.raises(NotImplementedError, match=msg):
            idx2.get_indexer(idx1, method="nearest")