Beispiel #1
0
def test_pandas_extension_types():
    """Test pandas extension data type happy path."""
    # pylint: disable=no-member
    test_params = [
        (pd.CategoricalDtype(),
         pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"), None),
        (pd.DatetimeTZDtype(tz='UTC'),
         pd.Series(pd.date_range(start="20200101", end="20200301"),
                   dtype="datetime64[ns, utc]"), None),
        (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None),
        (pd.StringDtype(), pd.Series(["foo", "bar", "baz"],
                                     dtype="string"), None),
        (pd.PeriodDtype(freq='D'),
         pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')), None),
        (
            pd.SparseDtype("float"),
            pd.Series(range(100)).where(lambda s: s < 5,
                                        other=np.nan).astype("Sparse[float]"),
            {
                "nullable": True
            },
        ),
        (pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None),
        (
            pd.IntervalDtype(subtype="int64"),
            pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])),
            None,
        )
    ]
    for dtype, data, series_kwargs in test_params:
        series_kwargs = {} if series_kwargs is None else series_kwargs
        series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs)
        assert isinstance(series_schema.validate(data), pd.Series)
Beispiel #2
0
 def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
     # Note: This does not handle null values in the interval column.
     # However, this exact sequence (calling __from_arrow__ on the output of
     # self.to_arrow) is currently the best known way to convert interval
     # types into pandas (trying to convert the underlying numerical columns
     # directly is problematic), so we're stuck with this for now.
     return pd.Series(pd.IntervalDtype().__from_arrow__(self.to_arrow()),
                      index=index)
Beispiel #3
0
    def test_compare_scalar_na(self, op, array, nulls_fixture):
        result = op(array, nulls_fixture)
        expected = self.elementwise_comparison(op, array, nulls_fixture)

        if nulls_fixture is pd.NA and array.dtype != pd.IntervalDtype("int"):
            pytest.xfail("broken for non-integer IntervalArray; see GH 31882")

        tm.assert_numpy_array_equal(result, expected)
Beispiel #4
0
    def test_compare_scalar_na(self, op, array, nulls_fixture, request):
        result = op(array, nulls_fixture)
        expected = self.elementwise_comparison(op, array, nulls_fixture)

        if nulls_fixture is pd.NA and array.dtype != pd.IntervalDtype("int64"):
            mark = pytest.mark.xfail(
                reason="broken for non-integer IntervalArray; see GH 31882")
            request.node.add_marker(mark)

        tm.assert_numpy_array_equal(result, expected)
Beispiel #5
0
def test_from_arrow_from_raw_struct_array():
    # in case pyarrow lost the Interval extension type (eg on parquet roundtrip
    # with datetime64[ns] subtype, see GH-45881), still allow conversion
    # from arrow to IntervalArray
    import pyarrow as pa

    arr = pa.array([{"left": 0, "right": 1}, {"left": 1, "right": 2}])
    dtype = pd.IntervalDtype(np.dtype("int64"), closed="neither")

    result = dtype.__from_arrow__(arr)
    expected = IntervalArray.from_breaks(np.array([0, 1, 2], dtype="int64"),
                                         closed="neither")
    tm.assert_extension_array_equal(result, expected)

    result = dtype.__from_arrow__(pa.chunked_array([arr]))
    tm.assert_extension_array_equal(result, expected)
Beispiel #6
0
class TestDataFrameAppend:
    @pytest.mark.filterwarnings(
        "ignore:.*append method is deprecated.*:FutureWarning")
    def test_append_multiindex(self, multiindex_dataframe_random_data,
                               frame_or_series):
        obj = multiindex_dataframe_random_data
        obj = tm.get_obj(obj, frame_or_series)

        a = obj[:5]
        b = obj[5:]

        result = a.append(b)
        tm.assert_equal(result, obj)

    def test_append_empty_list(self):
        # GH 28769
        df = DataFrame()
        result = df._append([])
        expected = df
        tm.assert_frame_equal(result, expected)
        assert result is not df

        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])
        result = df._append([])
        expected = df
        tm.assert_frame_equal(result, expected)
        assert result is not df  # ._append() should return a new object

    def test_append_series_dict(self):
        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])

        series = df.loc[4]
        msg = "Indexes have overlapping values"
        with pytest.raises(ValueError, match=msg):
            df._append(series, verify_integrity=True)

        series.name = None
        msg = "Can only append a Series if ignore_index=True"
        with pytest.raises(TypeError, match=msg):
            df._append(series, verify_integrity=True)

        result = df._append(series[::-1], ignore_index=True)
        expected = df._append(DataFrame({
            0: series[::-1]
        }, index=df.columns).T,
                              ignore_index=True)
        tm.assert_frame_equal(result, expected)

        # dict
        result = df._append(series.to_dict(), ignore_index=True)
        tm.assert_frame_equal(result, expected)

        result = df._append(series[::-1][:3], ignore_index=True)
        expected = df._append(DataFrame({
            0: series[::-1][:3]
        }).T,
                              ignore_index=True,
                              sort=True)
        tm.assert_frame_equal(result, expected.loc[:, result.columns])

        msg = "Can only append a dict if ignore_index=True"
        with pytest.raises(TypeError, match=msg):
            df._append(series.to_dict())

        # can append when name set
        row = df.loc[4]
        row.name = 5
        result = df._append(row)
        expected = df._append(df[-1:], ignore_index=True)
        tm.assert_frame_equal(result, expected)

    def test_append_list_of_series_dicts(self):
        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])

        dicts = [x.to_dict() for idx, x in df.iterrows()]

        result = df._append(dicts, ignore_index=True)
        expected = df._append(df, ignore_index=True)
        tm.assert_frame_equal(result, expected)

        # different columns
        dicts = [
            {
                "foo": 1,
                "bar": 2,
                "baz": 3,
                "peekaboo": 4
            },
            {
                "foo": 5,
                "bar": 6,
                "baz": 7,
                "peekaboo": 8
            },
        ]
        result = df._append(dicts, ignore_index=True, sort=True)
        expected = df._append(DataFrame(dicts), ignore_index=True, sort=True)
        tm.assert_frame_equal(result, expected)

    def test_append_list_retain_index_name(self):
        df = DataFrame([[1, 2], [3, 4]],
                       index=pd.Index(["a", "b"], name="keepthisname"))

        serc = Series([5, 6], name="c")

        expected = DataFrame(
            [[1, 2], [3, 4], [5, 6]],
            index=pd.Index(["a", "b", "c"], name="keepthisname"),
        )

        # append series
        result = df._append(serc)
        tm.assert_frame_equal(result, expected)

        # append list of series
        result = df._append([serc])
        tm.assert_frame_equal(result, expected)

    def test_append_missing_cols(self):
        # GH22252
        # exercise the conditional branch in append method where the data
        # to be appended is a list and does not contain all columns that are in
        # the target DataFrame
        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])

        dicts = [{"foo": 9}, {"bar": 10}]
        result = df._append(dicts, ignore_index=True, sort=True)

        expected = df._append(DataFrame(dicts), ignore_index=True, sort=True)
        tm.assert_frame_equal(result, expected)

    def test_append_empty_dataframe(self):

        # Empty df append empty df
        df1 = DataFrame()
        df2 = DataFrame()
        result = df1._append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        # Non-empty df append empty df
        df1 = DataFrame(np.random.randn(5, 2))
        df2 = DataFrame()
        result = df1._append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        # Empty df with columns append empty df
        df1 = DataFrame(columns=["bar", "foo"])
        df2 = DataFrame()
        result = df1._append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        # Non-Empty df with columns append empty df
        df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"])
        df2 = DataFrame()
        result = df1._append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

    def test_append_dtypes(self, using_array_manager):

        # GH 5754
        # row appends of different dtypes (so need to do by-item)
        # can sometimes infer the correct type

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5))
        df2 = DataFrame()
        result = df1._append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": "foo"}, index=range(1, 2))
        result = df1._append(df2)
        expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]})
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": np.nan}, index=range(1, 2))
        result = df1._append(df2)
        expected = DataFrame(
            {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")})
        if using_array_manager:
            # TODO(ArrayManager) decide on exact casting rules in concat
            # With ArrayManager, all-NaN float is not ignored
            expected = expected.astype(object)
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object)
        result = df1._append(df2)
        expected = DataFrame(
            {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")})
        if using_array_manager:
            # With ArrayManager, all-NaN float is not ignored
            expected = expected.astype(object)
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": np.nan}, index=range(1))
        df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2))
        result = df1._append(df2)
        expected = DataFrame(
            {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")})
        if using_array_manager:
            # With ArrayManager, all-NaN float is not ignored
            expected = expected.astype(object)
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object)
        result = df1._append(df2)
        expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])})
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "timestamp", ["2019-07-19 07:04:57+0100", "2019-07-19 07:04:57"])
    def test_append_timestamps_aware_or_naive(self, tz_naive_fixture,
                                              timestamp):
        # GH 30238
        tz = tz_naive_fixture
        df = DataFrame([Timestamp(timestamp, tz=tz)])
        result = df._append(df.iloc[0]).iloc[-1]
        expected = Series(Timestamp(timestamp, tz=tz), name=0)
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "data, dtype",
        [
            ([1], pd.Int64Dtype()),
            ([1], pd.CategoricalDtype()),
            ([pd.Interval(left=0, right=5)], pd.IntervalDtype()),
            ([pd.Period("2000-03", freq="M")], pd.PeriodDtype("M")),
            ([1], pd.SparseDtype()),
        ],
    )
    def test_other_dtypes(self, data, dtype, using_array_manager):
        df = DataFrame(data, dtype=dtype)

        warn = None
        if using_array_manager and isinstance(dtype, pd.SparseDtype):
            warn = FutureWarning

        with tm.assert_produces_warning(warn, match="astype from SparseDtype"):
            result = df._append(df.iloc[0]).iloc[-1]

        expected = Series(data, name=0, dtype=dtype)
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
    def test_append_numpy_bug_1681(self, dtype):
        # another datetime64 bug
        if dtype == "datetime64[ns]":
            index = date_range("2011/1/1", "2012/1/1", freq="W-FRI")
        else:
            index = timedelta_range("1 days", "10 days", freq="2D")

        df = DataFrame()
        other = DataFrame({"A": "foo", "B": index}, index=index)

        result = df._append(other)
        assert (result["B"] == index).all()

    @pytest.mark.filterwarnings("ignore:The values in the array:RuntimeWarning"
                                )
    def test_multiindex_column_append_multiple(self):
        # GH 29699
        df = DataFrame(
            [[1, 11], [2, 12], [3, 13]],
            columns=pd.MultiIndex.from_tuples([("multi", "col1"),
                                               ("multi", "col2")],
                                              names=["level1", None]),
        )
        df2 = df.copy()
        for i in range(1, 10):
            df[i, "colA"] = 10
            df = df._append(df2, ignore_index=True)
            result = df["multi"]
            expected = DataFrame({
                "col1": [1, 2, 3] * (i + 1),
                "col2": [11, 12, 13] * (i + 1)
            })
            tm.assert_frame_equal(result, expected)

    def test_append_raises_future_warning(self):
        # GH#35407
        df1 = DataFrame([[1, 2], [3, 4]])
        df2 = DataFrame([[5, 6], [7, 8]])
        with tm.assert_produces_warning(FutureWarning):
            df1.append(df2)
Beispiel #7
0
class TestDataFrameAppend:
    def test_append_empty_list(self):
        # GH 28769
        df = DataFrame()
        result = df.append([])
        expected = df
        tm.assert_frame_equal(result, expected)
        assert result is not df

        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])
        result = df.append([])
        expected = df
        tm.assert_frame_equal(result, expected)
        assert result is not df  # .append() should return a new object

    def test_append_series_dict(self):
        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])

        series = df.loc[4]
        msg = "Indexes have overlapping values"
        with pytest.raises(ValueError, match=msg):
            df.append(series, verify_integrity=True)

        series.name = None
        msg = "Can only append a Series if ignore_index=True"
        with pytest.raises(TypeError, match=msg):
            df.append(series, verify_integrity=True)

        result = df.append(series[::-1], ignore_index=True)
        expected = df.append(DataFrame({
            0: series[::-1]
        }, index=df.columns).T,
                             ignore_index=True)
        tm.assert_frame_equal(result, expected)

        # dict
        result = df.append(series.to_dict(), ignore_index=True)
        tm.assert_frame_equal(result, expected)

        result = df.append(series[::-1][:3], ignore_index=True)
        expected = df.append(DataFrame({
            0: series[::-1][:3]
        }).T,
                             ignore_index=True,
                             sort=True)
        tm.assert_frame_equal(result, expected.loc[:, result.columns])

        # can append when name set
        row = df.loc[4]
        row.name = 5
        result = df.append(row)
        expected = df.append(df[-1:], ignore_index=True)
        tm.assert_frame_equal(result, expected)

    def test_append_list_of_series_dicts(self):
        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])

        dicts = [x.to_dict() for idx, x in df.iterrows()]

        result = df.append(dicts, ignore_index=True)
        expected = df.append(df, ignore_index=True)
        tm.assert_frame_equal(result, expected)

        # different columns
        dicts = [
            {
                "foo": 1,
                "bar": 2,
                "baz": 3,
                "peekaboo": 4
            },
            {
                "foo": 5,
                "bar": 6,
                "baz": 7,
                "peekaboo": 8
            },
        ]
        result = df.append(dicts, ignore_index=True, sort=True)
        expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
        tm.assert_frame_equal(result, expected)

    def test_append_missing_cols(self):
        # GH22252
        # exercise the conditional branch in append method where the data
        # to be appended is a list and does not contain all columns that are in
        # the target DataFrame
        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])

        dicts = [{"foo": 9}, {"bar": 10}]
        with tm.assert_produces_warning(None):
            result = df.append(dicts, ignore_index=True, sort=True)

        expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
        tm.assert_frame_equal(result, expected)

    def test_append_empty_dataframe(self):

        # Empty df append empty df
        df1 = DataFrame()
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        # Non-empty df append empty df
        df1 = DataFrame(np.random.randn(5, 2))
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        # Empty df with columns append empty df
        df1 = DataFrame(columns=["bar", "foo"])
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        # Non-Empty df with columns append empty df
        df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"])
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

    def test_append_dtypes(self):

        # GH 5754
        # row appends of different dtypes (so need to do by-item)
        # can sometimes infer the correct type

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5))
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": "foo"}, index=range(1, 2))
        result = df1.append(df2)
        expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]})
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": np.nan}, index=range(1, 2))
        result = df1.append(df2)
        expected = DataFrame(
            {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")})
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object)
        result = df1.append(df2)
        expected = DataFrame(
            {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")})
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": np.nan}, index=range(1))
        df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2))
        result = df1.append(df2)
        expected = DataFrame(
            {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")})
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
        df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object)
        result = df1.append(df2)
        expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])})
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "timestamp", ["2019-07-19 07:04:57+0100", "2019-07-19 07:04:57"])
    def test_append_timestamps_aware_or_naive(self, tz_naive_fixture,
                                              timestamp):
        # GH 30238
        tz = tz_naive_fixture
        df = pd.DataFrame([pd.Timestamp(timestamp, tz=tz)])
        result = df.append(df.iloc[0]).iloc[-1]
        expected = pd.Series(pd.Timestamp(timestamp, tz=tz), name=0)
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "data, dtype",
        [
            ([1], pd.Int64Dtype()),
            ([1], pd.CategoricalDtype()),
            ([pd.Interval(left=0, right=5)], pd.IntervalDtype()),
            ([pd.Period("2000-03", freq="M")], pd.PeriodDtype("M")),
            ([1], pd.SparseDtype()),
        ],
    )
    def test_other_dtypes(self, data, dtype):
        df = pd.DataFrame(data, dtype=dtype)
        result = df.append(df.iloc[0]).iloc[-1]
        expected = pd.Series(data, name=0, dtype=dtype)
        tm.assert_series_equal(result, expected)
Beispiel #8
0
class TestSeriesReplace:
    def test_replace_explicit_none(self):
        # GH#36984 if the user explicitly passes value=None, give it to them
        ser = pd.Series([0, 0, ""], dtype=object)
        result = ser.replace("", None)
        expected = pd.Series([0, 0, None], dtype=object)
        tm.assert_series_equal(result, expected)

        df = pd.DataFrame(np.zeros((3, 3)))
        df.iloc[2, 2] = ""
        result = df.replace("", None)
        expected = pd.DataFrame(
            {
                0: np.zeros(3),
                1: np.zeros(3),
                2: np.array([0.0, 0.0, None], dtype=object),
            }
        )
        assert expected.iloc[2, 2] is None
        tm.assert_frame_equal(result, expected)

        # GH#19998 same thing with object dtype
        ser = pd.Series([10, 20, 30, "a", "a", "b", "a"])
        result = ser.replace("a", None)
        expected = pd.Series([10, 20, 30, None, None, "b", None])
        assert expected.iloc[-1] is None
        tm.assert_series_equal(result, expected)

    def test_replace_numpy_nan(self, nulls_fixture):
        # GH#45725 ensure numpy.nan can be replaced with all other null types
        to_replace = np.nan
        value = nulls_fixture
        dtype = object
        ser = pd.Series([to_replace], dtype=dtype)
        expected = pd.Series([value], dtype=dtype)

        result = ser.replace({to_replace: value}).astype(dtype=dtype)
        tm.assert_series_equal(result, expected)
        assert result.dtype == dtype

        # same thing but different calling convention
        result = ser.replace(to_replace, value).astype(dtype=dtype)
        tm.assert_series_equal(result, expected)
        assert result.dtype == dtype

    def test_replace_noop_doesnt_downcast(self):
        # GH#44498
        ser = pd.Series([None, None, pd.Timestamp("2021-12-16 17:31")], dtype=object)
        res = ser.replace({np.nan: None})  # should be a no-op
        tm.assert_series_equal(res, ser)
        assert res.dtype == object

        # same thing but different calling convention
        res = ser.replace(np.nan, None)
        tm.assert_series_equal(res, ser)
        assert res.dtype == object

    def test_replace(self):
        N = 100
        ser = pd.Series(np.random.randn(N))
        ser[0:4] = np.nan
        ser[6:10] = 0

        # replace list with a single value
        return_value = ser.replace([np.nan], -1, inplace=True)
        assert return_value is None

        exp = ser.fillna(-1)
        tm.assert_series_equal(ser, exp)

        rs = ser.replace(0.0, np.nan)
        ser[ser == 0.0] = np.nan
        tm.assert_series_equal(rs, ser)

        ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object)
        ser[:5] = np.nan
        ser[6:10] = "foo"
        ser[20:30] = "bar"

        # replace list with a single value
        rs = ser.replace([np.nan, "foo", "bar"], -1)

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -1).all()
        assert (rs[20:30] == -1).all()
        assert (pd.isna(ser[:5])).all()

        # replace with different values
        rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3})

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -2).all()
        assert (rs[20:30] == -3).all()
        assert (pd.isna(ser[:5])).all()

        # replace with different values with 2 lists
        rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3])
        tm.assert_series_equal(rs, rs2)

        # replace inplace
        return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True)
        assert return_value is None

        assert (ser[:5] == -1).all()
        assert (ser[6:10] == -1).all()
        assert (ser[20:30] == -1).all()

    def test_replace_nan_with_inf(self):
        ser = pd.Series([np.nan, 0, np.inf])
        tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0))

        ser = pd.Series([np.nan, 0, "foo", "bar", np.inf, None, pd.NaT])
        tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0))
        filled = ser.copy()
        filled[4] = 0
        tm.assert_series_equal(ser.replace(np.inf, 0), filled)

    def test_replace_listlike_value_listlike_target(self, datetime_series):
        ser = pd.Series(datetime_series.index)
        tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0))

        # malformed
        msg = r"Replacement lists must match in length\. Expecting 3 got 2"
        with pytest.raises(ValueError, match=msg):
            ser.replace([1, 2, 3], [np.nan, 0])

        # ser is dt64 so can't hold 1 or 2, so this replace is a no-op
        result = ser.replace([1, 2], [np.nan, 0])
        tm.assert_series_equal(result, ser)

        ser = pd.Series([0, 1, 2, 3, 4])
        result = ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])
        tm.assert_series_equal(result, pd.Series([4, 3, 2, 1, 0]))

    def test_replace_gh5319(self):
        # API change from 0.12?
        # GH 5319
        ser = pd.Series([0, np.nan, 2, 3, 4])
        expected = ser.ffill()
        result = ser.replace([np.nan])
        tm.assert_series_equal(result, expected)

        ser = pd.Series([0, np.nan, 2, 3, 4])
        expected = ser.ffill()
        result = ser.replace(np.nan)
        tm.assert_series_equal(result, expected)

    def test_replace_datetime64(self):
        # GH 5797
        ser = pd.Series(pd.date_range("20130101", periods=5))
        expected = ser.copy()
        expected.loc[2] = pd.Timestamp("20120101")
        result = ser.replace({pd.Timestamp("20130103"): pd.Timestamp("20120101")})
        tm.assert_series_equal(result, expected)
        result = ser.replace(pd.Timestamp("20130103"), pd.Timestamp("20120101"))
        tm.assert_series_equal(result, expected)

    def test_replace_nat_with_tz(self):
        # GH 11792: Test with replacing NaT in a list with tz data
        ts = pd.Timestamp("2015/01/01", tz="UTC")
        s = pd.Series([pd.NaT, pd.Timestamp("2015/01/01", tz="UTC")])
        result = s.replace([np.nan, pd.NaT], pd.Timestamp.min)
        expected = pd.Series([pd.Timestamp.min, ts], dtype=object)
        tm.assert_series_equal(expected, result)

    def test_replace_timedelta_td64(self):
        tdi = pd.timedelta_range(0, periods=5)
        ser = pd.Series(tdi)

        # Using a single dict argument means we go through replace_list
        result = ser.replace({ser[1]: ser[3]})

        expected = pd.Series([ser[0], ser[3], ser[2], ser[3], ser[4]])
        tm.assert_series_equal(result, expected)

    def test_replace_with_single_list(self):
        ser = pd.Series([0, 1, 2, 3, 4])
        result = ser.replace([1, 2, 3])
        tm.assert_series_equal(result, pd.Series([0, 0, 0, 0, 4]))

        s = ser.copy()
        return_value = s.replace([1, 2, 3], inplace=True)
        assert return_value is None
        tm.assert_series_equal(s, pd.Series([0, 0, 0, 0, 4]))

        # make sure things don't get corrupted when fillna call fails
        s = ser.copy()
        msg = (
            r"Invalid fill method\. Expecting pad \(ffill\) or backfill "
            r"\(bfill\)\. Got crash_cymbal"
        )
        with pytest.raises(ValueError, match=msg):
            return_value = s.replace([1, 2, 3], inplace=True, method="crash_cymbal")
            assert return_value is None
        tm.assert_series_equal(s, ser)

    def test_replace_mixed_types(self):
        ser = pd.Series(np.arange(5), dtype="int64")

        def check_replace(to_rep, val, expected):
            sc = ser.copy()
            result = ser.replace(to_rep, val)
            return_value = sc.replace(to_rep, val, inplace=True)
            assert return_value is None
            tm.assert_series_equal(expected, result)
            tm.assert_series_equal(expected, sc)

        # 3.0 can still be held in our int64 series, so we do not upcast GH#44940
        tr, v = [3], [3.0]
        check_replace(tr, v, ser)
        # Note this matches what we get with the scalars 3 and 3.0
        check_replace(tr[0], v[0], ser)

        # MUST upcast to float
        e = pd.Series([0, 1, 2, 3.5, 4])
        tr, v = [3], [3.5]
        check_replace(tr, v, e)

        # casts to object
        e = pd.Series([0, 1, 2, 3.5, "a"])
        tr, v = [3, 4], [3.5, "a"]
        check_replace(tr, v, e)

        # again casts to object
        e = pd.Series([0, 1, 2, 3.5, pd.Timestamp("20130101")])
        tr, v = [3, 4], [3.5, pd.Timestamp("20130101")]
        check_replace(tr, v, e)

        # casts to object
        e = pd.Series([0, 1, 2, 3.5, True], dtype="object")
        tr, v = [3, 4], [3.5, True]
        check_replace(tr, v, e)

        # test an object with dates + floats + integers + strings
        dr = pd.Series(pd.date_range("1/1/2001", "1/10/2001", freq="D"))
        result = dr.astype(object).replace([dr[0], dr[1], dr[2]], [1.0, 2, "a"])
        expected = pd.Series([1.0, 2, "a"] + dr[3:].tolist(), dtype=object)
        tm.assert_series_equal(result, expected)

    def test_replace_bool_with_string_no_op(self):
        s = pd.Series([True, False, True])
        result = s.replace("fun", "in-the-sun")
        tm.assert_series_equal(s, result)

    def test_replace_bool_with_string(self):
        # nonexistent elements
        s = pd.Series([True, False, True])
        result = s.replace(True, "2u")
        expected = pd.Series(["2u", False, "2u"])
        tm.assert_series_equal(expected, result)

    def test_replace_bool_with_bool(self):
        s = pd.Series([True, False, True])
        result = s.replace(True, False)
        expected = pd.Series([False] * len(s))
        tm.assert_series_equal(expected, result)

    def test_replace_with_dict_with_bool_keys(self):
        s = pd.Series([True, False, True])
        result = s.replace({"asdf": "asdb", True: "yes"})
        expected = pd.Series(["yes", False, "yes"])
        tm.assert_series_equal(result, expected)

    def test_replace_Int_with_na(self, any_int_ea_dtype):
        # GH 38267
        result = pd.Series([0, None], dtype=any_int_ea_dtype).replace(0, pd.NA)
        expected = pd.Series([pd.NA, pd.NA], dtype=any_int_ea_dtype)
        tm.assert_series_equal(result, expected)
        result = pd.Series([0, 1], dtype=any_int_ea_dtype).replace(0, pd.NA)
        result.replace(1, pd.NA, inplace=True)
        tm.assert_series_equal(result, expected)

    def test_replace2(self):
        N = 100
        ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object)
        ser[:5] = np.nan
        ser[6:10] = "foo"
        ser[20:30] = "bar"

        # replace list with a single value
        rs = ser.replace([np.nan, "foo", "bar"], -1)

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -1).all()
        assert (rs[20:30] == -1).all()
        assert (pd.isna(ser[:5])).all()

        # replace with different values
        rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3})

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -2).all()
        assert (rs[20:30] == -3).all()
        assert (pd.isna(ser[:5])).all()

        # replace with different values with 2 lists
        rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3])
        tm.assert_series_equal(rs, rs2)

        # replace inplace
        return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True)
        assert return_value is None
        assert (ser[:5] == -1).all()
        assert (ser[6:10] == -1).all()
        assert (ser[20:30] == -1).all()

    def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype):
        # GH 32621, GH#44940
        ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype)
        expected = pd.Series(["1", "2", np.nan], dtype=nullable_string_dtype)
        result = ser.replace({"one": "1", "two": "2"})
        tm.assert_series_equal(expected, result)

    def test_replace_with_empty_dictlike(self):
        # GH 15289
        s = pd.Series(list("abcd"))
        tm.assert_series_equal(s, s.replace({}))

        with tm.assert_produces_warning(FutureWarning):
            empty_series = pd.Series([])
        tm.assert_series_equal(s, s.replace(empty_series))

    def test_replace_string_with_number(self):
        # GH 15743
        s = pd.Series([1, 2, 3])
        result = s.replace("2", np.nan)
        expected = pd.Series([1, 2, 3])
        tm.assert_series_equal(expected, result)

    def test_replace_replacer_equals_replacement(self):
        # GH 20656
        # make sure all replacers are matching against original values
        s = pd.Series(["a", "b"])
        expected = pd.Series(["b", "a"])
        result = s.replace({"a": "b", "b": "a"})
        tm.assert_series_equal(expected, result)

    def test_replace_unicode_with_number(self):
        # GH 15743
        s = pd.Series([1, 2, 3])
        result = s.replace("2", np.nan)
        expected = pd.Series([1, 2, 3])
        tm.assert_series_equal(expected, result)

    def test_replace_mixed_types_with_string(self):
        # Testing mixed
        s = pd.Series([1, 2, 3, "4", 4, 5])
        result = s.replace([2, "4"], np.nan)
        expected = pd.Series([1, np.nan, 3, np.nan, 4, 5])
        tm.assert_series_equal(expected, result)

    @pytest.mark.parametrize(
        "categorical, numeric",
        [
            (pd.Categorical(["A"], categories=["A", "B"]), [1]),
            (pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]),
        ],
    )
    def test_replace_categorical(self, categorical, numeric):
        # GH 24971, GH#23305
        ser = pd.Series(categorical)
        result = ser.replace({"A": 1, "B": 2})
        expected = pd.Series(numeric).astype("category")
        if 2 not in expected.cat.categories:
            # i.e. categories should be [1, 2] even if there are no "B"s present
            # GH#44940
            expected = expected.cat.add_categories(2)
        tm.assert_series_equal(expected, result)

    def test_replace_categorical_single(self):
        # GH 26988
        dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
        s = pd.Series(dti)
        c = s.astype("category")

        expected = c.copy()
        expected = expected.cat.add_categories("foo")
        expected[2] = "foo"
        expected = expected.cat.remove_unused_categories()
        assert c[2] != "foo"

        result = c.replace(c[2], "foo")
        tm.assert_series_equal(expected, result)
        assert c[2] != "foo"  # ensure non-inplace call does not alter original

        return_value = c.replace(c[2], "foo", inplace=True)
        assert return_value is None
        tm.assert_series_equal(expected, c)

        first_value = c[0]
        return_value = c.replace(c[1], c[0], inplace=True)
        assert return_value is None
        assert c[0] == c[1] == first_value  # test replacing with existing value

    def test_replace_with_no_overflowerror(self):
        # GH 25616
        # casts to object without Exception from OverflowError
        s = pd.Series([0, 1, 2, 3, 4])
        result = s.replace([3], ["100000000000000000000"])
        expected = pd.Series([0, 1, 2, "100000000000000000000", 4])
        tm.assert_series_equal(result, expected)

        s = pd.Series([0, "100000000000000000000", "100000000000000000001"])
        result = s.replace(["100000000000000000000"], [1])
        expected = pd.Series([0, 1, "100000000000000000001"])
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "ser, to_replace, exp",
        [
            ([1, 2, 3], {1: 2, 2: 3, 3: 4}, [2, 3, 4]),
            (["1", "2", "3"], {"1": "2", "2": "3", "3": "4"}, ["2", "3", "4"]),
        ],
    )
    def test_replace_commutative(self, ser, to_replace, exp):
        # GH 16051
        # DataFrame.replace() overwrites when values are non-numeric

        series = pd.Series(ser)

        expected = pd.Series(exp)
        result = series.replace(to_replace)

        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "ser, exp", [([1, 2, 3], [1, True, 3]), (["x", 2, 3], ["x", True, 3])]
    )
    def test_replace_no_cast(self, ser, exp):
        # GH 9113
        # BUG: replace int64 dtype with bool coerces to int64

        series = pd.Series(ser)
        result = series.replace(2, True)
        expected = pd.Series(exp)

        tm.assert_series_equal(result, expected)

    def test_replace_invalid_to_replace(self):
        # GH 18634
        # API: replace() should raise an exception if invalid argument is given
        series = pd.Series(["a", "b", "c "])
        msg = (
            r"Expecting 'to_replace' to be either a scalar, array-like, "
            r"dict or None, got invalid type.*"
        )
        with pytest.raises(TypeError, match=msg):
            series.replace(lambda x: x.strip())

    @pytest.mark.parametrize("frame", [False, True])
    def test_replace_nonbool_regex(self, frame):
        obj = pd.Series(["a", "b", "c "])
        if frame:
            obj = obj.to_frame()

        msg = "'to_replace' must be 'None' if 'regex' is not a bool"
        with pytest.raises(ValueError, match=msg):
            obj.replace(to_replace=["a"], regex="foo")

    @pytest.mark.parametrize("frame", [False, True])
    def test_replace_empty_copy(self, frame):
        obj = pd.Series([], dtype=np.float64)
        if frame:
            obj = obj.to_frame()

        res = obj.replace(4, 5, inplace=True)
        assert res is None

        res = obj.replace(4, 5, inplace=False)
        tm.assert_equal(res, obj)
        assert res is not obj

    def test_replace_only_one_dictlike_arg(self, fixed_now_ts):
        # GH#33340

        ser = pd.Series([1, 2, "A", fixed_now_ts, True])
        to_replace = {0: 1, 2: "A"}
        value = "foo"
        msg = "Series.replace cannot use dict-like to_replace and non-None value"
        with pytest.raises(ValueError, match=msg):
            ser.replace(to_replace, value)

        to_replace = 1
        value = {0: "foo", 2: "bar"}
        msg = "Series.replace cannot use dict-value and non-None to_replace"
        with pytest.raises(ValueError, match=msg):
            ser.replace(to_replace, value)

    def test_replace_extension_other(self, frame_or_series):
        # https://github.com/pandas-dev/pandas/issues/34530
        obj = frame_or_series(pd.array([1, 2, 3], dtype="Int64"))
        result = obj.replace("", "")  # no exception
        # should not have changed dtype
        tm.assert_equal(obj, result)

    def _check_replace_with_method(self, ser: pd.Series):
        df = ser.to_frame()

        res = ser.replace(ser[1], method="pad")
        expected = pd.Series([ser[0], ser[0]] + list(ser[2:]), dtype=ser.dtype)
        tm.assert_series_equal(res, expected)

        res_df = df.replace(ser[1], method="pad")
        tm.assert_frame_equal(res_df, expected.to_frame())

        ser2 = ser.copy()
        res2 = ser2.replace(ser[1], method="pad", inplace=True)
        assert res2 is None
        tm.assert_series_equal(ser2, expected)

        res_df2 = df.replace(ser[1], method="pad", inplace=True)
        assert res_df2 is None
        tm.assert_frame_equal(df, expected.to_frame())

    def test_replace_ea_dtype_with_method(self, any_numeric_ea_dtype):
        arr = pd.array([1, 2, pd.NA, 4], dtype=any_numeric_ea_dtype)
        ser = pd.Series(arr)

        self._check_replace_with_method(ser)

    @pytest.mark.parametrize("as_categorical", [True, False])
    def test_replace_interval_with_method(self, as_categorical):
        # in particular interval that can't hold NA

        idx = pd.IntervalIndex.from_breaks(range(4))
        ser = pd.Series(idx)
        if as_categorical:
            ser = ser.astype("category")

        self._check_replace_with_method(ser)

    @pytest.mark.parametrize("as_period", [True, False])
    @pytest.mark.parametrize("as_categorical", [True, False])
    def test_replace_datetimelike_with_method(self, as_period, as_categorical):
        idx = pd.date_range("2016-01-01", periods=5, tz="US/Pacific")
        if as_period:
            idx = idx.tz_localize(None).to_period("D")

        ser = pd.Series(idx)
        ser.iloc[-2] = pd.NaT
        if as_categorical:
            ser = ser.astype("category")

        self._check_replace_with_method(ser)

    def test_replace_with_compiled_regex(self):
        # https://github.com/pandas-dev/pandas/issues/35680
        s = pd.Series(["a", "b", "c"])
        regex = re.compile("^a$")
        result = s.replace({regex: "z"}, regex=True)
        expected = pd.Series(["z", "b", "c"])
        tm.assert_series_equal(result, expected)

    def test_pandas_replace_na(self):
        # GH#43344
        ser = pd.Series(["AA", "BB", "CC", "DD", "EE", "", pd.NA], dtype="string")
        regex_mapping = {
            "AA": "CC",
            "BB": "CC",
            "EE": "CC",
            "CC": "CC-REPL",
        }
        result = ser.replace(regex_mapping, regex=True)
        exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string")
        tm.assert_series_equal(result, exp)

    @pytest.mark.parametrize(
        "dtype, input_data, to_replace, expected_data",
        [
            ("bool", [True, False], {True: False}, [False, False]),
            ("int64", [1, 2], {1: 10, 2: 20}, [10, 20]),
            ("Int64", [1, 2], {1: 10, 2: 20}, [10, 20]),
            ("float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]),
            ("Float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]),
            ("string", ["one", "two"], {"one": "1", "two": "2"}, ["1", "2"]),
            (
                pd.IntervalDtype("int64"),
                IntervalArray([pd.Interval(1, 2), pd.Interval(2, 3)]),
                {pd.Interval(1, 2): pd.Interval(10, 20)},
                IntervalArray([pd.Interval(10, 20), pd.Interval(2, 3)]),
            ),
            (
                pd.IntervalDtype("float64"),
                IntervalArray([pd.Interval(1.0, 2.7), pd.Interval(2.8, 3.1)]),
                {pd.Interval(1.0, 2.7): pd.Interval(10.6, 20.8)},
                IntervalArray([pd.Interval(10.6, 20.8), pd.Interval(2.8, 3.1)]),
            ),
            (
                pd.PeriodDtype("M"),
                [pd.Period("2020-05", freq="M")],
                {pd.Period("2020-05", freq="M"): pd.Period("2020-06", freq="M")},
                [pd.Period("2020-06", freq="M")],
            ),
        ],
    )
    def test_replace_dtype(self, dtype, input_data, to_replace, expected_data):
        # GH#33484
        ser = pd.Series(input_data, dtype=dtype)
        result = ser.replace(to_replace)
        expected = pd.Series(expected_data, dtype=dtype)
        tm.assert_series_equal(result, expected)

    def test_replace_string_dtype(self):
        # GH#40732, GH#44940
        ser = pd.Series(["one", "two", np.nan], dtype="string")
        res = ser.replace({"one": "1", "two": "2"})
        expected = pd.Series(["1", "2", np.nan], dtype="string")
        tm.assert_series_equal(res, expected)

        # GH#31644
        ser2 = pd.Series(["A", np.nan], dtype="string")
        res2 = ser2.replace("A", "B")
        expected2 = pd.Series(["B", np.nan], dtype="string")
        tm.assert_series_equal(res2, expected2)

        ser3 = pd.Series(["A", "B"], dtype="string")
        res3 = ser3.replace("A", pd.NA)
        expected3 = pd.Series([pd.NA, "B"], dtype="string")
        tm.assert_series_equal(res3, expected3)

    def test_replace_string_dtype_list_to_replace(self):
        # GH#41215, GH#44940
        ser = pd.Series(["abc", "def"], dtype="string")
        res = ser.replace(["abc", "any other string"], "xyz")
        expected = pd.Series(["xyz", "def"], dtype="string")
        tm.assert_series_equal(res, expected)

    def test_replace_string_dtype_regex(self):
        # GH#31644
        ser = pd.Series(["A", "B"], dtype="string")
        res = ser.replace(r".", "C", regex=True)
        expected = pd.Series(["C", "C"], dtype="string")
        tm.assert_series_equal(res, expected)

    def test_replace_nullable_numeric(self):
        # GH#40732, GH#44940

        floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype())
        assert floats.replace({1.0: 9}).dtype == floats.dtype
        assert floats.replace(1.0, 9).dtype == floats.dtype
        assert floats.replace({1.0: 9.0}).dtype == floats.dtype
        assert floats.replace(1.0, 9.0).dtype == floats.dtype

        res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0])
        assert res.dtype == floats.dtype

        ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype())
        assert ints.replace({1: 9}).dtype == ints.dtype
        assert ints.replace(1, 9).dtype == ints.dtype
        assert ints.replace({1: 9.0}).dtype == ints.dtype
        assert ints.replace(1, 9.0).dtype == ints.dtype

        # nullable (for now) raises instead of casting
        with pytest.raises(TypeError, match="Invalid value"):
            ints.replace({1: 9.5})
        with pytest.raises(TypeError, match="Invalid value"):
            ints.replace(1, 9.5)

    @pytest.mark.parametrize("regex", [False, True])
    def test_replace_regex_dtype_series(self, regex):
        # GH-48644
        series = pd.Series(["0"])
        expected = pd.Series([1])
        result = series.replace(to_replace="0", value=1, regex=regex)
        tm.assert_series_equal(result, expected)

    def test_replace_different_int_types(self, any_int_numpy_dtype):
        # GH#45311
        labs = pd.Series([1, 1, 1, 0, 0, 2, 2, 2], dtype=any_int_numpy_dtype)

        maps = pd.Series([0, 2, 1], dtype=any_int_numpy_dtype)
        map_dict = {old: new for (old, new) in zip(maps.values, maps.index)}

        result = labs.replace(map_dict)
        expected = labs.replace({0: 0, 2: 1, 1: 2})
        tm.assert_series_equal(result, expected)
class TestSeriesConvertDtypes:
    # The answerdict has keys that have 4 tuples, corresponding to the arguments
    # infer_objects, convert_string, convert_integer, convert_boolean
    # This allows all 16 possible combinations to be tested.  Since common
    # combinations expect the same answer, this provides an easy way to list
    # all the possibilities
    @pytest.mark.parametrize(
        "data, maindtype, answerdict",
        [
            (
                [1, 2, 3],
                np.dtype("int32"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int32",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("int32"),
                },
            ),
            (
                [1, 2, 3],
                np.dtype("int64"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("int64"),
                },
            ),
            (
                ["x", "y", "z"],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, ),
                        (True, False),
                        (True, False),
                    ):
                    pd.StringDtype(),
                    ((True, False), (False, ), (True, False), (True, False)):
                    np.dtype("O"),
                },
            ),
            (
                [True, False, np.nan],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, ),
                    ):
                    pd.BooleanDtype(),
                    ((True, False), (True, False), (True, False), (False, )):
                    np.dtype("O"),
                },
            ),
            (
                ["h", "i", np.nan],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, ),
                        (True, False),
                        (True, False),
                    ):
                    pd.StringDtype(),
                    ((True, False), (False, ), (True, False), (True, False)):
                    np.dtype("O"),
                },
            ),
            (  # GH32117
                ["h", "i", 1],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("O"),
                },
            ),
            (
                [10, np.nan, 20],
                np.dtype("float"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("float"),
                },
            ),
            (
                [np.nan, 100.5, 200],
                np.dtype("float"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("float"),
                },
            ),
            (
                [3, 4, 5],
                "Int8",
                {
                    ((True, False), (True, False), (True, False), (True, False)):
                    "Int8"
                },
            ),
            (
                [[1, 2], [3, 4], [5]],
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("O"),
                },
            ),
            (
                [4, 5, 6],
                np.dtype("uint32"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "UInt32",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("uint32"),
                },
            ),
            (
                [-10, 12, 13],
                np.dtype("i1"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int8",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("i1"),
                },
            ),
            (
                [1, 2.0],
                object,
                {
                    ((True, ), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, ), (True, False), (False, ), (True, False)):
                    np.dtype("float"),
                    ((False, ), (True, False), (True, False), (True, False)):
                    np.dtype("object"),
                },
            ),
            (
                [1, 2.5],
                object,
                {
                    ((True, ), (True, False), (True, False), (True, False)):
                    np.dtype("float"),
                    ((False, ), (True, False), (True, False), (True, False)):
                    np.dtype("object"),
                },
            ),
            (
                ["a", "b"],
                pd.CategoricalDtype(),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.CategoricalDtype(),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                pd.DatetimeTZDtype(tz="UTC"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.DatetimeTZDtype(tz="UTC"),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                "datetime64[ns]",
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("datetime64[ns]"),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                object,
                {
                    (
                        (True, ),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("datetime64[ns]"),
                    (
                        (False, ),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("O"),
                },
            ),
            (
                pd.period_range("1/1/2011", freq="M", periods=3),
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.PeriodDtype("M"),
                },
            ),
            (
                pd.arrays.IntervalArray([pd.Interval(0, 1),
                                         pd.Interval(1, 5)]),
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.IntervalDtype("int64"),
                },
            ),
        ],
    )
    @pytest.mark.parametrize("params", product(*[(True, False)] * 4))
    def test_convert_dtypes(self, data, maindtype, params, answerdict):
        if maindtype is not None:
            series = pd.Series(data, dtype=maindtype)
        else:
            series = pd.Series(data)
        answers = {
            k: a
            for (kk, a) in answerdict.items() for k in product(*kk)
        }

        ns = series.convert_dtypes(*params)
        expected_dtype = answers[tuple(params)]
        expected = pd.Series(series.values, dtype=expected_dtype)
        tm.assert_series_equal(ns, expected)

        # Test that it is a copy
        copy = series.copy(deep=True)
        if is_interval_dtype(ns.dtype) and ns.dtype.subtype.kind in ["i", "u"]:
            msg = "Cannot set float NaN to integer-backed IntervalArray"
            with pytest.raises(ValueError, match=msg):
                ns[ns.notna()] = np.nan
        else:
            ns[ns.notna()] = np.nan

        # Make sure original not changed
        tm.assert_series_equal(series, copy)

    def test_convert_string_dtype(self):
        # https://github.com/pandas-dev/pandas/issues/31731 -> converting columns
        # that are already string dtype
        df = pd.DataFrame({
            "A": ["a", "b", pd.NA],
            "B": ["ä", "ö", "ü"]
        },
                          dtype="string")
        result = df.convert_dtypes()
        tm.assert_frame_equal(df, result)

    def test_convert_bool_dtype(self):
        # GH32287
        df = pd.DataFrame({"A": pd.array([True])})
        tm.assert_frame_equal(df, df.convert_dtypes())
Beispiel #10
0
    def to_pandas_dtype(self):
        import pandas as pd

        return pd.IntervalDtype(self.subtype.to_pandas_dtype(), self.inclusive)
Beispiel #11
0
 def __post_init__(self):
     object.__setattr__(self, "type",
                        pd.IntervalDtype(subtype=self.subtype))
    ),
    (
        pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
        object,
        np.dtype("datetime64[ns]"),
        {
            ("infer_objects", False): np.dtype("object")
        },
    ),
    (pd.period_range("1/1/2011", freq="M",
                     periods=3), None, pd.PeriodDtype("M"), {}),
    (
        pd.arrays.IntervalArray([pd.Interval(0, 1),
                                 pd.Interval(1, 5)]),
        None,
        pd.IntervalDtype("int64", "right"),
        {},
    ),
]


class TestSeriesConvertDtypes:
    @pytest.mark.parametrize(
        "data, maindtype, expected_default, expected_other",
        test_cases,
    )
    @pytest.mark.parametrize("params", product(*[(True, False)] * 5))
    def test_convert_dtypes(self, data, maindtype, params, expected_default,
                            expected_other):
        warn = None
        if (hasattr(data, "dtype") and data.dtype == "M8[ns]"
Beispiel #13
0
timedelta_dtypes = {
    datetime.timedelta: "timedelta64",
    datetime.timedelta: "timedelta64",
    np.timedelta64: "timedelta64",
    pd.Timedelta: "timedelta64",
    pa.Timedelta: "timedelta64",
}

period_dtypes = {pd.PeriodDtype(freq="D"): "period[D]"}
# Series.astype does not accept a string alias for SparseDtype.
sparse_dtypes = {
    pd.SparseDtype: pd.SparseDtype(),
    pd.SparseDtype(np.float64): pd.SparseDtype(np.float64),
}
interval_dtypes = {pd.IntervalDtype(subtype=np.int64): "interval[int64]"}

dtype_fixtures: List[Tuple[Dict, List]] = [
    (int_dtypes, [-1]),
    (nullable_int_dtypes, [-1, None]),
    (uint_dtypes, [1]),
    (nullable_uint_dtypes, [1, None]),
    (float_dtypes, [1.0]),
    (complex_dtypes, [complex(1)]),
    (boolean_dtypes, [True, False]),
    (nullable_boolean_dtypes, [True, None]),
    (string_dtypes, ["A", "B"]),
    (object_dtypes, ["A", "B"]),
    (nullable_string_dtypes, [1, 2, None]),
    (category_dtypes, [1, 2, None]),
    (
    ),
    (
        pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
        object,
        np.dtype("datetime64[ns]"),
        {
            ("infer_objects", False): np.dtype("object")
        },
    ),
    (pd.period_range("1/1/2011", freq="M",
                     periods=3), None, pd.PeriodDtype("M"), {}),
    (
        pd.arrays.IntervalArray([pd.Interval(0, 1),
                                 pd.Interval(1, 5)]),
        None,
        pd.IntervalDtype("int64"),
        {},
    ),
]


class TestSeriesConvertDtypes:
    @pytest.mark.parametrize(
        "data, maindtype, expected_default, expected_other",
        test_cases,
    )
    @pytest.mark.parametrize("params", product(*[(True, False)] * 5))
    def test_convert_dtypes(self, data, maindtype, params, expected_default,
                            expected_other):
        if maindtype is not None:
            series = pd.Series(data, dtype=maindtype)
Beispiel #15
0
        ("<M8[s]", np.dtype("<M8[s]")),
        (cudf.ListDtype("int64"), cudf.ListDtype("int64")),
        ("category", cudf.CategoricalDtype()),
        (
            cudf.CategoricalDtype(categories=("a", "b", "c")),
            cudf.CategoricalDtype(categories=("a", "b", "c")),
        ),
        (
            pd.CategoricalDtype(categories=("a", "b", "c")),
            cudf.CategoricalDtype(categories=("a", "b", "c")),
        ),
        (
            # this is a pandas.core.arrays.numpy_.PandasDtype...
            pd.array([1], dtype="int16").dtype,
            np.dtype("int16"),
        ),
        (pd.IntervalDtype("int"), cudf.IntervalDtype("int64")),
        (cudf.IntervalDtype("int"), cudf.IntervalDtype("int64")),
        (pd.IntervalDtype("int64"), cudf.IntervalDtype("int64")),
    ],
)
def test_dtype(in_dtype, expect):
    assert_eq(cudf.dtype(in_dtype), expect)


@pytest.mark.parametrize("in_dtype",
                         ["complex", np.complex128, complex, "S", "a", "V"])
def test_dtype_raise(in_dtype):
    with pytest.raises(TypeError):
        cudf.dtype(in_dtype)
Beispiel #16
0
class TestSeriesConvertDtypes:
    # The answerdict has keys that have 4 tuples, corresponding to the arguments
    # infer_objects, convert_string, convert_integer, convert_boolean
    # This allows all 16 possible combinations to be tested.  Since common
    # combinations expect the same answer, this provides an easy way to list
    # all the possibilities
    @pytest.mark.parametrize(
        "data, maindtype, answerdict",
        [
            (
                [1, 2, 3],
                np.dtype("int32"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int32",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("int32"),
                },
            ),
            (
                [1, 2, 3],
                np.dtype("int64"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("int64"),
                },
            ),
            (
                ["x", "y", "z"],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, ),
                        (True, False),
                        (True, False),
                    ):
                    pd.StringDtype(),
                    ((True, False), (False, ), (True, False), (True, False)):
                    np.dtype("O"),
                },
            ),
            (
                [True, False, np.nan],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, ),
                    ):
                    pd.BooleanDtype(),
                    ((True, False), (True, False), (True, False), (False, )):
                    np.dtype("O"),
                },
            ),
            (
                ["h", "i", np.nan],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, ),
                        (True, False),
                        (True, False),
                    ):
                    pd.StringDtype(),
                    ((True, False), (False, ), (True, False), (True, False)):
                    np.dtype("O"),
                },
            ),
            (
                [10, np.nan, 20],
                np.dtype("float"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("float"),
                },
            ),
            (
                [np.nan, 100.5, 200],
                np.dtype("float"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("float"),
                },
            ),
            (
                [3, 4, 5],
                "Int8",
                {
                    ((True, False), (True, False), (True, False), (True, False)):
                    "Int8"
                },
            ),
            (
                [[1, 2], [3, 4], [5]],
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("O"),
                },
            ),
            (
                [4, 5, 6],
                np.dtype("uint32"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "UInt32",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("uint32"),
                },
            ),
            (
                [-10, 12, 13],
                np.dtype("i1"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int8",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("i1"),
                },
            ),
            (
                [1, 2.0],
                object,
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, ), (True, False), (False, ), (True, False)):
                    np.dtype("float"),
                    ((False, ), (True, False), (False, ), (True, False)):
                    np.dtype("object"),
                },
            ),
            (
                ["a", "b"],
                pd.CategoricalDtype(),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.CategoricalDtype(),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                pd.DatetimeTZDtype(tz="UTC"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.DatetimeTZDtype(tz="UTC"),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                "datetime64[ns]",
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("datetime64[ns]"),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                object,
                {
                    (
                        (True, ),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("datetime64[ns]"),
                    (
                        (False, ),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("O"),
                },
            ),
            (
                pd.period_range("1/1/2011", freq="M", periods=3),
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.PeriodDtype("M"),
                },
            ),
            (
                pd.arrays.IntervalArray([pd.Interval(0, 1),
                                         pd.Interval(1, 5)]),
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.IntervalDtype("int64"),
                },
            ),
        ],
    )
    @pytest.mark.parametrize("params", product(*[(True, False)] * 4))
    def test_convert_dtypes(self, data, maindtype, params, answerdict):
        if maindtype is not None:
            series = pd.Series(data, dtype=maindtype)
        else:
            series = pd.Series(data)
        answers = {
            k: a
            for (kk, a) in answerdict.items() for k in product(*kk)
        }

        ns = series.convert_dtypes(*params)
        expected_dtype = answers[tuple(params)]
        expected = pd.Series(series.values, dtype=expected_dtype)
        tm.assert_series_equal(ns, expected)

        # Test that it is a copy
        copy = series.copy(deep=True)
        ns[ns.notna()] = np.nan

        # Make sure original not changed
        tm.assert_series_equal(series, copy)