def test_pandas_extension_types(): """Test pandas extension data type happy path.""" # pylint: disable=no-member test_params = [ (pd.CategoricalDtype(), pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"), None), (pd.DatetimeTZDtype(tz='UTC'), pd.Series(pd.date_range(start="20200101", end="20200301"), dtype="datetime64[ns, utc]"), None), (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None), (pd.StringDtype(), pd.Series(["foo", "bar", "baz"], dtype="string"), None), (pd.PeriodDtype(freq='D'), pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')), None), ( pd.SparseDtype("float"), pd.Series(range(100)).where(lambda s: s < 5, other=np.nan).astype("Sparse[float]"), { "nullable": True }, ), (pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None), ( pd.IntervalDtype(subtype="int64"), pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])), None, ) ] for dtype, data, series_kwargs in test_params: series_kwargs = {} if series_kwargs is None else series_kwargs series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs) assert isinstance(series_schema.validate(data), pd.Series)
def _infer_column( series: pd.Series, given_format: Optional[str], try_fallback: Optional[Column] ) -> Column: """ Build a valid `Column` for the given Series, or raise `ValueError`. The logic: determine the `ColumnType` class of `series` (e.g., `ColumnType.Number`) and then try to initialize it with `given_format`. If the format is invalid, raise `ValueError` because the user tried to create something invalid. If `try_fallback` is given and of the correct `ColumnType` class, use `try_fallback`. Otherwise, construct `Column` with default format. """ # Determine ColumnType class, based on pandas/numpy `dtype`. dtype = series.dtype if is_numeric_dtype(dtype): if given_format is not None: parse_number_format(given_format) return Column(series.name, ColumnType.Number(format=given_format)) elif try_fallback is not None and isinstance( try_fallback.type, ColumnType.Number ): return try_fallback else: return Column(series.name, ColumnType.Number(format="{:,}")) elif is_datetime64_dtype(dtype): if given_format is not None: raise ValueError( '"format" not allowed for column "%s" because it is of type "timestamp"' % (series.name,) ) return Column(series.name, ColumnType.Timestamp()) elif pd.PeriodDtype(freq="D") == dtype: if given_format is not None: if given_format not in {"day", "week", "month", "quarter", "year"}: raise ValueError( 'Unit must be "day", "week", "month", "quarter" or "year"; got %r for column "%s"' % (given_format, series.name) ) return Column(series.name, ColumnType.Date(unit=given_format)) elif try_fallback is not None and isinstance( try_fallback.type, ColumnType.Date ): return try_fallback else: return Column(series.name, ColumnType.Date(unit="day")) elif dtype == object or dtype == "category": if given_format is not None: raise ValueError( '"format" not allowed for column "%s" because it is of type "text"' % (series.name,) ) return Column(series.name, ColumnType.Text()) else: raise ValueError(f"Unknown dtype: {dtype}")
def series_to_arrow_array(series: pd.Series) -> pa.Array: """ Convert a Pandas series to an in-memory Arrow array. """ if hasattr(series, "cat"): return pa.DictionaryArray.from_arrays( # Pandas categorical value "-1" means None pa.Array.from_pandas(series.cat.codes, mask=(series.cat.codes == -1)), series_to_arrow_array(series.cat.categories), ) elif pd.PeriodDtype(freq="D") == series.dtype: return pa.array( [(None if v is pd.NaT else v.ordinal) for v in series], type=pa.date32() ) else: return pa.array(series, type=_dtype_to_arrow_type(series.dtype))
class TestSeriesReplace: def test_replace_explicit_none(self): # GH#36984 if the user explicitly passes value=None, give it to them ser = pd.Series([0, 0, ""], dtype=object) result = ser.replace("", None) expected = pd.Series([0, 0, None], dtype=object) tm.assert_series_equal(result, expected) df = pd.DataFrame(np.zeros((3, 3))) df.iloc[2, 2] = "" result = df.replace("", None) expected = pd.DataFrame( { 0: np.zeros(3), 1: np.zeros(3), 2: np.array([0.0, 0.0, None], dtype=object), } ) assert expected.iloc[2, 2] is None tm.assert_frame_equal(result, expected) # GH#19998 same thing with object dtype ser = pd.Series([10, 20, 30, "a", "a", "b", "a"]) result = ser.replace("a", None) expected = pd.Series([10, 20, 30, None, None, "b", None]) assert expected.iloc[-1] is None tm.assert_series_equal(result, expected) def test_replace_numpy_nan(self, nulls_fixture): # GH#45725 ensure numpy.nan can be replaced with all other null types to_replace = np.nan value = nulls_fixture dtype = object ser = pd.Series([to_replace], dtype=dtype) expected = pd.Series([value], dtype=dtype) result = ser.replace({to_replace: value}).astype(dtype=dtype) tm.assert_series_equal(result, expected) assert result.dtype == dtype # same thing but different calling convention result = ser.replace(to_replace, value).astype(dtype=dtype) tm.assert_series_equal(result, expected) assert result.dtype == dtype def test_replace_noop_doesnt_downcast(self): # GH#44498 ser = pd.Series([None, None, pd.Timestamp("2021-12-16 17:31")], dtype=object) res = ser.replace({np.nan: None}) # should be a no-op tm.assert_series_equal(res, ser) assert res.dtype == object # same thing but different calling convention res = ser.replace(np.nan, None) tm.assert_series_equal(res, ser) assert res.dtype == object def test_replace(self): N = 100 ser = pd.Series(np.random.randn(N)) ser[0:4] = np.nan ser[6:10] = 0 # replace list with a single value return_value = ser.replace([np.nan], -1, inplace=True) assert return_value is None exp = ser.fillna(-1) tm.assert_series_equal(ser, exp) rs = ser.replace(0.0, np.nan) ser[ser == 0.0] = np.nan tm.assert_series_equal(rs, ser) ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object) ser[:5] = np.nan ser[6:10] = "foo" ser[20:30] = "bar" # replace list with a single value rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() assert (rs[20:30] == -1).all() assert (pd.isna(ser[:5])).all() # replace with different values rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() assert (rs[20:30] == -3).all() assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() def test_replace_nan_with_inf(self): ser = pd.Series([np.nan, 0, np.inf]) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) ser = pd.Series([np.nan, 0, "foo", "bar", np.inf, None, pd.NaT]) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) filled = ser.copy() filled[4] = 0 tm.assert_series_equal(ser.replace(np.inf, 0), filled) def test_replace_listlike_value_listlike_target(self, datetime_series): ser = pd.Series(datetime_series.index) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) # malformed msg = r"Replacement lists must match in length\. Expecting 3 got 2" with pytest.raises(ValueError, match=msg): ser.replace([1, 2, 3], [np.nan, 0]) # ser is dt64 so can't hold 1 or 2, so this replace is a no-op result = ser.replace([1, 2], [np.nan, 0]) tm.assert_series_equal(result, ser) ser = pd.Series([0, 1, 2, 3, 4]) result = ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0]) tm.assert_series_equal(result, pd.Series([4, 3, 2, 1, 0])) def test_replace_gh5319(self): # API change from 0.12? # GH 5319 ser = pd.Series([0, np.nan, 2, 3, 4]) expected = ser.ffill() result = ser.replace([np.nan]) tm.assert_series_equal(result, expected) ser = pd.Series([0, np.nan, 2, 3, 4]) expected = ser.ffill() result = ser.replace(np.nan) tm.assert_series_equal(result, expected) def test_replace_datetime64(self): # GH 5797 ser = pd.Series(pd.date_range("20130101", periods=5)) expected = ser.copy() expected.loc[2] = pd.Timestamp("20120101") result = ser.replace({pd.Timestamp("20130103"): pd.Timestamp("20120101")}) tm.assert_series_equal(result, expected) result = ser.replace(pd.Timestamp("20130103"), pd.Timestamp("20120101")) tm.assert_series_equal(result, expected) def test_replace_nat_with_tz(self): # GH 11792: Test with replacing NaT in a list with tz data ts = pd.Timestamp("2015/01/01", tz="UTC") s = pd.Series([pd.NaT, pd.Timestamp("2015/01/01", tz="UTC")]) result = s.replace([np.nan, pd.NaT], pd.Timestamp.min) expected = pd.Series([pd.Timestamp.min, ts], dtype=object) tm.assert_series_equal(expected, result) def test_replace_timedelta_td64(self): tdi = pd.timedelta_range(0, periods=5) ser = pd.Series(tdi) # Using a single dict argument means we go through replace_list result = ser.replace({ser[1]: ser[3]}) expected = pd.Series([ser[0], ser[3], ser[2], ser[3], ser[4]]) tm.assert_series_equal(result, expected) def test_replace_with_single_list(self): ser = pd.Series([0, 1, 2, 3, 4]) result = ser.replace([1, 2, 3]) tm.assert_series_equal(result, pd.Series([0, 0, 0, 0, 4])) s = ser.copy() return_value = s.replace([1, 2, 3], inplace=True) assert return_value is None tm.assert_series_equal(s, pd.Series([0, 0, 0, 0, 4])) # make sure things don't get corrupted when fillna call fails s = ser.copy() msg = ( r"Invalid fill method\. Expecting pad \(ffill\) or backfill " r"\(bfill\)\. Got crash_cymbal" ) with pytest.raises(ValueError, match=msg): return_value = s.replace([1, 2, 3], inplace=True, method="crash_cymbal") assert return_value is None tm.assert_series_equal(s, ser) def test_replace_mixed_types(self): ser = pd.Series(np.arange(5), dtype="int64") def check_replace(to_rep, val, expected): sc = ser.copy() result = ser.replace(to_rep, val) return_value = sc.replace(to_rep, val, inplace=True) assert return_value is None tm.assert_series_equal(expected, result) tm.assert_series_equal(expected, sc) # 3.0 can still be held in our int64 series, so we do not upcast GH#44940 tr, v = [3], [3.0] check_replace(tr, v, ser) # Note this matches what we get with the scalars 3 and 3.0 check_replace(tr[0], v[0], ser) # MUST upcast to float e = pd.Series([0, 1, 2, 3.5, 4]) tr, v = [3], [3.5] check_replace(tr, v, e) # casts to object e = pd.Series([0, 1, 2, 3.5, "a"]) tr, v = [3, 4], [3.5, "a"] check_replace(tr, v, e) # again casts to object e = pd.Series([0, 1, 2, 3.5, pd.Timestamp("20130101")]) tr, v = [3, 4], [3.5, pd.Timestamp("20130101")] check_replace(tr, v, e) # casts to object e = pd.Series([0, 1, 2, 3.5, True], dtype="object") tr, v = [3, 4], [3.5, True] check_replace(tr, v, e) # test an object with dates + floats + integers + strings dr = pd.Series(pd.date_range("1/1/2001", "1/10/2001", freq="D")) result = dr.astype(object).replace([dr[0], dr[1], dr[2]], [1.0, 2, "a"]) expected = pd.Series([1.0, 2, "a"] + dr[3:].tolist(), dtype=object) tm.assert_series_equal(result, expected) def test_replace_bool_with_string_no_op(self): s = pd.Series([True, False, True]) result = s.replace("fun", "in-the-sun") tm.assert_series_equal(s, result) def test_replace_bool_with_string(self): # nonexistent elements s = pd.Series([True, False, True]) result = s.replace(True, "2u") expected = pd.Series(["2u", False, "2u"]) tm.assert_series_equal(expected, result) def test_replace_bool_with_bool(self): s = pd.Series([True, False, True]) result = s.replace(True, False) expected = pd.Series([False] * len(s)) tm.assert_series_equal(expected, result) def test_replace_with_dict_with_bool_keys(self): s = pd.Series([True, False, True]) result = s.replace({"asdf": "asdb", True: "yes"}) expected = pd.Series(["yes", False, "yes"]) tm.assert_series_equal(result, expected) def test_replace_Int_with_na(self, any_int_ea_dtype): # GH 38267 result = pd.Series([0, None], dtype=any_int_ea_dtype).replace(0, pd.NA) expected = pd.Series([pd.NA, pd.NA], dtype=any_int_ea_dtype) tm.assert_series_equal(result, expected) result = pd.Series([0, 1], dtype=any_int_ea_dtype).replace(0, pd.NA) result.replace(1, pd.NA, inplace=True) tm.assert_series_equal(result, expected) def test_replace2(self): N = 100 ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object) ser[:5] = np.nan ser[6:10] = "foo" ser[20:30] = "bar" # replace list with a single value rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() assert (rs[20:30] == -1).all() assert (pd.isna(ser[:5])).all() # replace with different values rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() assert (rs[20:30] == -3).all() assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype): # GH 32621, GH#44940 ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype) expected = pd.Series(["1", "2", np.nan], dtype=nullable_string_dtype) result = ser.replace({"one": "1", "two": "2"}) tm.assert_series_equal(expected, result) def test_replace_with_empty_dictlike(self): # GH 15289 s = pd.Series(list("abcd")) tm.assert_series_equal(s, s.replace({})) with tm.assert_produces_warning(FutureWarning): empty_series = pd.Series([]) tm.assert_series_equal(s, s.replace(empty_series)) def test_replace_string_with_number(self): # GH 15743 s = pd.Series([1, 2, 3]) result = s.replace("2", np.nan) expected = pd.Series([1, 2, 3]) tm.assert_series_equal(expected, result) def test_replace_replacer_equals_replacement(self): # GH 20656 # make sure all replacers are matching against original values s = pd.Series(["a", "b"]) expected = pd.Series(["b", "a"]) result = s.replace({"a": "b", "b": "a"}) tm.assert_series_equal(expected, result) def test_replace_unicode_with_number(self): # GH 15743 s = pd.Series([1, 2, 3]) result = s.replace("2", np.nan) expected = pd.Series([1, 2, 3]) tm.assert_series_equal(expected, result) def test_replace_mixed_types_with_string(self): # Testing mixed s = pd.Series([1, 2, 3, "4", 4, 5]) result = s.replace([2, "4"], np.nan) expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) @pytest.mark.parametrize( "categorical, numeric", [ (pd.Categorical(["A"], categories=["A", "B"]), [1]), (pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]), ], ) def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 ser = pd.Series(categorical) result = ser.replace({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") if 2 not in expected.cat.categories: # i.e. categories should be [1, 2] even if there are no "B"s present # GH#44940 expected = expected.cat.add_categories(2) tm.assert_series_equal(expected, result) def test_replace_categorical_single(self): # GH 26988 dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") s = pd.Series(dti) c = s.astype("category") expected = c.copy() expected = expected.cat.add_categories("foo") expected[2] = "foo" expected = expected.cat.remove_unused_categories() assert c[2] != "foo" result = c.replace(c[2], "foo") tm.assert_series_equal(expected, result) assert c[2] != "foo" # ensure non-inplace call does not alter original return_value = c.replace(c[2], "foo", inplace=True) assert return_value is None tm.assert_series_equal(expected, c) first_value = c[0] return_value = c.replace(c[1], c[0], inplace=True) assert return_value is None assert c[0] == c[1] == first_value # test replacing with existing value def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError s = pd.Series([0, 1, 2, 3, 4]) result = s.replace([3], ["100000000000000000000"]) expected = pd.Series([0, 1, 2, "100000000000000000000", 4]) tm.assert_series_equal(result, expected) s = pd.Series([0, "100000000000000000000", "100000000000000000001"]) result = s.replace(["100000000000000000000"], [1]) expected = pd.Series([0, 1, "100000000000000000001"]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "ser, to_replace, exp", [ ([1, 2, 3], {1: 2, 2: 3, 3: 4}, [2, 3, 4]), (["1", "2", "3"], {"1": "2", "2": "3", "3": "4"}, ["2", "3", "4"]), ], ) def test_replace_commutative(self, ser, to_replace, exp): # GH 16051 # DataFrame.replace() overwrites when values are non-numeric series = pd.Series(ser) expected = pd.Series(exp) result = series.replace(to_replace) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "ser, exp", [([1, 2, 3], [1, True, 3]), (["x", 2, 3], ["x", True, 3])] ) def test_replace_no_cast(self, ser, exp): # GH 9113 # BUG: replace int64 dtype with bool coerces to int64 series = pd.Series(ser) result = series.replace(2, True) expected = pd.Series(exp) tm.assert_series_equal(result, expected) def test_replace_invalid_to_replace(self): # GH 18634 # API: replace() should raise an exception if invalid argument is given series = pd.Series(["a", "b", "c "]) msg = ( r"Expecting 'to_replace' to be either a scalar, array-like, " r"dict or None, got invalid type.*" ) with pytest.raises(TypeError, match=msg): series.replace(lambda x: x.strip()) @pytest.mark.parametrize("frame", [False, True]) def test_replace_nonbool_regex(self, frame): obj = pd.Series(["a", "b", "c "]) if frame: obj = obj.to_frame() msg = "'to_replace' must be 'None' if 'regex' is not a bool" with pytest.raises(ValueError, match=msg): obj.replace(to_replace=["a"], regex="foo") @pytest.mark.parametrize("frame", [False, True]) def test_replace_empty_copy(self, frame): obj = pd.Series([], dtype=np.float64) if frame: obj = obj.to_frame() res = obj.replace(4, 5, inplace=True) assert res is None res = obj.replace(4, 5, inplace=False) tm.assert_equal(res, obj) assert res is not obj def test_replace_only_one_dictlike_arg(self, fixed_now_ts): # GH#33340 ser = pd.Series([1, 2, "A", fixed_now_ts, True]) to_replace = {0: 1, 2: "A"} value = "foo" msg = "Series.replace cannot use dict-like to_replace and non-None value" with pytest.raises(ValueError, match=msg): ser.replace(to_replace, value) to_replace = 1 value = {0: "foo", 2: "bar"} msg = "Series.replace cannot use dict-value and non-None to_replace" with pytest.raises(ValueError, match=msg): ser.replace(to_replace, value) def test_replace_extension_other(self, frame_or_series): # https://github.com/pandas-dev/pandas/issues/34530 obj = frame_or_series(pd.array([1, 2, 3], dtype="Int64")) result = obj.replace("", "") # no exception # should not have changed dtype tm.assert_equal(obj, result) def _check_replace_with_method(self, ser: pd.Series): df = ser.to_frame() res = ser.replace(ser[1], method="pad") expected = pd.Series([ser[0], ser[0]] + list(ser[2:]), dtype=ser.dtype) tm.assert_series_equal(res, expected) res_df = df.replace(ser[1], method="pad") tm.assert_frame_equal(res_df, expected.to_frame()) ser2 = ser.copy() res2 = ser2.replace(ser[1], method="pad", inplace=True) assert res2 is None tm.assert_series_equal(ser2, expected) res_df2 = df.replace(ser[1], method="pad", inplace=True) assert res_df2 is None tm.assert_frame_equal(df, expected.to_frame()) def test_replace_ea_dtype_with_method(self, any_numeric_ea_dtype): arr = pd.array([1, 2, pd.NA, 4], dtype=any_numeric_ea_dtype) ser = pd.Series(arr) self._check_replace_with_method(ser) @pytest.mark.parametrize("as_categorical", [True, False]) def test_replace_interval_with_method(self, as_categorical): # in particular interval that can't hold NA idx = pd.IntervalIndex.from_breaks(range(4)) ser = pd.Series(idx) if as_categorical: ser = ser.astype("category") self._check_replace_with_method(ser) @pytest.mark.parametrize("as_period", [True, False]) @pytest.mark.parametrize("as_categorical", [True, False]) def test_replace_datetimelike_with_method(self, as_period, as_categorical): idx = pd.date_range("2016-01-01", periods=5, tz="US/Pacific") if as_period: idx = idx.tz_localize(None).to_period("D") ser = pd.Series(idx) ser.iloc[-2] = pd.NaT if as_categorical: ser = ser.astype("category") self._check_replace_with_method(ser) def test_replace_with_compiled_regex(self): # https://github.com/pandas-dev/pandas/issues/35680 s = pd.Series(["a", "b", "c"]) regex = re.compile("^a$") result = s.replace({regex: "z"}, regex=True) expected = pd.Series(["z", "b", "c"]) tm.assert_series_equal(result, expected) def test_pandas_replace_na(self): # GH#43344 ser = pd.Series(["AA", "BB", "CC", "DD", "EE", "", pd.NA], dtype="string") regex_mapping = { "AA": "CC", "BB": "CC", "EE": "CC", "CC": "CC-REPL", } result = ser.replace(regex_mapping, regex=True) exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") tm.assert_series_equal(result, exp) @pytest.mark.parametrize( "dtype, input_data, to_replace, expected_data", [ ("bool", [True, False], {True: False}, [False, False]), ("int64", [1, 2], {1: 10, 2: 20}, [10, 20]), ("Int64", [1, 2], {1: 10, 2: 20}, [10, 20]), ("float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]), ("Float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]), ("string", ["one", "two"], {"one": "1", "two": "2"}, ["1", "2"]), ( pd.IntervalDtype("int64"), IntervalArray([pd.Interval(1, 2), pd.Interval(2, 3)]), {pd.Interval(1, 2): pd.Interval(10, 20)}, IntervalArray([pd.Interval(10, 20), pd.Interval(2, 3)]), ), ( pd.IntervalDtype("float64"), IntervalArray([pd.Interval(1.0, 2.7), pd.Interval(2.8, 3.1)]), {pd.Interval(1.0, 2.7): pd.Interval(10.6, 20.8)}, IntervalArray([pd.Interval(10.6, 20.8), pd.Interval(2.8, 3.1)]), ), ( pd.PeriodDtype("M"), [pd.Period("2020-05", freq="M")], {pd.Period("2020-05", freq="M"): pd.Period("2020-06", freq="M")}, [pd.Period("2020-06", freq="M")], ), ], ) def test_replace_dtype(self, dtype, input_data, to_replace, expected_data): # GH#33484 ser = pd.Series(input_data, dtype=dtype) result = ser.replace(to_replace) expected = pd.Series(expected_data, dtype=dtype) tm.assert_series_equal(result, expected) def test_replace_string_dtype(self): # GH#40732, GH#44940 ser = pd.Series(["one", "two", np.nan], dtype="string") res = ser.replace({"one": "1", "two": "2"}) expected = pd.Series(["1", "2", np.nan], dtype="string") tm.assert_series_equal(res, expected) # GH#31644 ser2 = pd.Series(["A", np.nan], dtype="string") res2 = ser2.replace("A", "B") expected2 = pd.Series(["B", np.nan], dtype="string") tm.assert_series_equal(res2, expected2) ser3 = pd.Series(["A", "B"], dtype="string") res3 = ser3.replace("A", pd.NA) expected3 = pd.Series([pd.NA, "B"], dtype="string") tm.assert_series_equal(res3, expected3) def test_replace_string_dtype_list_to_replace(self): # GH#41215, GH#44940 ser = pd.Series(["abc", "def"], dtype="string") res = ser.replace(["abc", "any other string"], "xyz") expected = pd.Series(["xyz", "def"], dtype="string") tm.assert_series_equal(res, expected) def test_replace_string_dtype_regex(self): # GH#31644 ser = pd.Series(["A", "B"], dtype="string") res = ser.replace(r".", "C", regex=True) expected = pd.Series(["C", "C"], dtype="string") tm.assert_series_equal(res, expected) def test_replace_nullable_numeric(self): # GH#40732, GH#44940 floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype()) assert floats.replace({1.0: 9}).dtype == floats.dtype assert floats.replace(1.0, 9).dtype == floats.dtype assert floats.replace({1.0: 9.0}).dtype == floats.dtype assert floats.replace(1.0, 9.0).dtype == floats.dtype res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0]) assert res.dtype == floats.dtype ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype()) assert ints.replace({1: 9}).dtype == ints.dtype assert ints.replace(1, 9).dtype == ints.dtype assert ints.replace({1: 9.0}).dtype == ints.dtype assert ints.replace(1, 9.0).dtype == ints.dtype # nullable (for now) raises instead of casting with pytest.raises(TypeError, match="Invalid value"): ints.replace({1: 9.5}) with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 series = pd.Series(["0"]) expected = pd.Series([1]) result = series.replace(to_replace="0", value=1, regex=regex) tm.assert_series_equal(result, expected) def test_replace_different_int_types(self, any_int_numpy_dtype): # GH#45311 labs = pd.Series([1, 1, 1, 0, 0, 2, 2, 2], dtype=any_int_numpy_dtype) maps = pd.Series([0, 2, 1], dtype=any_int_numpy_dtype) map_dict = {old: new for (old, new) in zip(maps.values, maps.index)} result = labs.replace(map_dict) expected = labs.replace({0: 0, 2: 1, 1: 2}) tm.assert_series_equal(result, expected)
class TestSeriesConvertDtypes: # The answerdict has keys that have 4 tuples, corresponding to the arguments # infer_objects, convert_string, convert_integer, convert_boolean # This allows all 16 possible combinations to be tested. Since common # combinations expect the same answer, this provides an easy way to list # all the possibilities @pytest.mark.parametrize( "data, maindtype, answerdict", [ ( [1, 2, 3], np.dtype("int32"), { ((True, False), (True, False), (True, ), (True, False)): "Int32", ((True, False), (True, False), (False, ), (True, False)): np.dtype("int32"), }, ), ( [1, 2, 3], np.dtype("int64"), { ((True, False), (True, False), (True, ), (True, False)): "Int64", ((True, False), (True, False), (False, ), (True, False)): np.dtype("int64"), }, ), ( ["x", "y", "z"], np.dtype("O"), { ( (True, False), (True, ), (True, False), (True, False), ): pd.StringDtype(), ((True, False), (False, ), (True, False), (True, False)): np.dtype("O"), }, ), ( [True, False, np.nan], np.dtype("O"), { ( (True, False), (True, False), (True, False), (True, ), ): pd.BooleanDtype(), ((True, False), (True, False), (True, False), (False, )): np.dtype("O"), }, ), ( ["h", "i", np.nan], np.dtype("O"), { ( (True, False), (True, ), (True, False), (True, False), ): pd.StringDtype(), ((True, False), (False, ), (True, False), (True, False)): np.dtype("O"), }, ), ( # GH32117 ["h", "i", 1], np.dtype("O"), { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("O"), }, ), ( [10, np.nan, 20], np.dtype("float"), { ((True, False), (True, False), (True, ), (True, False)): "Int64", ((True, False), (True, False), (False, ), (True, False)): np.dtype("float"), }, ), ( [np.nan, 100.5, 200], np.dtype("float"), { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("float"), }, ), ( [3, 4, 5], "Int8", { ((True, False), (True, False), (True, False), (True, False)): "Int8" }, ), ( [[1, 2], [3, 4], [5]], None, { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("O"), }, ), ( [4, 5, 6], np.dtype("uint32"), { ((True, False), (True, False), (True, ), (True, False)): "UInt32", ((True, False), (True, False), (False, ), (True, False)): np.dtype("uint32"), }, ), ( [-10, 12, 13], np.dtype("i1"), { ((True, False), (True, False), (True, ), (True, False)): "Int8", ((True, False), (True, False), (False, ), (True, False)): np.dtype("i1"), }, ), ( [1, 2.0], object, { ((True, ), (True, False), (True, ), (True, False)): "Int64", ((True, ), (True, False), (False, ), (True, False)): np.dtype("float"), ((False, ), (True, False), (True, False), (True, False)): np.dtype("object"), }, ), ( [1, 2.5], object, { ((True, ), (True, False), (True, False), (True, False)): np.dtype("float"), ((False, ), (True, False), (True, False), (True, False)): np.dtype("object"), }, ), ( ["a", "b"], pd.CategoricalDtype(), { ( (True, False), (True, False), (True, False), (True, False), ): pd.CategoricalDtype(), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), pd.DatetimeTZDtype(tz="UTC"), { ( (True, False), (True, False), (True, False), (True, False), ): pd.DatetimeTZDtype(tz="UTC"), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), "datetime64[ns]", { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("datetime64[ns]"), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), object, { ( (True, ), (True, False), (True, False), (True, False), ): np.dtype("datetime64[ns]"), ( (False, ), (True, False), (True, False), (True, False), ): np.dtype("O"), }, ), ( pd.period_range("1/1/2011", freq="M", periods=3), None, { ( (True, False), (True, False), (True, False), (True, False), ): pd.PeriodDtype("M"), }, ), ( pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), None, { ( (True, False), (True, False), (True, False), (True, False), ): pd.IntervalDtype("int64"), }, ), ], ) @pytest.mark.parametrize("params", product(*[(True, False)] * 4)) def test_convert_dtypes(self, data, maindtype, params, answerdict): if maindtype is not None: series = pd.Series(data, dtype=maindtype) else: series = pd.Series(data) answers = { k: a for (kk, a) in answerdict.items() for k in product(*kk) } ns = series.convert_dtypes(*params) expected_dtype = answers[tuple(params)] expected = pd.Series(series.values, dtype=expected_dtype) tm.assert_series_equal(ns, expected) # Test that it is a copy copy = series.copy(deep=True) if is_interval_dtype(ns.dtype) and ns.dtype.subtype.kind in ["i", "u"]: msg = "Cannot set float NaN to integer-backed IntervalArray" with pytest.raises(ValueError, match=msg): ns[ns.notna()] = np.nan else: ns[ns.notna()] = np.nan # Make sure original not changed tm.assert_series_equal(series, copy) def test_convert_string_dtype(self): # https://github.com/pandas-dev/pandas/issues/31731 -> converting columns # that are already string dtype df = pd.DataFrame({ "A": ["a", "b", pd.NA], "B": ["ä", "ö", "ü"] }, dtype="string") result = df.convert_dtypes() tm.assert_frame_equal(df, result) def test_convert_bool_dtype(self): # GH32287 df = pd.DataFrame({"A": pd.array([True])}) tm.assert_frame_equal(df, df.convert_dtypes())
import numpy as np import pytest import pandas as pd from pandas import ( DatetimeIndex, Index, ) import pandas._testing as tm dtlike_dtypes = [ np.dtype("timedelta64[ns]"), np.dtype("datetime64[ns]"), pd.DatetimeTZDtype("ns", "Asia/Tokyo"), pd.PeriodDtype("ns"), ] @pytest.mark.parametrize("ldtype", dtlike_dtypes) @pytest.mark.parametrize("rdtype", dtlike_dtypes) def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype): vals = np.tile(3600 * 10**9 * np.arange(3), 2) def construct(dtype): if dtype is dtlike_dtypes[-1]: # PeriodArray will try to cast ints to strings return DatetimeIndex(vals).astype(dtype) return Index(vals, dtype=dtype) left = construct(ldtype)
def to_pandas_dtype(self): import pandas as pd return pd.PeriodDtype(freq=self.freq)
pd.DatetimeTZDtype(tz="UTC"), {}, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), "datetime64[ns]", np.dtype("datetime64[ns]"), {}, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), object, np.dtype("datetime64[ns]"), {("infer_objects", False): np.dtype("object")}, ), (pd.period_range("1/1/2011", freq="M", periods=3), None, pd.PeriodDtype("M"), {}), ( pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), None, pd.IntervalDtype("int64", "right"), {}, ), ] class TestSeriesConvertDtypes: @pytest.mark.parametrize( "data, maindtype, expected_default, expected_other", test_cases, ) @pytest.mark.parametrize("params", product(*[(True, False)] * 5))
res = df.reset_index() tm.assert_frame_equal(res, expected) # roundtrip res = expected.set_index(["level_0", "level_1"]).reset_index() tm.assert_frame_equal(res, expected) @pytest.mark.parametrize( "array, dtype", [ (["a", "b"], object), ( pd.period_range("12-1-2000", periods=2, freq="Q-DEC"), pd.PeriodDtype(freq="Q-DEC"), ), ], ) def test_reset_index_dtypes_on_empty_frame_with_multiindex(array, dtype): # GH 19602 - Preserve dtype on empty DataFrame with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = DataFrame(index=idx)[:0].reset_index().dtypes expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) def test_reset_index_empty_frame_with_datetime64_multiindex(): # https://github.com/pandas-dev/pandas/issues/35606 idx = MultiIndex( levels=[[Timestamp("2020-07-20 00:00:00")], [3, 4]],
def __post_init__(self): object.__setattr__(self, "type", pd.PeriodDtype(freq=self.freq))
def generate_type_mapper( pd_boolean=None, pd_integer=None, pd_string=None, pd_date_type=None, pd_timestamp_type=None, ): """Specifies the pyarrow data types mapping to corresponding Pandas data types. Args: pd_boolean: if not noe, use the new Pandas bool type. Defaults to None. pd_integer: if not None, use the new Pandas nullable integer type rather than defaulting to floats. Defaults to None. pd_string: if not None, use the new Pandas str type. Defaults to None. pd_date_type: Defaults to None. pd_timestamp_type: Defaults to None. Returns: Type mappings between pyarrow and pandas data types. """ tm = {} if pd_boolean: bool_map = {pa.bool_(): pd.BooleanDtype()} tm = {**tm, **bool_map} if pd_string: string_map = {pa.string(): pd.StringDtype()} tm = {**tm, **string_map} if pd_integer: int_map = { pa.int8(): pd.Int64Dtype(), pa.int16(): pd.Int64Dtype(), pa.int32(): pd.Int64Dtype(), pa.int64(): pd.Int64Dtype(), pa.uint8(): pd.Int64Dtype(), pa.uint16(): pd.Int64Dtype(), pa.uint32(): pd.Int64Dtype(), pa.uint64(): pd.Int64Dtype(), } tm = {**tm, **int_map} else: # No brackets for either keys or values in this dictionary # This lets types_mapper understand the numpy data type float_map = { pa.int8: np.float64, pa.int16: np.float64, pa.int32: np.float64, pa.int64: np.float64, pa.uint8: np.float64, pa.uint16: np.float64, pa.uint32: np.float64, pa.uint64: np.float64, } tm = {**tm, **float_map} if pd_date_type == "pd_period": date_map = {pa.date64(): pd.PeriodDtype("ms")} tm = {**tm, **date_map} if pd_timestamp_type == "pd_period": datetime_map = { pa.timestamp("s"): pd.PeriodDtype("s"), pa.timestamp("ms"): pd.PeriodDtype("ms"), pa.timestamp("us"): pd.PeriodDtype("us"), pa.timestamp("ns"): pd.PeriodDtype("ns"), } tm = {**tm, **datetime_map} if tm: return tm.get else: return None
( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), "datetime64[ns]", np.dtype("datetime64[ns]"), {}, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), object, np.dtype("datetime64[ns]"), { ("infer_objects", False): np.dtype("object") }, ), (pd.period_range("1/1/2011", freq="M", periods=3), None, pd.PeriodDtype("M"), {}), ( pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), None, pd.IntervalDtype("int64", "right"), {}, ), ] class TestSeriesConvertDtypes: @pytest.mark.parametrize( "data, maindtype, expected_default, expected_other", test_cases, )
pa.Timestamp: "datetime64[ns]", pd.DatetimeTZDtype(tz="CET"): "datetime64[ns, CET]", pandas_engine.DateTime: "datetime64[ns]", pandas_engine.DateTime(unit="ns", tz="CET"): "datetime64[ns, CET]", # type: ignore } timedelta_dtypes = { datetime.timedelta: "timedelta64", datetime.timedelta: "timedelta64", np.timedelta64: "timedelta64", pd.Timedelta: "timedelta64", pa.Timedelta: "timedelta64", } period_dtypes = {pd.PeriodDtype(freq="D"): "period[D]"} # Series.astype does not accept a string alias for SparseDtype. sparse_dtypes = { pd.SparseDtype: pd.SparseDtype(), pd.SparseDtype(np.float64): pd.SparseDtype(np.float64), } interval_dtypes = {pd.IntervalDtype(subtype=np.int64): "interval[int64]"} dtype_fixtures: List[Tuple[Dict, List]] = [ (int_dtypes, [-1]), (nullable_int_dtypes, [-1, None]), (uint_dtypes, [1]), (nullable_uint_dtypes, [1, None]), (float_dtypes, [1.0]), (complex_dtypes, [complex(1)]), (boolean_dtypes, [True, False]),
class TestSeriesConvertDtypes: # The answerdict has keys that have 4 tuples, corresponding to the arguments # infer_objects, convert_string, convert_integer, convert_boolean # This allows all 16 possible combinations to be tested. Since common # combinations expect the same answer, this provides an easy way to list # all the possibilities @pytest.mark.parametrize( "data, maindtype, answerdict", [ ( [1, 2, 3], np.dtype("int32"), { ((True, False), (True, False), (True, ), (True, False)): "Int32", ((True, False), (True, False), (False, ), (True, False)): np.dtype("int32"), }, ), ( [1, 2, 3], np.dtype("int64"), { ((True, False), (True, False), (True, ), (True, False)): "Int64", ((True, False), (True, False), (False, ), (True, False)): np.dtype("int64"), }, ), ( ["x", "y", "z"], np.dtype("O"), { ( (True, False), (True, ), (True, False), (True, False), ): pd.StringDtype(), ((True, False), (False, ), (True, False), (True, False)): np.dtype("O"), }, ), ( [True, False, np.nan], np.dtype("O"), { ( (True, False), (True, False), (True, False), (True, ), ): pd.BooleanDtype(), ((True, False), (True, False), (True, False), (False, )): np.dtype("O"), }, ), ( ["h", "i", np.nan], np.dtype("O"), { ( (True, False), (True, ), (True, False), (True, False), ): pd.StringDtype(), ((True, False), (False, ), (True, False), (True, False)): np.dtype("O"), }, ), ( [10, np.nan, 20], np.dtype("float"), { ((True, False), (True, False), (True, ), (True, False)): "Int64", ((True, False), (True, False), (False, ), (True, False)): np.dtype("float"), }, ), ( [np.nan, 100.5, 200], np.dtype("float"), { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("float"), }, ), ( [3, 4, 5], "Int8", { ((True, False), (True, False), (True, False), (True, False)): "Int8" }, ), ( [[1, 2], [3, 4], [5]], None, { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("O"), }, ), ( [4, 5, 6], np.dtype("uint32"), { ((True, False), (True, False), (True, ), (True, False)): "UInt32", ((True, False), (True, False), (False, ), (True, False)): np.dtype("uint32"), }, ), ( [-10, 12, 13], np.dtype("i1"), { ((True, False), (True, False), (True, ), (True, False)): "Int8", ((True, False), (True, False), (False, ), (True, False)): np.dtype("i1"), }, ), ( [1, 2.0], object, { ((True, False), (True, False), (True, ), (True, False)): "Int64", ((True, ), (True, False), (False, ), (True, False)): np.dtype("float"), ((False, ), (True, False), (False, ), (True, False)): np.dtype("object"), }, ), ( ["a", "b"], pd.CategoricalDtype(), { ( (True, False), (True, False), (True, False), (True, False), ): pd.CategoricalDtype(), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), pd.DatetimeTZDtype(tz="UTC"), { ( (True, False), (True, False), (True, False), (True, False), ): pd.DatetimeTZDtype(tz="UTC"), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), "datetime64[ns]", { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("datetime64[ns]"), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), object, { ( (True, ), (True, False), (True, False), (True, False), ): np.dtype("datetime64[ns]"), ( (False, ), (True, False), (True, False), (True, False), ): np.dtype("O"), }, ), ( pd.period_range("1/1/2011", freq="M", periods=3), None, { ( (True, False), (True, False), (True, False), (True, False), ): pd.PeriodDtype("M"), }, ), ( pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), None, { ( (True, False), (True, False), (True, False), (True, False), ): pd.IntervalDtype("int64"), }, ), ], ) @pytest.mark.parametrize("params", product(*[(True, False)] * 4)) def test_convert_dtypes(self, data, maindtype, params, answerdict): if maindtype is not None: series = pd.Series(data, dtype=maindtype) else: series = pd.Series(data) answers = { k: a for (kk, a) in answerdict.items() for k in product(*kk) } ns = series.convert_dtypes(*params) expected_dtype = answers[tuple(params)] expected = pd.Series(series.values, dtype=expected_dtype) tm.assert_series_equal(ns, expected) # Test that it is a copy copy = series.copy(deep=True) ns[ns.notna()] = np.nan # Make sure original not changed tm.assert_series_equal(series, copy)
np.array([1.0, 2.0], dtype="float64"), None, FloatingArray._from_sequence([1.0, 2.0]), ), # String alias passes through to NumPy ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))), # Period alias ( [pd.Period("2000", "D"), pd.Period("2001", "D")], "Period[D]", period_array(["2000", "2001"], freq="D"), ), # Period dtype ( [pd.Period("2000", "D")], pd.PeriodDtype("D"), period_array(["2000"], freq="D"), ), # Datetime (naive) ( [1, 2], np.dtype("datetime64[ns]"), DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")), ), ( np.array([1, 2], dtype="datetime64[ns]"), None, DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")), ), ( pd.DatetimeIndex(["2000", "2001"]),
([1, 2], object, PandasArray(np.array([1, 2], dtype=object))), ([1, 2], np.dtype('float32'), PandasArray(np.array([1., 2.0], dtype=np.dtype('float32')))), (np.array([1, 2]), None, PandasArray(np.array([1, 2]))), # String alias passes through to NumPy ([1, 2], 'float32', PandasArray(np.array([1, 2], dtype='float32'))), # Period alias ([pd.Period('2000', 'D'), pd.Period('2001', 'D') ], 'Period[D]', period_array(['2000', '2001'], freq='D')), # Period dtype ([pd.Period('2000', 'D') ], pd.PeriodDtype('D'), period_array(['2000'], freq='D')), # Datetime (naive) ([1, 2], np.dtype('datetime64[ns]'), pd.arrays.DatetimeArray._from_sequence( np.array([1, 2], dtype='datetime64[ns]'))), (np.array([1, 2], dtype='datetime64[ns]'), None, pd.arrays.DatetimeArray._from_sequence( np.array([1, 2], dtype='datetime64[ns]'))), (pd.DatetimeIndex(['2000', '2001']), np.dtype('datetime64[ns]'), pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), (pd.DatetimeIndex(['2000', '2001']), None, pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), (['2000', '2001'], np.dtype('datetime64[ns]'), pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])),
class TestDataFrameAppend: def test_append_empty_list(self): # GH 28769 df = DataFrame() result = df.append([]) expected = df tm.assert_frame_equal(result, expected) assert result is not df df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) result = df.append([]) expected = df tm.assert_frame_equal(result, expected) assert result is not df # .append() should return a new object def test_append_series_dict(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) series = df.loc[4] msg = "Indexes have overlapping values" with pytest.raises(ValueError, match=msg): df.append(series, verify_integrity=True) series.name = None msg = "Can only append a Series if ignore_index=True" with pytest.raises(TypeError, match=msg): df.append(series, verify_integrity=True) result = df.append(series[::-1], ignore_index=True) expected = df.append(DataFrame({ 0: series[::-1] }, index=df.columns).T, ignore_index=True) tm.assert_frame_equal(result, expected) # dict result = df.append(series.to_dict(), ignore_index=True) tm.assert_frame_equal(result, expected) result = df.append(series[::-1][:3], ignore_index=True) expected = df.append(DataFrame({ 0: series[::-1][:3] }).T, ignore_index=True, sort=True) tm.assert_frame_equal(result, expected.loc[:, result.columns]) # can append when name set row = df.loc[4] row.name = 5 result = df.append(row) expected = df.append(df[-1:], ignore_index=True) tm.assert_frame_equal(result, expected) def test_append_list_of_series_dicts(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) dicts = [x.to_dict() for idx, x in df.iterrows()] result = df.append(dicts, ignore_index=True) expected = df.append(df, ignore_index=True) tm.assert_frame_equal(result, expected) # different columns dicts = [ { "foo": 1, "bar": 2, "baz": 3, "peekaboo": 4 }, { "foo": 5, "bar": 6, "baz": 7, "peekaboo": 8 }, ] result = df.append(dicts, ignore_index=True, sort=True) expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) def test_append_missing_cols(self): # GH22252 # exercise the conditional branch in append method where the data # to be appended is a list and does not contain all columns that are in # the target DataFrame df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) dicts = [{"foo": 9}, {"bar": 10}] with tm.assert_produces_warning(None): result = df.append(dicts, ignore_index=True, sort=True) expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) def test_append_empty_dataframe(self): # Empty df append empty df df1 = DataFrame() df2 = DataFrame() result = df1.append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) # Non-empty df append empty df df1 = DataFrame(np.random.randn(5, 2)) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) # Empty df with columns append empty df df1 = DataFrame(columns=["bar", "foo"]) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) # Non-Empty df with columns append empty df df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"]) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) def test_append_dtypes(self): # GH 5754 # row appends of different dtypes (so need to do by-item) # can sometimes infer the correct type df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5)) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": "foo"}, index=range(1, 2)) result = df1.append(df2) expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]}) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": np.nan}, index=range(1, 2)) result = df1.append(df2) expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object) result = df1.append(df2) expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": np.nan}, index=range(1)) df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2)) result = df1.append(df2) expected = DataFrame( {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")}) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object) result = df1.append(df2) expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "timestamp", ["2019-07-19 07:04:57+0100", "2019-07-19 07:04:57"]) def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): # GH 30238 tz = tz_naive_fixture df = pd.DataFrame([pd.Timestamp(timestamp, tz=tz)]) result = df.append(df.iloc[0]).iloc[-1] expected = pd.Series(pd.Timestamp(timestamp, tz=tz), name=0) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "data, dtype", [ ([1], pd.Int64Dtype()), ([1], pd.CategoricalDtype()), ([pd.Interval(left=0, right=5)], pd.IntervalDtype()), ([pd.Period("2000-03", freq="M")], pd.PeriodDtype("M")), ([1], pd.SparseDtype()), ], ) def test_other_dtypes(self, data, dtype): df = pd.DataFrame(data, dtype=dtype) result = df.append(df.iloc[0]).iloc[-1] expected = pd.Series(data, name=0, dtype=dtype) tm.assert_series_equal(result, expected)
# Basic NumPy defaults. ([1, 2], None, PandasArray(np.array([1, 2]))), ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))), ([1, 2], np.dtype('float32'), PandasArray(np.array([1., 2.0], dtype=np.dtype('float32')))), (np.array([1, 2]), None, PandasArray(np.array([1, 2]))), # String alias passes through to NumPy ([1, 2], 'float32', PandasArray(np.array([1, 2], dtype='float32'))), # Period alias ([pd.Period('2000', 'D'), pd.Period('2001', 'D')], 'Period[D]', period_array(['2000', '2001'], freq='D')), # Period dtype ([pd.Period('2000', 'D')], pd.PeriodDtype('D'), period_array(['2000'], freq='D')), # Datetime (naive) ([1, 2], np.dtype('datetime64[ns]'), pd.arrays.DatetimeArray._from_sequence( np.array([1, 2], dtype='datetime64[ns]'))), (np.array([1, 2], dtype='datetime64[ns]'), None, pd.arrays.DatetimeArray._from_sequence( np.array([1, 2], dtype='datetime64[ns]'))), (pd.DatetimeIndex(['2000', '2001']), np.dtype('datetime64[ns]'), pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), (pd.DatetimeIndex(['2000', '2001']), None,
class TestDataFrameAppend: @pytest.mark.filterwarnings( "ignore:.*append method is deprecated.*:FutureWarning") def test_append_multiindex(self, multiindex_dataframe_random_data, frame_or_series): obj = multiindex_dataframe_random_data obj = tm.get_obj(obj, frame_or_series) a = obj[:5] b = obj[5:] result = a.append(b) tm.assert_equal(result, obj) def test_append_empty_list(self): # GH 28769 df = DataFrame() result = df._append([]) expected = df tm.assert_frame_equal(result, expected) assert result is not df df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) result = df._append([]) expected = df tm.assert_frame_equal(result, expected) assert result is not df # ._append() should return a new object def test_append_series_dict(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) series = df.loc[4] msg = "Indexes have overlapping values" with pytest.raises(ValueError, match=msg): df._append(series, verify_integrity=True) series.name = None msg = "Can only append a Series if ignore_index=True" with pytest.raises(TypeError, match=msg): df._append(series, verify_integrity=True) result = df._append(series[::-1], ignore_index=True) expected = df._append(DataFrame({ 0: series[::-1] }, index=df.columns).T, ignore_index=True) tm.assert_frame_equal(result, expected) # dict result = df._append(series.to_dict(), ignore_index=True) tm.assert_frame_equal(result, expected) result = df._append(series[::-1][:3], ignore_index=True) expected = df._append(DataFrame({ 0: series[::-1][:3] }).T, ignore_index=True, sort=True) tm.assert_frame_equal(result, expected.loc[:, result.columns]) msg = "Can only append a dict if ignore_index=True" with pytest.raises(TypeError, match=msg): df._append(series.to_dict()) # can append when name set row = df.loc[4] row.name = 5 result = df._append(row) expected = df._append(df[-1:], ignore_index=True) tm.assert_frame_equal(result, expected) def test_append_list_of_series_dicts(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) dicts = [x.to_dict() for idx, x in df.iterrows()] result = df._append(dicts, ignore_index=True) expected = df._append(df, ignore_index=True) tm.assert_frame_equal(result, expected) # different columns dicts = [ { "foo": 1, "bar": 2, "baz": 3, "peekaboo": 4 }, { "foo": 5, "bar": 6, "baz": 7, "peekaboo": 8 }, ] result = df._append(dicts, ignore_index=True, sort=True) expected = df._append(DataFrame(dicts), ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) def test_append_list_retain_index_name(self): df = DataFrame([[1, 2], [3, 4]], index=pd.Index(["a", "b"], name="keepthisname")) serc = Series([5, 6], name="c") expected = DataFrame( [[1, 2], [3, 4], [5, 6]], index=pd.Index(["a", "b", "c"], name="keepthisname"), ) # append series result = df._append(serc) tm.assert_frame_equal(result, expected) # append list of series result = df._append([serc]) tm.assert_frame_equal(result, expected) def test_append_missing_cols(self): # GH22252 # exercise the conditional branch in append method where the data # to be appended is a list and does not contain all columns that are in # the target DataFrame df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) dicts = [{"foo": 9}, {"bar": 10}] result = df._append(dicts, ignore_index=True, sort=True) expected = df._append(DataFrame(dicts), ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) def test_append_empty_dataframe(self): # Empty df append empty df df1 = DataFrame() df2 = DataFrame() result = df1._append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) # Non-empty df append empty df df1 = DataFrame(np.random.randn(5, 2)) df2 = DataFrame() result = df1._append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) # Empty df with columns append empty df df1 = DataFrame(columns=["bar", "foo"]) df2 = DataFrame() result = df1._append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) # Non-Empty df with columns append empty df df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"]) df2 = DataFrame() result = df1._append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) def test_append_dtypes(self, using_array_manager): # GH 5754 # row appends of different dtypes (so need to do by-item) # can sometimes infer the correct type df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5)) df2 = DataFrame() result = df1._append(df2) expected = df1.copy() tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": "foo"}, index=range(1, 2)) result = df1._append(df2) expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]}) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": np.nan}, index=range(1, 2)) result = df1._append(df2) expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}) if using_array_manager: # TODO(ArrayManager) decide on exact casting rules in concat # With ArrayManager, all-NaN float is not ignored expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object) result = df1._append(df2) expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}) if using_array_manager: # With ArrayManager, all-NaN float is not ignored expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": np.nan}, index=range(1)) df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2)) result = df1._append(df2) expected = DataFrame( {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")}) if using_array_manager: # With ArrayManager, all-NaN float is not ignored expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object) result = df1._append(df2) expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "timestamp", ["2019-07-19 07:04:57+0100", "2019-07-19 07:04:57"]) def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): # GH 30238 tz = tz_naive_fixture df = DataFrame([Timestamp(timestamp, tz=tz)]) result = df._append(df.iloc[0]).iloc[-1] expected = Series(Timestamp(timestamp, tz=tz), name=0) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "data, dtype", [ ([1], pd.Int64Dtype()), ([1], pd.CategoricalDtype()), ([pd.Interval(left=0, right=5)], pd.IntervalDtype()), ([pd.Period("2000-03", freq="M")], pd.PeriodDtype("M")), ([1], pd.SparseDtype()), ], ) def test_other_dtypes(self, data, dtype, using_array_manager): df = DataFrame(data, dtype=dtype) warn = None if using_array_manager and isinstance(dtype, pd.SparseDtype): warn = FutureWarning with tm.assert_produces_warning(warn, match="astype from SparseDtype"): result = df._append(df.iloc[0]).iloc[-1] expected = Series(data, name=0, dtype=dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) def test_append_numpy_bug_1681(self, dtype): # another datetime64 bug if dtype == "datetime64[ns]": index = date_range("2011/1/1", "2012/1/1", freq="W-FRI") else: index = timedelta_range("1 days", "10 days", freq="2D") df = DataFrame() other = DataFrame({"A": "foo", "B": index}, index=index) result = df._append(other) assert (result["B"] == index).all() @pytest.mark.filterwarnings("ignore:The values in the array:RuntimeWarning" ) def test_multiindex_column_append_multiple(self): # GH 29699 df = DataFrame( [[1, 11], [2, 12], [3, 13]], columns=pd.MultiIndex.from_tuples([("multi", "col1"), ("multi", "col2")], names=["level1", None]), ) df2 = df.copy() for i in range(1, 10): df[i, "colA"] = 10 df = df._append(df2, ignore_index=True) result = df["multi"] expected = DataFrame({ "col1": [1, 2, 3] * (i + 1), "col2": [11, 12, 13] * (i + 1) }) tm.assert_frame_equal(result, expected) def test_append_raises_future_warning(self): # GH#35407 df1 = DataFrame([[1, 2], [3, 4]]) df2 = DataFrame([[5, 6], [7, 8]]) with tm.assert_produces_warning(FutureWarning): df1.append(df2)
except (ImportError, TypeError, ValueError): pass numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32, np.float64, np.int16, np.int8, np.uint16, np.uint8] datetime_dtypes = [np.datetime64, np.timedelta64] string_dtypes = [np.object] try: extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype, pd.Int32Dtype, pd.Int64Dtype, pd.UInt8Dtype, pd.UInt16Dtype, pd.UInt32Dtype, pd.UInt64Dtype, pd.CategoricalDtype, pd.IntervalDtype, pd.DatetimeTZDtype('ns', 'UTC'), pd.PeriodDtype('D')] except AttributeError: extension_dtypes = [] def setup(*args, **kwargs): # This function just needs to be imported into each benchmark file to # set up the random seed before each function. # http://asv.readthedocs.io/en/latest/writing_benchmarks.html np.random.seed(1234) class BaseIO(object): """ Base class for IO benchmarks """