def test_interval_array_error_and_warning(): # GH 40245 msg = "Can only specify 'closed' or 'inclusive', not both." with pytest.raises(TypeError, match=msg): with tm.assert_produces_warning(FutureWarning): IntervalArray([Interval(0, 1), Interval(1, 5)], closed="both", inclusive="both") msg = "the 'closed'' keyword is deprecated, use 'inclusive' instead." with tm.assert_produces_warning(FutureWarning, match=msg): IntervalArray([Interval(0, 1), Interval(1, 5)], closed="both")
def test_interval_array_error_and_warning(): # GH 40245 msg = ( "Deprecated argument `closed` cannot " "be passed if argument `inclusive` is not None" ) with pytest.raises(ValueError, match=msg): IntervalArray([Interval(0, 1), Interval(1, 5)], closed="both", inclusive="both") msg = "Argument `closed` is deprecated in favor of `inclusive`" with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): IntervalArray([Interval(0, 1), Interval(1, 5)], closed="both")
def test_dtype_closed_mismatch(): # GH#38394 closed specified in both dtype and IntervalIndex constructor dtype = IntervalDtype(np.int64, "left") msg = "closed keyword does not match dtype.closed" with pytest.raises(ValueError, match=msg): IntervalIndex([], dtype=dtype, closed="neither") with pytest.raises(ValueError, match=msg): IntervalArray([], dtype=dtype, closed="neither")
def test_dtype_inclusive_mismatch(): # GH#38394 dtype = IntervalDtype(np.int64, "left") msg = "inclusive keyword does not match dtype.inclusive" with pytest.raises(ValueError, match=msg): IntervalIndex([], dtype=dtype, inclusive="neither") with pytest.raises(ValueError, match=msg): IntervalArray([], dtype=dtype, inclusive="neither")
def test_setitem_empty_indexer(self, data, box_in_series): data_dtype = type(data) if box_in_series: data = pd.Series(data) original = data.copy() if data_dtype == IntervalArray: data[np.array([], dtype=int)] = IntervalArray([], "right") else: data[np.array([], dtype=int)] = [] self.assert_equal(data, original)
def array( data: Union[Sequence[object], AnyArrayLike], dtype: Optional[Dtype] = None, copy: bool = True, ) -> "ExtensionArray": """ Create an array. .. versionadded:: 0.24.0 Parameters ---------- data : Sequence of objects The scalars inside `data` should be instances of the scalar type for `dtype`. It's expected that `data` represents a 1-dimensional array of data. When `data` is an Index or Series, the underlying array will be extracted from `data`. dtype : str, np.dtype, or ExtensionDtype, optional The dtype to use for the array. This may be a NumPy dtype or an extension type registered with pandas using :meth:`pandas.api.extensions.register_extension_dtype`. If not specified, there are two possibilities: 1. When `data` is a :class:`Series`, :class:`Index`, or :class:`ExtensionArray`, the `dtype` will be taken from the data. 2. Otherwise, pandas will attempt to infer the `dtype` from the data. Note that when `data` is a NumPy array, ``data.dtype`` is *not* used for inferring the array type. This is because NumPy cannot represent all the types of data that can be held in extension arrays. Currently, pandas will infer an extension dtype for sequences of ============================== ===================================== Scalar Type Array Type ============================== ===================================== :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` :class:`int` :class:`pandas.arrays.IntegerArray` :class:`str` :class:`pandas.arrays.StringArray` :class:`bool` :class:`pandas.arrays.BooleanArray` ============================== ===================================== For all other cases, NumPy's usual inference rules will be used. .. versionchanged:: 1.0.0 Pandas infers nullable-integer dtype for integer data, string dtype for string data, and nullable-boolean dtype for boolean data. copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require copying data, even if ``copy=False``. Returns ------- ExtensionArray The newly created array. Raises ------ ValueError When `data` is not 1-dimensional. See Also -------- numpy.array : Construct a NumPy array. Series : Construct a pandas Series. Index : Construct a pandas Index. arrays.PandasArray : ExtensionArray wrapping a NumPy array. Series.array : Extract the array stored within a Series. Notes ----- Omitting the `dtype` argument means pandas will attempt to infer the best array type from the values in the data. As new array types are added by pandas and 3rd party libraries, the "best" array type may change. We recommend specifying `dtype` to ensure that 1. the correct array type for the data is returned 2. the returned array type doesn't change as new extension types are added by pandas and third-party libraries Additionally, if the underlying memory representation of the returned array matters, we recommend specifying the `dtype` as a concrete object rather than a string alias or allowing it to be inferred. For example, a future version of pandas or a 3rd-party library may include a dedicated ExtensionArray for string data. In this event, the following would no longer return a :class:`arrays.PandasArray` backed by a NumPy array. >>> pd.array(['a', 'b'], dtype=str) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 This would instead return the new ExtensionArray dedicated for string data. If you really need the new array to be backed by a NumPy array, specify that in the dtype. >>> pd.array(['a', 'b'], dtype=np.dtype("<U1")) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 Finally, Pandas has arrays that mostly overlap with NumPy * :class:`arrays.DatetimeArray` * :class:`arrays.TimedeltaArray` When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray`` rather than a ``PandasArray``. This is for symmetry with the case of timezone-aware data, which NumPy does not natively support. >>> pd.array(['2015', '2016'], dtype='datetime64[ns]') <DatetimeArray> ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] Length: 2, dtype: datetime64[ns] >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') <TimedeltaArray> ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] Examples -------- If a dtype is not specified, pandas will infer the best dtype from the values. See the description of `dtype` for the types pandas infers for. >>> pd.array([1, 2]) <IntegerArray> [1, 2] Length: 2, dtype: Int64 >>> pd.array([1, 2, np.nan]) <IntegerArray> [1, 2, <NA>] Length: 3, dtype: Int64 >>> pd.array(["a", None, "c"]) <StringArray> ['a', <NA>, 'c'] Length: 3, dtype: string >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) <PeriodArray> ['2000-01-01', '2000-01-01'] Length: 2, dtype: period[D] You can use the string alias for `dtype` >>> pd.array(['a', 'b', 'a'], dtype='category') [a, b, a] Categories (2, object): [a, b] Or specify the actual dtype >>> pd.array(['a', 'b', 'a'], ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) [a, b, a] Categories (3, object): [a < b < c] If pandas does not infer a dedicated extension type a :class:`arrays.PandasArray` is returned. >>> pd.array([1.1, 2.2]) <PandasArray> [1.1, 2.2] Length: 2, dtype: float64 As mentioned in the "Notes" section, new extension types may be added in the future (by pandas or 3rd party libraries), causing the return value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype` as a NumPy dtype if you need to ensure there's no future change in behavior. >>> pd.array([1, 2], dtype=np.dtype("int32")) <PandasArray> [1, 2] Length: 2, dtype: int32 `data` must be 1-dimensional. A ValueError is raised when the input has the wrong dimensionality. >>> pd.array(1) Traceback (most recent call last): ... ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( period_array, BooleanArray, IntegerArray, IntervalArray, PandasArray, DatetimeArray, TimedeltaArray, StringArray, ) if lib.is_scalar(data): msg = f"Cannot pass scalar '{data}' to 'pandas.array'." raise ValueError(msg) if dtype is None and isinstance( data, (ABCSeries, ABCIndexClass, ABCExtensionArray)): dtype = data.dtype data = extract_array(data, extract_numpy=True) # this returns None for not-found dtypes. if isinstance(dtype, str): dtype = registry.find(dtype) or dtype if is_extension_array_dtype(dtype): cls = cast(ExtensionDtype, dtype).construct_array_type() return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: inferred_dtype = lib.infer_dtype(data, skipna=True) if inferred_dtype == "period": try: return period_array(data, copy=copy) except IncompatibleFrequency: # We may have a mixture of frequencies. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype == "interval": try: return IntervalArray(data, copy=copy) except ValueError: # We may have a mixture of `closed` here. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype.startswith("datetime"): # datetime, datetime64 try: return DatetimeArray._from_sequence(data, copy=copy) except ValueError: # Mixture of timezones, fall back to PandasArray pass elif inferred_dtype.startswith("timedelta"): # timedelta, timedelta64 return TimedeltaArray._from_sequence(data, copy=copy) elif inferred_dtype == "string": return StringArray._from_sequence(data, copy=copy) elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) elif inferred_dtype == "boolean": return BooleanArray._from_sequence(data, copy=copy) # Pandas overrides NumPy for # 1. datetime64[ns] # 2. timedelta64[ns] # so that a DatetimeArray is returned. if is_datetime64_ns_dtype(dtype): return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) elif is_timedelta64_ns_dtype(dtype): return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) result = PandasArray._from_sequence(data, dtype=dtype, copy=copy) return result
def data(): """Length-100 PeriodArray for semantics test.""" return IntervalArray(make_data())
class TestSeriesReplace: def test_replace_explicit_none(self): # GH#36984 if the user explicitly passes value=None, give it to them ser = pd.Series([0, 0, ""], dtype=object) result = ser.replace("", None) expected = pd.Series([0, 0, None], dtype=object) tm.assert_series_equal(result, expected) df = pd.DataFrame(np.zeros((3, 3))) df.iloc[2, 2] = "" result = df.replace("", None) expected = pd.DataFrame( { 0: np.zeros(3), 1: np.zeros(3), 2: np.array([0.0, 0.0, None], dtype=object), } ) assert expected.iloc[2, 2] is None tm.assert_frame_equal(result, expected) # GH#19998 same thing with object dtype ser = pd.Series([10, 20, 30, "a", "a", "b", "a"]) result = ser.replace("a", None) expected = pd.Series([10, 20, 30, None, None, "b", None]) assert expected.iloc[-1] is None tm.assert_series_equal(result, expected) def test_replace_numpy_nan(self, nulls_fixture): # GH#45725 ensure numpy.nan can be replaced with all other null types to_replace = np.nan value = nulls_fixture dtype = object ser = pd.Series([to_replace], dtype=dtype) expected = pd.Series([value], dtype=dtype) result = ser.replace({to_replace: value}).astype(dtype=dtype) tm.assert_series_equal(result, expected) assert result.dtype == dtype # same thing but different calling convention result = ser.replace(to_replace, value).astype(dtype=dtype) tm.assert_series_equal(result, expected) assert result.dtype == dtype def test_replace_noop_doesnt_downcast(self): # GH#44498 ser = pd.Series([None, None, pd.Timestamp("2021-12-16 17:31")], dtype=object) res = ser.replace({np.nan: None}) # should be a no-op tm.assert_series_equal(res, ser) assert res.dtype == object # same thing but different calling convention res = ser.replace(np.nan, None) tm.assert_series_equal(res, ser) assert res.dtype == object def test_replace(self): N = 100 ser = pd.Series(np.random.randn(N)) ser[0:4] = np.nan ser[6:10] = 0 # replace list with a single value return_value = ser.replace([np.nan], -1, inplace=True) assert return_value is None exp = ser.fillna(-1) tm.assert_series_equal(ser, exp) rs = ser.replace(0.0, np.nan) ser[ser == 0.0] = np.nan tm.assert_series_equal(rs, ser) ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object) ser[:5] = np.nan ser[6:10] = "foo" ser[20:30] = "bar" # replace list with a single value rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() assert (rs[20:30] == -1).all() assert (pd.isna(ser[:5])).all() # replace with different values rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() assert (rs[20:30] == -3).all() assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() def test_replace_nan_with_inf(self): ser = pd.Series([np.nan, 0, np.inf]) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) ser = pd.Series([np.nan, 0, "foo", "bar", np.inf, None, pd.NaT]) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) filled = ser.copy() filled[4] = 0 tm.assert_series_equal(ser.replace(np.inf, 0), filled) def test_replace_listlike_value_listlike_target(self, datetime_series): ser = pd.Series(datetime_series.index) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) # malformed msg = r"Replacement lists must match in length\. Expecting 3 got 2" with pytest.raises(ValueError, match=msg): ser.replace([1, 2, 3], [np.nan, 0]) # ser is dt64 so can't hold 1 or 2, so this replace is a no-op result = ser.replace([1, 2], [np.nan, 0]) tm.assert_series_equal(result, ser) ser = pd.Series([0, 1, 2, 3, 4]) result = ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0]) tm.assert_series_equal(result, pd.Series([4, 3, 2, 1, 0])) def test_replace_gh5319(self): # API change from 0.12? # GH 5319 ser = pd.Series([0, np.nan, 2, 3, 4]) expected = ser.ffill() result = ser.replace([np.nan]) tm.assert_series_equal(result, expected) ser = pd.Series([0, np.nan, 2, 3, 4]) expected = ser.ffill() result = ser.replace(np.nan) tm.assert_series_equal(result, expected) def test_replace_datetime64(self): # GH 5797 ser = pd.Series(pd.date_range("20130101", periods=5)) expected = ser.copy() expected.loc[2] = pd.Timestamp("20120101") result = ser.replace({pd.Timestamp("20130103"): pd.Timestamp("20120101")}) tm.assert_series_equal(result, expected) result = ser.replace(pd.Timestamp("20130103"), pd.Timestamp("20120101")) tm.assert_series_equal(result, expected) def test_replace_nat_with_tz(self): # GH 11792: Test with replacing NaT in a list with tz data ts = pd.Timestamp("2015/01/01", tz="UTC") s = pd.Series([pd.NaT, pd.Timestamp("2015/01/01", tz="UTC")]) result = s.replace([np.nan, pd.NaT], pd.Timestamp.min) expected = pd.Series([pd.Timestamp.min, ts], dtype=object) tm.assert_series_equal(expected, result) def test_replace_timedelta_td64(self): tdi = pd.timedelta_range(0, periods=5) ser = pd.Series(tdi) # Using a single dict argument means we go through replace_list result = ser.replace({ser[1]: ser[3]}) expected = pd.Series([ser[0], ser[3], ser[2], ser[3], ser[4]]) tm.assert_series_equal(result, expected) def test_replace_with_single_list(self): ser = pd.Series([0, 1, 2, 3, 4]) result = ser.replace([1, 2, 3]) tm.assert_series_equal(result, pd.Series([0, 0, 0, 0, 4])) s = ser.copy() return_value = s.replace([1, 2, 3], inplace=True) assert return_value is None tm.assert_series_equal(s, pd.Series([0, 0, 0, 0, 4])) # make sure things don't get corrupted when fillna call fails s = ser.copy() msg = ( r"Invalid fill method\. Expecting pad \(ffill\) or backfill " r"\(bfill\)\. Got crash_cymbal" ) with pytest.raises(ValueError, match=msg): return_value = s.replace([1, 2, 3], inplace=True, method="crash_cymbal") assert return_value is None tm.assert_series_equal(s, ser) def test_replace_mixed_types(self): ser = pd.Series(np.arange(5), dtype="int64") def check_replace(to_rep, val, expected): sc = ser.copy() result = ser.replace(to_rep, val) return_value = sc.replace(to_rep, val, inplace=True) assert return_value is None tm.assert_series_equal(expected, result) tm.assert_series_equal(expected, sc) # 3.0 can still be held in our int64 series, so we do not upcast GH#44940 tr, v = [3], [3.0] check_replace(tr, v, ser) # Note this matches what we get with the scalars 3 and 3.0 check_replace(tr[0], v[0], ser) # MUST upcast to float e = pd.Series([0, 1, 2, 3.5, 4]) tr, v = [3], [3.5] check_replace(tr, v, e) # casts to object e = pd.Series([0, 1, 2, 3.5, "a"]) tr, v = [3, 4], [3.5, "a"] check_replace(tr, v, e) # again casts to object e = pd.Series([0, 1, 2, 3.5, pd.Timestamp("20130101")]) tr, v = [3, 4], [3.5, pd.Timestamp("20130101")] check_replace(tr, v, e) # casts to object e = pd.Series([0, 1, 2, 3.5, True], dtype="object") tr, v = [3, 4], [3.5, True] check_replace(tr, v, e) # test an object with dates + floats + integers + strings dr = pd.Series(pd.date_range("1/1/2001", "1/10/2001", freq="D")) result = dr.astype(object).replace([dr[0], dr[1], dr[2]], [1.0, 2, "a"]) expected = pd.Series([1.0, 2, "a"] + dr[3:].tolist(), dtype=object) tm.assert_series_equal(result, expected) def test_replace_bool_with_string_no_op(self): s = pd.Series([True, False, True]) result = s.replace("fun", "in-the-sun") tm.assert_series_equal(s, result) def test_replace_bool_with_string(self): # nonexistent elements s = pd.Series([True, False, True]) result = s.replace(True, "2u") expected = pd.Series(["2u", False, "2u"]) tm.assert_series_equal(expected, result) def test_replace_bool_with_bool(self): s = pd.Series([True, False, True]) result = s.replace(True, False) expected = pd.Series([False] * len(s)) tm.assert_series_equal(expected, result) def test_replace_with_dict_with_bool_keys(self): s = pd.Series([True, False, True]) result = s.replace({"asdf": "asdb", True: "yes"}) expected = pd.Series(["yes", False, "yes"]) tm.assert_series_equal(result, expected) def test_replace_Int_with_na(self, any_int_ea_dtype): # GH 38267 result = pd.Series([0, None], dtype=any_int_ea_dtype).replace(0, pd.NA) expected = pd.Series([pd.NA, pd.NA], dtype=any_int_ea_dtype) tm.assert_series_equal(result, expected) result = pd.Series([0, 1], dtype=any_int_ea_dtype).replace(0, pd.NA) result.replace(1, pd.NA, inplace=True) tm.assert_series_equal(result, expected) def test_replace2(self): N = 100 ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object) ser[:5] = np.nan ser[6:10] = "foo" ser[20:30] = "bar" # replace list with a single value rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() assert (rs[20:30] == -1).all() assert (pd.isna(ser[:5])).all() # replace with different values rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() assert (rs[20:30] == -3).all() assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype): # GH 32621, GH#44940 ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype) expected = pd.Series(["1", "2", np.nan], dtype=nullable_string_dtype) result = ser.replace({"one": "1", "two": "2"}) tm.assert_series_equal(expected, result) def test_replace_with_empty_dictlike(self): # GH 15289 s = pd.Series(list("abcd")) tm.assert_series_equal(s, s.replace({})) with tm.assert_produces_warning(FutureWarning): empty_series = pd.Series([]) tm.assert_series_equal(s, s.replace(empty_series)) def test_replace_string_with_number(self): # GH 15743 s = pd.Series([1, 2, 3]) result = s.replace("2", np.nan) expected = pd.Series([1, 2, 3]) tm.assert_series_equal(expected, result) def test_replace_replacer_equals_replacement(self): # GH 20656 # make sure all replacers are matching against original values s = pd.Series(["a", "b"]) expected = pd.Series(["b", "a"]) result = s.replace({"a": "b", "b": "a"}) tm.assert_series_equal(expected, result) def test_replace_unicode_with_number(self): # GH 15743 s = pd.Series([1, 2, 3]) result = s.replace("2", np.nan) expected = pd.Series([1, 2, 3]) tm.assert_series_equal(expected, result) def test_replace_mixed_types_with_string(self): # Testing mixed s = pd.Series([1, 2, 3, "4", 4, 5]) result = s.replace([2, "4"], np.nan) expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) @pytest.mark.parametrize( "categorical, numeric", [ (pd.Categorical(["A"], categories=["A", "B"]), [1]), (pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]), ], ) def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 ser = pd.Series(categorical) result = ser.replace({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") if 2 not in expected.cat.categories: # i.e. categories should be [1, 2] even if there are no "B"s present # GH#44940 expected = expected.cat.add_categories(2) tm.assert_series_equal(expected, result) def test_replace_categorical_single(self): # GH 26988 dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") s = pd.Series(dti) c = s.astype("category") expected = c.copy() expected = expected.cat.add_categories("foo") expected[2] = "foo" expected = expected.cat.remove_unused_categories() assert c[2] != "foo" result = c.replace(c[2], "foo") tm.assert_series_equal(expected, result) assert c[2] != "foo" # ensure non-inplace call does not alter original return_value = c.replace(c[2], "foo", inplace=True) assert return_value is None tm.assert_series_equal(expected, c) first_value = c[0] return_value = c.replace(c[1], c[0], inplace=True) assert return_value is None assert c[0] == c[1] == first_value # test replacing with existing value def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError s = pd.Series([0, 1, 2, 3, 4]) result = s.replace([3], ["100000000000000000000"]) expected = pd.Series([0, 1, 2, "100000000000000000000", 4]) tm.assert_series_equal(result, expected) s = pd.Series([0, "100000000000000000000", "100000000000000000001"]) result = s.replace(["100000000000000000000"], [1]) expected = pd.Series([0, 1, "100000000000000000001"]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "ser, to_replace, exp", [ ([1, 2, 3], {1: 2, 2: 3, 3: 4}, [2, 3, 4]), (["1", "2", "3"], {"1": "2", "2": "3", "3": "4"}, ["2", "3", "4"]), ], ) def test_replace_commutative(self, ser, to_replace, exp): # GH 16051 # DataFrame.replace() overwrites when values are non-numeric series = pd.Series(ser) expected = pd.Series(exp) result = series.replace(to_replace) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "ser, exp", [([1, 2, 3], [1, True, 3]), (["x", 2, 3], ["x", True, 3])] ) def test_replace_no_cast(self, ser, exp): # GH 9113 # BUG: replace int64 dtype with bool coerces to int64 series = pd.Series(ser) result = series.replace(2, True) expected = pd.Series(exp) tm.assert_series_equal(result, expected) def test_replace_invalid_to_replace(self): # GH 18634 # API: replace() should raise an exception if invalid argument is given series = pd.Series(["a", "b", "c "]) msg = ( r"Expecting 'to_replace' to be either a scalar, array-like, " r"dict or None, got invalid type.*" ) with pytest.raises(TypeError, match=msg): series.replace(lambda x: x.strip()) @pytest.mark.parametrize("frame", [False, True]) def test_replace_nonbool_regex(self, frame): obj = pd.Series(["a", "b", "c "]) if frame: obj = obj.to_frame() msg = "'to_replace' must be 'None' if 'regex' is not a bool" with pytest.raises(ValueError, match=msg): obj.replace(to_replace=["a"], regex="foo") @pytest.mark.parametrize("frame", [False, True]) def test_replace_empty_copy(self, frame): obj = pd.Series([], dtype=np.float64) if frame: obj = obj.to_frame() res = obj.replace(4, 5, inplace=True) assert res is None res = obj.replace(4, 5, inplace=False) tm.assert_equal(res, obj) assert res is not obj def test_replace_only_one_dictlike_arg(self, fixed_now_ts): # GH#33340 ser = pd.Series([1, 2, "A", fixed_now_ts, True]) to_replace = {0: 1, 2: "A"} value = "foo" msg = "Series.replace cannot use dict-like to_replace and non-None value" with pytest.raises(ValueError, match=msg): ser.replace(to_replace, value) to_replace = 1 value = {0: "foo", 2: "bar"} msg = "Series.replace cannot use dict-value and non-None to_replace" with pytest.raises(ValueError, match=msg): ser.replace(to_replace, value) def test_replace_extension_other(self, frame_or_series): # https://github.com/pandas-dev/pandas/issues/34530 obj = frame_or_series(pd.array([1, 2, 3], dtype="Int64")) result = obj.replace("", "") # no exception # should not have changed dtype tm.assert_equal(obj, result) def _check_replace_with_method(self, ser: pd.Series): df = ser.to_frame() res = ser.replace(ser[1], method="pad") expected = pd.Series([ser[0], ser[0]] + list(ser[2:]), dtype=ser.dtype) tm.assert_series_equal(res, expected) res_df = df.replace(ser[1], method="pad") tm.assert_frame_equal(res_df, expected.to_frame()) ser2 = ser.copy() res2 = ser2.replace(ser[1], method="pad", inplace=True) assert res2 is None tm.assert_series_equal(ser2, expected) res_df2 = df.replace(ser[1], method="pad", inplace=True) assert res_df2 is None tm.assert_frame_equal(df, expected.to_frame()) def test_replace_ea_dtype_with_method(self, any_numeric_ea_dtype): arr = pd.array([1, 2, pd.NA, 4], dtype=any_numeric_ea_dtype) ser = pd.Series(arr) self._check_replace_with_method(ser) @pytest.mark.parametrize("as_categorical", [True, False]) def test_replace_interval_with_method(self, as_categorical): # in particular interval that can't hold NA idx = pd.IntervalIndex.from_breaks(range(4)) ser = pd.Series(idx) if as_categorical: ser = ser.astype("category") self._check_replace_with_method(ser) @pytest.mark.parametrize("as_period", [True, False]) @pytest.mark.parametrize("as_categorical", [True, False]) def test_replace_datetimelike_with_method(self, as_period, as_categorical): idx = pd.date_range("2016-01-01", periods=5, tz="US/Pacific") if as_period: idx = idx.tz_localize(None).to_period("D") ser = pd.Series(idx) ser.iloc[-2] = pd.NaT if as_categorical: ser = ser.astype("category") self._check_replace_with_method(ser) def test_replace_with_compiled_regex(self): # https://github.com/pandas-dev/pandas/issues/35680 s = pd.Series(["a", "b", "c"]) regex = re.compile("^a$") result = s.replace({regex: "z"}, regex=True) expected = pd.Series(["z", "b", "c"]) tm.assert_series_equal(result, expected) def test_pandas_replace_na(self): # GH#43344 ser = pd.Series(["AA", "BB", "CC", "DD", "EE", "", pd.NA], dtype="string") regex_mapping = { "AA": "CC", "BB": "CC", "EE": "CC", "CC": "CC-REPL", } result = ser.replace(regex_mapping, regex=True) exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") tm.assert_series_equal(result, exp) @pytest.mark.parametrize( "dtype, input_data, to_replace, expected_data", [ ("bool", [True, False], {True: False}, [False, False]), ("int64", [1, 2], {1: 10, 2: 20}, [10, 20]), ("Int64", [1, 2], {1: 10, 2: 20}, [10, 20]), ("float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]), ("Float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]), ("string", ["one", "two"], {"one": "1", "two": "2"}, ["1", "2"]), ( pd.IntervalDtype("int64"), IntervalArray([pd.Interval(1, 2), pd.Interval(2, 3)]), {pd.Interval(1, 2): pd.Interval(10, 20)}, IntervalArray([pd.Interval(10, 20), pd.Interval(2, 3)]), ), ( pd.IntervalDtype("float64"), IntervalArray([pd.Interval(1.0, 2.7), pd.Interval(2.8, 3.1)]), {pd.Interval(1.0, 2.7): pd.Interval(10.6, 20.8)}, IntervalArray([pd.Interval(10.6, 20.8), pd.Interval(2.8, 3.1)]), ), ( pd.PeriodDtype("M"), [pd.Period("2020-05", freq="M")], {pd.Period("2020-05", freq="M"): pd.Period("2020-06", freq="M")}, [pd.Period("2020-06", freq="M")], ), ], ) def test_replace_dtype(self, dtype, input_data, to_replace, expected_data): # GH#33484 ser = pd.Series(input_data, dtype=dtype) result = ser.replace(to_replace) expected = pd.Series(expected_data, dtype=dtype) tm.assert_series_equal(result, expected) def test_replace_string_dtype(self): # GH#40732, GH#44940 ser = pd.Series(["one", "two", np.nan], dtype="string") res = ser.replace({"one": "1", "two": "2"}) expected = pd.Series(["1", "2", np.nan], dtype="string") tm.assert_series_equal(res, expected) # GH#31644 ser2 = pd.Series(["A", np.nan], dtype="string") res2 = ser2.replace("A", "B") expected2 = pd.Series(["B", np.nan], dtype="string") tm.assert_series_equal(res2, expected2) ser3 = pd.Series(["A", "B"], dtype="string") res3 = ser3.replace("A", pd.NA) expected3 = pd.Series([pd.NA, "B"], dtype="string") tm.assert_series_equal(res3, expected3) def test_replace_string_dtype_list_to_replace(self): # GH#41215, GH#44940 ser = pd.Series(["abc", "def"], dtype="string") res = ser.replace(["abc", "any other string"], "xyz") expected = pd.Series(["xyz", "def"], dtype="string") tm.assert_series_equal(res, expected) def test_replace_string_dtype_regex(self): # GH#31644 ser = pd.Series(["A", "B"], dtype="string") res = ser.replace(r".", "C", regex=True) expected = pd.Series(["C", "C"], dtype="string") tm.assert_series_equal(res, expected) def test_replace_nullable_numeric(self): # GH#40732, GH#44940 floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype()) assert floats.replace({1.0: 9}).dtype == floats.dtype assert floats.replace(1.0, 9).dtype == floats.dtype assert floats.replace({1.0: 9.0}).dtype == floats.dtype assert floats.replace(1.0, 9.0).dtype == floats.dtype res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0]) assert res.dtype == floats.dtype ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype()) assert ints.replace({1: 9}).dtype == ints.dtype assert ints.replace(1, 9).dtype == ints.dtype assert ints.replace({1: 9.0}).dtype == ints.dtype assert ints.replace(1, 9.0).dtype == ints.dtype # nullable (for now) raises instead of casting with pytest.raises(TypeError, match="Invalid value"): ints.replace({1: 9.5}) with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 series = pd.Series(["0"]) expected = pd.Series([1]) result = series.replace(to_replace="0", value=1, regex=regex) tm.assert_series_equal(result, expected) def test_replace_different_int_types(self, any_int_numpy_dtype): # GH#45311 labs = pd.Series([1, 1, 1, 0, 0, 2, 2, 2], dtype=any_int_numpy_dtype) maps = pd.Series([0, 2, 1], dtype=any_int_numpy_dtype) map_dict = {old: new for (old, new) in zip(maps.values, maps.index)} result = labs.replace(map_dict) expected = labs.replace({0: 0, 2: 1, 1: 2}) tm.assert_series_equal(result, expected)
def array(data, # type: Sequence[object] dtype=None, # type: Optional[Union[str, np.dtype, ExtensionDtype]] copy=True, # type: bool ): # type: (...) -> ExtensionArray """ Create an array. .. versionadded:: 0.24.0 Parameters ---------- data : Sequence of objects The scalars inside `data` should be instances of the scalar type for `dtype`. It's expected that `data` represents a 1-dimensional array of data. When `data` is an Index or Series, the underlying array will be extracted from `data`. dtype : str, np.dtype, or ExtensionDtype, optional The dtype to use for the array. This may be a NumPy dtype or an extension type registered with pandas using :meth:`pandas.api.extensions.register_extension_dtype`. If not specified, there are two possibilities: 1. When `data` is a :class:`Series`, :class:`Index`, or :class:`ExtensionArray`, the `dtype` will be taken from the data. 2. Otherwise, pandas will attempt to infer the `dtype` from the data. Note that when `data` is a NumPy array, ``data.dtype`` is *not* used for inferring the array type. This is because NumPy cannot represent all the types of data that can be held in extension arrays. Currently, pandas will infer an extension dtype for sequences of ============================== ===================================== scalar type Array Type ============================= ===================================== * :class:`pandas.Interval` :class:`pandas.IntervalArray` * :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` * :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` * :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` ============================= ===================================== For all other cases, NumPy's usual inference rules will be used. copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require copying data, even if ``copy=False``. Returns ------- ExtensionArray The newly created array. Raises ------ ValueError When `data` is not 1-dimensional. See Also -------- numpy.array : Construct a NumPy array. arrays.PandasArray : ExtensionArray wrapping a NumPy array. Series : Construct a pandas Series. Index : Construct a pandas Index. Notes ----- Omitting the `dtype` argument means pandas will attempt to infer the best array type from the values in the data. As new array types are added by pandas and 3rd party libraries, the "best" array type may change. We recommend specifying `dtype` to ensure that 1. the correct array type for the data is returned 2. the returned array type doesn't change as new extension types are added by pandas and third-party libraries Additionally, if the underlying memory representation of the returned array matters, we recommend specifying the `dtype` as a concrete object rather than a string alias or allowing it to be inferred. For example, a future version of pandas or a 3rd-party library may include a dedicated ExtensionArray for string data. In this event, the following would no longer return a :class:`arrays.PandasArray` backed by a NumPy array. >>> pd.array(['a', 'b'], dtype=str) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 This would instead return the new ExtensionArray dedicated for string data. If you really need the new array to be backed by a NumPy array, specify that in the dtype. >>> pd.array(['a', 'b'], dtype=np.dtype("<U1")) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 Or use the dedicated constructor for the array you're expecting, and wrap that in a PandasArray >>> pd.array(np.array(['a', 'b'], dtype='<U1')) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 Examples -------- If a dtype is not specified, `data` is passed through to :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned. >>> pd.array([1, 2]) <PandasArray> [1, 2] Length: 2, dtype: int64 Or the NumPy dtype can be specified >>> pd.array([1, 2], dtype=np.dtype("int32")) <PandasArray> [1, 2] Length: 2, dtype: int32 You can use the string alias for `dtype` >>> pd.array(['a', 'b', 'a'], dtype='category') [a, b, a] Categories (2, object): [a, b] Or specify the actual dtype >>> pd.array(['a', 'b', 'a'], ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) [a, b, a] Categories (3, object): [a < b < c] Because omitting the `dtype` passes the data through to NumPy, a mixture of valid integers and NA will return a floating-point NumPy array. >>> pd.array([1, 2, np.nan]) <PandasArray> [1.0, 2.0, nan] Length: 3, dtype: float64 To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify the dtype: >>> pd.array([1, 2, np.nan], dtype='Int64') <IntegerArray> [1, 2, NaN] Length: 3, dtype: Int64 Pandas will infer an ExtensionArray for some types of data: >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) <PeriodArray> ['2000-01-01', '2000-01-01'] Length: 2, dtype: period[D] `data` must be 1-dimensional. A ValueError is raised when the input has the wrong dimensionality. >>> pd.array(1) Traceback (most recent call last): ... ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( period_array, ExtensionArray, IntervalArray, PandasArray, DatetimeArrayMixin, TimedeltaArrayMixin, ) from pandas.core.internals.arrays import extract_array if lib.is_scalar(data): msg = ( "Cannot pass scalar '{}' to 'pandas.array'." ) raise ValueError(msg.format(data)) data = extract_array(data, extract_numpy=True) if dtype is None and isinstance(data, ExtensionArray): dtype = data.dtype # this returns None for not-found dtypes. if isinstance(dtype, compat.string_types): dtype = registry.find(dtype) or dtype if is_extension_array_dtype(dtype): cls = dtype.construct_array_type() return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: inferred_dtype = lib.infer_dtype(data) if inferred_dtype == 'period': try: return period_array(data, copy=copy) except tslibs.IncompatibleFrequency: # We may have a mixture of frequencies. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype == 'interval': try: return IntervalArray(data, copy=copy) except ValueError: # We may have a mixture of `closed` here. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype.startswith('datetime'): # datetime, datetime64 try: return DatetimeArrayMixin._from_sequence(data, copy=copy) except ValueError: # Mixture of timezones, fall back to PandasArray pass elif inferred_dtype.startswith('timedelta'): # timedelta, timedelta64 return TimedeltaArrayMixin._from_sequence(data, copy=copy) # TODO(BooleanArray): handle this type result = PandasArray._from_sequence(data, dtype=dtype, copy=copy) return result
def gen(count): for _ in range(count): yield IntervalArray(make_data())