def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) if any_string_dtype == "object": empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) else: empty_int = Series(dtype="Int64") empty_bool = Series(dtype="boolean") empty_object = Series(dtype=object) empty_bytes = Series(dtype=object) empty_df = DataFrame() # GH7241 # (extract) on empty series tm.assert_series_equal(empty_str, empty.str.cat(empty)) assert "" == empty.str.cat() tm.assert_series_equal(empty_str, empty.str.title()) tm.assert_series_equal(empty_int, empty.str.count("a")) with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): tm.assert_series_equal(empty_bool, empty.str.contains("a")) with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): tm.assert_series_equal(empty_bool, empty.str.startswith("a")) with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): tm.assert_series_equal(empty_bool, empty.str.endswith("a")) tm.assert_series_equal(empty_str, empty.str.lower()) tm.assert_series_equal(empty_str, empty.str.upper()) with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): tm.assert_series_equal(empty_str, empty.str.replace("a", "b")) tm.assert_series_equal(empty_str, empty.str.repeat(3)) with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): tm.assert_series_equal(empty_bool, empty.str.match("^a")) tm.assert_frame_equal( DataFrame(columns=[0], dtype=any_string_dtype), empty.str.extract("()", expand=True), ) tm.assert_frame_equal( DataFrame(columns=[0, 1], dtype=any_string_dtype), empty.str.extract("()()", expand=True), ) tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False)) tm.assert_frame_equal( DataFrame(columns=[0, 1], dtype=any_string_dtype), empty.str.extract("()()", expand=False), ) tm.assert_frame_equal(empty_df, empty.str.get_dummies()) tm.assert_series_equal(empty_str, empty_str.str.join("")) with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): tm.assert_series_equal(empty_int, empty.str.len()) tm.assert_series_equal(empty_object, empty_str.str.findall("a")) tm.assert_series_equal(empty_int, empty.str.find("a")) tm.assert_series_equal(empty_int, empty.str.rfind("a")) tm.assert_series_equal(empty_str, empty.str.pad(42)) tm.assert_series_equal(empty_str, empty.str.center(42)) tm.assert_series_equal(empty_object, empty.str.split("a")) tm.assert_series_equal(empty_object, empty.str.rsplit("a")) tm.assert_series_equal(empty_object, empty.str.partition("a", expand=False)) tm.assert_frame_equal(empty_df, empty.str.partition("a")) tm.assert_series_equal(empty_object, empty.str.rpartition("a", expand=False)) tm.assert_frame_equal(empty_df, empty.str.rpartition("a")) tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) tm.assert_series_equal(empty_str, empty.str.slice(step=1)) with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): tm.assert_series_equal(empty_str, empty.str.strip()) with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): tm.assert_series_equal(empty_str, empty.str.lstrip()) with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii")) tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) # ismethods should always return boolean (GH 29624) tm.assert_series_equal(empty_bool, empty.str.isalnum()) tm.assert_series_equal(empty_bool, empty.str.isalpha()) tm.assert_series_equal(empty_bool, empty.str.isdigit()) with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under2p0): tm.assert_series_equal(empty_bool, empty.str.isspace()) tm.assert_series_equal(empty_bool, empty.str.islower()) tm.assert_series_equal(empty_bool, empty.str.isupper()) tm.assert_series_equal(empty_bool, empty.str.istitle()) tm.assert_series_equal(empty_bool, empty.str.isnumeric()) tm.assert_series_equal(empty_bool, empty.str.isdecimal()) tm.assert_series_equal(empty_str, empty.str.capitalize()) tm.assert_series_equal(empty_str, empty.str.swapcase()) tm.assert_series_equal(empty_str, empty.str.normalize("NFC")) table = str.maketrans("a", "b") tm.assert_series_equal(empty_str, empty.str.translate(table))
def setup(self, keep): self.s = Series(np.random.randint(1, 10, 100000))
def time_constructor(self, data): Series(data=self.data, index=self.idx)
def test_replace_datetimetz(self): # GH 11326 # behaving poorly when presented with a datetime64[ns, tz] df = DataFrame({ 'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': [0, np.nan, 2] }) result = df.replace(np.nan, 1) expected = DataFrame({ 'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': Series([0, 1, 2], dtype='float64') }) assert_frame_equal(result, expected) result = df.fillna(1) assert_frame_equal(result, expected) result = df.replace(0, np.nan) expected = DataFrame({ 'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': [np.nan, np.nan, 2] }) assert_frame_equal(result, expected) result = df.replace(Timestamp('20130102', tz='US/Eastern'), Timestamp('20130104', tz='US/Eastern')) expected = DataFrame({ 'A': [ Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104', tz='US/Eastern'), Timestamp('20130103', tz='US/Eastern') ], 'B': [0, np.nan, 2] }) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({'A': pd.NaT}, Timestamp('20130104', tz='US/Eastern')) assert_frame_equal(result, expected) # coerce to object result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({'A': pd.NaT}, Timestamp('20130104', tz='US/Pacific')) expected = DataFrame({ 'A': [ Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104', tz='US/Pacific'), Timestamp('20130103', tz='US/Eastern') ], 'B': [0, np.nan, 2] }) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({'A': np.nan}, Timestamp('20130104')) expected = DataFrame({ 'A': [ Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104'), Timestamp('20130103', tz='US/Eastern') ], 'B': [0, np.nan, 2] }) assert_frame_equal(result, expected)
def hash_pandas_object( obj, index: bool = True, encoding: str = "utf8", hash_key: Optional[str] = _default_hash_key, categorize: bool = True, ): """ Return a data hash of the Index/Series/DataFrame. Parameters ---------- index : bool, default True Include the index in the hash (if Series/DataFrame). encoding : str, default 'utf8' Encoding for data & key when strings. hash_key : str, default _default_hash_key Hash_key for string key to encode. categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. Returns ------- Series of uint64, same length as the object """ from pandas import Series if hash_key is None: hash_key = _default_hash_key if isinstance(obj, ABCMultiIndex): return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) elif isinstance(obj, ABCIndexClass): h = hash_array(obj.values, encoding, hash_key, categorize).astype( "uint64", copy=False ) h = Series(h, index=obj, dtype="uint64", copy=False) elif isinstance(obj, ABCSeries): h = hash_array(obj.values, encoding, hash_key, categorize).astype( "uint64", copy=False ) if index: index_iter = ( hash_pandas_object( obj.index, index=False, encoding=encoding, hash_key=hash_key, categorize=categorize, ).values for _ in [None] ) arrays = itertools.chain([h], index_iter) h = _combine_hash_arrays(arrays, 2) h = Series(h, index=obj.index, dtype="uint64", copy=False) elif isinstance(obj, ABCDataFrame): hashes = (hash_array(series.values) for _, series in obj.items()) num_items = len(obj.columns) if index: index_hash_generator = ( hash_pandas_object( obj.index, index=False, encoding=encoding, hash_key=hash_key, categorize=categorize, ).values # noqa for _ in [None] ) num_items += 1 # keep `hashes` specifically a generator to keep mypy happy _hashes = itertools.chain(hashes, index_hash_generator) hashes = (x for x in _hashes) h = _combine_hash_arrays(hashes, num_items) h = Series(h, index=obj.index, dtype="uint64", copy=False) else: raise TypeError(f"Unexpected type for hashing {type(obj)}") return h
def test_float_index_at_iat(self): s = Series([1, 2, 3], index=[0.1, 0.2, 0.3]) for el, item in s.items(): assert s.at[el] == item for i in range(len(s)): assert s.iat[i] == i + 1
def test_coercion_with_setitem(self, start_data, expected_result): start_series = Series(start_data) start_series[0] = None expected_series = Series(expected_result) tm.assert_series_equal(start_series, expected_series)
def test_string_slice_out_of_bounds(any_string_dtype): ser = Series(["foo", "b", "ba"], dtype=any_string_dtype) result = ser.str[1] expected = Series(["o", np.nan, "a"], dtype=any_string_dtype) tm.assert_series_equal(result, expected)
def test_encode_decode(any_string_dtype): ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8") result = ser.str.decode("utf-8") expected = ser.map(lambda x: x.decode("utf-8")) tm.assert_series_equal(result, expected)
def test_removesuffix(any_string_dtype, suffix, expected): ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype) result = ser.str.removesuffix(suffix) ser_expected = Series(expected, dtype=any_string_dtype) tm.assert_series_equal(result, ser_expected)
def test_string_slice_out_of_bounds_nested(): ser = Series([(1, 2), (1, ), (3, 4, 5)]) result = ser.str[1] expected = Series([2, np.nan, 4]) tm.assert_series_equal(result, expected)
def test_strip_lstrip_rstrip(any_string_dtype, method, exp): ser = Series([" aa ", " bb \n", np.nan, "cc "], dtype=any_string_dtype) result = getattr(ser.str, method)() expected = Series(exp, dtype=any_string_dtype) tm.assert_series_equal(result, expected)
def test_slice(start, stop, step, expected, any_string_dtype): ser = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"], dtype=any_string_dtype) result = ser.str.slice(start, stop, step) expected = Series(expected, dtype=any_string_dtype) tm.assert_series_equal(result, expected)
def test_spilt_join_roundtrip(any_string_dtype): ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = ser.str.split("_").str.join("_") expected = ser.astype(object) tm.assert_series_equal(result, expected)
def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df df = DataFrame( { "FC": ["a", "b", "a", "b", "a", "b"], "PF": [0, 0, 0, 0, 1, 1], "col1": list(range(6)), "col2": list(range(6, 12)), } ) df.iloc[1, 0] = np.nan df2 = df.copy() mask = ~df2.FC.isna() cols = ["col1", "col2"] dft = df2 * 2 dft.iloc[3, 3] = np.nan expected = DataFrame( { "FC": ["a", np.nan, "a", "b", "a", "b"], "PF": [0, 0, 0, 0, 1, 1], "col1": Series([0, 1, 4, 6, 8, 10]), "col2": [12, 7, 16, np.nan, 20, 22], } ) # frame on rhs df2.loc[mask, cols] = dft.loc[mask, cols] tm.assert_frame_equal(df2, expected) df2.loc[mask, cols] = dft.loc[mask, cols] tm.assert_frame_equal(df2, expected) # with an ndarray on rhs # coerces to float64 because values has float64 dtype # GH 14001 expected = DataFrame( { "FC": ["a", np.nan, "a", "b", "a", "b"], "PF": [0, 0, 0, 0, 1, 1], "col1": [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], "col2": [12, 7, 16, np.nan, 20, 22], } ) df2 = df.copy() df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) # broadcasting on the rhs is required df = DataFrame( dict( A=[1, 2, 0, 0, 0], B=[0, 0, 0, 10, 11], C=[0, 0, 0, 10, 11], D=[3, 4, 5, 6, 7], ) ) expected = df.copy() mask = expected["A"] == 0 for col in ["A", "B"]: expected.loc[mask, col] = df["D"] df.loc[df["A"] == 0, ["A", "B"]] = df["D"] tm.assert_frame_equal(df, expected)
def test_str_accessor_no_new_attributes(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/10673 ser = Series(list("aabbcde"), dtype=any_string_dtype) with pytest.raises(AttributeError, match="You cannot add any new attribute"): ser.str.xlabel = "a"
class TestMisc: def test_float_index_to_mixed(self): df = DataFrame({0.0: np.random.rand(10), 1.0: np.random.rand(10)}) df["a"] = 10 tm.assert_frame_equal( DataFrame({0.0: df[0.0], 1.0: df[1.0], "a": [10] * 10}), df ) def test_float_index_non_scalar_assignment(self): df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0]) df.loc[df.index[:2]] = 1 expected = DataFrame({"a": [1, 1, 3], "b": [1, 1, 5]}, index=df.index) tm.assert_frame_equal(expected, df) df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0]) df2 = df.copy() df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) def test_float_index_at_iat(self): s = Series([1, 2, 3], index=[0.1, 0.2, 0.3]) for el, item in s.items(): assert s.at[el] == item for i in range(len(s)): assert s.iat[i] == i + 1 def test_mixed_index_assignment(self): # GH 19860 s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) s.at["a"] = 11 assert s.iat[0] == 11 s.at[1] = 22 assert s.iat[3] == 22 def test_mixed_index_no_fallback(self): # GH 19860 s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) with pytest.raises(KeyError, match="^0$"): s.at[0] with pytest.raises(KeyError, match="^4$"): s.at[4] def test_rhs_alignment(self): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases def run_tests(df, rhs, right): # label, index, slice lbl_one, idx_one, slice_one = list("bcd"), [1, 2, 3], slice(1, 4) lbl_two, idx_two, slice_two = ["joe", "jolie"], [1, 2], slice(1, 3) left = df.copy() left.loc[lbl_one, lbl_two] = rhs tm.assert_frame_equal(left, right) left = df.copy() left.iloc[idx_one, idx_two] = rhs tm.assert_frame_equal(left, right) left = df.copy() left.iloc[slice_one, slice_two] = rhs tm.assert_frame_equal(left, right) xs = np.arange(20).reshape(5, 4) cols = ["jim", "joe", "jolie", "joline"] df = DataFrame(xs, columns=cols, index=list("abcde")) # right hand side; permute the indices and multiplpy by -2 rhs = -2 * df.iloc[3:0:-1, 2:0:-1] # expected `right` result; just multiply by -2 right = df.copy() right.iloc[1:4, 1:3] *= -2 # run tests with uniform dtypes run_tests(df, rhs, right) # make frames multi-type & re-run tests for frame in [df, rhs, right]: frame["joe"] = frame["joe"].astype("float64") frame["jolie"] = frame["jolie"].map("@{}".format) run_tests(df, rhs, right) def test_str_label_slicing_with_negative_step(self): SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) if not idx.is_integer: # For integer indices, .loc and plain getitem are position-based. tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) for idx in [_mklbl("A", 20), np.arange(20) + 100, np.linspace(100, 150, 20)]: idx = Index(idx) s = Series(np.arange(20), index=idx) assert_slices_equivalent(SLC[idx[9] :: -1], SLC[9::-1]) assert_slices_equivalent(SLC[: idx[9] : -1], SLC[:8:-1]) assert_slices_equivalent(SLC[idx[13] : idx[9] : -1], SLC[13:8:-1]) assert_slices_equivalent(SLC[idx[9] : idx[13] : -1], SLC[:0]) def test_slice_with_zero_step_raises(self): s = Series(np.arange(20), index=_mklbl("A", 20)) with pytest.raises(ValueError, match="slice step cannot be zero"): s[::0] with pytest.raises(ValueError, match="slice step cannot be zero"): s.loc[::0] def test_indexing_assignment_dict_already_exists(self): df = DataFrame({"x": [1, 2, 6], "y": [2, 2, 8], "z": [-5, 0, 5]}).set_index("z") expected = df.copy() rhs = dict(x=9, y=99) df.loc[5] = rhs expected.loc[5] = [9, 99] tm.assert_frame_equal(df, expected) def test_indexing_dtypes_on_empty(self): # Check that .iloc returns correct dtypes GH9983 df = DataFrame({"a": [1, 2, 3], "b": ["b", "b2", "b3"]}) df2 = df.iloc[[], :] assert df2.loc[:, "a"].dtype == np.int64 tm.assert_series_equal(df2.loc[:, "a"], df2.iloc[:, 0]) @pytest.mark.parametrize("size", [5, 999999, 1000000]) def test_range_in_series_indexing(self, size): # range can cause an indexing error # GH 11652 s = Series(index=range(size), dtype=np.float64) s.loc[range(1)] = 42 tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0])) s.loc[range(2)] = 43 tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1])) @pytest.mark.parametrize( "slc", [ pd.IndexSlice[:, :], pd.IndexSlice[:, 1], pd.IndexSlice[1, :], pd.IndexSlice[[1], [1]], pd.IndexSlice[1, [1]], pd.IndexSlice[[1], 1], pd.IndexSlice[1], pd.IndexSlice[1, 1], slice(None, None, None), [0, 1], np.array([0, 1]), Series([0, 1]), ], ) def test_non_reducing_slice(self, slc): df = DataFrame([[0, 1], [2, 3]]) tslice_ = non_reducing_slice(slc) assert isinstance(df.loc[tslice_], DataFrame) def test_list_slice(self): # like dataframe getitem slices = [["A"], Series(["A"]), np.array(["A"])] df = DataFrame({"A": [1, 2], "B": [3, 4]}, index=["A", "B"]) expected = pd.IndexSlice[:, ["A"]] for subset in slices: result = non_reducing_slice(subset) tm.assert_frame_equal(df.loc[result], df.loc[expected]) def test_maybe_numeric_slice(self): df = DataFrame({"A": [1, 2], "B": ["c", "d"], "C": [True, False]}) result = maybe_numeric_slice(df, slice_=None) expected = pd.IndexSlice[:, ["A"]] assert result == expected result = maybe_numeric_slice(df, None, include_bool=True) expected = pd.IndexSlice[:, ["A", "C"]] result = maybe_numeric_slice(df, [1]) expected = [1] assert result == expected def test_partial_boolean_frame_indexing(self): # GH 17170 df = DataFrame( np.arange(9.0).reshape(3, 3), index=list("abc"), columns=list("ABC") ) index_df = DataFrame(1, index=list("ab"), columns=list("AB")) result = df[index_df.notnull()] expected = DataFrame( np.array([[0.0, 1.0, np.nan], [3.0, 4.0, np.nan], [np.nan] * 3]), index=list("abc"), columns=list("ABC"), ) tm.assert_frame_equal(result, expected) def test_no_reference_cycle(self): df = DataFrame({"a": [0, 1], "b": [2, 3]}) for name in ("loc", "iloc", "at", "iat"): getattr(df, name) wr = weakref.ref(df) del df assert wr() is None
def test_cat_on_bytes_raises(): lhs = Series(np.array(list("abc"), "S1").astype(object)) rhs = Series(np.array(list("def"), "S1").astype(object)) msg = "Cannot use .str.cat with values of inferred dtype 'bytes'" with pytest.raises(TypeError, match=msg): lhs.str.cat(rhs)
def test_slice_with_zero_step_raises(self): s = Series(np.arange(20), index=_mklbl("A", 20)) with pytest.raises(ValueError, match="slice step cannot be zero"): s[::0] with pytest.raises(ValueError, match="slice step cannot be zero"): s.loc[::0]
def test_str_accessor_in_apply_func(): # https://github.com/pandas-dev/pandas/issues/38979 df = DataFrame(zip("abc", "def")) expected = Series(["A/D", "B/E", "C/F"]) result = df.apply(lambda f: "/".join(f.str.upper()), axis=1) tm.assert_series_equal(result, expected)
def test_coercion_with_loc_and_series(self, start_data, expected_result): start_series = Series(start_data) start_series.loc[start_series == start_series[0]] = None expected_series = Series(expected_result) tm.assert_series_equal(start_series, expected_series)
# -*- coding: utf-8 -*- """ Created on Mon Dec 25 08:20:27 2017 by Dhiraj Upadhyaya """ #Categories import pandas as pd from pandas import Series import numpy as np s = Series(["a", "b", "c", "a"], dtype="category") s s.cat.categories s.cat.ordered type(s) df = pd.DataFrame({'A': ['a', 'b', 'c', 'a']}) df df['B'] = df['A'].astype('category') df type(df['B']) #pd.core.common.is_categorical_dtype(df.A.dtype) df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) df labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] df = pd.DataFrame({ 'a': [1, 2, 3, 4, 5], 'b': ['yes', 'no', 'yes', 'no', 'absent'] })
def test_replace_mixed(self): mf = self.mixed_frame mf.iloc[5:20, mf.columns.get_loc('foo')] = np.nan mf.iloc[-10:, mf.columns.get_loc('A')] = np.nan result = self.mixed_frame.replace(np.nan, -18) expected = self.mixed_frame.fillna(value=-18) assert_frame_equal(result, expected) assert_frame_equal(result.replace(-18, np.nan), self.mixed_frame) result = self.mixed_frame.replace(np.nan, -1e8) expected = self.mixed_frame.fillna(value=-1e8) assert_frame_equal(result, expected) assert_frame_equal(result.replace(-1e8, np.nan), self.mixed_frame) # int block upcasting df = DataFrame({ 'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0, 1], dtype='int64') }) expected = DataFrame({ 'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0.5, 1], dtype='float64') }) result = df.replace(0, 0.5) assert_frame_equal(result, expected) df.replace(0, 0.5, inplace=True) assert_frame_equal(df, expected) # int block splitting df = DataFrame({ 'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0, 1], dtype='int64'), 'C': Series([1, 2], dtype='int64') }) expected = DataFrame({ 'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0.5, 1], dtype='float64'), 'C': Series([1, 2], dtype='int64') }) result = df.replace(0, 0.5) assert_frame_equal(result, expected) # to object block upcasting df = DataFrame({ 'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0, 1], dtype='int64') }) expected = DataFrame({ 'A': Series([1, 'foo'], dtype='object'), 'B': Series([0, 1], dtype='int64') }) result = df.replace(2, 'foo') assert_frame_equal(result, expected) expected = DataFrame({ 'A': Series(['foo', 'bar'], dtype='object'), 'B': Series([0, 'foo'], dtype='object') }) result = df.replace([1, 2], ['foo', 'bar']) assert_frame_equal(result, expected) # test case from df = DataFrame({ 'A': Series([3, 0], dtype='int64'), 'B': Series([0, 3], dtype='int64') }) result = df.replace(3, df.mean().to_dict()) expected = df.copy().astype('float64') m = df.mean() expected.iloc[0, 0] = m[0] expected.iloc[1, 1] = m[1] assert_frame_equal(result, expected)
def test_missing_labels_inside_loc_matched_in_error_message(): # GH34272 s = Series({"a": 1, "b": 2, "c": 3}) error_message_regex = "missing_0.*missing_1.*missing_2" with pytest.raises(KeyError, match=error_message_regex): s.loc[["a", "b", "missing_0", "c", "missing_1", "missing_2"]]
def test_get_loc_single_level(self, single_level_multiindex): single_level = single_level_multiindex s = Series(np.random.randn(len(single_level)), index=single_level) for k in single_level.values: s[k]
class TestFancy: """ pure get/set item & fancy indexing """ def test_setitem_ndarray_1d(self): # GH5508 # len of indexer vs length of the 1d ndarray df = DataFrame(index=Index(np.arange(1, 11))) df["foo"] = np.zeros(10, dtype=np.float64) df["bar"] = np.zeros(10, dtype=complex) # invalid msg = ( "cannot set using a multi-index selection " "indexer with a different length than the value" ) with pytest.raises(ValueError, match=msg): df.loc[df.index[2:5], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) # valid df.loc[df.index[2:6], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) result = df.loc[df.index[2:6], "bar"] expected = Series( [2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], name="bar" ) tm.assert_series_equal(result, expected) # dtype getting changed? df = DataFrame(index=Index(np.arange(1, 11))) df["foo"] = np.zeros(10, dtype=np.float64) df["bar"] = np.zeros(10, dtype=complex) msg = "Must have equal len keys and value when setting with an iterable" with pytest.raises(ValueError, match=msg): df[2:5] = np.arange(1, 4) * 1j @pytest.mark.parametrize( "obj", [ lambda i: Series(np.arange(len(i)), index=i), lambda i: DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i), ], ids=["Series", "DataFrame"], ) @pytest.mark.parametrize( "idxr, idxr_id", [ (lambda x: x, "getitem"), (lambda x: x.loc, "loc"), (lambda x: x.iloc, "iloc"), ], ) def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): # GH 25567 obj = obj(index) idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) msg = "|".join( [ r"Buffer has wrong number of dimensions \(expected 1, got 3\)", "Cannot index with multidimensional key", r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]", "Index data must be 1-dimensional", "positional indexers are out-of-bounds", "Indexing a MultiIndex with a multidimensional key is not implemented", ] ) potential_errors = (IndexError, ValueError, NotImplementedError) with pytest.raises(potential_errors, match=msg): with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): idxr[nd3] @pytest.mark.parametrize( "obj", [ lambda i: Series(np.arange(len(i)), index=i), lambda i: DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i), ], ids=["Series", "DataFrame"], ) @pytest.mark.parametrize( "idxr, idxr_id", [ (lambda x: x, "setitem"), (lambda x: x.loc, "loc"), (lambda x: x.iloc, "iloc"), ], ) def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): # GH 25567 obj = obj(index) idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) if (len(index) == 0) and (idxr_id == "iloc") and isinstance(obj, pd.DataFrame): # gh-32896 pytest.skip("This is currently failing. There's an xfailed test below.") if idxr_id == "iloc": err = ValueError msg = f"Cannot set values with ndim > {obj.ndim}" elif ( isinstance(index, pd.IntervalIndex) and idxr_id == "setitem" and obj.ndim == 1 ): err = AttributeError msg = ( "'pandas._libs.interval.IntervalTree' object has no attribute 'get_loc'" ) else: err = ValueError msg = r"Buffer has wrong number of dimensions \(expected 1, got 3\)|" with pytest.raises(err, match=msg): idxr[nd3] = 0 @pytest.mark.xfail(reason="gh-32896") def test_setitem_ndarray_3d_does_not_fail_for_iloc_empty_dataframe(self): # when fixing this, please remove the pytest.skip in test_setitem_ndarray_3d i = Index([]) obj = DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i) nd3 = np.random.randint(5, size=(2, 2, 2)) msg = f"Cannot set values with ndim > {obj.ndim}" with pytest.raises(ValueError, match=msg): obj.iloc[nd3] = 0 def test_inf_upcast(self): # GH 16957 # We should be able to use np.inf as a key # np.inf should cause an index to convert to float # Test with np.inf in rows df = DataFrame(columns=[0]) df.loc[1] = 1 df.loc[2] = 2 df.loc[np.inf] = 3 # make sure we can look up the value assert df.loc[np.inf, 0] == 3 result = df.index expected = pd.Float64Index([1, 2, np.inf]) tm.assert_index_equal(result, expected) # Test with np.inf in columns df = DataFrame() df.loc[0, 0] = 1 df.loc[1, 1] = 2 df.loc[0, np.inf] = 3 result = df.columns expected = pd.Float64Index([0, 1, np.inf]) tm.assert_index_equal(result, expected) def test_setitem_dtype_upcast(self): # GH3216 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df["c"] = np.nan assert df["c"].dtype == np.float64 df.loc[0, "c"] = "foo" expected = DataFrame( [{"a": 1, "b": np.nan, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}] ) tm.assert_frame_equal(df, expected) # GH10280 df = DataFrame( np.arange(6, dtype="int64").reshape(2, 3), index=list("ab"), columns=["foo", "bar", "baz"], ) for val in [3.14, "wxyz"]: left = df.copy() left.loc["a", "bar"] = val right = DataFrame( [[0, val, 2], [3, 4, 5]], index=list("ab"), columns=["foo", "bar", "baz"], ) tm.assert_frame_equal(left, right) assert is_integer_dtype(left["foo"]) assert is_integer_dtype(left["baz"]) left = DataFrame( np.arange(6, dtype="int64").reshape(2, 3) / 10.0, index=list("ab"), columns=["foo", "bar", "baz"], ) left.loc["a", "bar"] = "wxyz" right = DataFrame( [[0, "wxyz", 0.2], [0.3, 0.4, 0.5]], index=list("ab"), columns=["foo", "bar", "baz"], ) tm.assert_frame_equal(left, right) assert is_float_dtype(left["foo"]) assert is_float_dtype(left["baz"]) def test_dups_fancy_indexing(self): # GH 3455 df = tm.makeCustomDataframe(10, 3) df.columns = ["a", "a", "b"] result = df[["b", "a"]].columns expected = Index(["b", "a", "a"]) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa")) df.head() str(df) result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]]) result.columns = list("aaaaaaa") # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( {"test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd")}, index=["A", "A", "B", "C"], ) rows = ["C", "B"] expected = DataFrame( {"test": [11, 9], "test1": [7.0, 6], "other": ["d", "c"]}, index=rows ) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ["C", "B", "E"] with pytest.raises(KeyError, match="with any missing labels"): df.loc[rows] # see GH5553, make sure we use the right indexer rows = ["F", "G", "H", "C", "B", "E"] with pytest.raises(KeyError, match="with any missing labels"): df.loc[rows] # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD")) with pytest.raises( KeyError, match=re.escape( "\"None of [Index(['E'], dtype='object')] are in the [index]\"" ), ): dfnu.loc[["E"]] # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) with pytest.raises(KeyError, match="with any missing labels"): df.loc[[0, 8, 0]] df = DataFrame({"A": list("abc")}) with pytest.raises(KeyError, match="with any missing labels"): df.loc[[0, 8, 0]] # non unique with non unique selector df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"]) with pytest.raises(KeyError, match="with any missing labels"): df.loc[["A", "A", "E"]] def test_dups_fancy_indexing2(self): # GH 5835 # dups on index and missing values df = DataFrame(np.random.randn(5, 5), columns=["A", "B", "B", "B", "A"]) with pytest.raises(KeyError, match="with any missing labels"): df.loc[:, ["A", "B", "C"]] # GH 6504, multi-axis indexing df = DataFrame( np.random.randn(9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=["a", "b"] ) expected = df.iloc[0:6] result = df.loc[[1, 2]] tm.assert_frame_equal(result, expected) expected = df result = df.loc[:, ["a", "b"]] tm.assert_frame_equal(result, expected) expected = df.iloc[0:6, :] result = df.loc[[1, 2], ["a", "b"]] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("case", [lambda s: s, lambda s: s.loc]) def test_duplicate_int_indexing(self, case): # GH 17347 s = Series(range(3), index=[1, 1, 3]) expected = s[1] result = case(s)[[1]] tm.assert_series_equal(result, expected) def test_indexing_mixed_frame_bug(self): # GH3492 df = DataFrame( {"a": {1: "aaa", 2: "bbb", 3: "ccc"}, "b": {1: 111, 2: 222, 3: 333}} ) # this works, new column is created correctly df["test"] = df["a"].apply(lambda x: "_" if x == "aaa" else x) # this does not work, ie column test is not changed idx = df["test"] == "_" temp = df.loc[idx, "a"].apply(lambda x: "-----" if x == "aaa" else x) df.loc[idx, "test"] = temp assert df.iloc[0, 2] == "-----" def test_multitype_list_index_access(self): # GH 10610 df = DataFrame(np.random.random((10, 5)), columns=["a"] + [20, 21, 22, 23]) with pytest.raises(KeyError, match=re.escape("'[-8, 26] not in index'")): df[[22, 26, -8]] assert df[21].shape[0] == df.shape[0] def test_set_index_nan(self): # GH 3586 df = DataFrame( { "PRuid": { 17: "nonQC", 18: "nonQC", 19: "nonQC", 20: "10", 21: "11", 22: "12", 23: "13", 24: "24", 25: "35", 26: "46", 27: "47", 28: "48", 29: "59", 30: "10", }, "QC": { 17: 0.0, 18: 0.0, 19: 0.0, 20: np.nan, 21: np.nan, 22: np.nan, 23: np.nan, 24: 1.0, 25: np.nan, 26: np.nan, 27: np.nan, 28: np.nan, 29: np.nan, 30: np.nan, }, "data": { 17: 7.9544899999999998, 18: 8.0142609999999994, 19: 7.8591520000000008, 20: 0.86140349999999999, 21: 0.87853110000000001, 22: 0.8427041999999999, 23: 0.78587700000000005, 24: 0.73062459999999996, 25: 0.81668560000000001, 26: 0.81927080000000008, 27: 0.80705009999999999, 28: 0.81440240000000008, 29: 0.80140849999999997, 30: 0.81307740000000006, }, "year": { 17: 2006, 18: 2007, 19: 2008, 20: 1985, 21: 1985, 22: 1985, 23: 1985, 24: 1985, 25: 1985, 26: 1985, 27: 1985, 28: 1985, 29: 1985, 30: 1986, }, } ).reset_index() result = ( df.set_index(["year", "PRuid", "QC"]) .reset_index() .reindex(columns=df.columns) ) tm.assert_frame_equal(result, df) def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df df = DataFrame( { "FC": ["a", "b", "a", "b", "a", "b"], "PF": [0, 0, 0, 0, 1, 1], "col1": list(range(6)), "col2": list(range(6, 12)), } ) df.iloc[1, 0] = np.nan df2 = df.copy() mask = ~df2.FC.isna() cols = ["col1", "col2"] dft = df2 * 2 dft.iloc[3, 3] = np.nan expected = DataFrame( { "FC": ["a", np.nan, "a", "b", "a", "b"], "PF": [0, 0, 0, 0, 1, 1], "col1": Series([0, 1, 4, 6, 8, 10]), "col2": [12, 7, 16, np.nan, 20, 22], } ) # frame on rhs df2.loc[mask, cols] = dft.loc[mask, cols] tm.assert_frame_equal(df2, expected) df2.loc[mask, cols] = dft.loc[mask, cols] tm.assert_frame_equal(df2, expected) # with an ndarray on rhs # coerces to float64 because values has float64 dtype # GH 14001 expected = DataFrame( { "FC": ["a", np.nan, "a", "b", "a", "b"], "PF": [0, 0, 0, 0, 1, 1], "col1": [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], "col2": [12, 7, 16, np.nan, 20, 22], } ) df2 = df.copy() df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) # broadcasting on the rhs is required df = DataFrame( dict( A=[1, 2, 0, 0, 0], B=[0, 0, 0, 10, 11], C=[0, 0, 0, 10, 11], D=[3, 4, 5, 6, 7], ) ) expected = df.copy() mask = expected["A"] == 0 for col in ["A", "B"]: expected.loc[mask, col] = df["D"] df.loc[df["A"] == 0, ["A", "B"]] = df["D"] tm.assert_frame_equal(df, expected) def test_setitem_list(self): # GH 6043 # iloc with a list df = DataFrame(index=[0, 1], columns=[0]) df.iloc[1, 0] = [1, 2, 3] df.iloc[1, 0] = [1, 2] result = DataFrame(index=[0, 1], columns=[0]) result.iloc[1, 0] = [1, 2] tm.assert_frame_equal(result, df) # iloc with an object class TO: def __init__(self, value): self.value = value def __str__(self) -> str: return f"[{self.value}]" __repr__ = __str__ def __eq__(self, other) -> bool: return self.value == other.value def view(self): return self df = DataFrame(index=[0, 1], columns=[0]) df.iloc[1, 0] = TO(1) df.iloc[1, 0] = TO(2) result = DataFrame(index=[0, 1], columns=[0]) result.iloc[1, 0] = TO(2) tm.assert_frame_equal(result, df) # remains object dtype even after setting it back df = DataFrame(index=[0, 1], columns=[0]) df.iloc[1, 0] = TO(1) df.iloc[1, 0] = np.nan result = DataFrame(index=[0, 1], columns=[0]) tm.assert_frame_equal(result, df) def test_string_slice(self): # GH 14424 # string indexing against datetimelike with object # dtype should properly raises KeyError df = DataFrame([1], Index([pd.Timestamp("2011-01-01")], dtype=object)) assert df.index._is_all_dates with pytest.raises(KeyError, match="'2011'"): df["2011"] with pytest.raises(KeyError, match="'2011'"): with tm.assert_produces_warning(FutureWarning): # This does an is_all_dates check df.loc["2011", 0] df = DataFrame() assert not df.index._is_all_dates with pytest.raises(KeyError, match="'2011'"): df["2011"] with pytest.raises(KeyError, match="'2011'"): df.loc["2011", 0] def test_astype_assignment(self): # GH4312 (iloc) df_orig = DataFrame( [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) df = df_orig.copy() df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) tm.assert_frame_equal(df, expected) df = df_orig.copy() df.iloc[:, 0:2] = df.iloc[:, 0:2]._convert(datetime=True, numeric=True) expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) tm.assert_frame_equal(df, expected) # GH5702 (loc) df = df_orig.copy() df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64) expected = DataFrame( [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) expected = DataFrame( [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) tm.assert_frame_equal(df, expected) # full replacements / no nans df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]}) df.iloc[:, 0] = df["A"].astype(np.int64) expected = DataFrame({"A": [1, 2, 3, 4]}) tm.assert_frame_equal(df, expected) df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]}) df.loc[:, "A"] = df["A"].astype(np.int64) expected = DataFrame({"A": [1, 2, 3, 4]}) tm.assert_frame_equal(df, expected) def test_index_type_coercion(self): # GH 11836 # if we have an index type and set it with something that looks # to numpy like the same, but is actually, not # (e.g. setting with a float or string '0') # then we need to coerce to object # integer indexes for s in [Series(range(5)), Series(range(5), index=range(1, 6))]: assert s.index.is_integer() for indexer in [lambda x: x.loc, lambda x: x]: s2 = s.copy() indexer(s2)[0.1] = 0 assert s2.index.is_floating() assert indexer(s2)[0.1] == 0 s2 = s.copy() indexer(s2)[0.0] = 0 exp = s.index if 0 not in s: exp = Index(s.index.tolist() + [0]) tm.assert_index_equal(s2.index, exp) s2 = s.copy() indexer(s2)["0"] = 0 assert s2.index.is_object() for s in [Series(range(5), index=np.arange(5.0))]: assert s.index.is_floating() for idxr in [lambda x: x.loc, lambda x: x]: s2 = s.copy() idxr(s2)[0.1] = 0 assert s2.index.is_floating() assert idxr(s2)[0.1] == 0 s2 = s.copy() idxr(s2)[0.0] = 0 tm.assert_index_equal(s2.index, s.index) s2 = s.copy() idxr(s2)["0"] = 0 assert s2.index.is_object()
def setup(self, dtype): N = 10**5 data = np.array([1] * N + [2] * N + [3] * N).astype(dtype) self.s = Series(data)
def test_duplicate_int_indexing(self, case): # GH 17347 s = Series(range(3), index=[1, 1, 3]) expected = s[1] result = case(s)[[1]] tm.assert_series_equal(result, expected)
def setup(self, n): self.s = Series(np.random.randn(n))
def test_repeat_with_null(any_string_dtype, arg, repeat): # GH: 31632 ser = Series(["a", arg], dtype=any_string_dtype) result = ser.str.repeat([3, repeat]) expected = Series(["aaa", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected)