def test_endswith_nullable_string_dtype(nullable_string_dtype, na): values = Series( [ "om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege." ], dtype=nullable_string_dtype, ) with tm.maybe_produces_warning( PerformanceWarning, nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.endswith("foo", na=na) exp = Series([False, na, False, False, True, na, True, False, False], dtype="boolean") tm.assert_series_equal(result, exp) with tm.maybe_produces_warning( PerformanceWarning, nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.endswith("rege.", na=na) exp = Series([False, na, False, False, False, na, False, False, True], dtype="boolean") tm.assert_series_equal(result, exp)
def test_nunique_null(null_obj, index_or_series_obj): obj = index_or_series_obj if not allow_na_ops(obj): pytest.skip("type doesn't allow for NA operations") elif isinstance(obj, pd.MultiIndex): pytest.skip(f"MultiIndex can't hold '{null_obj}'") values = obj._values values[0:2] = null_obj klass = type(obj) repeated_values = np.repeat(values, range(1, len(values) + 1)) obj = klass(repeated_values, dtype=obj.dtype) if isinstance(obj, pd.CategoricalIndex): assert obj.nunique() == len(obj.categories) assert obj.nunique(dropna=False) == len(obj.categories) + 1 else: with tm.maybe_produces_warning( PerformanceWarning, pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): num_unique_values = len(obj.unique()) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): assert obj.nunique() == max(0, num_unique_values - 1) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): assert obj.nunique(dropna=False) == max(0, num_unique_values)
def test_replace_moar(any_string_dtype): # PR #1179 ser = Series( ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], dtype=any_string_dtype, ) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = ser.str.replace("A", "YYY") expected = Series( [ "YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat" ], dtype=any_string_dtype, ) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): result = ser.str.replace("A", "YYY", case=False) expected = Series( [ "YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", "", np.nan, "CYYYBYYY", "dog", "cYYYt", ], dtype=any_string_dtype, ) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ "A", "B", "C", "XX-XX ba", "XX-XX ca", "", np.nan, "XX-XX BA", "XX-XX ", "XX-XX t", ], dtype=any_string_dtype, ) tm.assert_series_equal(result, expected)
def test_isin(dtype, fixed_now_ts): s = pd.Series(["a", "b", None], dtype=dtype) with tm.maybe_produces_warning(PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0): result = s.isin(["a", "c"]) expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0): result = s.isin(["a", pd.NA]) expected = pd.Series([True, False, True]) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0): result = s.isin([]) expected = pd.Series([False, False, False]) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0): result = s.isin(["a", fixed_now_ts]) expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected)
def test_contains_moar(any_string_dtype): # PR #1179 s = Series( ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], dtype=any_string_dtype, ) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = s.str.contains("a") expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series( [False, False, False, True, True, False, np.nan, False, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): result = s.str.contains("a", case=False) expected = Series( [True, False, False, True, True, False, np.nan, True, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = s.str.contains("Aa") expected = Series( [False, False, False, True, False, False, np.nan, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = s.str.contains("ba") expected = Series( [False, False, False, True, False, False, np.nan, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): result = s.str.contains("ba", case=False) expected = Series( [False, False, False, True, True, False, np.nan, True, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected)
def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected)
def test_match(any_string_dtype): # New match behavior introduced in 0.13 expected_dtype = "object" if any_string_dtype == "object" else "boolean" values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.match(".*(BAD[_]+).*(BAD)") expected = Series([True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.match(".*BAD[_]+.*BAD") expected = Series([True, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.match("BAD[_]+.*BAD") expected = Series([False, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series(["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.match("^BAD[_]+.*BAD") expected = Series([False, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.match("\\^BAD[_]+.*BAD") expected = Series([False, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected)
def test_unique(index_or_series_obj): obj = index_or_series_obj obj = np.repeat(obj, range(1, len(obj) + 1)) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): result = obj.unique() # dict.fromkeys preserves the order unique_values = list(dict.fromkeys(obj.values)) if isinstance(obj, pd.MultiIndex): expected = pd.MultiIndex.from_tuples(unique_values) expected.names = obj.names tm.assert_index_equal(result, expected, exact=True) elif isinstance(obj, pd.Index) and obj._is_backward_compat_public_numeric_index: expected = NumericIndex(unique_values, dtype=obj.dtype) tm.assert_index_equal(result, expected, exact=True) elif isinstance(obj, pd.Index): expected = pd.Index(unique_values, dtype=obj.dtype) if is_datetime64tz_dtype(obj.dtype): expected = expected.normalize() tm.assert_index_equal(result, expected, exact=True) else: expected = np.array(unique_values) tm.assert_numpy_array_equal(result, expected)
def test_replace_regex_single_character(regex, any_string_dtype): # https://github.com/pandas-dev/pandas/pull/24809 # The current behavior is to treat single character patterns as literal strings, # even when ``regex`` is set to ``True``. s = Series(["a.b", ".", "b", np.nan, ""], dtype=any_string_dtype) if regex is None: msg = re.escape( "The default value of regex will change from True to False in a future " "version. In addition, single character regular expressions will *not* " "be treated as literal strings when regex=True.") pyarrow_warn = any_string_dtype == "string[pyarrow]" and pa_version_under4p0 with tm.assert_produces_warning( FutureWarning, match=msg, raise_on_extra_warnings=not pyarrow_warn): result = s.str.replace(".", "a", regex=regex) else: with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = s.str.replace(".", "a", regex=regex) expected = Series(["aab", "a", "b", np.nan, ""], dtype=any_string_dtype) tm.assert_series_equal(result, expected)
def test_replace_compiled_regex(any_string_dtype): # GH 15446 ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with compiled regex pat = re.compile(r"BAD_*") with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected)
def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): result = values.str.match("ab", case=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected)
def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): result = ser.str.replace(pat, ", ") tm.assert_series_equal(result, expected)
def test_nunique(index_or_series_obj): obj = index_or_series_obj obj = np.repeat(obj, range(1, len(obj) + 1)) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): expected = len(obj.unique()) assert obj.nunique(dropna=False) == expected
def test_replace_max_replacements(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = ser.str.replace("BAD[_]*", "", n=1, regex=True) tm.assert_series_equal(result, expected) expected = Series(["foo__barBAD", np.nan], dtype=any_string_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = ser.str.replace("BAD", "", n=1, regex=False) tm.assert_series_equal(result, expected)
def test_dropna_array(self, data_missing): with tm.maybe_produces_warning( PerformanceWarning, pa_version_under6p0 and data_missing.dtype.storage == "pyarrow", ): result = data_missing.dropna() expected = data_missing[[1]] self.assert_extension_array_equal(result, expected)
def test_replace_callable_named_groups(any_string_dtype): # test regex named groups ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)" repl = lambda m: m.group("middle").swapcase() with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected)
def test_strip_lstrip_rstrip_args(any_string_dtype, method, exp): ser = Series(["xxABCxx", "xx BNSD", "LDFJH xx"], dtype=any_string_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = getattr(ser.str, method)("x") expected = Series(exp, dtype=any_string_dtype) tm.assert_series_equal(result, expected)
def test_strip_lstrip_rstrip(any_string_dtype, method, exp): ser = Series([" aa ", " bb \n", np.nan, "cc "], dtype=any_string_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = getattr(ser.str, method)() expected = Series(exp, dtype=any_string_dtype) tm.assert_series_equal(result, expected)
def test_replace_literal(regex, expected, any_string_dtype): # GH16808 literal replace (regex=False vs regex=True) ser = Series(["f.o", "foo", np.nan], dtype=any_string_dtype) expected = Series(expected, dtype=any_string_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = ser.str.replace("f.", "ba", regex=regex) tm.assert_series_equal(result, expected)
def test_replace_compiled_regex_callable(any_string_dtype): # test with callable ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): result = ser.str.replace(pat, repl, n=2) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected)
def test_replace_callable_raises(any_string_dtype, repl): # GH 15055 values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with wrong number of arguments, raising an error msg = (r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " r"(?(3)required )positional arguments?") with pytest.raises(TypeError, match=msg): with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): values.str.replace("a", repl)
def test_fullmatch_na_kwarg(any_string_dtype): ser = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected)
def test_match_na_kwarg(any_string_dtype): # GH #6609 s = Series(["a", "b", np.nan], dtype=any_string_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = s.str.match("a", na=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = s.str.match("a") expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series([True, False, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected)
def test_contains_nan(any_string_dtype): # PR #14171 s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = s.str.contains("foo", na=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = s.str.contains("foo", na=True) expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) else: expected = Series([True, True, True], dtype="boolean") tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = s.str.contains("foo") expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected)
def test_len(any_string_dtype): ser = Series( ["foo", "fooo", "fooooo", np.nan, "fooooooo", "foo\n", "あ"], dtype=any_string_dtype, ) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = ser.str.len() expected_dtype = "float64" if any_string_dtype == "object" else "Int64" expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) tm.assert_series_equal(result, expected)
def test_contains_na_kwarg_for_nullable_string_dtype(nullable_string_dtype, na, expected, regex): # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416 values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype) with tm.maybe_produces_warning( PerformanceWarning, nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 and regex, ): result = values.str.contains("a", na=na, regex=regex) expected = Series([True, False, False, True, expected], dtype="boolean") tm.assert_series_equal(result, expected)
def test_fullmatch_case_kwarg(any_string_dtype): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = ser.str.fullmatch("ab", case=True) tm.assert_series_equal(result, expected) expected = Series([True, True, False, False], dtype=expected_dtype) with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning(PerformanceWarning, any_string_dtype == "string[pyarrow]"): result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected)
def test_pipe_failures(any_string_dtype): # #2119 ser = Series(["A|B|C"], dtype=any_string_dtype) result = ser.str.split("|") expected = Series([["A", "B", "C"]], dtype=object) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = ser.str.replace("|", " ", regex=False) expected = Series(["A B C"], dtype=any_string_dtype) tm.assert_series_equal(result, expected)
def test_flags_kwarg(any_string_dtype): data = { "Dave": "*****@*****.**", "Steve": "*****@*****.**", "Rob": "*****@*****.**", "Wes": np.nan, } data = Series(data, dtype=any_string_dtype) pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" using_pyarrow = any_string_dtype == "string[pyarrow]" result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow): result = data.str.match(pat, flags=re.IGNORECASE) assert result[0] with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow): result = data.str.fullmatch(pat, flags=re.IGNORECASE) assert result[0] result = data.str.findall(pat, flags=re.IGNORECASE) assert result[0][0] == ("dave", "google", "com") result = data.str.count(pat, flags=re.IGNORECASE) assert result[0] == 1 msg = "has match groups" with tm.assert_produces_warning(UserWarning, match=msg, raise_on_extra_warnings=not using_pyarrow): result = data.str.contains(pat, flags=re.IGNORECASE) assert result[0]
def test_ismethods(method, expected, any_string_dtype): ser = Series(["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype) expected_dtype = "bool" if any_string_dtype == "object" else "boolean" expected = Series(expected, dtype=expected_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under2p0 and method == "isspace", ): result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) # compare with standard library expected = [getattr(item, method)() for item in ser] assert list(result) == expected