def test_match_substring(): arr = pa.array(["ab", "abc", "ba", None]) result = pc.match_substring(arr, "ab") expected = pa.array([True, True, False, None]) assert expected.equals(result) arr = pa.array(["áB", "Ábc", "ba", None]) result = pc.match_substring(arr, "áb", ignore_case=True) expected = pa.array([True, True, False, None]) assert expected.equals(result) result = pc.match_substring(arr, "áb", ignore_case=False) expected = pa.array([False, False, False, None]) assert expected.equals(result)
result = lib.map_infer_mask(arr, f, mask.view("uint8"), convert=False, na_value=na_value) return self._from_sequence(result) else: # This is when the result type is object. We reach this when # -> We know the result type is truly object (e.g. .encode returns bytes # or .findall returns a list). # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): if not regex and case: result = pc.match_substring(self._data, pat) result = BooleanDtype().__from_arrow__(result) if not isna(na): result[isna(result)] = bool(na) return result else: return super()._str_contains(pat, case, flags, na, regex) def _str_startswith(self, pat, na=None): if hasattr(pc, "match_substring_regex"): result = pc.match_substring_regex(self._data, "^" + re.escape(pat)) result = BooleanDtype().__from_arrow__(result) if not isna(na): result[isna(result)] = bool(na) return result else:
pat, case=True, flags=0, na=np.nan, regex: bool = True): if flags: return super()._str_contains(pat, case, flags, na, regex) if regex: if pa_version_under4p0 or case is False: return super()._str_contains(pat, case, flags, na, regex) else: result = pc.match_substring_regex(self._data, pat) else: if case: result = pc.match_substring(self._data, pat) else: result = pc.match_substring(pc.utf8_upper(self._data), pat.upper()) result = BooleanDtype().__from_arrow__(result) if not isna(na): result[isna(result)] = bool(na) return result def _str_startswith(self, pat: str, na=None): if pa_version_under4p0: return super()._str_startswith(pat, na) pat = "^" + re.escape(pat) return self._str_contains(pat, na=na, regex=True)
def test_match_substring(): arr = pa.array(["ab", "abc", "ba", None]) result = pc.match_substring(arr, "ab") expected = pa.array([True, True, False, None]) assert expected.equals(result)
# Only available in pandas 1.2+ from pandas.core.strings.object_array import ObjectStringArrayMixin class _IntermediateExtensionArray(ExtensionArray, ObjectStringArrayMixin): pass except ImportError: class _IntermediateExtensionArray(ExtensionArray): # type: ignore pass class StringSupportingExtensionArray(_IntermediateExtensionArray): def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): if not regex and case and hasattr(pc, "match_substring"): return type(self)(pc.match_substring(self.data, pat), dtype=pa.bool_()) else: return super()._str_contains(pat, case, flags, na, regex) def _str_map(self, *args, **kwargs): return type(self)(super()._str_map(*args, **kwargs)) def _str_startswith(self, pat, na=None): # TODO: This is currently not implemented in Arrow but only directly in the fr_strx accessor. return super()._str_startswith(pat, na) def _str_endswith(self, pat, na=None): # TODO: This is currently not implemented in Arrow but only directly in the fr_strx accessor. return super()._str_endswith(pat, na)