def test_match_substring():
    arr = pa.array(["ab", "abc", "ba", None])
    result = pc.match_substring(arr, "ab")
    expected = pa.array([True, True, False, None])
    assert expected.equals(result)

    arr = pa.array(["áB", "Ábc", "ba", None])
    result = pc.match_substring(arr, "áb", ignore_case=True)
    expected = pa.array([True, True, False, None])
    assert expected.equals(result)
    result = pc.match_substring(arr, "áb", ignore_case=False)
    expected = pa.array([False, False, False, None])
    assert expected.equals(result)
Exemple #2
0
            result = lib.map_infer_mask(arr,
                                        f,
                                        mask.view("uint8"),
                                        convert=False,
                                        na_value=na_value)
            return self._from_sequence(result)
        else:
            # This is when the result type is object. We reach this when
            # -> We know the result type is truly object (e.g. .encode returns bytes
            #    or .findall returns a list).
            # -> We don't know the result type. E.g. `.get` can return anything.
            return lib.map_infer_mask(arr, f, mask.view("uint8"))

    def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
        if not regex and case:
            result = pc.match_substring(self._data, pat)
            result = BooleanDtype().__from_arrow__(result)
            if not isna(na):
                result[isna(result)] = bool(na)
            return result
        else:
            return super()._str_contains(pat, case, flags, na, regex)

    def _str_startswith(self, pat, na=None):
        if hasattr(pc, "match_substring_regex"):
            result = pc.match_substring_regex(self._data, "^" + re.escape(pat))
            result = BooleanDtype().__from_arrow__(result)
            if not isna(na):
                result[isna(result)] = bool(na)
            return result
        else:
Exemple #3
0
                      pat,
                      case=True,
                      flags=0,
                      na=np.nan,
                      regex: bool = True):
        if flags:
            return super()._str_contains(pat, case, flags, na, regex)

        if regex:
            if pa_version_under4p0 or case is False:
                return super()._str_contains(pat, case, flags, na, regex)
            else:
                result = pc.match_substring_regex(self._data, pat)
        else:
            if case:
                result = pc.match_substring(self._data, pat)
            else:
                result = pc.match_substring(pc.utf8_upper(self._data),
                                            pat.upper())
        result = BooleanDtype().__from_arrow__(result)
        if not isna(na):
            result[isna(result)] = bool(na)
        return result

    def _str_startswith(self, pat: str, na=None):
        if pa_version_under4p0:
            return super()._str_startswith(pat, na)

        pat = "^" + re.escape(pat)
        return self._str_contains(pat, na=na, regex=True)
Exemple #4
0
def test_match_substring():
    arr = pa.array(["ab", "abc", "ba", None])
    result = pc.match_substring(arr, "ab")
    expected = pa.array([True, True, False, None])
    assert expected.equals(result)
Exemple #5
0
    # Only available in pandas 1.2+
    from pandas.core.strings.object_array import ObjectStringArrayMixin

    class _IntermediateExtensionArray(ExtensionArray, ObjectStringArrayMixin):
        pass

except ImportError:

    class _IntermediateExtensionArray(ExtensionArray):  # type: ignore
        pass


class StringSupportingExtensionArray(_IntermediateExtensionArray):
    def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
        if not regex and case and hasattr(pc, "match_substring"):
            return type(self)(pc.match_substring(self.data, pat),
                              dtype=pa.bool_())
        else:
            return super()._str_contains(pat, case, flags, na, regex)

    def _str_map(self, *args, **kwargs):
        return type(self)(super()._str_map(*args, **kwargs))

    def _str_startswith(self, pat, na=None):
        # TODO: This is currently not implemented in Arrow but only directly in the fr_strx accessor.
        return super()._str_startswith(pat, na)

    def _str_endswith(self, pat, na=None):
        # TODO: This is currently not implemented in Arrow but only directly in the fr_strx accessor.
        return super()._str_endswith(pat, na)
Exemple #6
0
    # Only available in pandas 1.2+
    from pandas.core.strings.object_array import ObjectStringArrayMixin

    class _IntermediateExtensionArray(ExtensionArray, ObjectStringArrayMixin):
        pass


except ImportError:

    class _IntermediateExtensionArray(ExtensionArray):  # type: ignore
        pass


class StringSupportingExtensionArray(_IntermediateExtensionArray):
    def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
        if not regex and case and hasattr(pc, "match_substring"):
            return type(self)(pc.match_substring(self.data, pat), dtype=pa.bool_())
        else:
            return super()._str_contains(pat, case, flags, na, regex)

    def _str_map(self, *args, **kwargs):
        return type(self)(super()._str_map(*args, **kwargs))

    def _str_startswith(self, pat, na=None):
        # TODO: This is currently not implemented in Arrow but only directly in the fr_strx accessor.
        return super()._str_startswith(pat, na)

    def _str_endswith(self, pat, na=None):
        # TODO: This is currently not implemented in Arrow but only directly in the fr_strx accessor.
        return super()._str_endswith(pat, na)