def _str_len(self): if pa_version_under4p0: fallback_performancewarning(version="4") return super()._str_len() result = pc.utf8_length(self._data) return Int64Dtype().__from_arrow__(result)
def argsort( self, ascending: bool = True, kind: str = "quicksort", na_position: str = "last", *args, **kwargs, ) -> np.ndarray: order = "ascending" if ascending else "descending" null_placement = { "last": "at_end", "first": "at_start" }.get(na_position, None) if null_placement is None or pa_version_under7p0: # Although pc.array_sort_indices exists in version 6 # there's a bug that affects the pa.ChunkedArray backing # https://issues.apache.org/jira/browse/ARROW-12042 fallback_performancewarning("7") return super().argsort(ascending=ascending, kind=kind, na_position=na_position) result = pc.array_sort_indices(self._data, order=order, null_placement=null_placement) if pa_version_under2p0: np_result = result.to_pandas().values else: np_result = result.to_numpy() return np_result.astype(np.intp, copy=False)
def _str_isspace(self): if pa_version_under2p0: fallback_performancewarning(version="2") return super()._str_isspace() result = pc.utf8_is_space(self._data) return BooleanDtype().__from_arrow__(result)
def _str_endswith(self, pat: str, na=None): if pa_version_under4p0: fallback_performancewarning(version="4") return super()._str_endswith(pat, na) pat = re.escape(pat) + "$" return self._str_contains(pat, na=na, regex=True)
def isin(self, values): if pa_version_under2p0: fallback_performancewarning(version="2") return super().isin(values) value_set = [ pa_scalar.as_py() for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] if pa_scalar.type in (pa.string(), pa.null()) ] # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True # for null values, so we short-circuit to return all False array. if not len(value_set): return np.zeros(len(self), dtype=bool) kwargs = {} if pa_version_under3p0: # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises # with unexpected keyword argument in pyarrow 3.0.0+ kwargs["skip_null"] = True result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs) # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_)
def _str_rstrip(self, to_strip=None): if pa_version_under4p0: fallback_performancewarning(version="4") return super()._str_rstrip(to_strip) if to_strip is None: result = pc.utf8_rtrim_whitespace(self._data) else: result = pc.utf8_rtrim(self._data, characters=to_strip) return type(self)(result)
def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ Compute the ArrowExtensionArray of unique values. Returns ------- ArrowExtensionArray """ if pa_version_under2p0: fallback_performancewarning(version="2") return super().unique() else: return type(self)(pc.unique(self._data))
def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ Return ArrowExtensionArray without NA values. Returns ------- ArrowExtensionArray """ if pa_version_under6p0: fallback_performancewarning(version="6") return super().dropna() else: return type(self)(pc.drop_null(self._data))
return type(self)(result) else: # This is when the result type is object. We reach this when # -> We know the result type is truly object (e.g. .encode returns bytes # or .findall returns a list). # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): if flags: fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) if regex: if pa_version_under4p0 or case is False: fallback_performancewarning(version="4") return super()._str_contains(pat, case, flags, na, regex) else: result = pc.match_substring_regex(self._data, pat) else: if case: result = pc.match_substring(self._data, pat) else: result = pc.match_substring(pc.utf8_upper(self._data), pat.upper()) result = BooleanDtype().__from_arrow__(result)