def _str_upper(self): return type(self)(pc.utf8_upper(self._data))
self.scalar_udfs[udf.name] = udf def __getitem__(self, name): if name in self.scalar_udfs: return self.scalar_udfs[name] if name in self.agg_udfs: return self.agg_udfs[name] raise Exception("Could not find UDF named %s" % name) # # Prepopulate registry with simple functions # registry = UDFRegistry.registry() registry.add(ScalarUDF("lower", 1, lambda col: compute.utf8_lower(col.cast(string())))) registry.add(ScalarUDF("upper", 1, lambda col: compute.utf8_upper(col.cast(string())))) # # Prepopulate with incremental aggregation functions # registry.add(AggUDF("count", 1, lambda col: compute.count(col).cast(float64()))) registry.add(AggUDF("avg", 1, lambda col: compute.mean(col).cast(float64()))) registry.add(AggUDF("sum", 1, lambda col: compute.sum(col).cast(float64()))) # Welford's algorithm for online std std_init = lambda: [0, 0., 0] def std_update(s, v): s[0] += 1 d = v - s[1] s[1] += d / s[0]
flags=0, na=np.nan, regex: bool = True): if flags: return super()._str_contains(pat, case, flags, na, regex) if regex: if pa_version_under4p0 or case is False: return super()._str_contains(pat, case, flags, na, regex) else: result = pc.match_substring_regex(self._data, pat) else: if case: result = pc.match_substring(self._data, pat) else: result = pc.match_substring(pc.utf8_upper(self._data), pat.upper()) result = BooleanDtype().__from_arrow__(result) if not isna(na): result[isna(result)] = bool(na) return result def _str_startswith(self, pat: str, na=None): if pa_version_under4p0: return super()._str_startswith(pat, na) pat = "^" + re.escape(pat) return self._str_contains(pat, na=na, regex=True) def _str_endswith(self, pat: str, na=None): if pa_version_under4p0:
def _expr_kernel(self, arguments: Any, table: ArrowTable) -> Any: # return np.char.upper(*arguments) return pc.utf8_upper(*arguments)