def _reduce(self, name: str, *, skipna: bool = True, **kwargs): """ Return a scalar result of performing the reduction operation. Parameters ---------- name : str Name of the function, supported values are: { any, all, min, max, sum, mean, median, prod, std, var, sem, kurt, skew }. skipna : bool, default True If True, skip NaN values. **kwargs Additional keyword arguments passed to the reduction function. Currently, `ddof` is the only supported kwarg. Returns ------- scalar Raises ------ TypeError : subclass does not define reductions """ if name == "sem": def pyarrow_meth(data, skipna, **kwargs): numerator = pc.stddev(data, skip_nulls=skipna, **kwargs) denominator = pc.sqrt_checked( pc.subtract_checked( pc.count(self._data, skip_nulls=skipna), kwargs["ddof"])) return pc.divide_checked(numerator, denominator) else: pyarrow_name = { "median": "approximate_median", "prod": "product", "std": "stddev", "var": "variance", }.get(name, name) # error: Incompatible types in assignment # (expression has type "Optional[Any]", variable has type # "Callable[[Any, Any, KwArg(Any)], Any]") pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment] if pyarrow_meth is None: # Let ExtensionArray._reduce raise the TypeError return super()._reduce(name, skipna=skipna, **kwargs) try: result = pyarrow_meth(self._data, skip_nulls=skipna, **kwargs) except (AttributeError, NotImplementedError, TypeError) as err: msg = (f"'{type(self).__name__}' with dtype {self.dtype} " f"does not support reduction '{name}' with pyarrow " f"version {pa.__version__}. '{name}' may be supported by " f"upgrading pyarrow.") raise TypeError(msg) from err if pc.is_null(result).as_py(): return self.dtype.na_value return result.as_py()
def test_compare_string_scalar(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr = con(['a', 'b', 'c', None]) scalar = pa.scalar('b') result = pc.equal(arr, scalar) assert result.equals(con([False, True, False, None])) if typ == "array": nascalar = pa.scalar(None, type="string") result = pc.equal(arr, nascalar) isnull = pc.is_null(result) assert isnull.equals(con([True, True, True, True])) result = pc.not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) result = pc.less(arr, scalar) assert result.equals(con([True, False, False, None])) result = pc.less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) result = pc.greater(arr, scalar) assert result.equals(con([False, False, True, None])) result = pc.greater_equal(arr, scalar) assert result.equals(con([False, True, True, None]))
def test_is_null(): arr = pa.array([1, 2, 3, None]) result = arr.is_null() result = arr.is_null() expected = pa.array([False, False, False, True]) assert result.equals(expected) assert result.equals(pc.is_null(arr)) result = arr.is_valid() expected = pa.array([True, True, True, False]) assert result.equals(expected) assert result.equals(pc.is_valid(arr)) arr = pa.chunked_array([[1, 2], [3, None]]) result = arr.is_null() expected = pa.chunked_array([[False, False], [False, True]]) assert result.equals(expected) result = arr.is_valid() expected = pa.chunked_array([[True, True], [True, False]]) assert result.equals(expected)
), ) # remove unused columns table = table.select(["ix", "x", "y", "title", "first_author_name", "date", "language"]) # truncate the title after 101 characters (matching display logic) truncated_title = pc.utf8_replace_slice(table.column("title"), start=101, stop=1000, replacement="") table = table.set_column(table.schema.get_field_index("title"), "title", truncated_title) # ensure all dictionaries in the file use the same key/value mappings table = table.unify_dictionaries() # filter out non-numeric dates (e.g. null, "1850-1853") # matches the hack in index.js:37 mask = pc.invert(pc.is_null(table.column("date"))) table = table.filter(mask) # sorting by the date improves the loading aesthetics # comment this out to exactly match the original appearance indices = pc.sort_indices(table, sort_keys=[("date", "ascending")]) table = pc.take(table, indices) # after sorting replace ix with an accurate row index indices = pc.sort_indices(table, sort_keys=[("date", "ascending")]) table = table.set_column(table.schema.get_field_index("ix"), "ix", pc.cast(indices, pa.uint32())) temp_path.unlink() local = fs.LocalFileSystem()