def first(*, array: pa.Array, group_splits: np.array, **kwargs) -> pa.Array: nonnull_values = array.filter(array.is_valid()) nonnull_splits = nonnull_group_splits(array, group_splits) starts = np.insert(nonnull_splits, 0, 0) ends = np.append(nonnull_splits, len(nonnull_values)) nulls = starts == ends indices = pa.array(starts, pa.int64(), mask=nulls) return nonnull_values.take(indices) # taking index NULL gives NULL
def nunique(*, array: pa.Array, group_splits: np.array, **kwargs) -> pa.Array: nonnull_splits = nonnull_group_splits(array, group_splits) nonnull_values = array.filter( array.is_valid()).to_numpy(zero_copy_only=False) counts = np.fromiter( (np.unique(subarr).size for subarr in np.split(nonnull_values, nonnull_splits)), dtype=np.int64, count=len(nonnull_splits) + 1, ) return pa.array(counts)
def ufunc_caller(*, array: pa.Array, group_splits: np.array, **kwargs) -> pa.Array: nonnull_splits = nonnull_group_splits(array, group_splits) nonnull_values = array.filter( array.is_valid()).to_numpy(zero_copy_only=False) if force_otype: otype = force_otype else: otype = nonnull_values.dtype if pa.types.is_unicode(array.type): zero = "" else: zero = otype.type() np_result, np_empty_indices = call_ufunc(nonnull_values, nonnull_splits, otype, zero) return pa.array(np_result, mask=np_empty_indices)