def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray: if isinstance(pa_array, pa.ChunkedArray): if isinstance(pa_array.type, _ArrayXDExtensionType): # don't call to_pylist() to preserve dtype of the fixed-size array zero_copy_only = _is_zero_copy_only( pa_array.type.storage_dtype, unnest=True) if pa_array.type.shape[0] is None: array: List = [ row for chunk in pa_array.chunks for row in chunk.to_list_of_numpy( zero_copy_only=zero_copy_only) ] else: array: List = [ row for chunk in pa_array.chunks for row in chunk.to_numpy( zero_copy_only=zero_copy_only) ] else: zero_copy_only = _is_zero_copy_only(pa_array.type) and all( not _is_array_with_nulls(chunk) for chunk in pa_array.chunks) array: List = [ row for chunk in pa_array.chunks for row in chunk.to_numpy(zero_copy_only=zero_copy_only) ] else: if isinstance(pa_array.type, _ArrayXDExtensionType): # don't call to_pylist() to preserve dtype of the fixed-size array zero_copy_only = _is_zero_copy_only( pa_array.type.storage_dtype, unnest=True) if pa_array.type.shape[0] is None: array: List = pa_array.to_list_of_numpy( zero_copy_only=zero_copy_only) else: array: List = pa_array.to_numpy( zero_copy_only=zero_copy_only) else: zero_copy_only = _is_zero_copy_only( pa_array.type) and not _is_array_with_nulls(pa_array) array: List = pa_array.to_numpy( zero_copy_only=zero_copy_only).tolist() if len(array) > 0: if any((isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape)) or ( isinstance(x, float) and np.isnan(x)) for x in array): return np.array(array, copy=False, **{ **self.np_array_kwargs, "dtype": np.object }) return np.array(array, copy=False, **self.np_array_kwargs)
def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray: zero_copy_only = _is_zero_copy_only(pa_array.type) if isinstance(pa_array, pa.ChunkedArray): # don't call to_numpy() directly or we end up with a np.array with dtype object # call to_numpy on the chunks instead # for ArrayExtensionArray call py_list directly to support dynamic dimensions if isinstance(pa_array.type, _ArrayXDExtensionType): array: List = [ row for chunk in pa_array.chunks for row in chunk.to_pylist() ] else: array: List = [ row for chunk in pa_array.chunks for row in chunk.to_numpy(zero_copy_only=zero_copy_only) ] else: # cast to list of arrays or we end up with a np.array with dtype object # for ArrayExtensionArray call py_list directly to support dynamic dimensions if isinstance(pa_array.type, _ArrayXDExtensionType): array: List = pa_array.to_pylist() else: array: List = pa_array.to_numpy( zero_copy_only=zero_copy_only).tolist() if len(array) > 0: if any( isinstance(x, np.ndarray) and ( x.dtype == np.object or x.shape != array[0].shape) for x in array): return np.array(array, copy=False, **{ **self.np_array_kwargs, "dtype": np.object }) return np.array(array, copy=False, **self.np_array_kwargs)
def make_groupable_array( array: pa.Array, date_granularity: Optional[DateGranularity]) -> pa.Array: """Given an input array, return the array we will group by. This is for handling DEPRECATED date conversions. The idea is: with input value "2021-03-01T21:12:21.231212312Z", a "year" group should be "2021-01-01Z". """ if date_granularity is None: return array if date_granularity == DateGranularity.QUARTER: np_datetime_ns = array.to_numpy(zero_copy_only=False) np_datetime_m = np_datetime_ns.astype("datetime64[M]").astype(int) rounded_month_numbers = np.floor_divide(np_datetime_m, 3) * 3 np_rounded_ns = rounded_month_numbers.astype("datetime64[M]").astype( "datetime64[ns]") # converting to int made nulls into ... not-null. Make them null again np_rounded_ns[np.isnan(np_datetime_ns)] = "NaT" return pa.array(np_rounded_ns) if date_granularity == DateGranularity.WEEK: # numpy "week" is counted from the Epoch -- which happens to be a # Thursday. But ISO weeks start Monday, not Thursday -- and so Numpy's # "W" type is useless. # # We do integer math: add 3 to each date and then floor-divide by 7. # That makes "1970-01-01 [Thursday] + 3" => Sunday -- so when we # floor-divide, everything from Monday to Sunday falls in the same # bucket. We could group by this ... but we convert back to day and # subtract the 3, so the group can be formatted. np_datetime_ns = array.to_numpy(zero_copy_only=False) np_datetime_d = np_datetime_ns.astype("datetime64[D]").astype(int) rounded_day_numbers = np.floor_divide(np_datetime_d + 3, 7) * 7 - 3 np_rounded_ns = rounded_day_numbers.astype("datetime64[D]").astype( "datetime64[ns]") # converting to int made nulls into ... not-null. Make them null again np_rounded_ns[np.isnan(np_datetime_ns)] = "NaT" return pa.array(np_rounded_ns) freq = date_granularity.numpy_unit np_rounded_ns = (array.to_numpy(zero_copy_only=False).astype( f"datetime64[{freq}]").astype("datetime64[ns]")) return pa.array(np_rounded_ns)
def _replace_with_indices( cls, chunk: pa.Array, indices: npt.NDArray[np.intp], value: npt.NDArray[Any], ) -> pa.Array: """ Replace items selected with a set of positional indices. Analogous to pyarrow.compute.replace_with_mask, except that replacement positions are identified via indices rather than a mask. Parameters ---------- chunk : pa.Array indices : npt.NDArray[np.intp] value : npt.NDArray[Any] Replacement value(s). Returns ------- pa.Array """ n = len(indices) if n == 0: return chunk start, stop = indices[[0, -1]] if (stop - start) == (n - 1): # fast path for a contiguous set of indices arrays = [ chunk[:start], pa.array(value, type=chunk.type, from_pandas=True), chunk[stop + 1:], ] arrays = [arr for arr in arrays if len(arr)] if len(arrays) == 1: return arrays[0] return pa.concat_arrays(arrays) mask = np.zeros(len(chunk), dtype=np.bool_) mask[indices] = True if pa_version_under5p0: arr = chunk.to_numpy(zero_copy_only=False) arr[mask] = value return pa.array(arr, type=chunk.type) if isna(value).all(): return pc.if_else(mask, None, chunk) return pc.replace_with_mask(chunk, mask, value)
def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray: zero_copy_only = _is_zero_copy_only(pa_array.type) if isinstance(pa_array, pa.ChunkedArray): # don't call to_numpy() directly or we end up with a np.array with dtype object # call to_numpy on the chunks instead array: List[np.ndarray] = [ row for chunk in pa_array.chunks for row in chunk.to_numpy(zero_copy_only=zero_copy_only) ] else: # cast to list of arrays or we end up with a np.array with dtype object array: List[np.ndarray] = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist() return np.array(array, copy=False, **self.np_array_kwargs)