Example #1
0
 def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray:
     if isinstance(pa_array, pa.ChunkedArray):
         if isinstance(pa_array.type, _ArrayXDExtensionType):
             # don't call to_pylist() to preserve dtype of the fixed-size array
             zero_copy_only = _is_zero_copy_only(
                 pa_array.type.storage_dtype, unnest=True)
             if pa_array.type.shape[0] is None:
                 array: List = [
                     row for chunk in pa_array.chunks
                     for row in chunk.to_list_of_numpy(
                         zero_copy_only=zero_copy_only)
                 ]
             else:
                 array: List = [
                     row for chunk in pa_array.chunks
                     for row in chunk.to_numpy(
                         zero_copy_only=zero_copy_only)
                 ]
         else:
             zero_copy_only = _is_zero_copy_only(pa_array.type) and all(
                 not _is_array_with_nulls(chunk)
                 for chunk in pa_array.chunks)
             array: List = [
                 row for chunk in pa_array.chunks
                 for row in chunk.to_numpy(zero_copy_only=zero_copy_only)
             ]
     else:
         if isinstance(pa_array.type, _ArrayXDExtensionType):
             # don't call to_pylist() to preserve dtype of the fixed-size array
             zero_copy_only = _is_zero_copy_only(
                 pa_array.type.storage_dtype, unnest=True)
             if pa_array.type.shape[0] is None:
                 array: List = pa_array.to_list_of_numpy(
                     zero_copy_only=zero_copy_only)
             else:
                 array: List = pa_array.to_numpy(
                     zero_copy_only=zero_copy_only)
         else:
             zero_copy_only = _is_zero_copy_only(
                 pa_array.type) and not _is_array_with_nulls(pa_array)
             array: List = pa_array.to_numpy(
                 zero_copy_only=zero_copy_only).tolist()
     if len(array) > 0:
         if any((isinstance(x, np.ndarray) and
                 (x.dtype == np.object or x.shape != array[0].shape)) or (
                     isinstance(x, float) and np.isnan(x)) for x in array):
             return np.array(array,
                             copy=False,
                             **{
                                 **self.np_array_kwargs, "dtype": np.object
                             })
     return np.array(array, copy=False, **self.np_array_kwargs)
Example #2
0
 def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray:
     zero_copy_only = _is_zero_copy_only(pa_array.type)
     if isinstance(pa_array, pa.ChunkedArray):
         # don't call to_numpy() directly or we end up with a np.array with dtype object
         # call to_numpy on the chunks instead
         # for ArrayExtensionArray call py_list directly to support dynamic dimensions
         if isinstance(pa_array.type, _ArrayXDExtensionType):
             array: List = [
                 row for chunk in pa_array.chunks
                 for row in chunk.to_pylist()
             ]
         else:
             array: List = [
                 row for chunk in pa_array.chunks
                 for row in chunk.to_numpy(zero_copy_only=zero_copy_only)
             ]
     else:
         # cast to list of arrays or we end up with a np.array with dtype object
         # for ArrayExtensionArray call py_list directly to support dynamic dimensions
         if isinstance(pa_array.type, _ArrayXDExtensionType):
             array: List = pa_array.to_pylist()
         else:
             array: List = pa_array.to_numpy(
                 zero_copy_only=zero_copy_only).tolist()
     if len(array) > 0:
         if any(
                 isinstance(x, np.ndarray) and (
                     x.dtype == np.object or x.shape != array[0].shape)
                 for x in array):
             return np.array(array,
                             copy=False,
                             **{
                                 **self.np_array_kwargs, "dtype": np.object
                             })
     return np.array(array, copy=False, **self.np_array_kwargs)
Example #3
0
def make_groupable_array(
        array: pa.Array,
        date_granularity: Optional[DateGranularity]) -> pa.Array:
    """Given an input array, return the array we will group by.

    This is for handling DEPRECATED date conversions. The idea is: with input
    value "2021-03-01T21:12:21.231212312Z", a "year" group should be
    "2021-01-01Z".
    """
    if date_granularity is None:
        return array

    if date_granularity == DateGranularity.QUARTER:
        np_datetime_ns = array.to_numpy(zero_copy_only=False)
        np_datetime_m = np_datetime_ns.astype("datetime64[M]").astype(int)
        rounded_month_numbers = np.floor_divide(np_datetime_m, 3) * 3
        np_rounded_ns = rounded_month_numbers.astype("datetime64[M]").astype(
            "datetime64[ns]")
        # converting to int made nulls into ... not-null. Make them null again
        np_rounded_ns[np.isnan(np_datetime_ns)] = "NaT"
        return pa.array(np_rounded_ns)

    if date_granularity == DateGranularity.WEEK:
        # numpy "week" is counted from the Epoch -- which happens to be a
        # Thursday. But ISO weeks start Monday, not Thursday -- and so Numpy's
        # "W" type is useless.
        #
        # We do integer math: add 3 to each date and then floor-divide by 7.
        # That makes "1970-01-01 [Thursday] + 3" => Sunday -- so when we
        # floor-divide, everything from Monday to Sunday falls in the same
        # bucket. We could group by this ... but we convert back to day and
        # subtract the 3, so the group can be formatted.
        np_datetime_ns = array.to_numpy(zero_copy_only=False)
        np_datetime_d = np_datetime_ns.astype("datetime64[D]").astype(int)
        rounded_day_numbers = np.floor_divide(np_datetime_d + 3, 7) * 7 - 3
        np_rounded_ns = rounded_day_numbers.astype("datetime64[D]").astype(
            "datetime64[ns]")
        # converting to int made nulls into ... not-null. Make them null again
        np_rounded_ns[np.isnan(np_datetime_ns)] = "NaT"
        return pa.array(np_rounded_ns)

    freq = date_granularity.numpy_unit
    np_rounded_ns = (array.to_numpy(zero_copy_only=False).astype(
        f"datetime64[{freq}]").astype("datetime64[ns]"))
    return pa.array(np_rounded_ns)
Example #4
0
    def _replace_with_indices(
        cls,
        chunk: pa.Array,
        indices: npt.NDArray[np.intp],
        value: npt.NDArray[Any],
    ) -> pa.Array:
        """
        Replace items selected with a set of positional indices.

        Analogous to pyarrow.compute.replace_with_mask, except that replacement
        positions are identified via indices rather than a mask.

        Parameters
        ----------
        chunk : pa.Array
        indices : npt.NDArray[np.intp]
        value : npt.NDArray[Any]
            Replacement value(s).

        Returns
        -------
        pa.Array
        """
        n = len(indices)

        if n == 0:
            return chunk

        start, stop = indices[[0, -1]]

        if (stop - start) == (n - 1):
            # fast path for a contiguous set of indices
            arrays = [
                chunk[:start],
                pa.array(value, type=chunk.type, from_pandas=True),
                chunk[stop + 1:],
            ]
            arrays = [arr for arr in arrays if len(arr)]
            if len(arrays) == 1:
                return arrays[0]
            return pa.concat_arrays(arrays)

        mask = np.zeros(len(chunk), dtype=np.bool_)
        mask[indices] = True

        if pa_version_under5p0:
            arr = chunk.to_numpy(zero_copy_only=False)
            arr[mask] = value
            return pa.array(arr, type=chunk.type)

        if isna(value).all():
            return pc.if_else(mask, None, chunk)

        return pc.replace_with_mask(chunk, mask, value)
Example #5
0
 def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray:
     zero_copy_only = _is_zero_copy_only(pa_array.type)
     if isinstance(pa_array, pa.ChunkedArray):
         # don't call to_numpy() directly or we end up with a np.array with dtype object
         # call to_numpy on the chunks instead
         array: List[np.ndarray] = [
             row for chunk in pa_array.chunks for row in chunk.to_numpy(zero_copy_only=zero_copy_only)
         ]
     else:
         # cast to list of arrays or we end up with a np.array with dtype object
         array: List[np.ndarray] = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()
     return np.array(array, copy=False, **self.np_array_kwargs)