Esempio n. 1
0
def or_array_nparray(a: pa.Array, b: np.ndarray) -> pa.Array:
    """Perform ``pa.Array | np.ndarray``."""
    output_length = len(a) // 8
    if len(a) % 8 != 0:
        output_length += 1

    if a.null_count == 0:
        result = np.zeros(output_length, dtype=np.uint8)
        bitmap_or_unaligned_with_numpy_nonnull(len(a),
                                               a.buffers()[1], a.offset, b,
                                               result)
        return pa.Array.from_buffers(pa.bool_(), len(a),
                                     [None, pa.py_buffer(result)], 0)
    else:
        result = np.zeros(output_length, dtype=np.uint8)
        valid_bits = np.zeros(output_length, dtype=np.uint8)
        null_count = bitmap_or_unaligned_with_numpy(len(a),
                                                    a.buffers()[0],
                                                    a.buffers()[1], a.offset,
                                                    b, result, valid_bits)
        return pa.Array.from_buffers(
            pa.bool_(),
            len(a),
            [pa.py_buffer(valid_bits),
             pa.py_buffer(result)],
            null_count,
        )
Esempio n. 2
0
 def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray:
     zero_copy_only = _is_zero_copy_only(pa_array.type)
     if isinstance(pa_array, pa.ChunkedArray):
         # don't call to_numpy() directly or we end up with a np.array with dtype object
         # call to_numpy on the chunks instead
         # for ArrayExtensionArray call py_list directly to support dynamic dimensions
         if isinstance(pa_array.type, _ArrayXDExtensionType):
             array: List = [
                 row for chunk in pa_array.chunks
                 for row in chunk.to_pylist()
             ]
         else:
             array: List = [
                 row for chunk in pa_array.chunks
                 for row in chunk.to_numpy(zero_copy_only=zero_copy_only)
             ]
     else:
         # cast to list of arrays or we end up with a np.array with dtype object
         # for ArrayExtensionArray call py_list directly to support dynamic dimensions
         if isinstance(pa_array.type, _ArrayXDExtensionType):
             array: List = pa_array.to_pylist()
         else:
             array: List = pa_array.to_numpy(
                 zero_copy_only=zero_copy_only).tolist()
     if len(array) > 0:
         if any(
                 isinstance(x, np.ndarray) and (
                     x.dtype == np.object or x.shape != array[0].shape)
                 for x in array):
             return np.array(array,
                             copy=False,
                             **{
                                 **self.np_array_kwargs, "dtype": np.object
                             })
     return np.array(array, copy=False, **self.np_array_kwargs)
Esempio n. 3
0
def _text_contains_case_sensitive(data: pa.Array, pat: str) -> pa.Array:
    """
    Check for each element in the data whether it contains the pattern ``pat``.

    This implementation does basic byte-by-byte comparison and is independent
    of any locales or encodings.
    """
    # Convert to UTF-8 bytes
    pat_bytes: bytes = pat.encode()

    offsets_buffer, data_buffer = _extract_string_buffers(data)

    if data.null_count == 0:
        valid_buffer = np.empty(0, dtype=np.uint8)
    else:
        valid_buffer = _buffer_to_view(data.buffers()[0])

    output = _text_contains_case_sensitive_numba(len(data), valid_buffer,
                                                 data.offset, offsets_buffer,
                                                 data_buffer, pat_bytes)

    if data.null_count == 0:
        output_valid = None
    else:
        output_valid = data.buffers()[0].slice(data.offset // 8)
        if data.offset % 8 != 0:
            output_valid = shift_unaligned_bitmap(output_valid,
                                                  data.offset % 8, len(data))

    buffers = [output_valid, pa.py_buffer(output)]
    return pa.Array.from_buffers(pa.bool_(), len(data), buffers,
                                 data.null_count)
Esempio n. 4
0
def coerce_arrow(array: pa.Array) -> pa.Array:
    # also coerces timezone to naive representation
    # units are accounted for by pyarrow
    if "timestamp" in str(array.type):
        warnings.warn(
            "Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost",
        )
        ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False)
        ms = pa.compute.cast(ts_ms, pa.int64())
        del ts_ms
        array = pa.compute.cast(ms, pa.date64())
        del ms
    # note: Decimal256 could not be cast to float
    elif isinstance(array.type, pa.Decimal128Type):
        array = pa.compute.cast(array, pa.float64())

    if hasattr(array, "num_chunks") and array.num_chunks > 1:
        # we have to coerce before combining chunks, because pyarrow panics if
        # offsets overflow
        if pa.types.is_string(array.type):
            array = pa.compute.cast(array, pa.large_utf8())
        elif pa.types.is_list(array.type):
            # pyarrow does not seem to support casting from list to largelist
            # so we use convert to large list ourselves and do the re-alloc on polars/arrow side
            chunks = []
            for arr in array.iterchunks():
                chunks.append(pl.from_arrow(arr).to_arrow())
            array = pa.chunked_array(chunks)

        array = array.combine_chunks()
    return array
Esempio n. 5
0
def arrow_array_to_array_of_proto(
        arrow_type: pa.DataType,
        arrow_array: pa.Array) -> List[Value_pb2.Value]:
    values = []
    if isinstance(arrow_type, pa.ListType):
        proto_list_class = ARROW_LIST_TYPE_TO_PROTO_LIST_CLASS[
            arrow_type.value_type]
        proto_field_name = ARROW_LIST_TYPE_TO_PROTO_FIELD[
            arrow_type.value_type]

        if arrow_type.value_type == PA_TIMESTAMP_TYPE:
            arrow_array = arrow_array.cast(pa.list_(pa.int64()))

        for v in arrow_array.tolist():
            values.append(
                Value_pb2.Value(**{proto_field_name: proto_list_class(val=v)}))
    else:
        proto_field_name = ARROW_TYPE_TO_PROTO_FIELD[arrow_type]

        if arrow_type == PA_TIMESTAMP_TYPE:
            arrow_array = arrow_array.cast(pa.int64())

        for v in arrow_array.tolist():
            values.append(Value_pb2.Value(**{proto_field_name: v}))

    return values
Esempio n. 6
0
def first(*, array: pa.Array, group_splits: np.array, **kwargs) -> pa.Array:
    nonnull_values = array.filter(array.is_valid())
    nonnull_splits = nonnull_group_splits(array, group_splits)
    starts = np.insert(nonnull_splits, 0, 0)
    ends = np.append(nonnull_splits, len(nonnull_values))
    nulls = starts == ends
    indices = pa.array(starts, pa.int64(), mask=nulls)
    return nonnull_values.take(indices)  # taking index NULL gives NULL
Esempio n. 7
0
def _extract_string_buffers(arr: pa.Array) -> Tuple[np.ndarray, np.ndarray]:
    start = arr.offset
    end = arr.offset + len(arr)

    offsets = np.asanyarray(arr.buffers()[1]).view(np.int32)[start:end + 1]
    data = np.asanyarray(arr.buffers()[2]).view(np.uint8)

    return offsets, data
def _downcast_array(array: pa.Array) -> pa.Array:
    if array.type in (pa.float64(), ):
        array = array.cast(pa.float32())
    elif array.type in (pa.int64(), ):
        array = array.cast(pa.uint16())
    elif array.type in (pa.string(), pa.bool_()):
        pass
    else:
        raise Exception(f"Did not downcast array with type '{array.type}'.")
    return array
Esempio n. 9
0
def nunique(*, array: pa.Array, group_splits: np.array, **kwargs) -> pa.Array:
    nonnull_splits = nonnull_group_splits(array, group_splits)
    nonnull_values = array.filter(
        array.is_valid()).to_numpy(zero_copy_only=False)
    counts = np.fromiter(
        (np.unique(subarr).size
         for subarr in np.split(nonnull_values, nonnull_splits)),
        dtype=np.int64,
        count=len(nonnull_splits) + 1,
    )
    return pa.array(counts)
Esempio n. 10
0
def _extract_data_buffer_as_np_array(array: pa.Array) -> np.ndarray:
    """Extract the data buffer of a numeric-typed pyarrow.Array as an np.ndarray."""
    dtype = array.type.to_pandas_dtype()
    start = array.offset
    end = array.offset + len(array)
    if pa.types.is_boolean(array.type):
        return np.unpackbits(_buffer_to_view(array.buffers()[1]).view(
            np.uint8),
                             bitorder="little")[start:end].astype(bool)
    else:
        return _buffer_to_view(array.buffers()[1]).view(dtype)[start:end]
Esempio n. 11
0
def reencode_dictionary_array(array: pa.Array) -> pa.Array:
    if len(array.indices) <= len(array.dictionary):
        # Groupby often reduces the number of values considerably. Let's shy
        # away from dictionary when it gives us literally nothing.
        return array.cast(pa.utf8())

    used = np.zeros(len(array.dictionary), np.bool_)
    used[array.indices] = True
    if np.all(used):
        return array  # no edit

    return array.cast(pa.utf8()).dictionary_encode()  # TODO optimize
Esempio n. 12
0
 def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray:
     if isinstance(pa_array, pa.ChunkedArray):
         if isinstance(pa_array.type, _ArrayXDExtensionType):
             # don't call to_pylist() to preserve dtype of the fixed-size array
             zero_copy_only = _is_zero_copy_only(
                 pa_array.type.storage_dtype, unnest=True)
             if pa_array.type.shape[0] is None:
                 array: List = [
                     row for chunk in pa_array.chunks
                     for row in chunk.to_list_of_numpy(
                         zero_copy_only=zero_copy_only)
                 ]
             else:
                 array: List = [
                     row for chunk in pa_array.chunks
                     for row in chunk.to_numpy(
                         zero_copy_only=zero_copy_only)
                 ]
         else:
             zero_copy_only = _is_zero_copy_only(pa_array.type) and all(
                 not _is_array_with_nulls(chunk)
                 for chunk in pa_array.chunks)
             array: List = [
                 row for chunk in pa_array.chunks
                 for row in chunk.to_numpy(zero_copy_only=zero_copy_only)
             ]
     else:
         if isinstance(pa_array.type, _ArrayXDExtensionType):
             # don't call to_pylist() to preserve dtype of the fixed-size array
             zero_copy_only = _is_zero_copy_only(
                 pa_array.type.storage_dtype, unnest=True)
             if pa_array.type.shape[0] is None:
                 array: List = pa_array.to_list_of_numpy(
                     zero_copy_only=zero_copy_only)
             else:
                 array: List = pa_array.to_numpy(
                     zero_copy_only=zero_copy_only)
         else:
             zero_copy_only = _is_zero_copy_only(
                 pa_array.type) and not _is_array_with_nulls(pa_array)
             array: List = pa_array.to_numpy(
                 zero_copy_only=zero_copy_only).tolist()
     if len(array) > 0:
         if any((isinstance(x, np.ndarray) and
                 (x.dtype == np.object or x.shape != array[0].shape)) or (
                     isinstance(x, float) and np.isnan(x)) for x in array):
             return np.array(array,
                             copy=False,
                             **{
                                 **self.np_array_kwargs, "dtype": np.object
                             })
     return np.array(array, copy=False, **self.np_array_kwargs)
Esempio n. 13
0
 def from_arrow(cls, data: pa.Array):
     dtype = Decimal64Dtype.from_arrow(data.type)
     mask_buf = data.buffers()[0]
     mask = (mask_buf if mask_buf is None else pa_mask_buffer_to_mask(
         mask_buf, len(data)))
     data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int64"))
     data_64 = data_128[::2].copy()
     return cls(
         data=Buffer(data_64.view("uint8")),
         size=len(data),
         dtype=dtype,
         mask=mask,
     )
Esempio n. 14
0
def flatten_nested(
    array: pa.Array, return_parent_indices: bool = False
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
  """Flattens all the list arrays nesting an array.

  If `array` is not list-like, itself will be returned.

  Args:
    array: pa.Array to flatten.
    return_parent_indices: If True, also returns the parent indices array.

  Returns:
    A tuple. The first term is the flattened array. The second term is None
    if `return_parent_indices` is False; otherwise it's a parent indices array
    parallel to the flattened array: if parent_indices[i] = j, then
    flattened_array[i] belongs to the j-th element of the input array.
  """
  parent_indices = None

  while is_list_like(array.type):
    if return_parent_indices:
      cur_parent_indices = array_util.GetFlattenedArrayParentIndices(
          array).to_numpy()
      if parent_indices is None:
        parent_indices = cur_parent_indices
      else:
        parent_indices = parent_indices[cur_parent_indices]
    array = array.flatten()

  # the array is not nested at the first place.
  if return_parent_indices and parent_indices is None:
    parent_indices = np.arange(len(array))
  return array, parent_indices
Esempio n. 15
0
    def _get_validity_buffer(
        self, arr: pa.Array
    ) -> Optional[Tuple[OmnisciProtocolBuffer, Tuple[DTypeKind, int, str,
                                                     str]]]:
        """
        Get column's validity buffer.

        Parameters
        ----------
        arr : pa.Array
            PyArrow array holding column's data.

        Returns
        -------
        tuple or None
            Tuple of ``OmnisciProtocolBuffer`` and protocol dtype representation of the buffer's underlying data.
            None if column is non-nullable (``self.describe_null == ColumnNullType.NON_NULLABLE``).
        """
        # According to the Arrow's memory layout, the validity buffer is always present at zero position:
        # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout
        validity_buffer = arr.buffers()[0]
        if validity_buffer is None:
            return None

        # If exist, validity buffer is always a bit-mask.
        data_size = self._get_buffer_size(bit_width=1)
        return (
            OmnisciProtocolBuffer(validity_buffer, data_size),
            (DTypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE),
        )
Esempio n. 16
0
    def _get_data_buffer(
        self, arr: pa.Array
    ) -> Tuple[OmnisciProtocolBuffer, Tuple[DTypeKind, int, str, str]]:
        """
        Get column's data buffer.

        Parameters
        ----------
        arr : pa.Array
            PyArrow array holding column's data.

        Returns
        -------
        tuple
            Tuple of ``OmnisciProtocolBuffer`` and protocol dtype representation of the buffer's underlying data.
        """
        if self.dtype[0] == DTypeKind.CATEGORICAL:
            # For dictionary data the buffer has to return categories codes
            arr = arr.indices

        arrow_type = self._dtype_from_pyarrow(arr.type)
        buff_size = (
            self._get_buffer_size(
                bit_width=arrow_type[1]) if self.dtype[0] != DTypeKind.STRING
            # We don't chunk string buffers as it would require modifying offset values,
            # so just return the whole data buffer for every chunk.
            else None)

        return (
            # According to the Arrow's memory layout, the data buffer is always present
            # at the last position of `.buffers()`:
            # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout
            OmnisciProtocolBuffer(arr.buffers()[-1], buff_size),
            arrow_type,
        )
Esempio n. 17
0
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        example_indices: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, example_indices
        array_type = array.type
        if (not is_list_like(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a (large_)list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_indices = None
        if example_indices is not None:
            flat_indices = example_indices[
                array_util.GetFlattenedArrayParentIndices(array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_indices)
Esempio n. 18
0
def _text_strip(data: pa.Array, to_strip) -> pa.Array:
    """
    Strip the characters of ``to_strip`` from start and end of each element in the data.
    """
    if len(data) == 0:
        return data

    offsets, data_buffer = _extract_string_buffers(data)

    valid_buffer = data.buffers()[0]
    valid_offset = data.offset
    builder = StringArrayBuilder(max(len(data_buffer), len(data)))

    _do_strip(
        valid_buffer,
        valid_offset,
        offsets,
        data_buffer,
        len(data),
        to_strip,
        inout_builder=builder,
    )

    result_array = finalize_string_array(builder, pa.string())
    return result_array
Esempio n. 19
0
  def _ListArrayToTensor(
      self, list_array: pa.Array,
      produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]:
    """Converts a ListArray to a dense tensor."""
    values = list_array.flatten()
    batch_size = len(list_array)
    expected_num_elements = batch_size * self._unbatched_flat_len
    if len(values) != expected_num_elements:
      raise ValueError(
          "Unable to convert a {} to a tensor of type spec {}: size mismatch. "
          "Expected {} elements but got {}. "
          "If your data type is tf.Example, make sure that the feature "
          "is always present, and have the same length in all the examples. "
          "TFX users should make sure there is no data anomaly for the feature."
          .format(
              type(list_array), self.type_spec, expected_num_elements,
              len(values)))
    actual_shape = list(self._shape)
    actual_shape[0] = batch_size
    if self._convert_to_binary_fn is not None:
      values = self._convert_to_binary_fn(values)
    values_np = np.asarray(values).reshape(actual_shape)
    if produce_eager_tensors:
      return tf.convert_to_tensor(values_np)

    return values_np
Esempio n. 20
0
 def _recursion_helper(
     feature_path: types.FeaturePath, array: pa.Array,
     weights: Optional[np.ndarray]
 ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
     """Recursion helper."""
     array_type = array.type
     if is_list_like(array_type) and pa.types.is_struct(
             array_type.value_type):
         if not enumerate_leaves_only:
             yield (feature_path, array, weights)
         flat_struct_array = array.flatten()
         flat_weights = None
         if weights is not None:
             flat_weights = weights[
                 array_util.GetFlattenedArrayParentIndices(
                     array).to_numpy()]
         for field in flat_struct_array.type:
             field_name = field.name
             # use "yield from" after PY 3.3.
             for e in _recursion_helper(feature_path.child(field_name),
                                        flat_struct_array.field(field_name),
                                        flat_weights):
                 yield e
     else:
         yield (feature_path, array, weights)
Esempio n. 21
0
 def ufunc_caller(*, array: pa.Array, group_splits: np.array,
                  **kwargs) -> pa.Array:
     nonnull_splits = nonnull_group_splits(array, group_splits)
     nonnull_values = array.filter(
         array.is_valid()).to_numpy(zero_copy_only=False)
     if force_otype:
         otype = force_otype
     else:
         otype = nonnull_values.dtype
     if pa.types.is_unicode(array.type):
         zero = ""
     else:
         zero = otype.type()
     np_result, np_empty_indices = call_ufunc(nonnull_values,
                                              nonnull_splits, otype, zero)
     return pa.array(np_result, mask=np_empty_indices)
Esempio n. 22
0
def coerce_arrow(array: pa.Array) -> pa.Array:
    # also coerces timezone to naive representation
    # units are accounted for by pyarrow
    if "timestamp" in str(array.type):
        warnings.warn(
            "Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost",
        )
        ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False)
        ms = pa.compute.cast(ts_ms, pa.int64())
        del ts_ms
        array = pa.compute.cast(ms, pa.date64())
        del ms
    # note: Decimal256 could not be cast to float
    elif isinstance(array.type, pa.Decimal128Type):
        array = pa.compute.cast(array, pa.float64())

    # simplest solution is to cast to (large)-string arrays
    # this is copy and expensive
    elif isinstance(array.type, pa.DictionaryType):
        if pa.types.is_string(array.type.value_type):
            array = pa.compute.cast(array, pa.large_utf8())
        else:
            raise ValueError(
                "polars does not support dictionary encoded types other than strings"
            )

    if hasattr(array, "num_chunks") and array.num_chunks > 1:
        if pa.types.is_string(array.type):
            array = pa.compute.cast(array, pa.large_utf8())
        elif pa.types.is_list(array.type):
            array = pa.compute.cast(array, pa.large_list())
        array = array.combine_chunks()
    return array
Esempio n. 23
0
def ToSingletonListArray(array: pa.Array):
    """Converts an array of `type` to a `ListArray<type>`.

  Where result[i] is null if array[i] is null; [array[i]] otherwise.

  Args:
    array: an arrow Array.
  Returns:
    a ListArray.
  """
    array_size = len(array)
    # fast path: values are not copied.
    if array.null_count == 0:
        return pa.ListArray.from_arrays(
            pa.array(np.arange(0, array_size + 1, dtype=np.int32)), array)

    # null_mask[i] = 1 iff array[i] is null.
    null_mask = np.asarray(GetArrayNullBitmapAsByteArray(array))
    # presence_mask[i] = 0 iff array[i] is null
    presence_mask = np.subtract(1, null_mask, dtype=np.uint8)
    offsets_np = np.zeros((array_size + 1, ), np.int32)
    np.cumsum(presence_mask, out=offsets_np[1:])

    # This is the null mask over offsets (but ListArray.from_arrays() uses it as
    # the null mask for the ListArray), so its length is array_size +1, but the
    # last element is always False.
    list_array_null_mask = np.zeros((array_size + 1, ), np.bool)
    list_array_null_mask[:array_size] = null_mask.view(np.bool)
    values_non_null = array.take(pa.array(np.flatnonzero(presence_mask)))
    return pa.ListArray.from_arrays(
        pa.array(offsets_np, mask=list_array_null_mask), values_non_null)
    def update(self,
               feature_array: pa.Array,
               values_quantiles_combiner: Any,
               weights: Optional[np.ndarray] = None) -> None:
        """Update the partial numeric statistics using the input value."""

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_array:
            return

        flattened_value_array = feature_array.flatten()
        # Note: to_numpy will fail if flattened_value_array is empty.
        if not flattened_value_array:
            return
        values = np.asarray(flattened_value_array)
        nan_mask = np.isnan(values)
        self.num_nan += np.sum(nan_mask)
        non_nan_mask = ~nan_mask
        values_no_nan = values[non_nan_mask]

        # We do this check to avoid failing in np.min/max with empty array.
        if values_no_nan.size == 0:
            return
        # This is to avoid integer overflow when computing sum or sum of squares.
        values_no_nan_as_double = values_no_nan.astype(np.float64)
        self.sum += np.sum(values_no_nan_as_double)
        self.sum_of_squares += np.sum(values_no_nan_as_double *
                                      values_no_nan_as_double)
        # Use np.minimum.reduce(values_no_nan, initial=self.min) once we upgrade
        # to numpy 1.16
        curr_min = np.min(values_no_nan)
        curr_max = np.max(values_no_nan)
        self.min = min(self.min, curr_min)
        self.max = max(self.max, curr_max)
        if curr_min == float('-inf') or curr_max == float('inf'):
            finite_values = values_no_nan[np.isfinite(values_no_nan)]
            if finite_values.size > 0:
                self.finite_min = min(self.finite_min, np.min(finite_values))
                self.finite_max = max(self.finite_max, np.max(finite_values))

        self.num_zeros += values_no_nan.size - np.count_nonzero(values_no_nan)
        self.quantiles_summary = values_quantiles_combiner.add_input(
            self.quantiles_summary,
            [values_no_nan, np.ones_like(values_no_nan)])
        if weights is not None:
            value_parent_indices = np.asarray(
                array_util.GetFlattenedArrayParentIndices(feature_array))
            flat_weights = weights[value_parent_indices]
            flat_weights_no_nan = flat_weights[non_nan_mask]
            weighted_values = flat_weights_no_nan * values_no_nan
            self.weighted_sum += np.sum(weighted_values)
            self.weighted_sum_of_squares += np.sum(weighted_values *
                                                   values_no_nan)
            self.weighted_quantiles_summary = values_quantiles_combiner.add_input(
                self.weighted_quantiles_summary,
                [values_no_nan, flat_weights_no_nan])
            self.weighted_total_num_values += np.sum(flat_weights_no_nan)
Esempio n. 25
0
def make_groupable_array(
        array: pa.Array,
        date_granularity: Optional[DateGranularity]) -> pa.Array:
    """Given an input array, return the array we will group by.

    This is for handling DEPRECATED date conversions. The idea is: with input
    value "2021-03-01T21:12:21.231212312Z", a "year" group should be
    "2021-01-01Z".
    """
    if date_granularity is None:
        return array

    if date_granularity == DateGranularity.QUARTER:
        np_datetime_ns = array.to_numpy(zero_copy_only=False)
        np_datetime_m = np_datetime_ns.astype("datetime64[M]").astype(int)
        rounded_month_numbers = np.floor_divide(np_datetime_m, 3) * 3
        np_rounded_ns = rounded_month_numbers.astype("datetime64[M]").astype(
            "datetime64[ns]")
        # converting to int made nulls into ... not-null. Make them null again
        np_rounded_ns[np.isnan(np_datetime_ns)] = "NaT"
        return pa.array(np_rounded_ns)

    if date_granularity == DateGranularity.WEEK:
        # numpy "week" is counted from the Epoch -- which happens to be a
        # Thursday. But ISO weeks start Monday, not Thursday -- and so Numpy's
        # "W" type is useless.
        #
        # We do integer math: add 3 to each date and then floor-divide by 7.
        # That makes "1970-01-01 [Thursday] + 3" => Sunday -- so when we
        # floor-divide, everything from Monday to Sunday falls in the same
        # bucket. We could group by this ... but we convert back to day and
        # subtract the 3, so the group can be formatted.
        np_datetime_ns = array.to_numpy(zero_copy_only=False)
        np_datetime_d = np_datetime_ns.astype("datetime64[D]").astype(int)
        rounded_day_numbers = np.floor_divide(np_datetime_d + 3, 7) * 7 - 3
        np_rounded_ns = rounded_day_numbers.astype("datetime64[D]").astype(
            "datetime64[ns]")
        # converting to int made nulls into ... not-null. Make them null again
        np_rounded_ns[np.isnan(np_datetime_ns)] = "NaT"
        return pa.array(np_rounded_ns)

    freq = date_granularity.numpy_unit
    np_rounded_ns = (array.to_numpy(zero_copy_only=False).astype(
        f"datetime64[{freq}]").astype("datetime64[ns]"))
    return pa.array(np_rounded_ns)
Esempio n. 26
0
    def add_input(self, accumulator: _PartialNLStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Array) -> _PartialNLStats:
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow Array representing a batch of feature values which
        should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if feature_path not in self._valid_feature_paths:
            accumulator.invalidate = True
            return accumulator

        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator

        if feature_type not in self._feature_type_fns:
            accumulator.invalidate = True
            return accumulator

        feature_type_fn = self._feature_type_fns[feature_type]

        vocab = None
        rvocab = None
        if self._nld_vocabularies[feature_path]:
            vocab_name = self._nld_vocabularies[feature_path]
            vocab = self._vocabs[vocab_name]
            rvocab = self._rvocabs[vocab_name]

        excluded_string_tokens = self._nld_excluded_string_tokens[feature_path]
        excluded_int_tokens = self._nld_excluded_int_tokens[feature_path]
        oov_string_tokens = self._nld_oov_string_tokens[feature_path]
        int_tokens = self._nld_specified_int_tokens[feature_path]
        string_tokens = self._nld_specified_str_tokens[feature_path]
        sequence_length_excluded_int_tokens = (
            self._nld_sequence_length_excluded_int_tokens[feature_path])
        sequence_length_excluded_string_tokens = (
            self._nld_sequence_length_excluded_string_tokens[feature_path])

        # TODO(b/175875824): Benchmark and optimize performance.
        for row in feature_array.to_pylist():
            if row is not None:
                feature_type_fn(row, accumulator, excluded_string_tokens,
                                excluded_int_tokens, oov_string_tokens, vocab,
                                rvocab, int_tokens, string_tokens,
                                sequence_length_excluded_int_tokens,
                                sequence_length_excluded_string_tokens,
                                self._num_histogram_buckets)
        return accumulator
Esempio n. 27
0
    def _replace_with_indices(
        cls,
        chunk: pa.Array,
        indices: npt.NDArray[np.intp],
        value: npt.NDArray[Any],
    ) -> pa.Array:
        """
        Replace items selected with a set of positional indices.

        Analogous to pyarrow.compute.replace_with_mask, except that replacement
        positions are identified via indices rather than a mask.

        Parameters
        ----------
        chunk : pa.Array
        indices : npt.NDArray[np.intp]
        value : npt.NDArray[Any]
            Replacement value(s).

        Returns
        -------
        pa.Array
        """
        n = len(indices)

        if n == 0:
            return chunk

        start, stop = indices[[0, -1]]

        if (stop - start) == (n - 1):
            # fast path for a contiguous set of indices
            arrays = [
                chunk[:start],
                pa.array(value, type=chunk.type, from_pandas=True),
                chunk[stop + 1:],
            ]
            arrays = [arr for arr in arrays if len(arr)]
            if len(arrays) == 1:
                return arrays[0]
            return pa.concat_arrays(arrays)

        mask = np.zeros(len(chunk), dtype=np.bool_)
        mask[indices] = True

        if pa_version_under5p0:
            arr = chunk.to_numpy(zero_copy_only=False)
            arr[mask] = value
            return pa.array(arr, type=chunk.type)

        if isna(value).all():
            return pc.if_else(mask, None, chunk)

        return pc.replace_with_mask(chunk, mask, value)
Esempio n. 28
0
def or_array_array(a: pa.Array, b: pa.Array) -> pa.Array:
    """Perform ``pyarrow.Array | pyarrow.Array``."""
    output_length = len(a) // 8
    if len(a) % 8 != 0:
        output_length += 1

    if a.null_count == 0 and b.null_count == 0:
        result = np.zeros(output_length, dtype=np.uint8)
        bitmap_or_unaligned(len(a),
                            a.buffers()[1], a.offset,
                            b.buffers()[1], b.offset, result)
        return pa.Array.from_buffers(pa.bool_(), len(a),
                                     [None, pa.py_buffer(result)], 0)
    elif a.null_count == 0:
        result = np.zeros(output_length, dtype=np.uint8)
        bitmap_or_unaligned(len(a),
                            a.buffers()[1], a.offset,
                            b.buffers()[1], b.offset, result)
        # b has nulls, mark all occasions of b(None) & a(True) as True -> valid_bits = a.data or b.valid_bits
        valid_bits = np.zeros(output_length, dtype=np.uint8)
        bitmap_or_unaligned(len(a),
                            a.buffers()[1], a.offset,
                            b.buffers()[0], b.offset, valid_bits)
        return pa.Array.from_buffers(
            pa.bool_(), len(a),
            [pa.py_buffer(valid_bits),
             pa.py_buffer(result)])
        pass
    elif b.null_count == 0:
        return or_array_array(b, a)
    else:
        result = np.zeros(output_length, dtype=np.uint8)
        valid_bits = np.zeros(output_length, dtype=np.uint8)
        null_count = masked_bitmap_or_unaligned(
            len(a),
            a.buffers()[0],
            a.buffers()[1],
            a.offset,
            b.buffers()[0],
            b.buffers()[1],
            b.offset,
            result,
            valid_bits,
        )
        return pa.Array.from_buffers(
            pa.bool_(),
            len(a),
            [pa.py_buffer(valid_bits),
             pa.py_buffer(result)],
            null_count,
        )
Esempio n. 29
0
def nonnull_group_splits(array: pa.Array, group_splits: np.array) -> np.array:
    # in an array [null, 1, null, 2, null]
    # with group_splits [1, 2, 3], groups are [null], [1], [null], [2, null]
    # n_nulls_by_index will be [1, 1, 2, 2, 3]
    n_nulls_by_index = np.cumsum(
        array.is_null().to_numpy(zero_copy_only=False),
        dtype=np.min_scalar_type(-len(array)),
    )
    # non-null array is [1, 2]
    # we want groups [], [1], [], [2]
    # we want nonnull_group_splits [0, 1, 1]
    return group_splits - n_nulls_by_index[group_splits - 1]
Esempio n. 30
0
def _extract_isnull_bitmap(arr: pa.Array, offset: int, length: int):
    """
    Extract isnull bitmap with offset and padding.

    Ensures that even when pyarrow does return an empty bitmap that a filled
    one will be returned.
    """
    buf = _buffer_to_view(arr.buffers()[0])
    if len(buf) > 0:
        return buf[offset:offset + length]
    else:
        return np.full(length, fill_value=255, dtype=np.uint8)