Exemple #1
0
  def list_lengths(self, path: types.FeaturePath) -> np.ndarray:
    """Returns a numpy array containing the length of each feature list.

    If the requested path is not present in the record batch wrapped by the
    InputBatch, the returned array will consist of zeros, and be of length equal
    to the number of rows in the record batch.

    Args:
      path: The path for which to return list lengths.

    Returns:
      An ndarray containing the lengths of each nested list. The returned
      ndarray will be of shape (N,) where N is the number of rows in the
      referenced array (or in the record batch, if the path cannot be found).

    Raises:
      ValueError: When the referenced array is neither a ListArray nor null.
    """
    key = ('list_lengths({})', path)
    if key in self._cache:
      return self._cache[key]
    try:
      array, _ = arrow_util.get_array(
          self._record_batch, path, return_example_indices=False)
      if pa.types.is_null(array.type):
        lengths = np.full(self._record_batch.num_rows, 0)
      elif not arrow_util.is_list_like(array.type):
        raise ValueError('Can only compute list lengths on list arrays, found '
                         '{}'.format(array.type))
      else:
        lengths = np.asarray(array_util.ListLengthsFromListArray(array))
    except KeyError:
      lengths = np.full(self._record_batch.num_rows, 0)
    self._cache[key] = lengths
    return lengths
def DecodedExamplesToRecordBatch(
        decoded_examples: List[types.Example]) -> pa.RecordBatch:
    """Converts a list of types.Example to an Arrow RecordBatch.

  where types.Example is Dict[Union[bytes, unicode], Union[None, np.ndarray]]
  The result record batch has M rows and N columns where M is the number of
  examples in the list and N is the number of unique features in the examples.
  Each column is either a ListArray<primitive|string|binary> or a NullArray.
  None and missing feature handling:
    - if a feature's value is None in an example, then its corresponding column
      in the result batch will have a null at the corresponding position.
    - if a feature's value is always None across all the examples in the input
      list, then its corresponding column in the result batch will be a
      NullArray.
    - if an example does not contain a feature (in the universe of features),
      then the column of that feature will have a null at the corresponding
      position.

  Args:
    decoded_examples: a Dict[Union[bytes, unicode], Union[None, np.ndarray]]

  Returns:
    a pa.RecordBatch.

  Raises:
    ValueError: when the conversion fails.
    TypeError: when some of the output columns are not of supported types.
  """
    if not decoded_examples:
        return pa.RecordBatch.from_arrays([], [])

    struct_array = pa.array(decoded_examples)
    if not pa.types.is_struct(struct_array.type):
        raise ValueError("Unexpected Arrow type created from input")
    field_names = [f.name for f in list(struct_array.type)]
    if not field_names:
        return _GetEmptyRecordBatch(len(decoded_examples))
    value_arrays = struct_array.flatten()
    for name, array in six.moves.zip(field_names, value_arrays):
        if pa.types.is_null(array.type):
            continue
        if not arrow_util.is_list_like(array.type):
            raise TypeError(
                "Expected list arrays for field {} but got {}".format(
                    name, array.type))
        value_type = array.type.value_type
        if (not pa.types.is_integer(value_type)
                and not pa.types.is_floating(value_type)
                and not arrow_util.is_binary_like(value_type)
                and not pa.types.is_null(value_type)):
            raise TypeError("Type not supported: {} {}".format(
                name, array.type))

    return pa.RecordBatch.from_arrays(value_arrays, field_names)
  def update(self,
             feature_path: types.FeaturePath,
             feature_array: pa.Array,
             feature_type: types.FeatureNameStatisticsType,
             make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch],
             weights: Optional[np.ndarray] = None) -> None:
    """Update the partial common statistics using the input value."""
    if self.type is None:
      self.type = feature_type  # pytype: disable=annotation-type-mismatch
    elif feature_type is not None and self.type != feature_type:
      raise TypeError('Cannot determine the type of feature %s. '
                      'Found values of types %s and %s.' %
                      (feature_path, self.type, feature_type))

    nest_level = arrow_util.get_nest_level(feature_array.type)
    if self.presence_and_valency_stats is None:
      self.presence_and_valency_stats = [
          _PresenceAndValencyStats(make_quantiles_sketch_fn)
          for _ in range(nest_level)
      ]
    elif nest_level != len(self.presence_and_valency_stats):
      raise ValueError('Inconsistent nestedness in feature {}: {} vs {}'.format(
          feature_path, nest_level, len(self.presence_and_valency_stats)))

    # And there's nothing we can collect in this case.
    if not feature_array:
      return

    level = 0
    while arrow_util.is_list_like(feature_array.type):
      presence_mask = ~np.asarray(
          array_util.GetArrayNullBitmapAsByteArray(feature_array)).view(np.bool)
      num_values = np.asarray(
          array_util.ListLengthsFromListArray(feature_array))
      num_values_not_none = num_values[presence_mask]
      self.presence_and_valency_stats[level].update(feature_array,
                                                    presence_mask, num_values,
                                                    num_values_not_none,
                                                    weights)
      flattened = feature_array.flatten()
      if weights is not None:
        parent_indices = array_util.GetFlattenedArrayParentIndices(
            feature_array).to_numpy()
        weights = weights[parent_indices]
      feature_array = flattened
      level += 1
def get_feature_type_from_arrow_type(
        feature_path: types.FeaturePath,
        arrow_type: pa.DataType) -> Optional[types.FeatureNameStatisticsType]:
    """Get feature type from Arrow type.

  Args:
    feature_path: path of the feature.
    arrow_type: Arrow DataType.

  Returns:
    A statistics_pb2.FeatureNameStatistics.Type value or None if arrow_type
    is null (which means it cannot be determined for now).

  Raises:
    TypeError: if the type is not supported.
  """
    if pa.types.is_null(arrow_type):
        return None
    if not arrow_util.is_list_like(arrow_type):
        raise TypeError(
            'Expected feature column to be a '
            '(Large)List<primitive|struct> or null, but feature {} '
            'was {}.'.format(feature_path, arrow_type))

    value_type = arrow_util.get_innermost_nested_type(arrow_type)
    if pa.types.is_integer(value_type):
        return statistics_pb2.FeatureNameStatistics.INT
    elif pa.types.is_floating(value_type):
        return statistics_pb2.FeatureNameStatistics.FLOAT
    elif arrow_util.is_binary_like(value_type):
        return statistics_pb2.FeatureNameStatistics.STRING
    elif pa.types.is_struct(value_type):
        return statistics_pb2.FeatureNameStatistics.STRUCT
    elif pa.types.is_null(value_type):
        return None

    raise TypeError('Feature {} has unsupported arrow type: {}'.format(
        feature_path, arrow_type))
Exemple #5
0
  def testIsListLike(self):
    for t in (pa.list_(pa.int64()), pa.large_list(pa.int64())):
      self.assertTrue(arrow_util.is_list_like(t))

    for t in (pa.binary(), pa.int64(), pa.large_string()):
      self.assertFalse(arrow_util.is_list_like(t))