Exemple #1
0
 def _recursion_helper(
     feature_path: types.FeaturePath, array: pa.Array,
     weights: Optional[np.ndarray]
 ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
     """Recursion helper."""
     array_type = array.type
     if is_list_like(array_type) and pa.types.is_struct(
             array_type.value_type):
         if not enumerate_leaves_only:
             yield (feature_path, array, weights)
         flat_struct_array = array.flatten()
         flat_weights = None
         if weights is not None:
             flat_weights = weights[
                 array_util.GetFlattenedArrayParentIndices(
                     array).to_numpy()]
         for field in flat_struct_array.type:
             field_name = field.name
             # use "yield from" after PY 3.3.
             for e in _recursion_helper(feature_path.child(field_name),
                                        flat_struct_array.field(field_name),
                                        flat_weights):
                 yield e
     else:
         yield (feature_path, array, weights)
Exemple #2
0
  def _ListArrayToTensor(
      self, list_array: pa.Array,
      produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]:
    """Converts a ListArray to a dense tensor."""
    values = list_array.flatten()
    batch_size = len(list_array)
    expected_num_elements = batch_size * self._unbatched_flat_len
    if len(values) != expected_num_elements:
      raise ValueError(
          "Unable to convert a {} to a tensor of type spec {}: size mismatch. "
          "Expected {} elements but got {}. "
          "If your data type is tf.Example, make sure that the feature "
          "is always present, and have the same length in all the examples. "
          "TFX users should make sure there is no data anomaly for the feature."
          .format(
              type(list_array), self.type_spec, expected_num_elements,
              len(values)))
    actual_shape = list(self._shape)
    actual_shape[0] = batch_size
    if self._convert_to_binary_fn is not None:
      values = self._convert_to_binary_fn(values)
    values_np = np.asarray(values).reshape(actual_shape)
    if produce_eager_tensors:
      return tf.convert_to_tensor(values_np)

    return values_np
Exemple #3
0
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        example_indices: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, example_indices
        array_type = array.type
        if (not is_list_like(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a (large_)list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_indices = None
        if example_indices is not None:
            flat_indices = example_indices[
                array_util.GetFlattenedArrayParentIndices(array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_indices)
Exemple #4
0
def flatten_nested(
    array: pa.Array, return_parent_indices: bool = False
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
  """Flattens all the list arrays nesting an array.

  If `array` is not list-like, itself will be returned.

  Args:
    array: pa.Array to flatten.
    return_parent_indices: If True, also returns the parent indices array.

  Returns:
    A tuple. The first term is the flattened array. The second term is None
    if `return_parent_indices` is False; otherwise it's a parent indices array
    parallel to the flattened array: if parent_indices[i] = j, then
    flattened_array[i] belongs to the j-th element of the input array.
  """
  parent_indices = None

  while is_list_like(array.type):
    if return_parent_indices:
      cur_parent_indices = array_util.GetFlattenedArrayParentIndices(
          array).to_numpy()
      if parent_indices is None:
        parent_indices = cur_parent_indices
      else:
        parent_indices = parent_indices[cur_parent_indices]
    array = array.flatten()

  # the array is not nested at the first place.
  if return_parent_indices and parent_indices is None:
    parent_indices = np.arange(len(array))
  return array, parent_indices
    def update(self,
               feature_array: pa.Array,
               values_quantiles_combiner: Any,
               weights: Optional[np.ndarray] = None) -> None:
        """Update the partial numeric statistics using the input value."""

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_array:
            return

        flattened_value_array = feature_array.flatten()
        # Note: to_numpy will fail if flattened_value_array is empty.
        if not flattened_value_array:
            return
        values = np.asarray(flattened_value_array)
        nan_mask = np.isnan(values)
        self.num_nan += np.sum(nan_mask)
        non_nan_mask = ~nan_mask
        values_no_nan = values[non_nan_mask]

        # We do this check to avoid failing in np.min/max with empty array.
        if values_no_nan.size == 0:
            return
        # This is to avoid integer overflow when computing sum or sum of squares.
        values_no_nan_as_double = values_no_nan.astype(np.float64)
        self.sum += np.sum(values_no_nan_as_double)
        self.sum_of_squares += np.sum(values_no_nan_as_double *
                                      values_no_nan_as_double)
        # Use np.minimum.reduce(values_no_nan, initial=self.min) once we upgrade
        # to numpy 1.16
        curr_min = np.min(values_no_nan)
        curr_max = np.max(values_no_nan)
        self.min = min(self.min, curr_min)
        self.max = max(self.max, curr_max)
        if curr_min == float('-inf') or curr_max == float('inf'):
            finite_values = values_no_nan[np.isfinite(values_no_nan)]
            if finite_values.size > 0:
                self.finite_min = min(self.finite_min, np.min(finite_values))
                self.finite_max = max(self.finite_max, np.max(finite_values))

        self.num_zeros += values_no_nan.size - np.count_nonzero(values_no_nan)
        self.quantiles_summary = values_quantiles_combiner.add_input(
            self.quantiles_summary,
            [values_no_nan, np.ones_like(values_no_nan)])
        if weights is not None:
            value_parent_indices = np.asarray(
                array_util.GetFlattenedArrayParentIndices(feature_array))
            flat_weights = weights[value_parent_indices]
            flat_weights_no_nan = flat_weights[non_nan_mask]
            weighted_values = flat_weights_no_nan * values_no_nan
            self.weighted_sum += np.sum(weighted_values)
            self.weighted_sum_of_squares += np.sum(weighted_values *
                                                   values_no_nan)
            self.weighted_quantiles_summary = values_quantiles_combiner.add_input(
                self.weighted_quantiles_summary,
                [values_no_nan, flat_weights_no_nan])
            self.weighted_total_num_values += np.sum(flat_weights_no_nan)
    def add_input(self, accumulator: _PartialImageStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Array) -> _PartialImageStats:
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidate:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        # If we see a different type, invalidate.
        if feature_type != statistics_pb2.FeatureNameStatistics.STRING:
            accumulator.invalidate = True
            return accumulator

        # Consider using memoryview to avoid copying after upgrading to
        # arrow 0.12. Note that this would involve modifying the subsequent logic
        # to iterate over the values in a loop.
        values = np.asarray(feature_array.flatten())
        accumulator.total_num_values += values.size
        image_formats = self._image_decoder.get_formats(values)
        valid_mask = ~pd.isnull(image_formats)
        valid_formats = image_formats[valid_mask]
        format_counts = np.unique(valid_formats, return_counts=True)
        for (image_format, count) in zip(*format_counts):
            accumulator.counter_by_format[image_format] += count
        unknown_count = image_formats.size - valid_formats.size
        if unknown_count > 0:
            accumulator.counter_by_format[''] += unknown_count

        if self._enable_size_stats:
            # Get image height and width.
            image_sizes = self._image_decoder.get_sizes(values[valid_mask])
            if image_sizes.any():
                max_sizes = np.max(image_sizes, axis=0)
                # Update the max image height/width with all image values.
                accumulator.max_height = max(accumulator.max_height,
                                             max_sizes[0])
                accumulator.max_width = max(accumulator.max_width,
                                            max_sizes[1])

        return accumulator
  def add_input(self, accumulator: _PartialTimeStats,
                feature_path: types.FeaturePath,
                feature_array: pa.Array) -> _PartialTimeStats:
    """Returns result of folding a batch of inputs into the current accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow Array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
    if accumulator.invalidated:
      return accumulator
    feature_type = stats_util.get_feature_type_from_arrow_type(
        feature_path, feature_array.type)
    # Ignore null array.
    if feature_type is None:
      return accumulator
    if feature_type == statistics_pb2.FeatureNameStatistics.STRING:

      def _maybe_get_utf8(val):
        return stats_util.maybe_get_utf8(val) if isinstance(val, bytes) else val

      values = np.asarray(feature_array.flatten())
      maybe_utf8 = np.vectorize(_maybe_get_utf8, otypes=[np.object])(values)
      if not maybe_utf8.all():
        accumulator.invalidated = True
        return accumulator
      accumulator.update(maybe_utf8, feature_type)
    elif feature_type == statistics_pb2.FeatureNameStatistics.INT:
      values = np.asarray(feature_array.flatten())
      accumulator.update(values, feature_type)
    else:
      accumulator.invalidated = True

    return accumulator
Exemple #8
0
 def update(self, feature_array: pa.Array) -> None:
   """Update the partial bytes statistics using the input value."""
   if pa.types.is_null(feature_array.type):
     return
   # Iterate through the value array and update the partial stats.'
   flattened_values_array = feature_array.flatten()
   if (pa.types.is_floating(flattened_values_array.type) or
       pa.types.is_integer(flattened_values_array.type)):
     raise ValueError('Bytes stats cannot be computed on INT/FLOAT features.')
   if flattened_values_array:
     num_bytes = array_util.GetElementLengths(
         flattened_values_array).to_numpy()
     self.min_num_bytes = min(self.min_num_bytes, np.min(num_bytes))
     self.max_num_bytes = max(self.max_num_bytes, np.max(num_bytes))
     self.total_num_bytes += np.sum(num_bytes)
  def update(self,
             feature_path: types.FeaturePath,
             feature_array: pa.Array,
             feature_type: types.FeatureNameStatisticsType,
             make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch],
             weights: Optional[np.ndarray] = None) -> None:
    """Update the partial common statistics using the input value."""
    if self.type is None:
      self.type = feature_type  # pytype: disable=annotation-type-mismatch
    elif feature_type is not None and self.type != feature_type:
      raise TypeError('Cannot determine the type of feature %s. '
                      'Found values of types %s and %s.' %
                      (feature_path, self.type, feature_type))

    nest_level = arrow_util.get_nest_level(feature_array.type)
    if self.presence_and_valency_stats is None:
      self.presence_and_valency_stats = [
          _PresenceAndValencyStats(make_quantiles_sketch_fn)
          for _ in range(nest_level)
      ]
    elif nest_level != len(self.presence_and_valency_stats):
      raise ValueError('Inconsistent nestedness in feature {}: {} vs {}'.format(
          feature_path, nest_level, len(self.presence_and_valency_stats)))

    # And there's nothing we can collect in this case.
    if not feature_array:
      return

    level = 0
    while arrow_util.is_list_like(feature_array.type):
      presence_mask = ~np.asarray(
          array_util.GetArrayNullBitmapAsByteArray(feature_array)).view(np.bool)
      num_values = np.asarray(
          array_util.ListLengthsFromListArray(feature_array))
      num_values_not_none = num_values[presence_mask]
      self.presence_and_valency_stats[level].update(feature_array,
                                                    presence_mask, num_values,
                                                    num_values_not_none,
                                                    weights)
      flattened = feature_array.flatten()
      if weights is not None:
        parent_indices = array_util.GetFlattenedArrayParentIndices(
            feature_array).to_numpy()
        weights = weights[parent_indices]
      feature_array = flattened
      level += 1
Exemple #10
0
 def update(self, feature_array: pa.Array) -> None:
   """Update the partial string statistics using the input value."""
   if pa.types.is_null(feature_array.type):
     return
   # Iterate through the value array and update the partial stats.
   flattened_values_array = feature_array.flatten()
   if arrow_util.is_binary_like(flattened_values_array.type):
     # GetBinaryArrayTotalByteSize returns a Python long (to be compatible
     # with Python3). To make sure we do cheaper integer arithemetics in
     # Python2, we first convert it to int.
     self.total_bytes_length += int(array_util.GetBinaryArrayTotalByteSize(
         flattened_values_array))
   elif flattened_values_array:
     # We can only do flattened_values_array.to_numpy() when it's not empty.
     # This could be computed faster by taking log10 of the integer.
     def _len_after_conv(s):
       return len(str(s))
     self.total_bytes_length += np.sum(
         np.vectorize(_len_after_conv,
                      otypes=[np.int32])(np.asarray(flattened_values_array)))
    def add_input(self, accumulator: _PartialNLStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Array) -> _PartialNLStats:
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow Array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidate:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        # If we see a different type, invalidate.
        if feature_type != statistics_pb2.FeatureNameStatistics.STRING:
            accumulator.invalidate = True
            return accumulator

        def _is_non_utf8(value):
            return (isinstance(value, bytes)
                    and stats_util.maybe_get_utf8(value) is None)

        is_non_utf_vec = np.vectorize(_is_non_utf8, otypes=[np.bool])
        classify_vec = np.vectorize(self._classifier.classify,
                                    otypes=[np.bool])
        values = np.asarray(feature_array.flatten().slice(0, _CROP_AT_VALUES))
        if np.any(is_non_utf_vec(values)):
            accumulator.invalidate = True
            return accumulator
        accumulator.considered += values.size
        accumulator.matched += np.sum(classify_vec(values))
        return accumulator
Exemple #12
0
    def _ListArrayToTensor(
            self, list_array: pa.Array,
            produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]:
        """Converts a ListArray to a dense tensor."""
        values = list_array.flatten()
        batch_size = len(list_array)
        expected_num_elements = batch_size * self._unbatched_flat_len
        if len(values) != expected_num_elements:
            raise ValueError(
                "Unable to convert ListArray {} to {}: size mismatch. expected {} "
                "elements but got {}".format(list_array, self.type_spec,
                                             expected_num_elements,
                                             len(values)))
        actual_shape = list(self._shape)
        actual_shape[0] = batch_size
        if self._convert_to_binary_fn is not None:
            values = self._convert_to_binary_fn(values)
        values_np = np.asarray(values).reshape(actual_shape)
        if produce_eager_tensors:
            return tf.convert_to_tensor(values_np)

        return values_np
Exemple #13
0
  def _ListArrayToTensor(
      self, list_array: pa.Array,
      produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]:
    """Converts a ListArray to a dense tensor."""
    values = list_array.flatten()
    batch_size = len(list_array)
    expected_num_elements = batch_size * self._unbatched_flat_len
    if len(values) != expected_num_elements:
      raise ValueError(
          "Unable to convert ListArray {} to {}: size mismatch. expected {} "
          "elements but got {}".format(
              list_array, self.type_spec, expected_num_elements, len(values)))
    # TODO(zhuo): Cast StringArrays to BinaryArrays before calling np.asarray()
    # to avoid generating unicode objects which are wasteful to feed to
    # TensorFlow, once pyarrow requirement is bumped to >=0.15.
    actual_shape = list(self._shape)
    actual_shape[0] = batch_size
    values_np = np.asarray(values).reshape(actual_shape)
    if produce_eager_tensors:
      return tf.convert_to_tensor(values_np)

    return values_np