Beispiel #1
0
 def test_value_counts_binary(self):
     binary_array = pa.array(
         [b"abc", b"ghi", b"def", b"ghi", b"ghi", b"def"])
     expected_result = {b"abc": 1, b"ghi": 3, b"def": 2}
     self.assertDictEqual(
         self._value_counts_struct_array_to_dict(
             array_util.ValueCounts(binary_array)), expected_result)
Beispiel #2
0
def _LargeBinaryCanBeValueCounted() -> bool:
  """Returns True if a large binary array can be value counted."""
  try:
    array_util.ValueCounts(pa.array([], type=pa.large_binary()))
  except:  # pylint:disable=bare-except
    return False
  return True
Beispiel #3
0
    def add_input(
        self, accumulator: Dict[types.FeaturePath, _ValueCounts],
        input_record_batch: pa.RecordBatch
    ) -> Dict[types.FeaturePath, _ValueCounts]:
        for feature_path, leaf_array, weights in arrow_util.enumerate_arrays(
                input_record_batch,
                weight_column=self._weight_feature,
                enumerate_leaves_only=True):
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, leaf_array.type)
            if feature_type is None:
                continue
            # if it's not a categorical feature nor a string feature, we don't bother
            # with topk stats.
            if (feature_path in self._categorical_features or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                flattened_values = leaf_array.flatten()
                unweighted_counts = collections.Counter()
                # Compute unweighted counts.
                value_counts = array_util.ValueCounts(flattened_values)
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in six.moves.zip(values, counts):
                    unweighted_counts[value] = count

                # Compute weighted counts if a weight feature is specified.
                weighted_counts = _WeightedCounter()
                if weights is not None:
                    flattened_values_np = np.asarray(flattened_values)
                    parent_indices = array_util.GetFlattenedArrayParentIndices(
                        leaf_array)
                    weighted_counts.weighted_update(
                        flattened_values_np,
                        weights[np.asarray(parent_indices)])

                if feature_path not in accumulator:
                    accumulator[feature_path] = _ValueCounts(
                        unweighted_counts=unweighted_counts,
                        weighted_counts=weighted_counts)
                else:
                    accumulator[feature_path].unweighted_counts.update(
                        unweighted_counts)
                    accumulator[feature_path].weighted_counts.update(
                        weighted_counts)

        return accumulator
def _to_topk_tuples(
    sliced_record_batch: Tuple[types.SliceKey, pa.RecordBatch],
    bytes_features: FrozenSet[types.FeaturePath],
    categorical_features: FrozenSet[types.FeaturePath],
    example_weight_map: ExampleWeightMap,
) -> Iterable[Tuple[Tuple[types.SliceKey, types.FeaturePathTuple, Any], Union[
    int, Tuple[int, Union[int, float]]]]]:
  """Generates tuples for computing top-k and uniques from the input."""
  slice_key, record_batch = sliced_record_batch

  has_any_weight = bool(example_weight_map.all_weight_features())
  for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
      record_batch,
      example_weight_map=example_weight_map,
      enumerate_leaves_only=True):
    feature_array_type = feature_array.type
    feature_type = stats_util.get_feature_type_from_arrow_type(
        feature_path, feature_array_type)
    if feature_path in bytes_features:
      continue
    if ((feature_type == statistics_pb2.FeatureNameStatistics.INT and
         feature_path in categorical_features) or
        feature_type == statistics_pb2.FeatureNameStatistics.STRING):
      flattened_values, parent_indices = arrow_util.flatten_nested(
          feature_array, weights is not None)
      if weights is not None and flattened_values:
        # Slow path: weighted uniques.
        flattened_values_np = np.asarray(flattened_values)
        weights_ndarray = weights[parent_indices]
        for value, count, weight in _weighted_unique(
            flattened_values_np, weights_ndarray):
          yield (slice_key, feature_path.steps(), value), (count, weight)
      else:
        value_counts = array_util.ValueCounts(flattened_values)
        values = value_counts.field('values').to_pylist()
        counts = value_counts.field('counts').to_pylist()
        if has_any_weight:
          for value, count in zip(values, counts):
            yield ((slice_key, feature_path.steps(), value), (count, 1))
        else:
          for value, count in zip(values, counts):
            yield ((slice_key, feature_path.steps(), value), count)
def _to_topk_tuples(
    sliced_table: Tuple[types.SliceKey, pa.Table],
    bytes_features: FrozenSet[types.FeaturePath],
    categorical_features: FrozenSet[types.FeaturePath],
    weight_feature: Optional[Text]
) -> Iterable[
    Tuple[Tuple[types.SliceKey, FeaturePathTuple, Any],
          Union[int, Tuple[int, Union[int, float]]]]]:
  """Generates tuples for computing top-k and uniques from input tables."""
  slice_key, table = sliced_table

  for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
      table,
      weight_column=weight_feature,
      enumerate_leaves_only=True):
    feature_array_type = feature_array.type
    if pa.types.is_null(feature_array_type):
      continue
    if feature_path in bytes_features:
      continue
    if (feature_path in categorical_features or
        stats_util.get_feature_type_from_arrow_type(
            feature_path,
            feature_array_type) == statistics_pb2.FeatureNameStatistics.STRING):
      flattened_values = feature_array.flatten()
      if weights is not None and flattened_values:
        # Slow path: weighted uniques.
        flattened_values_np = np.asarray(flattened_values)
        parent_indices = (
            np.asarray(
                array_util.GetFlattenedArrayParentIndices(feature_array)))
        weights_ndarray = weights[parent_indices]
        for value, count, weight in _weighted_unique(
            flattened_values_np, weights_ndarray):
          yield (slice_key, feature_path.steps(), value), (count, weight)
      else:
        value_counts = array_util.ValueCounts(flattened_values)
        values = value_counts.field('values').to_pylist()
        counts = value_counts.field('counts').to_pylist()
        for value, count in six.moves.zip(values, counts):
          yield ((slice_key, feature_path.steps(), value), count)
Beispiel #6
0
 def test_value_counts_empty(self):
   empty_array = pa.array([])
   expected_result = {}
   self.assertDictEqual(self._value_counts_struct_array_to_dict(
       array_util.ValueCounts(empty_array)), expected_result)
Beispiel #7
0
 def test_value_counts_integer(self):
   int_array = pa.array([1, 4, 1, 3, 1, 4])
   expected_result = {1: 3, 4: 2, 3: 1}
   self.assertDictEqual(self._value_counts_struct_array_to_dict(
       array_util.ValueCounts(int_array)), expected_result)