Esempio n. 1
0
def _to_topk_tuples(
    sliced_table: Tuple[Text, pa.Table],
    categorical_features: FrozenSet[types.FeaturePath],
    weight_feature: Optional[Text]
) -> Iterable[Tuple[Tuple[Text, FeaturePathTuple, Any], Union[int, Tuple[
        int, Union[int, float]]]]]:
    """Generates tuples for computing top-k and uniques from input tables."""
    slice_key, table = sliced_table

    for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
            table, weight_column=weight_feature, enumerate_leaves_only=True):
        feature_array_type = feature_array.type
        if (feature_path in categorical_features
                or stats_util.get_feature_type_from_arrow_type(
                    feature_path, feature_array_type)
                == statistics_pb2.FeatureNameStatistics.STRING):
            flattened_values = feature_array.flatten()
            if weights is not None and flattened_values:
                # Slow path: weighted uniques.
                flattened_values_np = arrow_util.primitive_array_to_numpy(
                    flattened_values)
                parent_indices = (arrow_util.primitive_array_to_numpy(
                    arrow_util.GetFlattenedArrayParentIndices(feature_array)))
                weights_ndarray = weights[parent_indices]
                for value, count, weight in _weighted_unique(
                        flattened_values_np, weights_ndarray):
                    yield (slice_key, feature_path.steps(), value), (count,
                                                                     weight)
            else:
                value_counts = arrow_util.ValueCounts(flattened_values)
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in six.moves.zip(values, counts):
                    yield ((slice_key, feature_path.steps(), value), count)
    def add_input(
        self, accumulator: Dict[types.FeaturePath,
                                _PartialBasicStats], examples_table: pa.Table
    ) -> Dict[types.FeaturePath, _PartialBasicStats]:
        for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
                examples_table,
                weight_column=self._weight_feature,
                enumerate_leaves_only=False):
            stats_for_feature = accumulator.get(feature_path)
            if stats_for_feature is None:
                stats_for_feature = _PartialBasicStats(
                    self._weight_feature is not None)
                # Store empty summary.
                stats_for_feature.common_stats.num_values_summary = (
                    self._num_values_quantiles_combiner.create_accumulator())
                stats_for_feature.numeric_stats.quantiles_summary = (
                    self._values_quantiles_combiner.create_accumulator())
                accumulator[feature_path] = stats_for_feature

            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, feature_array.type)
            stats_for_feature.common_stats.update(
                feature_path, feature_array, feature_type,
                self._num_values_quantiles_combiner, weights)
            is_categorical_feature = feature_path in self._categorical_features
            if (is_categorical_feature or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                stats_for_feature.string_stats.update(feature_array)
            elif feature_type in (statistics_pb2.FeatureNameStatistics.INT,
                                  statistics_pb2.FeatureNameStatistics.FLOAT):
                stats_for_feature.numeric_stats.update(
                    feature_array, self._values_quantiles_combiner, weights)

        return accumulator
    def testEnumerateArrays(self):
        for leaves_only, has_weights in itertools.combinations_with_replacement(
            [True, False], 2):
            actual_results = {}
            for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
                    _INPUT_TABLE, "w" if has_weights else None, leaves_only):
                actual_results[feature_path] = (feature_array, weights)

            expected_results = {}
            for p in [["f1"], ["w"], ["f2", "sf1"], ["f2", "sf2", "ssf1"]]:
                feature_path = types.FeaturePath(p)
                expected_results[feature_path] = (
                    _FEATURES_TO_ARRAYS[feature_path][0],
                    _FEATURES_TO_ARRAYS[feature_path][1]
                    if has_weights else None)
            if not leaves_only:
                for p in [["f2"], ["f2", "sf2"]]:
                    feature_path = types.FeaturePath(p)
                    expected_results[feature_path] = (
                        _FEATURES_TO_ARRAYS[feature_path][0],
                        _FEATURES_TO_ARRAYS[feature_path][1]
                        if has_weights else None)

            self.assertLen(actual_results, len(expected_results))
            for k, v in six.iteritems(expected_results):
                self.assertIn(k, actual_results)
                actual = actual_results[k]
                self.assertTrue(
                    actual[0].equals(v[0]), "leaves_only={}; has_weights={}; "
                    "feature={}; expected: {}; actual: {}".format(
                        leaves_only, has_weights, k, v, actual))
                np.testing.assert_array_equal(actual[1], v[1])
Esempio n. 4
0
  def add_input(self, wrapper_accumulator: WrapperAccumulator,
                input_record_batch: pa.RecordBatch) -> WrapperAccumulator:
    """Returns result of folding a batch of inputs into wrapper_accumulator.

    Args:
      wrapper_accumulator: The current wrapper accumulator.
      input_record_batch: An arrow RecordBatch representing a batch of examples,
      which should be added to the accumulator.

    Returns:
      The wrapper_accumulator after updating the statistics for the batch of
      inputs.
    """
    if self._sample_rate is not None and random.random() <= self._sample_rate:
      return wrapper_accumulator

    for feature_path, feature_array, _ in arrow_util.enumerate_arrays(
        input_record_batch,
        weight_column=self._weight_feature,
        enumerate_leaves_only=True):
      for index, generator in enumerate(self._feature_stats_generators):
        self._perhaps_initialize_for_feature_path(wrapper_accumulator,
                                                  feature_path)
        wrapper_accumulator[feature_path][index] = generator.add_input(
            generator.create_accumulator(), feature_path, feature_array)

    return wrapper_accumulator
Esempio n. 5
0
 def testInvalidWeightColumnStringValues(self):
     with self.assertRaisesRegex(
             ValueError, 'Weight feature "w" must be of numeric type.*'):
         for _ in arrow_util.enumerate_arrays(pa.Table.from_arrays(
             [pa.array([[1], [2, 3]]),
              pa.array([["two"], ["two"]])], ["v", "w"]),
                                              weight_column="w",
                                              enumerate_leaves_only=False):
             pass
 def testInvalidWeightColumn(self):
     with self.assertRaisesRegex(
             ValueError,
             "weight feature must have exactly one value in each example"):
         for _ in arrow_util.enumerate_arrays(pa.Table.from_arrays(
             [pa.array([[1], [2, 3]]),
              pa.array([[1], []])], ["v", "w"]),
                                              weight_column="w",
                                              enumerate_leaves_only=False):
             pass
Esempio n. 7
0
 def testEnumerateArraysStringWeight(self):
   # The arrow type of a string changes between py2 and py3 so we accept either
   with self.assertRaisesRegex(
       ValueError,
       r'Weight column "w" must be of numeric type. Found (string|binary).*'):
     for _ in arrow_util.enumerate_arrays(
         pa.RecordBatch.from_arrays(
             [pa.array([[1], [2, 3]]),
              pa.array([["a"], ["b"]])], ["v", "w"]),
         weight_column="w",
         enumerate_leaves_only=True):
       pass
Esempio n. 8
0
 def add_input(
     self, accumulator: Dict[tfdv_types.FeaturePath, _CombinedSketch],
     input_record_batch: pa.RecordBatch
 ) -> Dict[tfdv_types.FeaturePath, _CombinedSketch]:
     for feature_path, leaf_array, weights in arrow_util.enumerate_arrays(
             input_record_batch,
             example_weight_map=self._example_weight_map,
             enumerate_leaves_only=True):
         feature_type = stats_util.get_feature_type_from_arrow_type(
             feature_path, leaf_array.type)
         if self._should_run(feature_path, feature_type):
             self._update_combined_sketch_for_feature(
                 feature_path, leaf_array, weights, accumulator)
     return accumulator
 def testEnumerateArraysWithColumnSelectFn(self, col_fn, expected_features):
     actual = list(
         arrow_util.enumerate_arrays(_INPUT_RECORD_BATCH,
                                     _EXAMPLE_WEIGHT_MAP,
                                     True,
                                     column_select_fn=col_fn))
     expected = list(
         (f, _FEATURES_TO_ARRAYS[f].array, _FEATURES_TO_ARRAYS[f].weights)
         for f in expected_features)
     for (actual_path, actual_col,
          actual_w), (expected_path, expected_col,
                      expected_w) in zip(actual, expected):
         self.assertEqual(expected_path, actual_path)
         self.assertEqual(expected_col, actual_col)
         self.assertEqual(pa.array(expected_w), pa.array(actual_w))
 def testEnumerateMissingPropagatedInFlattenedStruct(
         self, batch, expected_results):
     actual_results = {}
     for feature_path, feature_array, _ in arrow_util.enumerate_arrays(
             batch, example_weight_map=None, enumerate_leaves_only=False):
         actual_results[feature_path] = feature_array
     self.assertLen(actual_results, len(expected_results))
     for k, v in six.iteritems(expected_results):
         assert k in actual_results, (k, list(actual_results.keys()))
         self.assertIn(k, actual_results)
         actual = _Normalize(actual_results[k])
         v = _Normalize(v)
         self.assertTrue(
             actual.equals(v),
             "feature={}; expected: {}; actual: {}; diff: {}".format(
                 k, v, actual, actual.diff(v)))
Esempio n. 11
0
 def add_input(
     self, accumulator: Dict[tfdv_types.FeaturePath, _CombinedSketch],
     input_record_batch: pa.RecordBatch
     ) -> Dict[tfdv_types.FeaturePath, _CombinedSketch]:
   for feature_path, leaf_array, weights in arrow_util.enumerate_arrays(
       input_record_batch,
       example_weight_map=self._example_weight_map,
       enumerate_leaves_only=True):
     feature_type = stats_util.get_feature_type_from_arrow_type(
         feature_path, leaf_array.type)
     # Only compute top-k and unique stats for categorical and string features.
     if ((feature_type == statistics_pb2.FeatureNameStatistics.INT and
          feature_path in self._categorical_features) or
         feature_type == statistics_pb2.FeatureNameStatistics.STRING):
       self._update_combined_sketch_for_feature(
           feature_path, leaf_array, weights, accumulator)
   return accumulator
Esempio n. 12
0
    def add_input(
        self, accumulator: Dict[types.FeaturePath, _ValueCounts],
        input_record_batch: pa.RecordBatch
    ) -> Dict[types.FeaturePath, _ValueCounts]:
        for feature_path, leaf_array, weights in arrow_util.enumerate_arrays(
                input_record_batch,
                weight_column=self._weight_feature,
                enumerate_leaves_only=True):
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, leaf_array.type)
            if feature_type is None:
                continue
            # if it's not a categorical feature nor a string feature, we don't bother
            # with topk stats.
            if (feature_path in self._categorical_features or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                flattened_values = leaf_array.flatten()
                unweighted_counts = collections.Counter()
                # Compute unweighted counts.
                value_counts = array_util.ValueCounts(flattened_values)
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in six.moves.zip(values, counts):
                    unweighted_counts[value] = count

                # Compute weighted counts if a weight feature is specified.
                weighted_counts = _WeightedCounter()
                if weights is not None:
                    flattened_values_np = np.asarray(flattened_values)
                    parent_indices = array_util.GetFlattenedArrayParentIndices(
                        leaf_array)
                    weighted_counts.weighted_update(
                        flattened_values_np,
                        weights[np.asarray(parent_indices)])

                if feature_path not in accumulator:
                    accumulator[feature_path] = _ValueCounts(
                        unweighted_counts=unweighted_counts,
                        weighted_counts=weighted_counts)
                else:
                    accumulator[feature_path].unweighted_counts.update(
                        unweighted_counts)
                    accumulator[feature_path].weighted_counts.update(
                        weighted_counts)

        return accumulator
    def testEnumerateArrays(self):
        for leaves_only, has_weights, wrap_flat_struct_in_list in (
                itertools.product([True, False], [True, False],
                                  [True, False])):
            actual_results = {}
            for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
                    _INPUT_RECORD_BATCH,
                    _EXAMPLE_WEIGHT_MAP if has_weights else None, leaves_only,
                    wrap_flat_struct_in_list):
                actual_results[feature_path] = (feature_array, weights)

            expected_results = {}
            # leaf fields
            for p in [["f1"], ["w"], ["w_override1"], ["w_override2"],
                      ["f2", "sf1"], ["f2", "sf2", "ssf1"], ["f3", "sf1"],
                      ["f3", "sf2"]]:
                feature_path = types.FeaturePath(p)
                expected_results[feature_path] = (
                    _FEATURES_TO_ARRAYS[feature_path].array,
                    _FEATURES_TO_ARRAYS[feature_path].weights
                    if has_weights else None)
            if not leaves_only:
                for p in [["f2"], ["f2", "sf2"], ["f3"]]:
                    feature_path = types.FeaturePath(p)
                    expected_array = _FEATURES_TO_ARRAYS[feature_path][0]
                    if wrap_flat_struct_in_list and pa.types.is_struct(
                            expected_array.type):
                        expected_array = array_util.ToSingletonListArray(
                            expected_array)
                    expected_results[feature_path] = (
                        expected_array,
                        _FEATURES_TO_ARRAYS[feature_path].weights
                        if has_weights else None)

            self.assertLen(actual_results, len(expected_results))
            for k, v in six.iteritems(expected_results):
                self.assertIn(k, actual_results)
                actual = actual_results[k]
                self.assertTrue(
                    actual[0].equals(v[0]), "leaves_only={}; has_weights={}; "
                    "wrap_flat_struct_in_list={} feature={}; expected: {}; actual: {}"
                    .format(leaves_only, has_weights, wrap_flat_struct_in_list,
                            k, v, actual))
                np.testing.assert_array_equal(actual[1], v[1])
Esempio n. 14
0
    def add_input(
        self, accumulator: Dict[types.FeaturePath, _ValueCounts],
        input_record_batch: pa.RecordBatch
    ) -> Dict[types.FeaturePath, _ValueCounts]:
        for feature_path, leaf_array, weights in arrow_util.enumerate_arrays(
                input_record_batch,
                example_weight_map=self._example_weight_map,
                enumerate_leaves_only=True):
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, leaf_array.type)
            # if it's not a categorical int feature nor a string feature, we don't
            # bother with topk stats.
            if ((feature_type == statistics_pb2.FeatureNameStatistics.INT
                 and feature_path in self._categorical_features)
                    or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                flattened_values, parent_indices = arrow_util.flatten_nested(
                    leaf_array, weights is not None)
                unweighted_counts = collections.Counter()
                # Compute unweighted counts.
                value_counts = flattened_values.value_counts()
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in zip(values, counts):
                    unweighted_counts[value] = count

                # Compute weighted counts if a weight feature is specified.
                weighted_counts = _WeightedCounter()
                if weights is not None:
                    flattened_values_np = np.asarray(flattened_values)
                    weighted_counts.weighted_update(flattened_values_np,
                                                    weights[parent_indices])

                if feature_path not in accumulator:
                    accumulator[feature_path] = _ValueCounts(
                        unweighted_counts=unweighted_counts,
                        weighted_counts=weighted_counts)
                else:
                    accumulator[feature_path].unweighted_counts.update(
                        unweighted_counts)
                    accumulator[feature_path].weighted_counts.update(
                        weighted_counts)

        return accumulator
Esempio n. 15
0
def _to_topk_tuples(
    sliced_record_batch: Tuple[types.SliceKey, pa.RecordBatch],
    bytes_features: FrozenSet[types.FeaturePath],
    categorical_features: FrozenSet[types.FeaturePath],
    example_weight_map: ExampleWeightMap,
) -> Iterable[Tuple[Tuple[types.SliceKey, types.FeaturePathTuple, Any], Union[
        int, Tuple[int, Union[int, float]]]]]:
    """Generates tuples for computing top-k and uniques from the input."""
    slice_key, record_batch = sliced_record_batch

    has_any_weight = bool(example_weight_map.all_weight_features())
    for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
            record_batch,
            example_weight_map=example_weight_map,
            enumerate_leaves_only=True):
        feature_array_type = feature_array.type
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array_type)
        if feature_path in bytes_features:
            continue
        if ((feature_type == statistics_pb2.FeatureNameStatistics.INT
             and feature_path in categorical_features) or feature_type
                == statistics_pb2.FeatureNameStatistics.STRING):
            flattened_values, parent_indices = arrow_util.flatten_nested(
                feature_array, weights is not None)
            if weights is not None and flattened_values:
                # Slow path: weighted uniques.
                flattened_values_np = np.asarray(flattened_values)
                weights_ndarray = weights[parent_indices]
                for value, count, weight in _weighted_unique(
                        flattened_values_np, weights_ndarray):
                    yield (slice_key, feature_path.steps(), value), (count,
                                                                     weight)
            else:
                value_counts = flattened_values.value_counts()
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                if has_any_weight:
                    for value, count in zip(values, counts):
                        yield ((slice_key, feature_path.steps(), value),
                               (count, 1))
                else:
                    for value, count in zip(values, counts):
                        yield ((slice_key, feature_path.steps(), value), count)
  def add_input(
      self, accumulator: Dict[types.FeaturePath, _PartialBasicStats],
      examples: pa.RecordBatch
      ) -> Dict[types.FeaturePath, _PartialBasicStats]:
    for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
        examples,
        example_weight_map=self._example_weight_map,
        enumerate_leaves_only=False):
      stats_for_feature = accumulator.get(feature_path)
      if stats_for_feature is None:
        stats_for_feature = _PartialBasicStats(
            weights is not None, self._make_quantiles_sketch_fn)
        accumulator[feature_path] = stats_for_feature

      feature_type = stats_util.get_feature_type_from_arrow_type(
          feature_path, feature_array.type)
      stats_for_feature.common_stats.update(feature_path,
                                            feature_array, feature_type,
                                            self._make_quantiles_sketch_fn,
                                            weights)
      # The user may make certain claims about a feature's data type
      # (e.g. _bytes_features imply string data type). However we should not
      # trust those claims because TFDV is also responsible for detecting
      # mismatching types. We collect stats according to the actual type, and
      # only when the actual type matches the claim do we collect the
      # type-specific stats (like for categorical int and bytes features).
      if feature_type == statistics_pb2.FeatureNameStatistics.STRING:
        if feature_path in self._bytes_features:
          stats_for_feature.bytes_stats.update(feature_array)
        else:
          stats_for_feature.string_stats.update(feature_array)
      elif feature_type == statistics_pb2.FeatureNameStatistics.INT:
        if feature_path in self._categorical_features:
          stats_for_feature.string_stats.update(feature_array)
        else:
          stats_for_feature.numeric_stats.update(feature_array, weights)
      elif feature_type == statistics_pb2.FeatureNameStatistics.FLOAT:
        stats_for_feature.numeric_stats.update(feature_array, weights)

    return accumulator
def _to_topk_tuples(
    sliced_record_batch: Tuple[types.SliceKey, pa.RecordBatch],
    bytes_features: FrozenSet[types.FeaturePath],
    categorical_features: FrozenSet[types.FeaturePath],
    weight_feature: Optional[Text]
) -> Iterable[Tuple[Tuple[types.SliceKey, types.FeaturePathTuple, Any], Union[
    int, Tuple[int, Union[int, float]]]]]:
  """Generates tuples for computing top-k and uniques from the input."""
  slice_key, record_batch = sliced_record_batch

  for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
      record_batch,
      weight_column=weight_feature,
      enumerate_leaves_only=True):
    feature_array_type = feature_array.type
    feature_type = stats_util.get_feature_type_from_arrow_type(
        feature_path, feature_array_type)
    # Skip null columns.
    if feature_type is None:
      continue
    if feature_path in bytes_features:
      continue
    if (feature_path in categorical_features or
        feature_type == statistics_pb2.FeatureNameStatistics.STRING):
      flattened_values, parent_indices = arrow_util.flatten_nested(
          feature_array, weights is not None)
      if weights is not None and flattened_values:
        # Slow path: weighted uniques.
        flattened_values_np = np.asarray(flattened_values)
        weights_ndarray = weights[parent_indices]
        for value, count, weight in _weighted_unique(
            flattened_values_np, weights_ndarray):
          yield (slice_key, feature_path.steps(), value), (count, weight)
      else:
        value_counts = array_util.ValueCounts(flattened_values)
        values = value_counts.field('values').to_pylist()
        counts = value_counts.field('counts').to_pylist()
        for value, count in six.moves.zip(values, counts):
          yield ((slice_key, feature_path.steps(), value), count)
    def testEnumerate(self):
        input_table = pa.Table.from_arrays([
            pa.array([[1], [2, 3]]),
            pa.array([[{
                "sf1": [["a", "b"]]
            }], [{
                "sf2": [{
                    "ssf1": [[3], [4]]
                }]
            }]]),
            pa.array([[1.0], [2.0]])
        ], ["f1", "f2", "w"])
        possible_results = {
            types.FeaturePath(["f1"]): (pa.array([[1], [2, 3]]), [1.0, 2.0]),
            types.FeaturePath(["w"]): (pa.array([[1.0], [2.0]]), [1.0, 2.0]),
            types.FeaturePath(["f2"]): (pa.array([[{
                "sf1": [["a", "b"]]
            }], [{
                "sf2": [{
                    "ssf1": [[3], [4]]
                }]
            }]]), [1.0, 2.0]),
            types.FeaturePath(["f2", "sf1"]): (pa.array([[["a", "b"]],
                                                         None]), [1.0, 2.0]),
            types.FeaturePath(["f2", "sf2"]):
            (pa.array([None, [{
                "ssf1": [[3], [4]]
            }]]), [1.0, 2.0]),
            types.FeaturePath(["f2", "sf2", "ssf1"]):
            (pa.array([[[3], [4]]]), [2.0]),
        }
        for leaves_only, has_weights in itertools.combinations_with_replacement(
            [True, False], 2):
            actual_results = {}
            for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
                    input_table, "w" if has_weights else None, leaves_only):
                actual_results[feature_path] = (feature_array, weights)

            expected_results = {}
            for p in [["f1"], ["w"], ["f2", "sf1"], ["f2", "sf2", "ssf1"]]:
                feature_path = types.FeaturePath(p)
                expected_results[feature_path] = (
                    possible_results[feature_path][0],
                    possible_results[feature_path][1] if has_weights else None)
            if not leaves_only:
                for p in [["f2"], ["f2", "sf2"]]:
                    feature_path = types.FeaturePath(p)
                    expected_results[feature_path] = (
                        possible_results[feature_path][0],
                        possible_results[feature_path][1]
                        if has_weights else None)

            self.assertLen(actual_results, len(expected_results))
            for k, v in six.iteritems(expected_results):
                self.assertIn(k, actual_results)
                actual = actual_results[k]
                self.assertTrue(
                    actual[0].equals(v[0]), "leaves_only={}; has_weights={}; "
                    "feature={}; expected: {}; actual: {}".format(
                        leaves_only, has_weights, k, v, actual))
                np.testing.assert_array_equal(actual[1], v[1])
    def add_input(
        self, accumulator: Dict[types.FeaturePath, _PartialSparseFeatureStats],
        input_table: pa.Table
    ) -> Dict[types.FeaturePath, _PartialSparseFeatureStats]:
        """Returns result of folding a batch of inputs into the current accumulator.

    Args:
      accumulator: The current accumulator.
      input_table: An Arrow Table whose columns are features and rows are
        examples.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        feature_value_list_lengths = dict()
        feature_is_missing = dict()
        batch_example_count = input_table.num_rows
        # Do a single pass through the input table to determine the value list
        # lengths and whether the feature is missing for every feature
        # that is an index or value feature in any sparse feature in the schema.
        for feature_path, leaf_array, _ in arrow_util.enumerate_arrays(
                input_table, weight_column=None, enumerate_leaves_only=True):
            if (feature_path in self._all_index_feature_paths
                    or feature_path in self._all_value_feature_paths):
                if pa.types.is_null(leaf_array.type):
                    # If the column is a NullArray, it is missing from the entire batch
                    # (missing features have value list lengths of 0).
                    feature_value_list_lengths[feature_path] = np.full(
                        batch_example_count, 0)
                    feature_is_missing[feature_path] = np.full(
                        batch_example_count, True)
                else:
                    feature_value_list_lengths[feature_path] = np.asarray(
                        array_util.ListLengthsFromListArray(leaf_array))
                    feature_is_missing[feature_path] = np.asarray(
                        array_util.GetArrayNullBitmapAsByteArray(leaf_array))

        # Now create a partial sparse feature stats object for each sparse feature
        # using the value list lengths and feature missing information collected
        # above.
        for feature_path in self._sparse_feature_component_paths:
            value_feature_path = self._sparse_feature_component_paths[
                feature_path].value_feature
            index_feature_paths = self._sparse_feature_component_paths[
                feature_path].index_features

            # Create a filter identifying examples in which the entire sparse feature
            # is missing since those examples should not be included in counting
            # missing counts or length differences.
            component_features_missing = np.array([
                feature_is_missing.get(path, np.full(batch_example_count,
                                                     True)) for path in
                itertools.chain([value_feature_path], index_feature_paths)
            ])
            entire_sparse_feature_missing = np.all(component_features_missing,
                                                   axis=0)
            num_examples_missing_sparse_feature = np.sum(
                entire_sparse_feature_missing)

            # If all examples in the batch are missing the sparse feature, do not
            # update the accumulator with the partial stats for that sparse feature.
            if num_examples_missing_sparse_feature == batch_example_count:
                continue

            is_missing_value_feature = feature_is_missing.get(
                value_feature_path)
            # If this batch does not have the value feature at all,
            # missing_value_count is the number of examples in the batch.
            # Also populate the value list lengths for the value feature with all 0s
            # since a missing feature is considered to have a value list length of 0.
            if is_missing_value_feature is None:
                missing_value_count = batch_example_count
                feature_value_list_lengths[value_feature_path] = np.full(
                    batch_example_count, 0)
            else:
                missing_value_count = np.sum(is_missing_value_feature)
            # Do not include examples that are entirely missing the sparse feature in
            # the missing value count.
            missing_value_count -= num_examples_missing_sparse_feature

            missing_index_counts = collections.Counter()
            min_length_diff = dict()
            max_length_diff = dict()
            for index_feature_path in index_feature_paths:
                is_missing_index_feature = feature_is_missing.get(
                    index_feature_path)
                if is_missing_index_feature is None:
                    # If this batch does not have this index feature at all,
                    # missing_index_count for that index feature is the number of
                    # examples in the batch.
                    missing_index_count = batch_example_count
                    # Populate the value list lengths for the index feature with all 0s
                    # since a missing feature is considered to have a value list length of
                    # 0.
                    feature_value_list_lengths[index_feature_path] = np.full(
                        batch_example_count, 0)
                else:
                    missing_index_count = np.sum(is_missing_index_feature)
                # Do not include examples that are entirely missing the sparse feature
                # in the missing value count.
                missing_index_counts[index_feature_path] = (
                    missing_index_count - num_examples_missing_sparse_feature)

                length_differences = np.subtract(
                    feature_value_list_lengths[index_feature_path],
                    feature_value_list_lengths[value_feature_path])

                # Do not include examples that are entirely missing the sparse feature
                # in determining the min and max length differences.
                filtered_length_differences = length_differences[
                    ~entire_sparse_feature_missing]
                # This generator should not get to this point if the current sparse
                # feature is missing from all examples in the batch (which would cause
                # filtered_length_differences to be empty).
                assert filtered_length_differences.size != 0
                min_length_diff[index_feature_path] = np.min(
                    filtered_length_differences)
                max_length_diff[index_feature_path] = np.max(
                    filtered_length_differences)

            stats_for_feature = _PartialSparseFeatureStats(
                missing_value_count, missing_index_counts, min_length_diff,
                max_length_diff)
            existing_stats_for_feature = accumulator.get(feature_path)
            if existing_stats_for_feature is None:
                accumulator[feature_path] = stats_for_feature
            else:
                accumulator[feature_path] += stats_for_feature
        return accumulator
Esempio n. 20
0
    def add_input(
        self, accumulator: Dict[types.FeaturePath, _PartialSparseFeatureStats],
        input_table: pa.Table
    ) -> Dict[types.FeaturePath, _PartialSparseFeatureStats]:
        """Returns result of folding a batch of inputs into the current accumulator.

    Args:
      accumulator: The current accumulator.
      input_table: An Arrow Table whose columns are features and rows are
        examples.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        component_feature_value_list_lengths = dict()
        component_feature_num_missing = dict()
        batch_example_count = input_table.num_rows
        # Do a single pass through the input table to determine the value list
        # lengths and number missing for every feature that is an index or value
        # feature in any sparse feature in the schema.
        for feature_path, leaf_array, _ in arrow_util.enumerate_arrays(
                input_table, weight_column=None, enumerate_leaves_only=True):
            if (feature_path in self._all_index_feature_paths
                    or feature_path in self._all_value_feature_paths):
                # If the column is a NullArray, skip it when populating the
                # component_feature_ dicts. Features that are missing from those dicts
                # are treated as entirely missing for the batch.
                if not pa.types.is_null(leaf_array.type):
                    component_feature_value_list_lengths[
                        feature_path] = arrow_util.primitive_array_to_numpy(
                            arrow_util.ListLengthsFromListArray(leaf_array))
                    component_feature_num_missing[
                        feature_path] = leaf_array.null_count

        # Now create a partial sparse feature stats object for each sparse feature
        # using the value list lengths and numbers missing information collected
        # above.
        for feature_path in self._sparse_feature_component_paths:
            value_feature_path = self._sparse_feature_component_paths[
                feature_path].value_feature
            index_feature_paths = self._sparse_feature_component_paths[
                feature_path].index_features
            missing_value_count = component_feature_num_missing.get(
                value_feature_path)
            # If this batch does not have the value feature at all,
            # missing_value_count is the number of examples in the batch.
            # Also populate the value list lengths for the value feature with all 0s
            # since a missing feature is considered to have a value list length of 0.
            if missing_value_count is None:
                missing_value_count = batch_example_count
                component_feature_value_list_lengths[
                    value_feature_path] = np.full(batch_example_count, 0)
            missing_index_counts = collections.Counter()
            min_length_diff = dict()
            max_length_diff = dict()
            for index_feature_path in index_feature_paths:
                missing_index_count = component_feature_num_missing.get(
                    index_feature_path)
                # If this batch does not have this index feature at all,
                # missing_index_count for that index feature is the number of
                # examples in the batch.
                # Also populate the value list lengths for the index feature with all 0s
                # since a missing feature is considered to have a value list length of
                # 0.
                if missing_index_count is None:
                    missing_index_counts[
                        index_feature_path] = batch_example_count
                    component_feature_value_list_lengths[
                        index_feature_path] = np.full(batch_example_count, 0)
                else:
                    missing_index_counts[
                        index_feature_path] = missing_index_count
                length_differences = np.subtract(
                    component_feature_value_list_lengths[index_feature_path],
                    component_feature_value_list_lengths[value_feature_path])
                min_length_diff[index_feature_path] = np.min(
                    length_differences)
                max_length_diff[index_feature_path] = np.max(
                    length_differences)

            stats_for_feature = _PartialSparseFeatureStats(
                missing_value_count, missing_index_counts, min_length_diff,
                max_length_diff)
            existing_stats_for_feature = accumulator.get(feature_path)
            if existing_stats_for_feature is None:
                accumulator[feature_path] = stats_for_feature
            else:
                accumulator[feature_path] += stats_for_feature
        return accumulator