Esempio n. 1
0
 def test_get_feature_type_get_string(self):
     self.assertEqual(stats_util.get_feature_type(np.dtype('S')),
                      statistics_pb2.FeatureNameStatistics.STRING)
     self.assertEqual(stats_util.get_feature_type(np.dtype('U')),
                      statistics_pb2.FeatureNameStatistics.STRING)
Esempio n. 2
0
 def test_get_feature_type_get_none(self):
     self.assertIsNone(stats_util.get_feature_type(np.dtype('complex64')))
Esempio n. 3
0
    def add_input(self, accumulator, input_batch):
        if self._weight_feature:
            weights = stats_util.get_weight_feature(input_batch,
                                                    self._weight_feature)

        # Iterate through each feature and update the partial numeric stats.
        for feature_name, values in six.iteritems(input_batch):
            # Skip the weight feature.
            if feature_name == self._weight_feature:
                continue

            # If we have a categorical feature, don't generate numeric stats.
            if feature_name in self._categorical_features:
                continue

            # Update the numeric statistics for every example in the batch.
            # Keep track of the values and the weights in the current batch. Note
            # that we store the values in the current batch so that we invoke the
            # quantiles combiner only once per feature for the input batch.
            current_batch = [[], []]  # stores values and weights
            for i, value in enumerate(values):
                # Check if we have a numpy array with at least one value.
                if not isinstance(value, np.ndarray) or value.size == 0:
                    continue

                # Check if the numpy array is of numeric type.
                feature_type = get_feature_type(value.dtype)
                if feature_type not in [
                        statistics_pb2.FeatureNameStatistics.INT,
                        statistics_pb2.FeatureNameStatistics.FLOAT
                ]:
                    continue

                # If we encounter this feature for the first time, create a
                # new partial numeric stats.
                if feature_name not in accumulator:
                    partial_stats = _PartialNumericStats(
                        self._weight_feature is not None)
                    # Store empty summary.
                    partial_stats.quantiles_summary = (
                        self._quantiles_combiner.create_accumulator())
                    accumulator[feature_name] = partial_stats

                # Update the partial numeric stats and append values
                # to the current batch.
                _update_numeric_stats(
                    accumulator[feature_name], value, feature_name,
                    feature_type, current_batch,
                    weights[i][0] if self._weight_feature else None)

            # Update the quantiles summary of the feature based on the current batch.
            if current_batch[0]:
                # For the unweighted case, explicitly set the weights to be 1. We do
                # this so that we can use the same weighted quantiles combiner for both
                # scenarios.
                accumulator[feature_name].quantiles_summary = (
                    self._quantiles_combiner.add_input(
                        accumulator[feature_name].quantiles_summary,
                        [current_batch[0], [1] * len(current_batch[0])]))

                if self._weight_feature:
                    accumulator[feature_name].weighted_quantiles_summary = (
                        self._quantiles_combiner.add_input(
                            accumulator[feature_name].
                            weighted_quantiles_summary, current_batch))

        return accumulator
Esempio n. 4
0
  def add_input(self, accumulator,
                input_batch
               ):
    if self._weight_feature:
      # TODO(b/118489848): This method also validates the weight feature.
      # Consider moving these validation checks outside of the generators.
      weights = stats_util.get_weight_feature(input_batch, self._weight_feature)

    # Iterate through each feature and update the partial basic stats.
    for feature_name, values in six.iteritems(input_batch):
      # Skip the weight feature.
      if feature_name == self._weight_feature:
        continue
      is_categorical_feature = feature_name in self._categorical_features

      # If we encounter this feature for the first time, create a
      # new partial basic stats.
      if feature_name not in accumulator:
        partial_stats = _PartialBasicStats(self._weight_feature is not None)
        # Store empty summary.
        partial_stats.common_stats.num_values_summary = (
            self._num_values_quantiles_combiner.create_accumulator())
        partial_stats.numeric_stats.quantiles_summary = (
            self._values_quantiles_combiner.create_accumulator())
        accumulator[feature_name] = partial_stats

      # Keep track of the number of values in each example in order to update
      # the common statistics.
      num_values = []

      # Keep track of the values and the weights in the current batch for
      # numeric feature. Note that we store the values in the current batch
      # so that we invoke the quantiles combiner only once per feature for
      # the input batch.
      values_and_weights_numeric_feat = [[], []]

      for i, value in enumerate(values):
        # TODO(b/79685042): Currently we infer the type for each example, which
        # is expensive. Consider doing the type inference only once per batch.
        if isinstance(value, np.ndarray):
          feature_type = get_feature_type(value.dtype)
          if feature_type is None:
            raise TypeError('Feature {} has value {} which is a numpy array '
                            'of type {}, should be int, float or str '
                            'types.'.format(feature_name, value,
                                            value.dtype.name))
        elif value is None:
          # We have a missing value.
          feature_type = None
        else:
          raise TypeError('Feature %s has value of type %s, '
                          'should be numpy.ndarray or None' %
                          (feature_name, type(value).__name__))

        accumulator[feature_name].common_stats.update(
            value, feature_name, feature_type,
            weights[i][0] if self._weight_feature else None)
        if value is None:
          continue
        # Keep track of the number of values in non-missing examples.
        num_values.append(value.size)
        if value.size == 0:
          continue

        if (is_categorical_feature or
            feature_type == statistics_pb2.FeatureNameStatistics.STRING):
          # If we have a categorical feature, convert the value to string type.
          if is_categorical_feature:
            value = value.astype(str)

          # Update the partial string stats.
          accumulator[feature_name].string_stats.update(value)
        else:
          # Update the partial numeric stats and append values
          # to the current batch of values and weights.
          accumulator[feature_name].numeric_stats.update(
              value, values_and_weights_numeric_feat,
              weights[i][0] if self._weight_feature else None)

      # Update the num_vals_histogram summary for the feature based on the
      # current batch.
      if num_values:
        accumulator[feature_name].common_stats.num_values_summary = (
            self._num_values_quantiles_combiner.add_input(
                accumulator[feature_name].common_stats.num_values_summary,
                [num_values]))

      # Update the quantiles summary of the numeric feature values based on the
      # current batch of values and weights.
      if values_and_weights_numeric_feat[0]:
        numeric_stats = accumulator[feature_name].numeric_stats
        # For the unweighted case, explicitly set the weights to be 1. We do
        # this so that we can use the same weighted quantiles combiner for both
        # scenarios.
        numeric_stats.quantiles_summary = (
            self._values_quantiles_combiner.add_input(
                numeric_stats.quantiles_summary,
                [values_and_weights_numeric_feat[0],
                 # Set weights to be 1.
                 [1] * len(values_and_weights_numeric_feat[0])]))

        if self._weight_feature:
          numeric_stats.weighted_quantiles_summary = (
              self._values_quantiles_combiner.add_input(
                  numeric_stats.weighted_quantiles_summary,
                  values_and_weights_numeric_feat))

    return accumulator