Ejemplo n.º 1
0
 def test_make_feature_type_float(self):
   self.assertEqual(stats_util.make_feature_type(np.dtype('float16')),
                    statistics_pb2.FeatureNameStatistics.FLOAT)
   self.assertEqual(stats_util.make_feature_type(np.dtype('float32')),
                    statistics_pb2.FeatureNameStatistics.FLOAT)
   self.assertEqual(stats_util.make_feature_type(np.dtype('float64')),
                    statistics_pb2.FeatureNameStatistics.FLOAT)
Ejemplo n.º 2
0
 def test_make_feature_type_int(self):
   self.assertEqual(stats_util.make_feature_type(np.dtype('int8')),
                    statistics_pb2.FeatureNameStatistics.INT)
   self.assertEqual(stats_util.make_feature_type(np.dtype('int16')),
                    statistics_pb2.FeatureNameStatistics.INT)
   self.assertEqual(stats_util.make_feature_type(np.dtype('int32')),
                    statistics_pb2.FeatureNameStatistics.INT)
   self.assertEqual(stats_util.make_feature_type(np.dtype('int64')),
                    statistics_pb2.FeatureNameStatistics.INT)
def _update_common_stats(common_stats, value, feature_name):
    """Update the partial common statistics using the input value."""
    # Check if the input value is a numpy array. If so, we have a non-missing
    # value to process.
    if isinstance(value, np.ndarray):
        # Get the number of values for the feature in the example.
        num_values = value.size
        common_stats.num_non_missing += 1
        common_stats.min_num_values = min(common_stats.min_num_values,
                                          num_values)
        common_stats.max_num_values = max(common_stats.max_num_values,
                                          num_values)
        common_stats.total_num_values += num_values

        feature_type = stats_util.make_feature_type(value.dtype)
        if feature_type is None:
            raise TypeError(
                'Feature %s has value which is a numpy array of type %s, '
                'should be int, float or str types.' %
                (feature_name, value.dtype.name))

        if common_stats.type is None:
            common_stats.type = feature_type
        elif common_stats.type != feature_type:
            raise TypeError('Cannot determine the type of feature %s. '
                            'Found values of types %s and %s.' %
                            (feature_name, common_stats.type, feature_type))
    # If the feature is missing, increment num_missing.
    # We represent a missing value by None.
    elif value is None:
        common_stats.num_missing += 1
    else:
        raise TypeError('Feature %s has value of type %s, '
                        'should be numpy.ndarray or None' %
                        (feature_name, type(value).__name__))
Ejemplo n.º 4
0
    def add_input(self, accumulator, input_batch):
        # Iterate through each feature and update the partial string stats.
        for feature_name, values in six.iteritems(input_batch):
            # Update the string statistics for every example in the batch.
            for value in values:
                # Check if we have a numpy array with at least one value.
                if not isinstance(value, np.ndarray) or value.size == 0:
                    continue

                # If the feature is neither categorical nor of string type, then
                # skip the feature.
                if not (feature_name in self._categorical_features
                        or stats_util.make_feature_type(value.dtype)
                        == statistics_pb2.FeatureNameStatistics.STRING):
                    continue

                # If we encounter this feature for the first time, create a
                # new partial string stats.
                if feature_name not in accumulator:
                    accumulator[feature_name] = _PartialStringStats()

                # If we have a categorical feature, convert the value to string type.
                if feature_name in self._categorical_features:
                    value = value.astype(str)

                # Update the partial string stats.
                for v in value:
                    accumulator[feature_name].total_bytes_length += len(v)
                accumulator[feature_name].total_num_values += len(value)

        return accumulator
    def add_input(self, accumulator, input_batch):
        if self._weight_feature:
            if self._weight_feature not in input_batch:
                raise ValueError(
                    'Weight feature "{}" not present in the input '
                    'batch.'.format(self._weight_feature))
            weights = input_batch[self._weight_feature]

        # Iterate through each feature and update the partial common stats.
        for feature_name, values in six.iteritems(input_batch):
            # Skip the weight feature.
            if feature_name == self._weight_feature:
                continue
            # If we encounter this feature for the first time, create a
            # new partial common stats.
            if feature_name not in accumulator:
                partial_stats = _PartialCommonStats(
                    self._weight_feature is not None)
                # Store empty summary.
                partial_stats.num_values_summary = (
                    self._quantiles_combiner.create_accumulator())
                accumulator[feature_name] = partial_stats

            # Update the common statistics for every example in the batch.
            num_values = []

            for i, value in enumerate(values):
                if self._weight_feature:
                    if weights[i] is None:
                        raise ValueError('Weight feature "{}" missing in an '
                                         'example.'.format(
                                             self._weight_feature))
                    elif (stats_util.make_feature_type(weights[i].dtype) ==
                          statistics_pb2.FeatureNameStatistics.STRING):
                        raise ValueError(
                            'Weight feature "{}" must be of numeric type. '
                            'Found {}.'.format(self._weight_feature,
                                               weights[i]))
                    elif weights[i].size != 1:
                        raise ValueError(
                            'Weight feature "{}" must have a single value. '
                            'Found {}.'.format(self._weight_feature,
                                               weights[i]))
                _update_common_stats(
                    accumulator[feature_name], value, feature_name,
                    weights[i][0] if self._weight_feature else None)
                # Keep track of the number of values in non-missing examples.
                if isinstance(value, np.ndarray):
                    num_values.append(value.size)

            # Update the num_vals_histogram summary for the feature based on the
            # current batch.
            if num_values:
                accumulator[feature_name].num_values_summary = (
                    self._quantiles_combiner.add_input(
                        accumulator[feature_name].num_values_summary,
                        [num_values]))

        return accumulator
  def add_input(self,
                accumulator,
                input_batch
               ):
    # Iterate through each feature and update the partial numeric stats.
    for feature_name, values in six.iteritems(input_batch):
      # If we have a categorical feature, don't generate numeric stats.
      if feature_name in self._categorical_features:
        continue

      # Update the numeric statistics for every example in the batch.
      current_batch = []
      for value in values:
        # Check if we have a numpy array with at least one value.
        if not isinstance(value, np.ndarray) or value.size == 0:
          continue

        # Check if the numpy array is of numeric type.
        feature_type = stats_util.make_feature_type(value.dtype)
        if feature_type not in [
            statistics_pb2.FeatureNameStatistics.INT,
            statistics_pb2.FeatureNameStatistics.FLOAT
        ]:
          continue

        # If we encounter this feature for the first time, create a
        # new partial numeric stats.
        if feature_name not in accumulator:
          partial_stats = _PartialNumericStats()
          # Store empty summary.
          partial_stats.std_hist_summary = (
              self._std_hist_combiner.create_accumulator())
          partial_stats.quantiles_hist_summary = (
              self._quantiles_hist_combiner.create_accumulator())
          accumulator[feature_name] = partial_stats

        # Update the partial numeric stats and append values
        # to the current batch.
        _update_numeric_stats(accumulator[feature_name], value, feature_name,
                              feature_type, current_batch)

      # Update the equi-width histogram and quantiles histogram sequi-widthor
      # the feature based on the current batch.
      if current_batch:
        accumulator[feature_name].std_hist_summary = (
            self._std_hist_combiner.add_input(
                accumulator[feature_name].std_hist_summary, [current_batch]))
        accumulator[feature_name].quantiles_hist_summary = (
            self._quantiles_hist_combiner.add_input(
                accumulator[feature_name].quantiles_hist_summary,
                [current_batch]))

    return accumulator
Ejemplo n.º 7
0
 def process(self, element):
     self._num_instances.inc(1)
     for _, value in six.iteritems(element):
         if not isinstance(value, np.ndarray):
             self._unknown_feature_values_count.update(1)
             continue
         feature_type = stats_util.make_feature_type(value.dtype)
         if feature_type == statistics_pb2.FeatureNameStatistics.INT:
             self._int_feature_values_count.update(len(value))
         elif feature_type == statistics_pb2.FeatureNameStatistics.FLOAT:
             self._float_feature_values_count.update(len(value))
         elif feature_type == statistics_pb2.FeatureNameStatistics.STRING:
             self._string_feature_values_count.update(len(value))
         else:
             self._unknown_feature_values_count.update(len(value))
     yield element
    def _filter_irrelevant_features(self, input_batch):
        """Filters out non-string features."""
        for feature_name, values_batch in six.iteritems(input_batch):
            is_categorical = feature_name in self._categorical_features
            for values in values_batch:
                # Check if we have a numpy array with at least one value.
                if not isinstance(values, np.ndarray) or values.size == 0:
                    continue
                # If the feature is neither categorical nor of string type, then
                # skip the feature.
                if not (is_categorical
                        or stats_util.make_feature_type(values.dtype)
                        == statistics_pb2.FeatureNameStatistics.STRING):
                    continue

                yield (feature_name,
                       values.astype(str) if is_categorical else values)
Ejemplo n.º 9
0
 def test_make_feature_type_invalid_dtype(self):
   with self.assertRaises(TypeError):
     stats_util.make_feature_type(int)
Ejemplo n.º 10
0
 def test_make_feature_type_none(self):
   self.assertIsNone(stats_util.make_feature_type(np.dtype('complex64')))
Ejemplo n.º 11
0
 def test_make_feature_type_string(self):
   self.assertEqual(stats_util.make_feature_type(np.dtype('S')),
                    statistics_pb2.FeatureNameStatistics.STRING)
   self.assertEqual(stats_util.make_feature_type(np.dtype('U')),
                    statistics_pb2.FeatureNameStatistics.STRING)