def test_get_feature_type_get_string(self): self.assertEqual(stats_util.get_feature_type(np.dtype('S')), statistics_pb2.FeatureNameStatistics.STRING) self.assertEqual(stats_util.get_feature_type(np.dtype('U')), statistics_pb2.FeatureNameStatistics.STRING)
def test_get_feature_type_get_none(self): self.assertIsNone(stats_util.get_feature_type(np.dtype('complex64')))
def add_input(self, accumulator, input_batch): if self._weight_feature: weights = stats_util.get_weight_feature(input_batch, self._weight_feature) # Iterate through each feature and update the partial numeric stats. for feature_name, values in six.iteritems(input_batch): # Skip the weight feature. if feature_name == self._weight_feature: continue # If we have a categorical feature, don't generate numeric stats. if feature_name in self._categorical_features: continue # Update the numeric statistics for every example in the batch. # Keep track of the values and the weights in the current batch. Note # that we store the values in the current batch so that we invoke the # quantiles combiner only once per feature for the input batch. current_batch = [[], []] # stores values and weights for i, value in enumerate(values): # Check if we have a numpy array with at least one value. if not isinstance(value, np.ndarray) or value.size == 0: continue # Check if the numpy array is of numeric type. feature_type = get_feature_type(value.dtype) if feature_type not in [ statistics_pb2.FeatureNameStatistics.INT, statistics_pb2.FeatureNameStatistics.FLOAT ]: continue # If we encounter this feature for the first time, create a # new partial numeric stats. if feature_name not in accumulator: partial_stats = _PartialNumericStats( self._weight_feature is not None) # Store empty summary. partial_stats.quantiles_summary = ( self._quantiles_combiner.create_accumulator()) accumulator[feature_name] = partial_stats # Update the partial numeric stats and append values # to the current batch. _update_numeric_stats( accumulator[feature_name], value, feature_name, feature_type, current_batch, weights[i][0] if self._weight_feature else None) # Update the quantiles summary of the feature based on the current batch. if current_batch[0]: # For the unweighted case, explicitly set the weights to be 1. We do # this so that we can use the same weighted quantiles combiner for both # scenarios. accumulator[feature_name].quantiles_summary = ( self._quantiles_combiner.add_input( accumulator[feature_name].quantiles_summary, [current_batch[0], [1] * len(current_batch[0])])) if self._weight_feature: accumulator[feature_name].weighted_quantiles_summary = ( self._quantiles_combiner.add_input( accumulator[feature_name]. weighted_quantiles_summary, current_batch)) return accumulator
def add_input(self, accumulator, input_batch ): if self._weight_feature: # TODO(b/118489848): This method also validates the weight feature. # Consider moving these validation checks outside of the generators. weights = stats_util.get_weight_feature(input_batch, self._weight_feature) # Iterate through each feature and update the partial basic stats. for feature_name, values in six.iteritems(input_batch): # Skip the weight feature. if feature_name == self._weight_feature: continue is_categorical_feature = feature_name in self._categorical_features # If we encounter this feature for the first time, create a # new partial basic stats. if feature_name not in accumulator: partial_stats = _PartialBasicStats(self._weight_feature is not None) # Store empty summary. partial_stats.common_stats.num_values_summary = ( self._num_values_quantiles_combiner.create_accumulator()) partial_stats.numeric_stats.quantiles_summary = ( self._values_quantiles_combiner.create_accumulator()) accumulator[feature_name] = partial_stats # Keep track of the number of values in each example in order to update # the common statistics. num_values = [] # Keep track of the values and the weights in the current batch for # numeric feature. Note that we store the values in the current batch # so that we invoke the quantiles combiner only once per feature for # the input batch. values_and_weights_numeric_feat = [[], []] for i, value in enumerate(values): # TODO(b/79685042): Currently we infer the type for each example, which # is expensive. Consider doing the type inference only once per batch. if isinstance(value, np.ndarray): feature_type = get_feature_type(value.dtype) if feature_type is None: raise TypeError('Feature {} has value {} which is a numpy array ' 'of type {}, should be int, float or str ' 'types.'.format(feature_name, value, value.dtype.name)) elif value is None: # We have a missing value. feature_type = None else: raise TypeError('Feature %s has value of type %s, ' 'should be numpy.ndarray or None' % (feature_name, type(value).__name__)) accumulator[feature_name].common_stats.update( value, feature_name, feature_type, weights[i][0] if self._weight_feature else None) if value is None: continue # Keep track of the number of values in non-missing examples. num_values.append(value.size) if value.size == 0: continue if (is_categorical_feature or feature_type == statistics_pb2.FeatureNameStatistics.STRING): # If we have a categorical feature, convert the value to string type. if is_categorical_feature: value = value.astype(str) # Update the partial string stats. accumulator[feature_name].string_stats.update(value) else: # Update the partial numeric stats and append values # to the current batch of values and weights. accumulator[feature_name].numeric_stats.update( value, values_and_weights_numeric_feat, weights[i][0] if self._weight_feature else None) # Update the num_vals_histogram summary for the feature based on the # current batch. if num_values: accumulator[feature_name].common_stats.num_values_summary = ( self._num_values_quantiles_combiner.add_input( accumulator[feature_name].common_stats.num_values_summary, [num_values])) # Update the quantiles summary of the numeric feature values based on the # current batch of values and weights. if values_and_weights_numeric_feat[0]: numeric_stats = accumulator[feature_name].numeric_stats # For the unweighted case, explicitly set the weights to be 1. We do # this so that we can use the same weighted quantiles combiner for both # scenarios. numeric_stats.quantiles_summary = ( self._values_quantiles_combiner.add_input( numeric_stats.quantiles_summary, [values_and_weights_numeric_feat[0], # Set weights to be 1. [1] * len(values_and_weights_numeric_feat[0])])) if self._weight_feature: numeric_stats.weighted_quantiles_summary = ( self._values_quantiles_combiner.add_input( numeric_stats.weighted_quantiles_summary, values_and_weights_numeric_feat)) return accumulator