def update(self, value_list): """Updates the partial Time statistics using the value list. Args: value_list: A list of the values in an example. """ for value in value_list: if not value: continue if isinstance(value, bytes) and not stats_util.is_valid_utf8(value): self.invalidated = True return self.considered += 1 for strptime_format, time_regex in _TIME_RE_LIST: if time_regex.match(value): self.matching_formats[strptime_format] += 1
def add_input(self, accumulator, input_batch): """Return result of folding a batch of inputs into accumulator. Args: accumulator: The current accumulator. input_batch: A list representing a batch of feature value_lists (one per example) which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if accumulator.invalidate: return accumulator for value_list in input_batch: # If the value_list is None or empty ignore. if value_list is None or value_list.size == 0: continue # Check if the numpy array is of bytes type, if not invalidate the stats. # in examples/features to run image stas gen on. if stats_util.get_feature_type( value_list.dtype ) != statistics_pb2.FeatureNameStatistics.STRING: accumulator.invalidate = True return accumulator # Perform heuristic for a value. for value in value_list: if not value: continue if isinstance(value, bytes) and not stats_util.is_valid_utf8(value): accumulator.invalidate = True return accumulator accumulator.considered += 1 accumulator.matched += self._classifier.classify(value) return accumulator
def make_feature_stats_proto_with_topk_stats(feature_name, top_k_value_count_list, is_categorical, is_weighted_stats, num_top_values, frequency_threshold, num_rank_histogram_buckets): """Makes a FeatureNameStatistics proto containing the top-k stats. Args: feature_name: The feature name. top_k_value_count_list: A list of FeatureValueCount tuples. is_categorical: Whether the feature is categorical. is_weighted_stats: Whether top_k_value_count_list incorporates weights. num_top_values: The number of most frequent feature values to keep for string features. frequency_threshold: The minimum number of examples (possibly weighted) the most frequent values must be present in. num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. Returns: A FeatureNameStatistics proto containing the top-k stats. """ # Sort the top_k_value_count_list in descending order by count. Where # multiple feature values have the same count, consider the feature with the # 'larger' feature value to be larger for purposes of breaking the tie. top_k_value_count_list.sort(key=lambda counts: (counts[1], counts[0]), reverse=True) result = statistics_pb2.FeatureNameStatistics() result.name = feature_name # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) if is_weighted_stats: string_stats = result.string_stats.weighted_string_stats else: string_stats = result.string_stats for i in range(len(top_k_value_count_list)): value, count = top_k_value_count_list[i] if count < frequency_threshold: break # Check if we have a valid utf-8 string. If not, assign a default invalid # string value. if isinstance(value, bytes) and not is_valid_utf8(value): logging.warning( 'Feature "%s" has bytes value "%s" which cannot be ' 'decoded as a UTF-8 string.', feature_name, value) value = _INVALID_STRING if i < num_top_values: freq_and_value = string_stats.top_values.add() freq_and_value.value = value freq_and_value.frequency = count if i < num_rank_histogram_buckets: bucket = string_stats.rank_histogram.buckets.add() bucket.low_rank = i bucket.high_rank = i bucket.sample_count = count bucket.label = value return result
def test_is_valid_utf8(self): self.assertTrue(stats_util.is_valid_utf8(b'This is valid')) self.assertFalse(stats_util.is_valid_utf8(b'\xF0'))