def _get_unicode_value(value: Union[Text, bytes], path: types.FeaturePath): value = stats_util.maybe_get_utf8(value) # Check if we have a valid utf-8 string. If not, assign a placeholder. if value is None: logging.warning('Feature "%s" has bytes value "%s" which cannot be ' 'decoded as a UTF-8 string.', path, value) value = constants.NON_UTF8_PLACEHOLDER return value
def _get_unicode_value(value: Union[Text, bytes]) -> Text: """Get feature value decoded as utf-8.""" decoded_value = stats_util.maybe_get_utf8(value) # Check if we have a valid utf-8 string. If not, assign a placeholder. if decoded_value is None: _NON_UTF8_VALUES_COUNTER.inc() decoded_value = constants.NON_UTF8_PLACEHOLDER return decoded_value
def _make_feature_stats_proto_topk( feature_path: types.FeaturePath, top_k_values_pairs: List[FeatureValueCount], is_categorical: bool, is_weighted_stats: bool, num_top_values: int, frequency_threshold: Union[float, int], num_rank_histogram_buckets: int) -> statistics_pb2.FeatureNameStatistics: """Makes a FeatureNameStatistics proto containing the top-k stats.""" # Sort (a copy of) the top_k_value_pairs in descending order by count. # Where multiple feature values have the same count, consider the feature with # the 'larger' feature value to be larger for purposes of breaking the tie. top_k_values_pairs = sorted( top_k_values_pairs, key=lambda pair: (pair.count, pair.feature_value), reverse=True) result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) if is_weighted_stats: string_stats = result.string_stats.weighted_string_stats else: string_stats = result.string_stats for i in range(len(top_k_values_pairs)): value, count = top_k_values_pairs[i] if count < frequency_threshold: break # Check if we have a valid utf-8 string. If not, assign a default invalid # string value. if isinstance(value, six.binary_type): decoded_value = stats_util.maybe_get_utf8(value) if decoded_value is None: logging.warning('Feature "%s" has bytes value "%s" which cannot be ' 'decoded as a UTF-8 string.', feature_path, value) value = constants.NON_UTF8_PLACEHOLDER else: value = decoded_value elif not isinstance(value, six.text_type): value = str(value) if i < num_top_values: freq_and_value = string_stats.top_values.add() freq_and_value.value = value freq_and_value.frequency = count if i < num_rank_histogram_buckets: bucket = string_stats.rank_histogram.buckets.add() bucket.low_rank = i bucket.high_rank = i bucket.sample_count = count bucket.label = value return result
def _to_slice_key(feature_value: Any): """Decode slice key as UTF-8.""" # For bytes features we try decoding it as utf-8 (and throw an error if # fails). This is because in stats proto the slice name (dataset name) is a # string field which can only accept valid unicode. if isinstance(feature_value, six.binary_type): decoded_value = stats_util.maybe_get_utf8(feature_value) if decoded_value is None: raise ValueError('Feature names and slicing feature values must be valid' ' UTF-8. Found value {}.'.format(feature_value)) return decoded_value return str(feature_value)
def update(self, value_list): """Updates the partial Time statistics using the value list. Args: value_list: A list of the values in an example. """ for value in value_list: if not value: continue if isinstance(value, bytes): utf8_or_none = stats_util.maybe_get_utf8(value) if utf8_or_none is None: self.invalidated = True return else: value = utf8_or_none self.considered += 1 for strptime_format, time_regex in _TIME_RE_LIST: if time_regex.match(value): self.matching_formats[strptime_format] += 1
def add_input(self, accumulator, input_batch): """Return result of folding a batch of inputs into accumulator. Args: accumulator: The current accumulator. input_batch: A list representing a batch of feature value_lists (one per example) which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if accumulator.invalidate: return accumulator for value_list in input_batch: # If the value_list is None or empty ignore. if value_list is None or value_list.size == 0: continue # Check if the numpy array is of bytes type, if not invalidate the stats. # in examples/features to run image stas gen on. if stats_util.get_feature_type( value_list.dtype ) != statistics_pb2.FeatureNameStatistics.STRING: accumulator.invalidate = True return accumulator # Perform heuristic for a value. for value in value_list: if not value: continue if isinstance( value, bytes) and stats_util.maybe_get_utf8(value) is None: accumulator.invalidate = True return accumulator accumulator.considered += 1 accumulator.matched += self._classifier.classify(value) return accumulator
def _maybe_get_utf8(val): return stats_util.maybe_get_utf8(val) if isinstance( val, bytes) else val
def test_get_utf8(self): self.assertEqual(u'This is valid.', stats_util.maybe_get_utf8(b'This is valid.')) self.assertIsNone(stats_util.maybe_get_utf8(b'\xF0'))
def make_feature_stats_proto_with_topk_stats(feature_name, top_k_value_count_list, is_categorical, is_weighted_stats, num_top_values, frequency_threshold, num_rank_histogram_buckets): """Makes a FeatureNameStatistics proto containing the top-k stats. Args: feature_name: The feature name. top_k_value_count_list: A list of FeatureValueCount tuples. is_categorical: Whether the feature is categorical. is_weighted_stats: Whether top_k_value_count_list incorporates weights. num_top_values: The number of most frequent feature values to keep for string features. frequency_threshold: The minimum number of examples (possibly weighted) the most frequent values must be present in. num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. Returns: A FeatureNameStatistics proto containing the top-k stats. """ # Sort the top_k_value_count_list in descending order by count. Where # multiple feature values have the same count, consider the feature with the # 'larger' feature value to be larger for purposes of breaking the tie. top_k_value_count_list.sort(key=lambda counts: (counts[1], counts[0]), reverse=True) result = statistics_pb2.FeatureNameStatistics() result.name = feature_name # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) if is_weighted_stats: string_stats = result.string_stats.weighted_string_stats else: string_stats = result.string_stats for i in range(len(top_k_value_count_list)): value, count = top_k_value_count_list[i] if count < frequency_threshold: break # Convert to string if integer. if isinstance(value, numbers.Integral): value = str(value) # Check if we have a valid utf-8 string. If not, assign a default invalid # string value. elif isinstance(value, bytes) and maybe_get_utf8(value) is None: logging.warning( 'Feature "%s" has bytes value "%s" which cannot be ' 'decoded as a UTF-8 string.', feature_name, value) value = _INVALID_STRING if i < num_top_values: freq_and_value = string_stats.top_values.add() freq_and_value.value = value freq_and_value.frequency = count if i < num_rank_histogram_buckets: bucket = string_stats.rank_histogram.buckets.add() bucket.low_rank = i bucket.high_rank = i bucket.sample_count = count bucket.label = value return result
def _is_non_utf8(value): return (isinstance(value, bytes) and stats_util.maybe_get_utf8(value) is None)
def make_feature_stats_proto_with_topk_stats( feature_path: types.FeaturePath, top_k_value_count_list: List[FeatureValueCount], is_categorical: bool, is_weighted_stats: bool, num_top_values: int, frequency_threshold: Union[float, int], num_rank_histogram_buckets: int) -> statistics_pb2.FeatureNameStatistics: """Makes a FeatureNameStatistics proto containing the top-k stats. Args: feature_path: The path of the feature. top_k_value_count_list: A list of FeatureValueCount tuples. is_categorical: Whether the feature is categorical. is_weighted_stats: Whether top_k_value_count_list incorporates weights. num_top_values: The number of most frequent feature values to keep for string features. frequency_threshold: The minimum number of examples (possibly weighted) the most frequent values must be present in. num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. Returns: A FeatureNameStatistics proto containing the top-k stats. """ # Sort (a copy of) the top_k_value_count_list in descending order by count. # Where multiple feature values have the same count, consider the feature with # the 'larger' feature value to be larger for purposes of breaking the tie. top_k_value_count_list = sorted( top_k_value_count_list, key=lambda counts: (counts[1], counts[0]), reverse=True) result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) if is_weighted_stats: string_stats = result.string_stats.weighted_string_stats else: string_stats = result.string_stats for i in range(len(top_k_value_count_list)): value, count = top_k_value_count_list[i] if count < frequency_threshold: break # Check if we have a valid utf-8 string. If not, assign a default invalid # string value. if isinstance(value, six.binary_type): value = stats_util.maybe_get_utf8(value) if value is None: logging.warning('Feature "%s" has bytes value "%s" which cannot be ' 'decoded as a UTF-8 string.', feature_path, value) value = constants.NON_UTF8_PLACEHOLDER elif not isinstance(value, six.text_type): value = str(value) if i < num_top_values: freq_and_value = string_stats.top_values.add() freq_and_value.value = value freq_and_value.frequency = count if i < num_rank_histogram_buckets: bucket = string_stats.rank_histogram.buckets.add() bucket.low_rank = i bucket.high_rank = i bucket.sample_count = count bucket.label = value return result