def test_basic_stats_generator_categorical_feature(self): batches = [ pa.Table.from_arrays([pa.array([[1, 5, 10], [0]])], ['c']), pa.Table.from_arrays([pa.array([[1, 1, 1, 5, 15], [-1]])], ['c']), ] expected_result = { types.FeaturePath(['c']): text_format.Parse( """ path { step: 'c' } string_stats { common_stats { num_non_missing: 4 min_num_values: 1 max_num_values: 5 avg_num_values: 2.5 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 1.3333333 } buckets { low_value: 1.0 high_value: 3.0 sample_count: 1.3333333 } buckets { low_value: 3.0 high_value: 5.0 sample_count: 1.3333333 } type: QUANTILES } tot_num_values: 10 } avg_length: 1.29999995232 } """, statistics_pb2.FeatureNameStatistics()) } schema = text_format.Parse( """ feature { name: "c" type: INT int_domain { is_categorical: true } } """, schema_pb2.Schema()) generator = basic_stats_generator.BasicStatsGenerator( schema=schema, num_values_histogram_buckets=3, num_histogram_buckets=3, num_quantiles_histogram_buckets=4) self.assertCombinerOutputEqual(batches, generator, expected_result)
def _merge_dataset_feature_stats_protos( stats_protos: Iterable[statistics_pb2.DatasetFeatureStatistics] ) -> statistics_pb2.DatasetFeatureStatistics: """Merges together a list of DatasetFeatureStatistics protos. Args: stats_protos: A list of DatasetFeatureStatistics protos to merge. Returns: The merged DatasetFeatureStatistics proto. """ stats_per_feature = {} # Create a new DatasetFeatureStatistics proto. result = statistics_pb2.DatasetFeatureStatistics() # Iterate over each DatasetFeatureStatistics proto and merge the # FeatureNameStatistics protos per feature and add the cross feature stats. for stats_proto in stats_protos: if stats_proto.cross_features: result.cross_features.extend(stats_proto.cross_features) for feature_stats_proto in stats_proto.features: feature_path = types.FeaturePath.from_proto(feature_stats_proto.path) if feature_path not in stats_per_feature: # Make a copy for the "cache" since we are modifying it in 'else' below. new_feature_stats_proto = statistics_pb2.FeatureNameStatistics() new_feature_stats_proto.CopyFrom(feature_stats_proto) stats_per_feature[feature_path] = new_feature_stats_proto else: stats_for_feature = stats_per_feature[feature_path] # MergeFrom would concatenate repeated fields which is not what we want # for path.step. del stats_for_feature.path.step[:] stats_for_feature.MergeFrom(feature_stats_proto) num_examples = None for feature_stats_proto in six.itervalues(stats_per_feature): # Add the merged FeatureNameStatistics proto for the feature # into the DatasetFeatureStatistics proto. new_feature_stats_proto = result.features.add() new_feature_stats_proto.CopyFrom(feature_stats_proto) # Get the number of examples from one of the features that # has common stats. if num_examples is None: stats_type = feature_stats_proto.WhichOneof('stats') stats_proto = None if stats_type == 'num_stats': stats_proto = feature_stats_proto.num_stats else: stats_proto = feature_stats_proto.string_stats if stats_proto.HasField('common_stats'): num_examples = (stats_proto.common_stats.num_non_missing + stats_proto.common_stats.num_missing) # Set the num_examples field. if num_examples is not None: result.num_examples = num_examples return result
def _get_test_stats_with_mi(feature_paths): """Get stats proto for MI test.""" result = statistics_pb2.DatasetFeatureStatistics() for feature_path in feature_paths: feature_proto = text_format.Parse( """ custom_stats { name: "max_sklearn_adjusted_mutual_information" num: 0.0 } custom_stats { name: "max_sklearn_mutual_information" num: 0.0 } custom_stats { name: "mean_sklearn_adjusted_mutual_information" num: 0.0 } custom_stats { name: "mean_sklearn_mutual_information" num: 0.0 } custom_stats { name: "median_sklearn_adjusted_mutual_information" num: 0.0 } custom_stats { name: "median_sklearn_mutual_information" num: 0.0 } custom_stats { name: "min_sklearn_adjusted_mutual_information" num: 0.0 } custom_stats { name: "min_sklearn_mutual_information" num: 0.0 } custom_stats { name: "num_partitions_sklearn_adjusted_mutual_information" num: 2.0 } custom_stats { name: "num_partitions_sklearn_mutual_information" num: 2.0 } custom_stats { name: "std_dev_sklearn_adjusted_mutual_information" num: 0.0 } custom_stats { name: "std_dev_sklearn_mutual_information" num: 0.0 } """, statistics_pb2.FeatureNameStatistics()) feature_proto.path.CopyFrom(feature_path.to_proto()) result.features.add().CopyFrom(feature_proto) return result
def make_feature_stats_proto_with_topk_stats(feature_name, top_k_value_count_list, is_categorical, is_weighted_stats, num_top_values, num_rank_histogram_buckets): """Makes a FeatureNameStatistics proto containing the top-k stats. Args: feature_name: The feature name. top_k_value_count_list: A list of FeatureValueCount tuples. is_categorical: Whether the feature is categorical. is_weighted_stats: Whether top_k_value_count_list incorporates weights. num_top_values: The number of most frequent feature values to keep for string features. num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. Returns: A FeatureNameStatistics proto containing the top-k stats. """ # Sort the top_k_value_count_list in descending order by count. Where # multiple feature values have the same count, consider the feature with the # 'larger' feature value to be larger for purposes of breaking the tie. top_k_value_count_list.sort(key=lambda counts: (counts[1], counts[0]), reverse=True) result = statistics_pb2.FeatureNameStatistics() result.name = feature_name # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) if is_weighted_stats: string_stats = result.string_stats.weighted_string_stats else: string_stats = result.string_stats for i in range(len(top_k_value_count_list)): value, count = top_k_value_count_list[i] # Check if we have a valid utf-8 string. If not, assign a default invalid # string value. if isinstance(value, bytes) and not _is_valid_utf8(value): logging.warning( 'Feature "%s" has bytes value "%s" which cannot be ' 'decoded as a UTF-8 string.', feature_name, value) value = _INVALID_STRING if i < num_top_values: freq_and_value = string_stats.top_values.add() freq_and_value.value = value freq_and_value.frequency = count if i < num_rank_histogram_buckets: bucket = string_stats.rank_histogram.buckets.add() bucket.low_rank = i bucket.high_rank = i bucket.sample_count = count bucket.label = value return result
def test_cases_with_no_image_stats(self, batches): """Test cases that should not generate image statistics.""" image_decoder = FakeImageDecoder() generator = image_stats_generator.ImageStatsGenerator( image_decoder=image_decoder, values_threshold=1, enable_size_stats=True) self.assertCombinerOutputEqual(batches, generator, statistics_pb2.FeatureNameStatistics())
def extract_output( self, accumulator: _PartialNLStats ) -> statistics_pb2.FeatureNameStatistics: """Return result of converting accumulator into the output value. Args: accumulator: The final accumulator value. Returns: A proto representing the result of this stats generator. """ result = statistics_pb2.FeatureNameStatistics() if accumulator.invalidate: return result nls = statistics_pb2.NaturalLanguageStatistics() if accumulator.total_num_tokens: nls.feature_coverage = (float(accumulator.num_in_vocab_tokens) / accumulator.total_num_tokens) result.custom_stats.add(name='nl_feature_coverage', num=nls.feature_coverage) if accumulator.num_in_vocab_tokens: nls.avg_token_length = ( float(accumulator.sum_in_vocab_token_lengths) / accumulator.num_in_vocab_tokens) result.custom_stats.add(name='nl_avg_token_length', num=nls.avg_token_length) if self._num_quantiles_histogram_buckets: _populate_token_length_histogram( nls, accumulator, self._num_quantiles_histogram_buckets) if nls.token_length_histogram.buckets: result.custom_stats.add(name='nl_token_length_histogram', histogram=nls.token_length_histogram) if self._num_rank_histogram_buckets: _populate_token_rank_histogram(nls, accumulator, self._num_rank_histogram_buckets) if nls.rank_histogram.buckets: result.custom_stats.add(name='nl_rank_tokens', rank_histogram=nls.rank_histogram) if accumulator.token_statistics: for name, stats in accumulator.token_statistics.items(): _populate_token_statistics(name, self._num_histogram_buckets, accumulator.num_examples, nls.token_statistics.add(), stats, result) for r in (accumulator.reported_sequences_coverage + accumulator.reported_sequences_avg_token_length): str_seq = str(r[0]) nls.reported_sequences.append(str_seq) if nls.reported_sequences: reported_sequences = '\n'.join(nls.reported_sequences) result.custom_stats.add(name='nl_reported_sequences', str=reported_sequences) my_proto = any_pb2.Any() result.custom_stats.add(name='nl_statistics', any=my_proto.Pack(nls)) return result
def assert_on_unequal_feature_protos(self): expected = text_format.Parse( """ name: 'a' custom_stats { name: 'MI' num: 2.5 } """, statistics_pb2.FeatureNameStatistics()) actual = text_format.Parse( """ name: 'a' custom_stats { name: 'MI' num: 2.0 } """, statistics_pb2.FeatureNameStatistics()) test_util.assert_feature_proto_equal(self, actual, expected)
def test_common_stats_generator_with_weight_feature(self): # input with two batches: first batch has two examples and second batch # has a single example. batches = [{'a': np.array([np.array([1.0, 2.0]), np.array([3.0, 4.0, 5.0])]), 'w': np.array([np.array([1.0]), np.array([2.0])])}, {'a': np.array([np.array([1.0,]), None]), 'w': np.array([np.array([3.0]), np.array([2.0])])}] expected_result = { 'a': text_format.Parse( """ name: 'a' type: FLOAT num_stats { common_stats { num_non_missing: 3 num_missing: 1 min_num_values: 1 max_num_values: 3 avg_num_values: 2.0 tot_num_values: 6 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.75 } buckets { low_value: 1.0 high_value: 2.0 sample_count: 0.75 } buckets { low_value: 2.0 high_value: 3.0 sample_count: 0.75 } buckets { low_value: 3.0 high_value: 3.0 sample_count: 0.75 } type: QUANTILES } weighted_common_stats { num_non_missing: 6.0 num_missing: 2.0 avg_num_values: 1.83333333 tot_num_values: 11.0 } } } """, statistics_pb2.FeatureNameStatistics())} generator = common_stats_generator.CommonStatsGenerator( weight_feature='w', num_values_histogram_buckets=4) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_common_stats_generator_categorical_feature(self): batches = [{ 'c': np.array([np.array([1, 5, 10]), np.array([0])]) }, { 'c': np.array([np.array([1, 1, 1, 5, 15])]) }] expected_result = { 'c': text_format.Parse( """ name: 'c' type: INT string_stats { common_stats { num_non_missing: 3 num_missing: 0 min_num_values: 1 max_num_values: 5 avg_num_values: 3.0 tot_num_values: 9 num_values_histogram { buckets { low_value: 1.0 high_value: 3.0 sample_count: 1.0 } buckets { low_value: 3.0 high_value: 5.0 sample_count: 1.0 } buckets { low_value: 5.0 high_value: 5.0 sample_count: 1.0 } type: QUANTILES } } } """, statistics_pb2.FeatureNameStatistics()) } schema = text_format.Parse( """ feature { name: "c" type: INT int_domain { is_categorical: true } } """, schema_pb2.Schema()) generator = common_stats_generator.CommonStatsGenerator( schema=schema, num_values_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def _make_feature_stats_proto_topk( feature_path: types.FeaturePath, top_k_values_pairs: List[FeatureValueCount], is_categorical: bool, is_weighted_stats: bool, num_top_values: int, frequency_threshold: Union[float, int], num_rank_histogram_buckets: int) -> statistics_pb2.FeatureNameStatistics: """Makes a FeatureNameStatistics proto containing the top-k stats.""" # Sort (a copy of) the top_k_value_pairs in descending order by count. # Where multiple feature values have the same count, consider the feature with # the 'larger' feature value to be larger for purposes of breaking the tie. top_k_values_pairs = sorted( top_k_values_pairs, key=lambda pair: (pair.count, pair.feature_value), reverse=True) result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) if is_weighted_stats: string_stats = result.string_stats.weighted_string_stats else: string_stats = result.string_stats for i in range(len(top_k_values_pairs)): value, count = top_k_values_pairs[i] if count < frequency_threshold: break # Check if we have a valid utf-8 string. If not, assign a default invalid # string value. if isinstance(value, six.binary_type): decoded_value = stats_util.maybe_get_utf8(value) if decoded_value is None: logging.warning('Feature "%s" has bytes value "%s" which cannot be ' 'decoded as a UTF-8 string.', feature_path, value) value = constants.NON_UTF8_PLACEHOLDER else: value = decoded_value elif not isinstance(value, six.text_type): value = str(value) if i < num_top_values: freq_and_value = string_stats.top_values.add() freq_and_value.value = value freq_and_value.frequency = count if i < num_rank_histogram_buckets: bucket = string_stats.rank_histogram.buckets.add() bucket.low_rank = i bucket.high_rank = i bucket.sample_count = count bucket.label = value return result
def test_nl_generator_avg_word_heuristic_non_match(self): """Tests generator with avg word length heuristic.""" generator = nlsg.NLDomainInferringStatsGenerator(values_threshold=2) input_batches = [ pa.array([['abc' * 10, 'xxxxxxxxx'], ['xosuhddsofuhg123fdgosh']]), pa.array([['Only one valid text?']]), ] self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics())
def _make_feature_stats_proto(feature_name, count, is_categorical): """Makes a FeatureNameStatistics proto containing the uniques stats.""" result = statistics_pb2.FeatureNameStatistics() result.name = feature_name # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) result.string_stats.unique = count return result
def test_get_custom_stats_numeric(self): stats = text_format.Parse( """ name: 'feature' custom_stats { name: 'abc' num: 100.0 } """, statistics_pb2.FeatureNameStatistics()) self.assertEqual(stats_util.get_custom_stats(stats, 'abc'), 100.0)
def test_time_stats_generator_non_time_integers(self): """Tests that the generator handles integers that are not times.""" # None of these numbers are valid times. input_batches = [ pa.array([[1, 2]]), ] generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.1, values_threshold=1) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics())
def assert_on_two_protos_within_valid_error_but_different_name(self): expected = text_format.Parse( """ name: 'a' custom_stats { name: 'MI' num: 2.5 } """, statistics_pb2.FeatureNameStatistics()) actual = text_format.Parse( """ name: 'b' custom_stats { name: 'MI' num: 2.45 } """, statistics_pb2.FeatureNameStatistics()) test_util.assert_feature_proto_equal_with_error_on_custom_stats( self, actual, expected)
def test_get_custom_stats_string(self): stats = text_format.Parse( """ name: 'feature' custom_stats { name: 'abc' str: 'xyz' } """, statistics_pb2.FeatureNameStatistics()) self.assertEqual(stats_util.get_custom_stats(stats, 'abc'), 'xyz')
def _make_feature_stats_proto(string_stats, feature_name, is_categorical): """Convert the partial string statistics into FeatureNameStatistics proto.""" result = statistics_pb2.FeatureNameStatistics() result.name = feature_name # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) result.string_stats.avg_length = (string_stats.total_bytes_length / string_stats.total_num_values) return result
def test_get_custom_stats_not_found(self): stats = text_format.Parse( """ name: 'feature' custom_stats { name: 'abc' num: 100.0 } """, statistics_pb2.FeatureNameStatistics()) with self.assertRaisesRegexp(ValueError, 'Custom statistics.*not found'): stats_util.get_custom_stats(stats, 'xyz')
def test_basic_stats_generator_no_value_in_batch(self): batches = [ pa.Table.from_arrays( [pa.array([[], [], []], type=pa.list_(pa.int64()))], ['a']) ] expected_result = { types.FeaturePath(['a']): text_format.Parse( """ path { step: 'a' } num_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } buckets { sample_count: 0.3 } type: QUANTILES } } }""", statistics_pb2.FeatureNameStatistics()) } generator = basic_stats_generator.BasicStatsGenerator() self.assertCombinerOutputEqual(batches, generator, expected_result)
def _make_feature_stats_proto(numeric_stats, feature_name, quantiles_combiner, num_histogram_buckets, num_quantiles_histogram_buckets): """Convert the partial numeric statistics into FeatureNameStatistics proto.""" numeric_stats_proto = statistics_pb2.NumericStatistics() # Set the stats in the proto only if we have at least one value for the # feature. if numeric_stats.total_num_values > 0: mean = numeric_stats.sum / numeric_stats.total_num_values variance = max( 0, (numeric_stats.sum_of_squares / numeric_stats.total_num_values) - mean * mean) numeric_stats_proto.mean = float(mean) numeric_stats_proto.std_dev = math.sqrt(variance) numeric_stats_proto.num_zeros = numeric_stats.num_zeros numeric_stats_proto.min = float(numeric_stats.min) numeric_stats_proto.max = float(numeric_stats.max) # Extract the quantiles from the summary. quantiles = quantiles_combiner.extract_output( numeric_stats.quantiles_summary) # Find the median from the quantiles and update the numeric stats proto. numeric_stats_proto.median = float( quantiles_util.find_median(quantiles)) # Construct the equi-width histogram from the quantiles and add it to the # numeric stats proto. std_histogram = quantiles_util.generate_equi_width_histogram( quantiles, numeric_stats.min, numeric_stats.max, numeric_stats.total_num_values, num_histogram_buckets) std_histogram.num_nan = numeric_stats.num_nan new_std_histogram = numeric_stats_proto.histograms.add() new_std_histogram.CopyFrom(std_histogram) # Construct the quantiles histogram from the quantiles and add it to the # numeric stats proto. q_histogram = quantiles_util.generate_quantiles_histogram( quantiles, numeric_stats.min, numeric_stats.max, numeric_stats.total_num_values, num_quantiles_histogram_buckets) q_histogram.num_nan = numeric_stats.num_nan new_q_histogram = numeric_stats_proto.histograms.add() new_q_histogram.CopyFrom(q_histogram) # Create a new FeatureNameStatistics proto. result = statistics_pb2.FeatureNameStatistics() result.name = feature_name result.type = numeric_stats.type result.num_stats.CopyFrom(numeric_stats_proto) return result
def _make_feature_stats_proto_uniques( feature_path: types.FeaturePath, num_uniques: int, is_categorical: bool) -> statistics_pb2.FeatureNameStatistics: """Makes a FeatureNameStatistics proto containing the uniques stats.""" result = statistics_pb2.FeatureNameStatistics() result.path.CopyFrom(feature_path.to_proto()) # If we have a categorical feature, we preserve the type to be the original # INT type. result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical else statistics_pb2.FeatureNameStatistics.STRING) result.string_stats.unique = num_uniques return result
def test_time_stats_generator_no_valid_formats(self): """Tests that the generator handles batches that contain no valid values.""" # None of these values is a valid format. input_batches = [ pa.array([['', '2018-Nov-30', '20183011']]), pa.array([['all/invalid', '2018-11-30invalid']]), pa.array([['invalid2018-11-30', 'invalid\n2018-11-30']]) ] generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.1, values_threshold=1) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics())
def test_make_feature_stats_proto_topk_uniques_unordered(self): expected_result = text_format.Parse( """ path { step: 'fa' } type: INT string_stats { unique: 4 top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } top_values { value: 'd' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } } }""", statistics_pb2.FeatureNameStatistics()) value_counts = [('a', 4), ('c', 3), ('d', 2), ('b', 2)] top_k_value_count_list = [ top_k_uniques_stats_util.FeatureValueCount(value_count[0], value_count[1]) for value_count in value_counts ] result = ( top_k_uniques_stats_util.make_feature_stats_proto_topk_uniques( types.FeaturePath(['fa']), is_categorical=True, num_top_values=3, frequency_threshold=1, num_rank_histogram_buckets=2, num_unique=4, value_count_list=top_k_value_count_list)) test_util.assert_feature_proto_equal(self, result, expected_result)
def test_nl_generator_invalidation_check(self): """Tests generator invalidation with fake heuristic.""" # Expected to give 6 matches. input_batches = [ pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH']]), pa.array([['MATCH', 'MATCH']]), # Incorrect type invalidates accumulator. pa.array([[42]]), ] # No domain_info is generated as the incorrect type of 42 value invalidated # the stats. generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=1) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics())
def test_nl_generator_utf8_check(self): """Tests generator utf8 check with fake heuristic.""" # Expected to give 6 matches. input_batches = [ pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH']]), pa.array([['MATCH', 'MATCH']]), # Non utf-8 string invalidates accumulator. pa.array([[b'\xF0']]), ] # Try generators with values_threshold=1 which should have generated # stats without the non utf-8 value. generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=1) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics())
def test_basic_stats_generator_only_nan(self): b1 = pa.Table.from_arrays( [pa.array([[np.NaN]], type=pa.list_(pa.float32()))], ['a']) batches = [b1] expected_result = { types.FeaturePath(['a']): text_format.Parse( """ path { step: 'a' } type: FLOAT num_stats { common_stats { num_non_missing: 1 min_num_values: 1 max_num_values: 1 avg_num_values: 1.0 tot_num_values: 1 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } type: QUANTILES } } histograms { num_nan: 1 type: STANDARD } histograms { num_nan: 1 type: QUANTILES } } """, statistics_pb2.FeatureNameStatistics()) } generator = basic_stats_generator.BasicStatsGenerator( num_values_histogram_buckets=2, num_histogram_buckets=3, num_quantiles_histogram_buckets=4) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_make_feature_stats_proto_with_topk_stats_weighted(self): expected_result = text_format.Parse( """ path { step: 'fa' } type: STRING string_stats { weighted_string_stats { top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } top_values { value: 'd' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } } } }""", statistics_pb2.FeatureNameStatistics()) value_counts = [('a', 4), ('c', 3), ('d', 2), ('b', 2)] top_k_value_count_list = [ top_k_uniques_stats_generator.FeatureValueCount( value_count[0], value_count[1]) for value_count in value_counts ] result = ( top_k_uniques_stats_generator .make_feature_stats_proto_with_topk_stats( types.FeaturePath(['fa']), top_k_value_count_list, False, True, 3, 1, 2)) compare.assertProtoEqual(self, result, expected_result)
def test_nl_generator_values_threshold_check(self): """Tests generator values threshold with fake heuristic.""" # Expected to give 6 matches. input_batches = [ pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH']]), pa.array([['MATCH', 'MATCH']]), # Nones should be ignored. pa.array([None, None]), ] # Try generators with values_threshold=7 (should not create stats) and # 6 (should create stats) generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=7) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics()) generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=6) self.assertCombinerOutputEqual( input_batches, generator, statistics_pb2.FeatureNameStatistics(custom_stats=[ statistics_pb2.CustomStatistic( name='domain_info', str='natural_language_domain {}'), statistics_pb2.CustomStatistic( name='natural_language_match_rate', num=1.0) ]))
def extract_output( self, accumulator: Dict[types.FeaturePath, _PartialSparseFeatureStats] ) -> statistics_pb2.DatasetFeatureStatistics: result = statistics_pb2.DatasetFeatureStatistics() for feature_path, partial_stats in accumulator.items(): feature_result = statistics_pb2.FeatureNameStatistics() feature_result.path.CopyFrom(feature_path.to_proto()) feature_result.custom_stats.add( name='missing_value', num=partial_stats.value_feature_num_missing) index_features_num_missing_histogram = statistics_pb2.RankHistogram( ) max_length_diff_histogram = statistics_pb2.RankHistogram() min_length_diff_histogram = statistics_pb2.RankHistogram() # Sort to get deterministic ordering of the buckets in the custom stat. for index_feature in sorted( partial_stats.index_features_num_missing): # The label is the last step in the feature path (and shares the parent # with the sparse feature). label = index_feature.steps()[-1] missing_bucket = index_features_num_missing_histogram.buckets.add( ) missing_bucket.label = label missing_bucket.sample_count = partial_stats.index_features_num_missing[ index_feature] max_length_bucket = max_length_diff_histogram.buckets.add() max_length_bucket.label = label max_length_bucket.sample_count = ( partial_stats.index_features_max_length_diff[index_feature] ) min_length_bucket = min_length_diff_histogram.buckets.add() min_length_bucket.label = label min_length_bucket.sample_count = ( partial_stats.index_features_min_length_diff[index_feature] ) feature_result.custom_stats.add( name='missing_index', rank_histogram=index_features_num_missing_histogram) feature_result.custom_stats.add( name='max_length_diff', rank_histogram=max_length_diff_histogram) feature_result.custom_stats.add( name='min_length_diff', rank_histogram=min_length_diff_histogram) new_feature_stats_proto = result.features.add() new_feature_stats_proto.CopyFrom(feature_result) return result
def test_time_stats_generator_inconsistent_type_invalidation_check(self): """Tests that generator invalidates stats if inconsistent types are used.""" # Absent invalidation, this is expected to give 6 matches. input_batches = [ pa.array([['2018-11-30', '2018-11-30', '2018-11-30'], ['2018-11-30']]), pa.array([['2018-11-30', '2018-11-30']]), pa.array([[1.0]]), ] # No domain_info should be generated as the incorrect type of the 1.0 value # should invalidate the stats. Absent this type issue, these examples would # satisfy the specified match_ratio and values_threshold. generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.5, values_threshold=1) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics())