def extract_output(self, accumulator): feature_paths_to_value_counts = dict() feature_paths_to_weighted_value_counts = dict() for feature_path, value_counts in accumulator.items(): if value_counts.unweighted_counts: feature_value_counts = [ top_k_uniques_stats_generator.FeatureValueCount( key, value) for key, value in value_counts.unweighted_counts.items() ] feature_paths_to_value_counts[ feature_path] = feature_value_counts if value_counts.weighted_counts: weighted_feature_value_counts = [ top_k_uniques_stats_generator.FeatureValueCount( key, value) for key, value in value_counts.weighted_counts.items() ] feature_paths_to_weighted_value_counts[ feature_path] = weighted_feature_value_counts return _make_dataset_feature_stats_proto_with_multiple_features( feature_paths_to_value_counts, feature_paths_to_weighted_value_counts, self._categorical_features, self._num_top_values, self._frequency_threshold, self._weighted_frequency_threshold, self._num_rank_histogram_buckets)
def test_make_feature_stats_proto_with_topk_stats_weighted(self): expected_result = text_format.Parse( """ path { step: 'fa' } type: STRING string_stats { weighted_string_stats { top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } top_values { value: 'd' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } } } }""", statistics_pb2.FeatureNameStatistics()) value_counts = [('a', 4), ('c', 3), ('d', 2), ('b', 2)] top_k_value_count_list = [ top_k_uniques_stats_generator.FeatureValueCount( value_count[0], value_count[1]) for value_count in value_counts ] result = ( top_k_uniques_stats_generator .make_feature_stats_proto_with_topk_stats( types.FeaturePath(['fa']), top_k_value_count_list, False, True, 3, 1, 2)) compare.assertProtoEqual(self, result, expected_result)