Example #1
0
    def extract_output(self, accumulator):
        feature_names_to_value_counts = dict()
        feature_names_to_weighted_value_counts = dict()

        for feature_name, value_counts in accumulator.items():
            if value_counts.unweighted_counts:
                feature_value_counts = [
                    top_k_stats_generator.FeatureValueCount(key, value)
                    for key, value in value_counts.unweighted_counts.items()
                ]
                feature_names_to_value_counts[
                    feature_name] = feature_value_counts
            if value_counts.weighted_counts:
                weighted_feature_value_counts = [
                    top_k_stats_generator.FeatureValueCount(key, value)
                    for key, value in value_counts.weighted_counts.items()
                ]
                feature_names_to_weighted_value_counts[
                    feature_name] = weighted_feature_value_counts

        return _make_dataset_feature_stats_proto_with_multiple_features(
            feature_names_to_value_counts,
            feature_names_to_weighted_value_counts, self._categorical_features,
            self._num_top_values, self._num_rank_histogram_buckets)
Example #2
0
 def test_make_feature_stats_proto_with_topk_stats_weighted(self):
   expected_result = text_format.Parse(
       """
       name: 'fa'
       type: STRING
       string_stats {
         weighted_string_stats {
           top_values {
             value: 'a'
             frequency: 4
           }
           top_values {
             value: 'c'
             frequency: 3
           }
           top_values {
             value: 'd'
             frequency: 2
           }
           rank_histogram {
             buckets {
               low_rank: 0
               high_rank: 0
               label: "a"
               sample_count: 4.0
             }
             buckets {
               low_rank: 1
               high_rank: 1
               label: "c"
               sample_count: 3.0
             }
           }
         }
   }""", statistics_pb2.FeatureNameStatistics())
   value_counts = [('a', 4), ('c', 3), ('d', 2), ('b', 2)]
   top_k_value_count_list = [
       top_k_stats_generator.FeatureValueCount(value_count[0], value_count[1])
       for value_count in value_counts
   ]
   result = top_k_stats_generator.make_feature_stats_proto_with_topk_stats(
       'fa', top_k_value_count_list, False, True, 3, 2)
   compare.assertProtoEqual(self, result, expected_result)