Beispiel #1
0
    def extract_output(self, accumulator):
        feature_paths_to_value_counts = dict()
        feature_paths_to_weighted_value_counts = dict()

        for feature_path, value_counts in accumulator.items():
            if value_counts.unweighted_counts:
                feature_value_counts = [
                    top_k_uniques_stats_generator.FeatureValueCount(
                        key, value)
                    for key, value in value_counts.unweighted_counts.items()
                ]
                feature_paths_to_value_counts[
                    feature_path] = feature_value_counts
            if value_counts.weighted_counts:
                weighted_feature_value_counts = [
                    top_k_uniques_stats_generator.FeatureValueCount(
                        key, value)
                    for key, value in value_counts.weighted_counts.items()
                ]
                feature_paths_to_weighted_value_counts[
                    feature_path] = weighted_feature_value_counts

        return _make_dataset_feature_stats_proto_with_multiple_features(
            feature_paths_to_value_counts,
            feature_paths_to_weighted_value_counts, self._categorical_features,
            self._num_top_values, self._frequency_threshold,
            self._weighted_frequency_threshold,
            self._num_rank_histogram_buckets)
 def test_make_feature_stats_proto_with_topk_stats_weighted(self):
   expected_result = text_format.Parse(
       """
       path {
         step: 'fa'
       }
       type: STRING
       string_stats {
         weighted_string_stats {
           top_values {
             value: 'a'
             frequency: 4
           }
           top_values {
             value: 'c'
             frequency: 3
           }
           top_values {
             value: 'd'
             frequency: 2
           }
           rank_histogram {
             buckets {
               low_rank: 0
               high_rank: 0
               label: "a"
               sample_count: 4.0
             }
             buckets {
               low_rank: 1
               high_rank: 1
               label: "c"
               sample_count: 3.0
             }
           }
         }
   }""", statistics_pb2.FeatureNameStatistics())
   value_counts = [('a', 4), ('c', 3), ('d', 2), ('b', 2)]
   top_k_value_count_list = [
       top_k_uniques_stats_generator.FeatureValueCount(
           value_count[0], value_count[1])
       for value_count in value_counts
   ]
   result = (
       top_k_uniques_stats_generator
       .make_feature_stats_proto_with_topk_stats(
           types.FeaturePath(['fa']),
           top_k_value_count_list, False, True, 3, 1, 2))
   compare.assertProtoEqual(self, result, expected_result)