コード例 #1
0
def _make_feature_stats_proto(feature_name, value_count_list,
                              weighted_value_count_list, is_categorical,
                              num_top_values, num_rank_histogram_buckets):
    """Makes a FeatureNameStatistics proto containing top-k and uniques stats."""
    # Create a FeatureNameStatistics proto that includes the unweighted top-k
    # stats.
    result = top_k_stats_generator.make_feature_stats_proto_with_topk_stats(
        feature_name, value_count_list, is_categorical, False, num_top_values,
        num_rank_histogram_buckets)

    # If weights were provided, create another FeatureNameStatistics proto that
    # includes the weighted top-k stats, and then copy those weighted top-k stats
    # into the result proto.
    if weighted_value_count_list:
        weighted_result = (
            top_k_stats_generator.make_feature_stats_proto_with_topk_stats(
                feature_name, weighted_value_count_list, is_categorical, True,
                num_top_values, num_rank_histogram_buckets))

        result.string_stats.weighted_string_stats.CopyFrom(
            weighted_result.string_stats.weighted_string_stats)

    # Add the number of uniques to the FeatureNameStatistics proto.
    result.string_stats.unique = len(value_count_list)

    return result
コード例 #2
0
 def test_make_feature_stats_proto_with_topk_stats_weighted(self):
   expected_result = text_format.Parse(
       """
       name: 'fa'
       type: STRING
       string_stats {
         weighted_string_stats {
           top_values {
             value: 'a'
             frequency: 4
           }
           top_values {
             value: 'c'
             frequency: 3
           }
           top_values {
             value: 'd'
             frequency: 2
           }
           rank_histogram {
             buckets {
               low_rank: 0
               high_rank: 0
               label: "a"
               sample_count: 4.0
             }
             buckets {
               low_rank: 1
               high_rank: 1
               label: "c"
               sample_count: 3.0
             }
           }
         }
   }""", statistics_pb2.FeatureNameStatistics())
   value_counts = [('a', 4), ('c', 3), ('d', 2), ('b', 2)]
   top_k_value_count_list = [
       top_k_stats_generator.FeatureValueCount(value_count[0], value_count[1])
       for value_count in value_counts
   ]
   result = top_k_stats_generator.make_feature_stats_proto_with_topk_stats(
       'fa', top_k_value_count_list, False, True, 3, 2)
   compare.assertProtoEqual(self, result, expected_result)