def _make_dataset_feature_stats_proto_with_uniques_for_single_feature( feature_path_to_value_count, categorical_features): """Makes a DatasetFeatureStatistics proto with uniques stats for a feature.""" (slice_key, feature_path_tuple), count = feature_path_to_value_count feature_path = types.FeaturePath(feature_path_tuple) result = statistics_pb2.DatasetFeatureStatistics() result.features.add().CopyFrom( _make_feature_stats_proto_with_uniques_stats( feature_path, count, feature_path in categorical_features)) return slice_key, result.SerializeToString()
def make_dataset_feature_stats_proto_unique_single( feature_path_tuple: types.FeaturePathTuple, num_uniques: int, ) -> statistics_pb2.DatasetFeatureStatistics: """Makes a DatasetFeatureStatistics proto with uniques stats for a feature.""" feature_path = types.FeaturePath(feature_path_tuple) result = statistics_pb2.DatasetFeatureStatistics() result.features.add().CopyFrom( _make_feature_stats_proto_uniques(feature_path, num_uniques)) return result
def _add_slice_key( stats_proto_per_slice, is_slicing_enabled ): """Add slice key to stats proto.""" result = statistics_pb2.DatasetFeatureStatistics() result.CopyFrom(stats_proto_per_slice[1]) if is_slicing_enabled: result.name = stats_proto_per_slice[0] return result
def _add_slice_key( stats_proto_per_slice: Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics], is_slicing_enabled: bool) -> statistics_pb2.DatasetFeatureStatistics: """Add slice key to stats proto.""" result = statistics_pb2.DatasetFeatureStatistics() result.CopyFrom(stats_proto_per_slice[1]) if is_slicing_enabled: result.name = stats_proto_per_slice[0] return result
def test_topk_with_single_string_feature(self): # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e' examples = [{'fa': np.array(['a', 'b', 'c', 'e'])}, {'fa': np.array(['a', 'c', 'd', 'a'])}, {'fa': np.array(['a', 'b', 'c', 'd'])}] # Note that if two feature values have the same frequency, the one with the # lexicographically larger feature value will be higher in the order. expected_result = text_format.Parse( """ features { name: 'fa' type: STRING string_stats { top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } top_values { value: 'd' frequency: 2 } top_values { value: 'b' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } } }""", statistics_pb2.DatasetFeatureStatistics()) generator = top_k_stats_generator.TopKStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3) self.assertTransformOutputEqual(examples, generator, [expected_result])
def test_make_dataset_feature_stats_proto_topk_single(self): expected_result = text_format.Parse( """ features { string_stats { top_values { value: "e" frequency: 20.0 } top_values { value: "d" frequency: 20.0 } top_values { value: "a" frequency: 15.0 } rank_histogram { buckets { label: "e" sample_count: 20.0 } buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 20.0 } } } path { step: "fa" } }""", statistics_pb2.DatasetFeatureStatistics()) value_counts = [('e', 20), ('d', 20), ('a', 15), ('c', 10), ('b', 5)] value_count_list = [ top_k_uniques_stats_util.FeatureValueCount(value_count[0], value_count[1]) for value_count in value_counts ] result = (top_k_uniques_stats_util. make_dataset_feature_stats_proto_topk_single( types.FeaturePath(['fa']).steps(), value_count_list=value_count_list, categorical_features=frozenset([ types.FeaturePath(['fa']), types.FeaturePath(['fb']) ]), is_weighted_stats=False, num_top_values=3, frequency_threshold=1, num_rank_histogram_buckets=2)) test_util.assert_dataset_feature_stats_proto_equal( self, result, expected_result)
def _make_dataset_feature_stats_proto( lifts: Tuple[_SlicedFeatureKey, _LiftSeries], y_path: types.FeaturePath, y_boundaries: Optional[np.ndarray] ) -> Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics]: """Generates DatasetFeatureStatistics proto for a given x_path, y_path pair. Args: lifts: The result of two successive group bys of lift values. The innermost grouping collects all the lift values for a given (slice, x_path and y_value) tuple (corresponding to a single LiftSeries message). The outermost grouping collects all the lift values for the same (slice, x_path) tuple (corresponding to the set of the LiftSeries which share the same value of y_path). The full structure of lifts is described by: (slice, x_path), [(y, y_count, [(x, lift, xy_count, x_count)])] y_path: The path used as Y in the lift expression: lift = P(Y=y|X=x) / P(Y=y). y_boundaries: Optionally, a set of bin boundaries used for binning y_path values. Returns: The populated DatasetFeatureStatistics proto. """ key, lift_series_list = lifts stats = statistics_pb2.DatasetFeatureStatistics() cross_stats = stats.cross_features.add(path_x=key.x_path.to_proto(), path_y=y_path.to_proto()) for lift_series in sorted(lift_series_list): lift_series_proto = cross_stats.categorical_cross_stats.lift_series.add( y_count=lift_series.y_count) y = lift_series.y if y_boundaries is not None: low_value, high_value = bin_util.get_boundaries(y, y_boundaries) lift_series_proto.y_bucket.low_value = low_value lift_series_proto.y_bucket.high_value = high_value elif isinstance(y, six.string_types): lift_series_proto.y_string = y else: lift_series_proto.y_int = y # dedupe possibly overlapping top_k and bottom_k x values. lift_values_deduped = {v.x: v for v in lift_series.lift_values} # sort by lift DESC, x ASC lift_values_sorted = sorted(lift_values_deduped.values(), key=lambda v: (-v.lift, v.x)) for lift_value in lift_values_sorted: lift_value_proto = lift_series_proto.lift_values.add( lift=lift_value.lift, x_count=lift_value.x_count, x_and_y_count=lift_value.xy_count) x = lift_value.x if isinstance(x, six.string_types): lift_value_proto.x_string = x else: lift_value_proto.x_int = x return key.slice_key, stats
def assert_on_two_protos_with_same_features_in_different_order(self): expected = text_format.Parse( """ features { path { step: 'fb' } type: STRING string_stats { unique: 5 } } features { path { step: 'fa' } type: STRING string_stats { unique: 4 } }""", statistics_pb2.DatasetFeatureStatistics()) actual = text_format.Parse( """ features { path { step: 'fa' } type: STRING string_stats { unique: 4 } } features { path { step: 'fb' } type: STRING string_stats { unique: 5 } }""", statistics_pb2.DatasetFeatureStatistics()) test_util.assert_dataset_feature_stats_proto_equal(self, actual, expected)
def _merge_dataset_feature_stats_protos( stats_protos: Iterable[statistics_pb2.DatasetFeatureStatistics] ) -> statistics_pb2.DatasetFeatureStatistics: """Merges together a list of DatasetFeatureStatistics protos. Args: stats_protos: A list of DatasetFeatureStatistics protos to merge. Returns: The merged DatasetFeatureStatistics proto. """ stats_per_feature = {} # Iterate over each DatasetFeatureStatistics proto and merge the # FeatureNameStatistics protos per feature. for stats_proto in stats_protos: for feature_stats_proto in stats_proto.features: feature_path = types.FeaturePath.from_proto( feature_stats_proto.path) if feature_path not in stats_per_feature: stats_per_feature[feature_path] = feature_stats_proto else: stats_for_feature = stats_per_feature[feature_path] # MergeFrom would concatenate repeated fields which is not what we want # for path.step. del stats_for_feature.path.step[:] stats_per_feature[feature_path].MergeFrom(feature_stats_proto) # Create a new DatasetFeatureStatistics proto. result = statistics_pb2.DatasetFeatureStatistics() num_examples = None for feature_stats_proto in six.itervalues(stats_per_feature): # Add the merged FeatureNameStatistics proto for the feature # into the DatasetFeatureStatistics proto. new_feature_stats_proto = result.features.add() new_feature_stats_proto.CopyFrom(feature_stats_proto) # Get the number of examples from one of the features that # has common stats. if num_examples is None: stats_type = feature_stats_proto.WhichOneof('stats') stats_proto = None if stats_type == 'num_stats': stats_proto = feature_stats_proto.num_stats else: stats_proto = feature_stats_proto.string_stats if stats_proto.HasField('common_stats'): num_examples = (stats_proto.common_stats.num_non_missing + stats_proto.common_stats.num_missing) # Set the num_examples field. if num_examples is not None: result.num_examples = num_examples return result
def extract_output(self, accumulator: List[float] ) -> statistics_pb2.DatasetFeatureStatistics: result = statistics_pb2.DatasetFeatureStatistics() dummy_feature = result.features.add() dummy_feature.path.CopyFrom(_DUMMY_FEATURE_PATH.to_proto()) dummy_feature.custom_stats.add(name=_NUM_EXAMPLES_KEY, num=accumulator[0]) dummy_feature.custom_stats.add(name=_WEIGHTED_NUM_EXAMPLES_KEY, num=accumulator[1]) beam.metrics.Metrics.counter(constants.METRICS_NAMESPACE, 'num_instances' ).inc(accumulator[0]) return result
def test_topk_with_single_unicode_feature(self): # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e' batches = [{'fa': np.array([np.array(['a', 'b', 'c', 'e']), np.array(['a', 'c', 'd', 'a'])], dtype=np.unicode_)}, {'fa': np.array([np.array(['a', 'b', 'c', 'd'])], dtype=np.unicode_)}] expected_result = text_format.Parse( """ features { name: 'fa' type: STRING string_stats { top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } top_values { value: 'd' frequency: 2 } top_values { value: 'b' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } } }""", statistics_pb2.DatasetFeatureStatistics()) generator = top_k_stats_generator.TopKStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3) self.assertTransformOutputEqual(batches, generator, [expected_result])
def extract_output(self, accumulator): result = statistics_pb2.DatasetFeatureStatistics() dummy_feature = result.features.add() dummy_feature.name = _DUMMY_FEATURE_NAME dummy_feature.custom_stats.add(name=_NUM_EXAMPLES_KEY, num=accumulator[0]) dummy_feature.custom_stats.add(name=_WEIGHTED_NUM_EXAMPLES_KEY, num=accumulator[1]) beam.metrics.Metrics.counter(constants.METRICS_NAMESPACE, 'num_instances').inc(accumulator[0]) return result
def test_topk_with_categorical_feature(self): batches = [{'fa': np.array([np.array([12, 23, 34, 12]), np.array([45, 23])])}, {'fa': np.array([np.array([12, 12, 34, 45])])}] expected_result_fa = text_format.Parse( """ features { name: 'fa' type: INT string_stats { top_values { value: '12' frequency: 4 } top_values { value: '45' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "12" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "45" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "34" sample_count: 2.0 } } } }""", statistics_pb2.DatasetFeatureStatistics()) schema = text_format.Parse( """ feature { name: "fa" type: INT int_domain { is_categorical: true } } """, schema_pb2.Schema()) generator = top_k_stats_generator.TopKStatsGenerator( schema=schema, num_top_values=2, num_rank_histogram_buckets=3) self.assertTransformOutputEqual(batches, generator, [expected_result_fa])
def extract_output( self, accumulator: List[float] ) -> statistics_pb2.DatasetFeatureStatistics: result = statistics_pb2.DatasetFeatureStatistics() dummy_feature = result.features.add() dummy_feature.path.CopyFrom(_DUMMY_FEATURE_PATH.to_proto()) dummy_feature.custom_stats.add(name=_NUM_EXAMPLES_KEY, num=accumulator[0]) dummy_feature.custom_stats.add(name=_WEIGHTED_NUM_EXAMPLES_KEY, num=accumulator[1]) return result
def test_mi_with_imputed_categorical_feature(self): label_array = pa.array([[0], [2], [0], [1], [2], [1], [1]]) # A categorical feature with missing values. feat_array = pa.array([["Red"], ["Blue"], None, None, ["Blue"], ["Green"], ["Green"]]) batch = pa.RecordBatch.from_arrays([label_array, feat_array], ["label_key", "fa"]) schema = text_format.Parse( """ feature { name: "label_key" type: INT shape { dim { size: 1 } } int_domain { is_categorical: true } } feature { name: "fa" type: BYTES shape { dim { size: 1 } } } """, schema_pb2.Schema()) expected = text_format.Parse( """ features { path { step: "fa" } custom_stats { name: 'sklearn_adjusted_mutual_information' num: 0.3960841 } custom_stats { name: 'sklearn_mutual_information' num: 0.8809502 } custom_stats { name: "sklearn_normalized_adjusted_mutual_information" num: 0.4568877 } }""", statistics_pb2.DatasetFeatureStatistics()) self._assert_mi_output_equal(batch, expected, schema, types.FeaturePath(["label_key"]))
def _make_dataset_feature_stats_proto_with_single_feature( feature_name_to_value_count_list, categorical_features, num_top_values, num_rank_histogram_buckets): """Makes a DatasetFeatureStatistics containing one single feature.""" result = statistics_pb2.DatasetFeatureStatistics() result.features.add().CopyFrom( _make_feature_stats_proto( feature_name_to_value_count_list[0], feature_name_to_value_count_list[1], feature_name_to_value_count_list[0] in categorical_features, num_top_values, num_rank_histogram_buckets)) return result
def test_mi_with_imputed_numerical_label(self): label_array = pa.array([[0.1], [0.2], [0.8], [0.7], [0.2], [np.NaN], None, [0.1], [0.2], [0.8], [0.7], [0.2], [0.2], [0.3]]) feat_array = pa.array([[0.1], [0.2], [0.8], [0.7], [0.2], [0.2], [0.3], [0.1], [0.2], [0.8], [0.7], [0.2], [0.2], [0.3]]) batch = pa.RecordBatch.from_arrays([label_array, feat_array], ["label_key", "fa"]) schema = text_format.Parse( """ feature { name: "fa" type: FLOAT shape { dim { size: 1 } } } feature { name: "label_key" type: FLOAT shape { dim { size: 1 } } } """, schema_pb2.Schema()) expected = text_format.Parse( """ features { path { step: "fa" } custom_stats { name: "sklearn_adjusted_mutual_information" num: 0.2640041 } custom_stats { name: "sklearn_mutual_information" num: 0.3825569 } custom_stats { name: "sklearn_normalized_adjusted_mutual_information" num: 0.244306 } }""", statistics_pb2.DatasetFeatureStatistics()) self._assert_mi_output_equal(batch, expected, schema, types.FeaturePath(["label_key"]))
def _make_dataset_feature_stats_proto_with_single_feature( feature_name_to_value_count, categorical_features ): """Generates a DatasetFeatureStatistics proto containing a single feature.""" result = statistics_pb2.DatasetFeatureStatistics() result.features.add().CopyFrom( _make_feature_stats_proto( feature_name_to_value_count[0], feature_name_to_value_count[1], feature_name_to_value_count[0] in categorical_features)) return result
def test_all_string_features(self): # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e' # fb: 1 'a', 2 'b', 3 'c' examples = [{ 'fa': np.array(['a', 'b', 'c', 'e']), 'fb': np.array(['a', 'c', 'c']) }, { 'fa': None, 'fb': np.array(['a', 'c', 'c']) }, { 'fa': np.array(['a', 'c', 'd']), 'fb': None }, { 'fa': np.array(['a', 'a', 'b', 'c', 'd']), 'fb': None }, { 'fa': None, 'fb': np.array(['b', 'c']) }] expected_result_fa = text_format.Parse( """ features { name: 'fa' type: STRING string_stats { unique: 5 } }""", statistics_pb2.DatasetFeatureStatistics()) expected_result_fb = text_format.Parse( """ features { name: 'fb' type: STRING string_stats { unique: 3 } }""", statistics_pb2.DatasetFeatureStatistics()) generator = uniques_stats_generator.UniquesStatsGenerator() self.assertTransformOutputEqual( examples, generator, [expected_result_fa, expected_result_fb])
def test_mi_classif_with_int_label_and_categorical_feature(self): label_array = pa.array([ [0], [2], [0], [1], [2], [1], [1], [0], [2], [1], [0]]) # A categorical feature that maps directly on to the label. perfect_feat_array = pa.array([ ["Red"], ["Blue"], ["Red"], ["Green"], ["Blue"], ["Green"], ["Green"], ["Red"], ["Blue"], ["Green"], ["Red"]]) batch = pa.RecordBatch.from_arrays([label_array, perfect_feat_array], ["label_key", "perfect_feature"]) schema = text_format.Parse( """ feature { name: "label_key" type: INT int_domain { is_categorical: true } shape { dim { size: 1 } } } feature { name: "perfect_feature" type: BYTES shape { dim { size: 1 } } } """, schema_pb2.Schema()) expected = text_format.Parse( """ features { path { step: "perfect_feature" } custom_stats { name: 'sklearn_adjusted_mutual_information' num: 0.9297553 } custom_stats { name: 'sklearn_mutual_information' num: 1.0900597 } }""", statistics_pb2.DatasetFeatureStatistics()) self._assert_mi_output_equal(batch, expected, schema, types.FeaturePath(["label_key"]))
def _make_dataset_feature_stats_proto_with_single_feature( feature_name_to_value_count_list, categorical_features, is_weighted_stats, num_top_values, num_rank_histogram_buckets ): """Makes a DatasetFeatureStatistics containing one single feature.""" (slice_key, feature_name), value_count_list = feature_name_to_value_count_list result = statistics_pb2.DatasetFeatureStatistics() result.features.add().CopyFrom( make_feature_stats_proto_with_topk_stats( feature_name, value_count_list, feature_name in categorical_features, is_weighted_stats, num_top_values, num_rank_histogram_buckets)) return slice_key, result.SerializeToString()
def _make_dataset_feature_stats_proto_with_uniques_for_single_feature( feature_path_to_value_count: Tuple[Tuple[types.SliceKey, FeaturePathTuple], int], categorical_features: Set[types.FeaturePath] ) -> Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics]: """Makes a DatasetFeatureStatistics proto with uniques stats for a feature.""" (slice_key, feature_path_tuple), count = feature_path_to_value_count feature_path = types.FeaturePath(feature_path_tuple) result = statistics_pb2.DatasetFeatureStatistics() result.features.add().CopyFrom( _make_feature_stats_proto_with_uniques_stats( feature_path, count, feature_path in categorical_features)) return slice_key, result
def test_mi_classif_categorical_label_small_sample(self): label_array = pa.array([[0]]) feat_array = pa.array([["Red"]]) batch = pa.RecordBatch.from_arrays([label_array, feat_array], ["label_key", "feature"]) schema = text_format.Parse( """ feature { name: "label_key" type: INT int_domain { is_categorical: true } shape { dim { size: 1 } } } feature { name: "feature" type: BYTES shape { dim { size: 1 } } } """, schema_pb2.Schema()) expected = text_format.Parse( """ features { path { step: "feature" } custom_stats { name: 'sklearn_adjusted_mutual_information' num: 0 } custom_stats { name: 'sklearn_mutual_information' num: 0 } custom_stats { name: "sklearn_normalized_adjusted_mutual_information" num: 0 } }""", statistics_pb2.DatasetFeatureStatistics()) self._assert_mi_output_equal(batch, expected, schema, types.FeaturePath(["label_key"]))
def test_topk_with_numeric_feature(self): # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e' examples = [{'fa': np.array(['a', 'b', 'c', 'e']), 'fb': np.array([1.0, 2.0, 3.0])}, {'fa': None, 'fb': np.array([4.0, 5.0])}, {'fa': np.array(['a', 'c', 'd']), 'fb': None}, {'fa': np.array(['a', 'a', 'b', 'c', 'd']), 'fb': None}] expected_result_fa = text_format.Parse( """ features { name: 'fa' type: STRING string_stats { top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } } }""", statistics_pb2.DatasetFeatureStatistics()) generator = top_k_stats_generator.TopKStatsGenerator( num_top_values=2, num_rank_histogram_buckets=3) self.assertTransformOutputEqual(examples, generator, [expected_result_fa])
def test_merge_dataset_feature_stats_protos_single_proto(self): proto1 = text_format.Parse( """ num_examples: 7 features: { name: 'feature1' type: STRING string_stats: { common_stats: { num_missing: 3 num_non_missing: 4 min_num_values: 1 max_num_values: 1 } } } """, statistics_pb2.DatasetFeatureStatistics()) expected = text_format.Parse( """ num_examples: 7 features: { name: 'feature1' type: STRING string_stats: { common_stats: { num_missing: 3 num_non_missing: 4 min_num_values: 1 max_num_values: 1 } } } """, statistics_pb2.DatasetFeatureStatistics()) actual = stats_impl._merge_dataset_feature_stats_protos([proto1]) self.assertEqual(actual, expected)
def _merge_dataset_feature_stats_protos( stats_protos ): """Merges together a list of DatasetFeatureStatistics protos. Args: stats_protos: A list of DatasetFeatureStatistics protos to merge. Returns: The merged DatasetFeatureStatistics proto. """ stats_per_feature = {} # Iterate over each DatasetFeatureStatistics proto and merge the # FeatureNameStatistics protos per feature. for stats_proto in stats_protos: for feature_stats_proto in stats_proto.features: if feature_stats_proto.name not in stats_per_feature: stats_per_feature[feature_stats_proto.name] = feature_stats_proto else: stats_per_feature[feature_stats_proto.name].MergeFrom( feature_stats_proto) # Create a new DatasetFeatureStatistics proto. result = statistics_pb2.DatasetFeatureStatistics() num_examples = None for feature_stats_proto in stats_per_feature.values(): # Add the merged FeatureNameStatistics proto for the feature # into the DatasetFeatureStatistics proto. new_feature_stats_proto = result.features.add() new_feature_stats_proto.CopyFrom(feature_stats_proto) # Get the number of examples from one of the features that # has common stats. if num_examples is None: stats_type = feature_stats_proto.WhichOneof('stats') stats_proto = None if stats_type == 'num_stats': stats_proto = feature_stats_proto.num_stats else: stats_proto = feature_stats_proto.string_stats if stats_proto.HasField('common_stats'): num_examples = (stats_proto.common_stats.num_non_missing + stats_proto.common_stats.num_missing) # Set the num_examples field. if num_examples is not None: result.num_examples = num_examples return result
def make_dataset_feature_stats_proto_topk_single( feature_path_tuple: types.FeaturePathTuple, value_count_list: List[FeatureValueCount], is_weighted_stats: bool, num_top_values: int, frequency_threshold: Union[int, float], num_rank_histogram_buckets: int) -> statistics_pb2.DatasetFeatureStatistics: """Makes a DatasetFeatureStatistics proto with top-k stats for a feature.""" feature_path = types.FeaturePath(feature_path_tuple) result = statistics_pb2.DatasetFeatureStatistics() result.features.add().CopyFrom( _make_feature_stats_proto_topk(feature_path, value_count_list, is_weighted_stats, num_top_values, frequency_threshold, num_rank_histogram_buckets)) return result
def _make_dataset_feature_stats_proto_with_topk_for_single_feature( feature_path_to_value_count_list, categorical_features, is_weighted_stats, num_top_values, frequency_threshold, num_rank_histogram_buckets): """Makes a DatasetFeatureStatistics proto with top-k stats for a feature.""" (slice_key, feature_path_tuple), value_count_list = (feature_path_to_value_count_list) feature_path = types.FeaturePath(feature_path_tuple) result = statistics_pb2.DatasetFeatureStatistics() result.features.add().CopyFrom( make_feature_stats_proto_with_topk_stats( feature_path, value_count_list, feature_path in categorical_features, is_weighted_stats, num_top_values, frequency_threshold, num_rank_histogram_buckets)) return slice_key, result.SerializeToString()
def assert_on_two_protos_with_different_num_examples(self): expected = text_format.Parse( """ num_examples: 1 features { name: 'fa' type: STRING string_stats { unique: 4 } } """, statistics_pb2.DatasetFeatureStatistics()) actual = text_format.Parse( """ num_examples: 2 features { name: 'fa' type: STRING string_stats { unique: 4 } }""", statistics_pb2.DatasetFeatureStatistics()) test_util.assert_dataset_feature_stats_proto_equal( self, actual, expected)
def extract_output(self, accumulator): # Create a new DatasetFeatureStatistics proto. result = statistics_pb2.DatasetFeatureStatistics() for feature_name, common_stats in accumulator.items(): # Construct the FeatureNameStatistics proto from the partial # common stats. feature_stats_proto = _make_feature_stats_proto( common_stats, feature_name, self._quantiles_combiner, feature_name in self._categorical_features) # Copy the constructed FeatureNameStatistics proto into the # DatasetFeatureStatistics proto. new_feature_stats_proto = result.features.add() new_feature_stats_proto.CopyFrom(feature_stats_proto) return result