def compute( self, examples_record_batch: pa.RecordBatch ) -> statistics_pb2.DatasetFeatureStatistics: """Computes MI and AMI between all valid features and labels. Args: examples_record_batch: Arrow record_batch containing a batch of examples. Returns: DatasetFeatureStatistics proto containing AMI and MI for each feature. Raises: ValueError: If label_feature does not exist in examples. """ if self._label_feature_is_unique(examples_record_batch): result = {} for feature_name in examples_record_batch.schema.names: feature_path = types.FeaturePath([feature_name]) if feature_path != self._label_feature: result[feature_path] = {self._custom_stats_key: 0.0} return stats_util.make_dataset_feature_stats_proto(result) encoded_examples = _encode_examples(examples_record_batch, self._multivalent_features, self._categorical_features, self._features_to_ignore, self._max_encoding_length) if self._normalize_by_max: labels = encoded_examples[self._label_feature] else: labels = encoded_examples.pop(self._label_feature) mi_result = self._calculate_mi(encoded_examples, labels, self._seed) if self._normalize_by_max: mi_result = self._normalize_mi_values(mi_result) return stats_util.make_dataset_feature_stats_proto(mi_result)
def extract_output(self, accumulator): """Returns meta-statistics as a DatasetFeatureStatistics proto.""" valid_stats_summary = _get_partitioned_statistics_summary( get_valid_statistics(accumulator.statistics, self._min_partitions_stat_presence)) return stats_util.make_dataset_feature_stats_proto(valid_stats_summary)
def compute(self, examples_table: pa.Table ) -> statistics_pb2.DatasetFeatureStatistics: """Computes MI and AMI between all valid features and labels. Args: examples_table: Arrow table containing a batch of examples. Returns: DatasetFeatureStatistics proto containing AMI and MI for each valid feature in the dataset. Some features may filtered out by _remove_unsupported_feature_columns if they are inavlid. In this case, AMI and MI will not be calculated for the invalid feature. Raises: ValueError: If label_feature contains unsupported data. """ examples_table = _remove_unsupported_feature_columns(examples_table, self._schema) flattened_examples = _flatten_and_impute(examples_table, self._categorical_features) if self._label_feature not in flattened_examples: raise ValueError("Label column contains unsupported data.") labels = flattened_examples.pop(self._label_feature) df = pd.DataFrame(flattened_examples) # Boolean list used to mark features as discrete for sk-learn MI computation discrete_feature_mask = self._convert_categorical_features_to_numeric(df) return stats_util.make_dataset_feature_stats_proto( self._calculate_mi(df, labels, discrete_feature_mask, seed=self._seed))
def test_make_dataset_feature_stats_proto(self): stats = { types.FeaturePath(['feature_1']): { 'Mutual Information': 0.5, 'Correlation': 0.1 }, types.FeaturePath(['feature_2']): { 'Mutual Information': 0.8, 'Correlation': 0.6 } } expected = { types.FeaturePath(['feature_1']): text_format.Parse( """ path { step: 'feature_1' } custom_stats { name: 'Correlation' num: 0.1 } custom_stats { name: 'Mutual Information' num: 0.5 } """, statistics_pb2.FeatureNameStatistics()), types.FeaturePath(['feature_2']): text_format.Parse( """ path { step: 'feature_2' } custom_stats { name: 'Correlation' num: 0.6 } custom_stats { name: 'Mutual Information' num: 0.8 } """, statistics_pb2.FeatureNameStatistics()) } actual = stats_util.make_dataset_feature_stats_proto(stats) self.assertEqual(len(actual.features), len(expected)) for actual_feature_stats in actual.features: compare.assertProtoEqual(self, actual_feature_stats, expected[types.FeaturePath.from_proto( actual_feature_stats.path)], normalize_numbers=True)
def test_make_dataset_feature_stats_proto(self): stats = { 'feature_1': { 'Mutual Information': 0.5, 'Correlation': 0.1 }, 'feature_2': { 'Mutual Information': 0.8, 'Correlation': 0.6 } } expected = { 'feature_1': text_format.Parse( """ name: 'feature_1' custom_stats { name: 'Correlation' num: 0.1 } custom_stats { name: 'Mutual Information' num: 0.5 } """, statistics_pb2.FeatureNameStatistics()), 'feature_2': text_format.Parse( """ name: 'feature_2' custom_stats { name: 'Correlation' num: 0.6 } custom_stats { name: 'Mutual Information' num: 0.8 } """, statistics_pb2.FeatureNameStatistics()) } actual = stats_util.make_dataset_feature_stats_proto(stats) self.assertEqual(len(actual.features), len(expected)) for actual_feature_stats in actual.features: compare.assertProtoEqual(self, actual_feature_stats, expected[actual_feature_stats.name], normalize_numbers=True)
def compute(self, examples): """Computes MI and AMI between all valid features and labels. Args: examples: ExampleBatch containing the feature values for each feature. Returns: DatasetFeatureStatistics proto containing AMI and MI for each valid feature in the dataset. Some features may filtered out by _remove_unsupported_feature_columns if they are inavlid. In this case, AMI and MI will not be calculated for the invalid feature. Raises: ValueError: If label_feature contains unsupported data. """ if self._label_feature not in examples: raise ValueError("Label column does not exist.") _remove_unsupported_feature_columns(examples, self._schema) if self._label_feature not in examples: raise ValueError("Label column contains unsupported data.") flattened_examples = _flatten_examples(examples) # TODO(b/119414212): Use Ranklab struct feature to handle null values for MI imputed_examples = self._impute(flattened_examples) labels = imputed_examples.pop(self._label_feature) df = pd.DataFrame(imputed_examples) # Boolean list used to mark features as discrete for sk-learn MI computation discrete_feature_mask = self._convert_categorical_features_to_numeric( df) return stats_util.make_dataset_feature_stats_proto( self._calculate_mi(df, labels, discrete_feature_mask, seed=self._seed))