def _update_example_and_missing_count(stats): """Updates example count of the dataset and missing count for all features.""" dummy_feature = stats_util.get_feature_stats(stats, _DUMMY_FEATURE_NAME) num_examples = stats_util.get_custom_stats(dummy_feature, _NUM_EXAMPLES_KEY) weighted_num_examples = stats_util.get_custom_stats( dummy_feature, _WEIGHTED_NUM_EXAMPLES_KEY) stats.features.remove(dummy_feature) for feature_stats in stats.features: common_stats = None if feature_stats.WhichOneof('stats') == 'num_stats': common_stats = feature_stats.num_stats.common_stats else: common_stats = feature_stats.string_stats.common_stats assert num_examples >= common_stats.num_non_missing, ( 'Total number of examples: {} is less than number of non missing ' 'examples: {} for feature {}.'.format(num_examples, common_stats.num_non_missing, feature_stats.name)) common_stats.num_missing = int(num_examples - common_stats.num_non_missing) if weighted_num_examples != 0: common_stats.weighted_common_stats.num_missing = ( weighted_num_examples - common_stats.weighted_common_stats.num_non_missing) stats.num_examples = int(num_examples)
def test_get_custom_stats_not_found(self): stats = text_format.Parse( """ name: 'feature' custom_stats { name: 'abc' num: 100.0 } """, statistics_pb2.FeatureNameStatistics()) with self.assertRaisesRegexp(ValueError, 'Custom statistics.*not found'): stats_util.get_custom_stats(stats, 'xyz')
def _update_example_and_missing_count( stats: statistics_pb2.DatasetFeatureStatistics) -> None: """Updates example count of the dataset and missing count for all features.""" if not stats.features: return dummy_feature = stats_util.get_feature_stats(stats, _DUMMY_FEATURE_PATH) num_examples = stats_util.get_custom_stats(dummy_feature, _NUM_EXAMPLES_KEY) weighted_num_examples = stats_util.get_custom_stats( dummy_feature, _WEIGHTED_NUM_EXAMPLES_KEY) stats.features.remove(dummy_feature) for feature_stats in stats.features: # For features nested under a STRUCT feature, their num_missing is computed # in the basic stats generator (because their num_missing is relative to # their parent's value count). if len(feature_stats.path.step) > 1: continue common_stats = None which_oneof_stats = feature_stats.WhichOneof('stats') if which_oneof_stats is None: # There are not common_stats for this feature (which can be the case when # generating only custom_stats for a sparse or weighted feature). In that # case, simply continue without modifying the common stats. continue common_stats = getattr(feature_stats, which_oneof_stats).common_stats assert num_examples >= common_stats.num_non_missing, ( 'Total number of examples: {} is less than number of non missing ' 'examples: {} for feature {}.'.format( num_examples, common_stats.num_non_missing, '.'.join(feature_stats.path.step))) num_missing = int(num_examples - common_stats.num_non_missing) common_stats.num_missing = num_missing if common_stats.presence_and_valency_stats: common_stats.presence_and_valency_stats[ 0].num_missing = num_missing if weighted_num_examples != 0: weighted_num_missing = ( weighted_num_examples - common_stats.weighted_common_stats.num_non_missing) common_stats.weighted_common_stats.num_missing = weighted_num_missing if common_stats.weighted_presence_and_valency_stats: common_stats.weighted_presence_and_valency_stats[ 0].num_missing = (weighted_num_missing) stats.num_examples = int(num_examples) stats.weighted_num_examples = weighted_num_examples
def test_get_custom_stats_string(self): stats = text_format.Parse( """ name: 'feature' custom_stats { name: 'abc' str: 'xyz' } """, statistics_pb2.FeatureNameStatistics()) self.assertEqual(stats_util.get_custom_stats(stats, 'abc'), 'xyz')
def test_get_custom_stats_numeric(self): stats = text_format.Parse( """ name: 'feature' custom_stats { name: 'abc' num: 100.0 } """, statistics_pb2.FeatureNameStatistics()) self.assertEqual(stats_util.get_custom_stats(stats, 'abc'), 100.0)
def _update_example_and_missing_count(stats): """Updates example count of the dataset and missing count for all features.""" dummy_feature = stats_util.get_feature_stats(stats, _DUMMY_FEATURE_NAME) num_examples = stats_util.get_custom_stats(dummy_feature, _NUM_EXAMPLES_KEY) weighted_num_examples = stats_util.get_custom_stats( dummy_feature, _WEIGHTED_NUM_EXAMPLES_KEY) stats.features.remove(dummy_feature) for feature_stats in stats.features: common_stats = None if feature_stats.WhichOneof('stats') == 'num_stats': common_stats = feature_stats.num_stats.common_stats else: common_stats = feature_stats.string_stats.common_stats common_stats.num_missing = int(num_examples - common_stats.num_non_missing) if weighted_num_examples != 0: common_stats.weighted_common_stats.num_missing = ( weighted_num_examples - common_stats.weighted_common_stats.num_non_missing) stats.num_examples = int(num_examples)
def _update_example_and_missing_count( stats: statistics_pb2.DatasetFeatureStatistics) -> None: """Updates example count of the dataset and missing count for all features.""" if not stats.features: return dummy_feature = stats_util.get_feature_stats(stats, _DUMMY_FEATURE_PATH) num_examples = stats_util.get_custom_stats(dummy_feature, _NUM_EXAMPLES_KEY) weighted_num_examples = stats_util.get_custom_stats( dummy_feature, _WEIGHTED_NUM_EXAMPLES_KEY) stats.features.remove(dummy_feature) for feature_stats in stats.features: # For features nested under a STRUCT feature, their num_missing is computed # in the basic stats generator (because their num_missing is relative to # their parent's value count). if len(feature_stats.path.step) > 1: continue common_stats = None which_oneof_stats = feature_stats.WhichOneof('stats') if which_oneof_stats == 'num_stats': common_stats = feature_stats.num_stats.common_stats elif which_oneof_stats == 'struct_stats': common_stats = feature_stats.struct_stats.common_stats else: common_stats = feature_stats.string_stats.common_stats assert num_examples >= common_stats.num_non_missing, ( 'Total number of examples: {} is less than number of non missing ' 'examples: {} for feature {}.'.format( num_examples, common_stats.num_non_missing, '.'.join(feature_stats.path.step))) common_stats.num_missing = int(num_examples - common_stats.num_non_missing) if weighted_num_examples != 0: common_stats.weighted_common_stats.num_missing = ( weighted_num_examples - common_stats.weighted_common_stats.num_non_missing) stats.num_examples = int(num_examples)