Beispiel #1
0
def _update_example_and_missing_count(stats):
    """Updates example count of the dataset and missing count for all features."""
    dummy_feature = stats_util.get_feature_stats(stats, _DUMMY_FEATURE_NAME)
    num_examples = stats_util.get_custom_stats(dummy_feature,
                                               _NUM_EXAMPLES_KEY)
    weighted_num_examples = stats_util.get_custom_stats(
        dummy_feature, _WEIGHTED_NUM_EXAMPLES_KEY)
    stats.features.remove(dummy_feature)
    for feature_stats in stats.features:
        common_stats = None
        if feature_stats.WhichOneof('stats') == 'num_stats':
            common_stats = feature_stats.num_stats.common_stats
        else:
            common_stats = feature_stats.string_stats.common_stats
        assert num_examples >= common_stats.num_non_missing, (
            'Total number of examples: {} is less than number of non missing '
            'examples: {} for feature {}.'.format(num_examples,
                                                  common_stats.num_non_missing,
                                                  feature_stats.name))
        common_stats.num_missing = int(num_examples -
                                       common_stats.num_non_missing)
        if weighted_num_examples != 0:
            common_stats.weighted_common_stats.num_missing = (
                weighted_num_examples -
                common_stats.weighted_common_stats.num_non_missing)
    stats.num_examples = int(num_examples)
 def test_get_custom_stats_not_found(self):
   stats = text_format.Parse(
       """
           name: 'feature'
           custom_stats {
             name: 'abc'
             num: 100.0
           }
       """, statistics_pb2.FeatureNameStatistics())
   with self.assertRaisesRegexp(ValueError, 'Custom statistics.*not found'):
     stats_util.get_custom_stats(stats, 'xyz')
def _update_example_and_missing_count(
        stats: statistics_pb2.DatasetFeatureStatistics) -> None:
    """Updates example count of the dataset and missing count for all features."""
    if not stats.features:
        return
    dummy_feature = stats_util.get_feature_stats(stats, _DUMMY_FEATURE_PATH)
    num_examples = stats_util.get_custom_stats(dummy_feature,
                                               _NUM_EXAMPLES_KEY)
    weighted_num_examples = stats_util.get_custom_stats(
        dummy_feature, _WEIGHTED_NUM_EXAMPLES_KEY)
    stats.features.remove(dummy_feature)
    for feature_stats in stats.features:
        # For features nested under a STRUCT feature, their num_missing is computed
        # in the basic stats generator (because their num_missing is relative to
        # their parent's value count).
        if len(feature_stats.path.step) > 1:
            continue
        common_stats = None
        which_oneof_stats = feature_stats.WhichOneof('stats')
        if which_oneof_stats is None:
            # There are not common_stats for this feature (which can be the case when
            # generating only custom_stats for a sparse or weighted feature). In that
            # case, simply continue without modifying the common stats.
            continue
        common_stats = getattr(feature_stats, which_oneof_stats).common_stats
        assert num_examples >= common_stats.num_non_missing, (
            'Total number of examples: {} is less than number of non missing '
            'examples: {} for feature {}.'.format(
                num_examples, common_stats.num_non_missing,
                '.'.join(feature_stats.path.step)))
        num_missing = int(num_examples - common_stats.num_non_missing)
        common_stats.num_missing = num_missing
        if common_stats.presence_and_valency_stats:
            common_stats.presence_and_valency_stats[
                0].num_missing = num_missing
        if weighted_num_examples != 0:
            weighted_num_missing = (
                weighted_num_examples -
                common_stats.weighted_common_stats.num_non_missing)
            common_stats.weighted_common_stats.num_missing = weighted_num_missing
            if common_stats.weighted_presence_and_valency_stats:
                common_stats.weighted_presence_and_valency_stats[
                    0].num_missing = (weighted_num_missing)

    stats.num_examples = int(num_examples)
    stats.weighted_num_examples = weighted_num_examples
Beispiel #4
0
 def test_get_custom_stats_string(self):
     stats = text_format.Parse(
         """
         name: 'feature'
         custom_stats {
           name: 'abc'
           str: 'xyz'
         }
     """, statistics_pb2.FeatureNameStatistics())
     self.assertEqual(stats_util.get_custom_stats(stats, 'abc'), 'xyz')
Beispiel #5
0
 def test_get_custom_stats_numeric(self):
     stats = text_format.Parse(
         """
         name: 'feature'
         custom_stats {
           name: 'abc'
           num: 100.0
         }
     """, statistics_pb2.FeatureNameStatistics())
     self.assertEqual(stats_util.get_custom_stats(stats, 'abc'), 100.0)
Beispiel #6
0
def _update_example_and_missing_count(stats):
    """Updates example count of the dataset and missing count for all features."""
    dummy_feature = stats_util.get_feature_stats(stats, _DUMMY_FEATURE_NAME)
    num_examples = stats_util.get_custom_stats(dummy_feature,
                                               _NUM_EXAMPLES_KEY)
    weighted_num_examples = stats_util.get_custom_stats(
        dummy_feature, _WEIGHTED_NUM_EXAMPLES_KEY)
    stats.features.remove(dummy_feature)
    for feature_stats in stats.features:
        common_stats = None
        if feature_stats.WhichOneof('stats') == 'num_stats':
            common_stats = feature_stats.num_stats.common_stats
        else:
            common_stats = feature_stats.string_stats.common_stats
        common_stats.num_missing = int(num_examples -
                                       common_stats.num_non_missing)
        if weighted_num_examples != 0:
            common_stats.weighted_common_stats.num_missing = (
                weighted_num_examples -
                common_stats.weighted_common_stats.num_non_missing)
    stats.num_examples = int(num_examples)
def _update_example_and_missing_count(
        stats: statistics_pb2.DatasetFeatureStatistics) -> None:
    """Updates example count of the dataset and missing count for all features."""
    if not stats.features:
        return
    dummy_feature = stats_util.get_feature_stats(stats, _DUMMY_FEATURE_PATH)
    num_examples = stats_util.get_custom_stats(dummy_feature,
                                               _NUM_EXAMPLES_KEY)
    weighted_num_examples = stats_util.get_custom_stats(
        dummy_feature, _WEIGHTED_NUM_EXAMPLES_KEY)
    stats.features.remove(dummy_feature)
    for feature_stats in stats.features:
        # For features nested under a STRUCT feature, their num_missing is computed
        # in the basic stats generator (because their num_missing is relative to
        # their parent's value count).
        if len(feature_stats.path.step) > 1:
            continue
        common_stats = None
        which_oneof_stats = feature_stats.WhichOneof('stats')
        if which_oneof_stats == 'num_stats':
            common_stats = feature_stats.num_stats.common_stats
        elif which_oneof_stats == 'struct_stats':
            common_stats = feature_stats.struct_stats.common_stats
        else:
            common_stats = feature_stats.string_stats.common_stats
        assert num_examples >= common_stats.num_non_missing, (
            'Total number of examples: {} is less than number of non missing '
            'examples: {} for feature {}.'.format(
                num_examples, common_stats.num_non_missing,
                '.'.join(feature_stats.path.step)))
        common_stats.num_missing = int(num_examples -
                                       common_stats.num_non_missing)
        if weighted_num_examples != 0:
            common_stats.weighted_common_stats.num_missing = (
                weighted_num_examples -
                common_stats.weighted_common_stats.num_non_missing)
    stats.num_examples = int(num_examples)