def _update_example_and_missing_count(stats):
    """Updates example count of the dataset and missing count for all features."""
    dummy_feature = stats_util.get_feature_stats(stats, _DUMMY_FEATURE_NAME)
    num_examples = stats_util.get_custom_stats(dummy_feature,
                                               _NUM_EXAMPLES_KEY)
    weighted_num_examples = stats_util.get_custom_stats(
        dummy_feature, _WEIGHTED_NUM_EXAMPLES_KEY)
    stats.features.remove(dummy_feature)
    for feature_stats in stats.features:
        common_stats = None
        if feature_stats.WhichOneof('stats') == 'num_stats':
            common_stats = feature_stats.num_stats.common_stats
        else:
            common_stats = feature_stats.string_stats.common_stats
        assert num_examples >= common_stats.num_non_missing, (
            'Total number of examples: {} is less than number of non missing '
            'examples: {} for feature {}.'.format(num_examples,
                                                  common_stats.num_non_missing,
                                                  feature_stats.name))
        common_stats.num_missing = int(num_examples -
                                       common_stats.num_non_missing)
        if weighted_num_examples != 0:
            common_stats.weighted_common_stats.num_missing = (
                weighted_num_examples -
                common_stats.weighted_common_stats.num_non_missing)
    stats.num_examples = int(num_examples)
def _update_example_and_missing_count(
        stats: statistics_pb2.DatasetFeatureStatistics) -> None:
    """Updates example count of the dataset and missing count for all features."""
    if not stats.features:
        return
    dummy_feature = stats_util.get_feature_stats(stats, _DUMMY_FEATURE_PATH)
    num_examples = stats_util.get_custom_stats(dummy_feature,
                                               _NUM_EXAMPLES_KEY)
    weighted_num_examples = stats_util.get_custom_stats(
        dummy_feature, _WEIGHTED_NUM_EXAMPLES_KEY)
    stats.features.remove(dummy_feature)
    for feature_stats in stats.features:
        # For features nested under a STRUCT feature, their num_missing is computed
        # in the basic stats generator (because their num_missing is relative to
        # their parent's value count).
        if len(feature_stats.path.step) > 1:
            continue
        common_stats = None
        which_oneof_stats = feature_stats.WhichOneof('stats')
        if which_oneof_stats is None:
            # There are not common_stats for this feature (which can be the case when
            # generating only custom_stats for a sparse or weighted feature). In that
            # case, simply continue without modifying the common stats.
            continue
        common_stats = getattr(feature_stats, which_oneof_stats).common_stats
        assert num_examples >= common_stats.num_non_missing, (
            'Total number of examples: {} is less than number of non missing '
            'examples: {} for feature {}.'.format(
                num_examples, common_stats.num_non_missing,
                '.'.join(feature_stats.path.step)))
        num_missing = int(num_examples - common_stats.num_non_missing)
        common_stats.num_missing = num_missing
        if common_stats.presence_and_valency_stats:
            common_stats.presence_and_valency_stats[
                0].num_missing = num_missing
        if weighted_num_examples != 0:
            weighted_num_missing = (
                weighted_num_examples -
                common_stats.weighted_common_stats.num_non_missing)
            common_stats.weighted_common_stats.num_missing = weighted_num_missing
            if common_stats.weighted_presence_and_valency_stats:
                common_stats.weighted_presence_and_valency_stats[
                    0].num_missing = (weighted_num_missing)

    stats.num_examples = int(num_examples)
    stats.weighted_num_examples = weighted_num_examples
Exemple #3
0
def get_num_feature_stats_as_dataframe(
        stats_list: statistics_pb2.DatasetFeatureStatisticsList,
        feature_path: FeaturePath):
    """Returns a series of numeric statistics for a given
    feature formatted as a tidy dataframe."""

    feature_stats_list = []
    for dataset in stats_list.datasets:
        if dataset.name != 'All Examples':
            feature_stats = get_feature_stats(dataset, feature_path)
            if not feature_stats.HasField('num_stats'):
                raise ValueError('This is not a numeric feature')
            stats_dict = MessageToDict(feature_stats.num_stats)
            del stats_dict['commonStats']
            del stats_dict['histograms']
            stats_dict['slice'] = dataset.name
            feature_stats_list.append(stats_dict)

    return pd.DataFrame(feature_stats_list)
Exemple #4
0
def get_histograms_as_dataframe(
        stats_list: statistics_pb2.DatasetFeatureStatisticsList,
        feature_path: FeaturePath):
    """Returns a series of histograms for a given numeric feature
    formatted as a tidy dataframe"""

    buckets = []
    for dataset in stats_list.datasets:
        if dataset.name != 'All Examples':
            feature_stats = get_feature_stats(dataset, feature_path)
            if not feature_stats.HasField('num_stats'):
                raise ValueError('This is not a numeric feature')
            for histogram in feature_stats.num_stats.histograms:
                if histogram.type == statistics_pb2.Histogram.HistogramType.STANDARD:
                    for bucket in histogram.buckets:
                        bucket_dict = MessageToDict(bucket)
                        bucket_dict['slice'] = dataset.name
                        buckets.append(bucket_dict)

    return pd.DataFrame(buckets)
Exemple #5
0
def _update_example_and_missing_count(stats):
    """Updates example count of the dataset and missing count for all features."""
    dummy_feature = stats_util.get_feature_stats(stats, _DUMMY_FEATURE_NAME)
    num_examples = stats_util.get_custom_stats(dummy_feature,
                                               _NUM_EXAMPLES_KEY)
    weighted_num_examples = stats_util.get_custom_stats(
        dummy_feature, _WEIGHTED_NUM_EXAMPLES_KEY)
    stats.features.remove(dummy_feature)
    for feature_stats in stats.features:
        common_stats = None
        if feature_stats.WhichOneof('stats') == 'num_stats':
            common_stats = feature_stats.num_stats.common_stats
        else:
            common_stats = feature_stats.string_stats.common_stats
        common_stats.num_missing = int(num_examples -
                                       common_stats.num_non_missing)
        if weighted_num_examples != 0:
            common_stats.weighted_common_stats.num_missing = (
                weighted_num_examples -
                common_stats.weighted_common_stats.num_non_missing)
    stats.num_examples = int(num_examples)
def _update_example_and_missing_count(
        stats: statistics_pb2.DatasetFeatureStatistics) -> None:
    """Updates example count of the dataset and missing count for all features."""
    if not stats.features:
        return
    dummy_feature = stats_util.get_feature_stats(stats, _DUMMY_FEATURE_PATH)
    num_examples = stats_util.get_custom_stats(dummy_feature,
                                               _NUM_EXAMPLES_KEY)
    weighted_num_examples = stats_util.get_custom_stats(
        dummy_feature, _WEIGHTED_NUM_EXAMPLES_KEY)
    stats.features.remove(dummy_feature)
    for feature_stats in stats.features:
        # For features nested under a STRUCT feature, their num_missing is computed
        # in the basic stats generator (because their num_missing is relative to
        # their parent's value count).
        if len(feature_stats.path.step) > 1:
            continue
        common_stats = None
        which_oneof_stats = feature_stats.WhichOneof('stats')
        if which_oneof_stats == 'num_stats':
            common_stats = feature_stats.num_stats.common_stats
        elif which_oneof_stats == 'struct_stats':
            common_stats = feature_stats.struct_stats.common_stats
        else:
            common_stats = feature_stats.string_stats.common_stats
        assert num_examples >= common_stats.num_non_missing, (
            'Total number of examples: {} is less than number of non missing '
            'examples: {} for feature {}.'.format(
                num_examples, common_stats.num_non_missing,
                '.'.join(feature_stats.path.step)))
        common_stats.num_missing = int(num_examples -
                                       common_stats.num_non_missing)
        if weighted_num_examples != 0:
            common_stats.weighted_common_stats.num_missing = (
                weighted_num_examples -
                common_stats.weighted_common_stats.num_non_missing)
    stats.num_examples = int(num_examples)