def _make_string_stats_proto(string_stats,
                             total_num_values
                            ):
  """Convert the partial string statistics into StringStatistics proto."""
  result = statistics_pb2.StringStatistics()
  if total_num_values > 0:
    result.avg_length = string_stats.total_bytes_length / total_num_values
  return result
Example #2
0
def _make_feature_stats_proto(common_stats, feature_name, q_combiner,
                              num_values_histogram_buckets, is_categorical,
                              has_weights):
    """Convert the partial common stats into a FeatureNameStatistics proto.

  Args:
    common_stats: The partial common stats associated with a feature.
    feature_name: The name of the feature.
    q_combiner: The quantiles combiner used to construct the quantiles
        histogram for the number of values in the feature.
    num_values_histogram_buckets: Number of buckets in the quantiles
        histogram for the number of values per feature.
    is_categorical: A boolean indicating whether the feature is categorical.
    has_weights: A boolean indicating whether a weight feature is specified.

  Returns:
    A statistics_pb2.FeatureNameStatistics proto.
  """
    common_stats_proto = statistics_pb2.CommonStatistics()
    common_stats_proto.num_non_missing = common_stats.num_non_missing
    common_stats_proto.num_missing = common_stats.num_missing
    common_stats_proto.tot_num_values = common_stats.total_num_values

    if common_stats.num_non_missing > 0:
        common_stats_proto.min_num_values = common_stats.min_num_values
        common_stats_proto.max_num_values = common_stats.max_num_values
        common_stats_proto.avg_num_values = (common_stats.total_num_values /
                                             common_stats.num_non_missing)

        # Add num_values_histogram to the common stats proto.
        num_values_quantiles = q_combiner.extract_output(
            common_stats.num_values_summary)
        histogram = quantiles_util.generate_quantiles_histogram(
            num_values_quantiles, common_stats.min_num_values,
            common_stats.max_num_values, common_stats.num_non_missing,
            num_values_histogram_buckets)
        common_stats_proto.num_values_histogram.CopyFrom(histogram)

    # Add weighted common stats to the proto.
    if has_weights:
        weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics(
            num_non_missing=common_stats.weighted_num_non_missing,
            num_missing=common_stats.weighted_num_missing,
            tot_num_values=common_stats.weighted_total_num_values)

        if common_stats.weighted_num_non_missing > 0:
            weighted_common_stats_proto.avg_num_values = (
                common_stats.weighted_total_num_values /
                common_stats.weighted_num_non_missing)

        common_stats_proto.weighted_common_stats.CopyFrom(
            weighted_common_stats_proto)

    # Create a new FeatureNameStatistics proto.
    result = statistics_pb2.FeatureNameStatistics()
    result.name = feature_name
    # Set the feature type.
    # If we have a categorical feature, we preserve the type to be the original
    # INT type. Currently we don't set the type if we cannot infer it, which
    # happens when all the values are missing. We need to add an UNKNOWN type
    # to the stats proto to handle this case.
    if is_categorical:
        result.type = statistics_pb2.FeatureNameStatistics.INT
    elif common_stats.type is None:
        # If a feature is completely missing, we assume the type to be STRING.
        result.type = statistics_pb2.FeatureNameStatistics.STRING
    else:
        result.type = common_stats.type

    # Copy the common stats into appropriate numeric/string stats.
    # If the type is not set, we currently wrap the common stats
    # within numeric stats.
    if (result.type == statistics_pb2.FeatureNameStatistics.STRING
            or is_categorical):
        # Add the common stats into string stats.
        string_stats_proto = statistics_pb2.StringStatistics()
        string_stats_proto.common_stats.CopyFrom(common_stats_proto)
        result.string_stats.CopyFrom(string_stats_proto)
    else:
        # Add the common stats into numeric stats.
        numeric_stats_proto = statistics_pb2.NumericStatistics()
        numeric_stats_proto.common_stats.CopyFrom(common_stats_proto)
        result.num_stats.CopyFrom(numeric_stats_proto)

    return result