def _make_weighted_presence_and_valency_stats_protos( parent_presence_and_valency: Optional[_PresenceAndValencyStats], presence_and_valency: List[_PresenceAndValencyStats] ) -> List[statistics_pb2.WeightedCommonStatistics]: """Converts weighted presence and valency stats to corresponding protos.""" result = [] # The top-level non-missing is computed by # weighted_num_examples - top_level.weighted_num_non_missing (outside # BasicStatsGenerator as num_examples cannot be computed here). # For all other levels, # it's (previous_level.weighted_total_num_values - # this_level.weighted_num_non_missing). for prev_s, s in zip( itertools.chain([parent_presence_and_valency], presence_and_valency), presence_and_valency): proto = statistics_pb2.WeightedCommonStatistics() if prev_s is not None: proto.num_missing = ( prev_s.weighted_total_num_values - s.weighted_num_non_missing) proto.num_non_missing = s.weighted_num_non_missing proto.tot_num_values = s.weighted_total_num_values if s.weighted_num_non_missing > 0: proto.avg_num_values = ( s.weighted_total_num_values / s.weighted_num_non_missing) result.append(proto) return result
def _make_common_stats_proto( common_stats: _PartialCommonStats, parent_common_stats: Optional[_PartialCommonStats], q_combiner: quantiles_util.QuantilesCombiner, num_values_histogram_buckets: int, has_weights: bool ) -> statistics_pb2.CommonStatistics: """Convert the partial common stats into a CommonStatistics proto.""" result = statistics_pb2.CommonStatistics() result.num_non_missing = common_stats.num_non_missing if parent_common_stats is not None: result.num_missing = ( parent_common_stats.total_num_values - common_stats.num_non_missing) result.tot_num_values = common_stats.total_num_values # TODO(b/79685042): Need to decide on what is the expected values for # statistics like min_num_values, max_num_values, avg_num_values, when # all the values for the feature are missing. if common_stats.num_non_missing > 0: result.min_num_values = common_stats.min_num_values result.max_num_values = common_stats.max_num_values result.avg_num_values = ( common_stats.total_num_values / common_stats.num_non_missing) # Add num_values_histogram to the common stats proto. num_values_quantiles = q_combiner.extract_output( common_stats.num_values_summary) histogram = quantiles_util.generate_quantiles_histogram( num_values_quantiles, common_stats.num_non_missing, num_values_histogram_buckets) result.num_values_histogram.CopyFrom(histogram) # Add weighted common stats to the proto. if has_weights: weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics( num_non_missing=common_stats.weighted_num_non_missing, tot_num_values=common_stats.weighted_total_num_values) if parent_common_stats is not None: weighted_common_stats_proto.num_missing = ( parent_common_stats.weighted_total_num_values - common_stats.weighted_num_non_missing) if common_stats.weighted_num_non_missing > 0: weighted_common_stats_proto.avg_num_values = ( common_stats.weighted_total_num_values / common_stats.weighted_num_non_missing) result.weighted_common_stats.CopyFrom( weighted_common_stats_proto) return result
def _make_feature_stats_proto(common_stats, feature_name, q_combiner, num_values_histogram_buckets, is_categorical, has_weights): """Convert the partial common stats into a FeatureNameStatistics proto. Args: common_stats: The partial common stats associated with a feature. feature_name: The name of the feature. q_combiner: The quantiles combiner used to construct the quantiles histogram for the number of values in the feature. num_values_histogram_buckets: Number of buckets in the quantiles histogram for the number of values per feature. is_categorical: A boolean indicating whether the feature is categorical. has_weights: A boolean indicating whether a weight feature is specified. Returns: A statistics_pb2.FeatureNameStatistics proto. """ common_stats_proto = statistics_pb2.CommonStatistics() common_stats_proto.num_non_missing = common_stats.num_non_missing common_stats_proto.num_missing = common_stats.num_missing common_stats_proto.tot_num_values = common_stats.total_num_values if common_stats.num_non_missing > 0: common_stats_proto.min_num_values = common_stats.min_num_values common_stats_proto.max_num_values = common_stats.max_num_values common_stats_proto.avg_num_values = (common_stats.total_num_values / common_stats.num_non_missing) # Add num_values_histogram to the common stats proto. num_values_quantiles = q_combiner.extract_output( common_stats.num_values_summary) histogram = quantiles_util.generate_quantiles_histogram( num_values_quantiles, common_stats.min_num_values, common_stats.max_num_values, common_stats.num_non_missing, num_values_histogram_buckets) common_stats_proto.num_values_histogram.CopyFrom(histogram) # Add weighted common stats to the proto. if has_weights: weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics( num_non_missing=common_stats.weighted_num_non_missing, num_missing=common_stats.weighted_num_missing, tot_num_values=common_stats.weighted_total_num_values) if common_stats.weighted_num_non_missing > 0: weighted_common_stats_proto.avg_num_values = ( common_stats.weighted_total_num_values / common_stats.weighted_num_non_missing) common_stats_proto.weighted_common_stats.CopyFrom( weighted_common_stats_proto) # Create a new FeatureNameStatistics proto. result = statistics_pb2.FeatureNameStatistics() result.name = feature_name # Set the feature type. # If we have a categorical feature, we preserve the type to be the original # INT type. Currently we don't set the type if we cannot infer it, which # happens when all the values are missing. We need to add an UNKNOWN type # to the stats proto to handle this case. if is_categorical: result.type = statistics_pb2.FeatureNameStatistics.INT elif common_stats.type is None: # If a feature is completely missing, we assume the type to be STRING. result.type = statistics_pb2.FeatureNameStatistics.STRING else: result.type = common_stats.type # Copy the common stats into appropriate numeric/string stats. # If the type is not set, we currently wrap the common stats # within numeric stats. if (result.type == statistics_pb2.FeatureNameStatistics.STRING or is_categorical): # Add the common stats into string stats. string_stats_proto = statistics_pb2.StringStatistics() string_stats_proto.common_stats.CopyFrom(common_stats_proto) result.string_stats.CopyFrom(string_stats_proto) else: # Add the common stats into numeric stats. numeric_stats_proto = statistics_pb2.NumericStatistics() numeric_stats_proto.common_stats.CopyFrom(common_stats_proto) result.num_stats.CopyFrom(numeric_stats_proto) return result
def _make_common_stats_proto( common_stats: _PartialCommonStats, parent_common_stats: Optional[_PartialCommonStats], make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch], num_values_histogram_buckets: int, has_weights: bool ) -> statistics_pb2.CommonStatistics: """Convert the partial common stats into a CommonStatistics proto.""" result = statistics_pb2.CommonStatistics() parent_presence_and_valency = None if parent_common_stats is not None: parent_presence_and_valency = ( _PresenceAndValencyStats(make_quantiles_sketch_fn) if parent_common_stats.presence_and_valency_stats is None else parent_common_stats.presence_and_valency_stats[-1]) presence_and_valency_stats = common_stats.presence_and_valency_stats # the CommonStatistics already contains the presence and valency # for a 1-nested feature. if (presence_and_valency_stats is not None and len(presence_and_valency_stats) > 1): result.presence_and_valency_stats.extend( _make_presence_and_valency_stats_protos( parent_presence_and_valency, common_stats.presence_and_valency_stats)) if has_weights: result.weighted_presence_and_valency_stats.extend( _make_weighted_presence_and_valency_stats_protos( parent_presence_and_valency, common_stats.presence_and_valency_stats)) top_level_presence_and_valency = ( _PresenceAndValencyStats(make_quantiles_sketch_fn) if common_stats.presence_and_valency_stats is None else common_stats.presence_and_valency_stats[0]) result.num_non_missing = top_level_presence_and_valency.num_non_missing if parent_presence_and_valency is not None: result.num_missing = ( parent_presence_and_valency.total_num_values - top_level_presence_and_valency.num_non_missing) result.tot_num_values = top_level_presence_and_valency.total_num_values # TODO(b/79685042): Need to decide on what is the expected values for # statistics like min_num_values, max_num_values, avg_num_values, when # all the values for the feature are missing. if top_level_presence_and_valency.num_non_missing > 0: result.min_num_values = top_level_presence_and_valency.min_num_values result.max_num_values = top_level_presence_and_valency.max_num_values result.avg_num_values = ( top_level_presence_and_valency.total_num_values / top_level_presence_and_valency.num_non_missing) if top_level_presence_and_valency.num_values_summary is not None: # Add num_values_histogram to the common stats proto. num_values_quantiles = ( top_level_presence_and_valency.num_values_summary.GetQuantiles( num_values_histogram_buckets).flatten().to_pylist()) histogram = quantiles_util.generate_quantiles_histogram( num_values_quantiles, top_level_presence_and_valency.num_non_missing, num_values_histogram_buckets) result.num_values_histogram.CopyFrom(histogram) # Add weighted common stats to the proto. if has_weights: weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics( num_non_missing=top_level_presence_and_valency.weighted_num_non_missing, tot_num_values=top_level_presence_and_valency.weighted_total_num_values) if parent_presence_and_valency is not None: weighted_common_stats_proto.num_missing = ( parent_presence_and_valency.weighted_total_num_values - top_level_presence_and_valency.weighted_num_non_missing) if top_level_presence_and_valency.weighted_num_non_missing > 0: weighted_common_stats_proto.avg_num_values = ( top_level_presence_and_valency.weighted_total_num_values / top_level_presence_and_valency.weighted_num_non_missing) result.weighted_common_stats.CopyFrom( weighted_common_stats_proto) return result