def _make_numeric_stats_proto( numeric_stats, total_num_values, quantiles_combiner, num_histogram_buckets, num_quantiles_histogram_buckets, has_weights ): """Convert the partial numeric statistics into NumericStatistics proto.""" result = statistics_pb2.NumericStatistics() if numeric_stats.num_nan > 0: total_num_values -= numeric_stats.num_nan # Set the stats in the proto only if we have at least one value for the # feature. if total_num_values == 0: return result mean = numeric_stats.sum / total_num_values variance = max( 0, (numeric_stats.sum_of_squares / total_num_values) - mean * mean) result.mean = float(mean) result.std_dev = math.sqrt(variance) result.num_zeros = numeric_stats.num_zeros result.min = float(numeric_stats.min) result.max = float(numeric_stats.max) # Extract the quantiles from the summary. quantiles = quantiles_combiner.extract_output( numeric_stats.quantiles_summary) # Find the median from the quantiles and update the numeric stats proto. result.median = float(quantiles_util.find_median(quantiles)) # Construct the equi-width histogram from the quantiles and add it to the # numeric stats proto. std_histogram = quantiles_util.generate_equi_width_histogram( quantiles, numeric_stats.min, numeric_stats.max, total_num_values, num_histogram_buckets) std_histogram.num_nan = numeric_stats.num_nan new_std_histogram = result.histograms.add() new_std_histogram.CopyFrom(std_histogram) # Construct the quantiles histogram from the quantiles and add it to the # numeric stats proto. q_histogram = quantiles_util.generate_quantiles_histogram( quantiles, numeric_stats.min, numeric_stats.max, total_num_values, num_quantiles_histogram_buckets) q_histogram.num_nan = numeric_stats.num_nan new_q_histogram = result.histograms.add() new_q_histogram.CopyFrom(q_histogram) # Add weighted numeric stats to the proto. if has_weights: weighted_numeric_stats_proto = statistics_pb2.WeightedNumericStatistics() if numeric_stats.weighted_total_num_values == 0: weighted_mean = 0 weighted_variance = 0 else: weighted_mean = (numeric_stats.weighted_sum / numeric_stats.weighted_total_num_values) weighted_variance = max(0, (numeric_stats.weighted_sum_of_squares / numeric_stats.weighted_total_num_values) - weighted_mean**2) weighted_numeric_stats_proto.mean = weighted_mean weighted_numeric_stats_proto.std_dev = math.sqrt(weighted_variance) # Extract the weighted quantiles from the summary. weighted_quantiles = quantiles_combiner.extract_output( numeric_stats.weighted_quantiles_summary) # Find the weighted median from the quantiles and update the proto. weighted_numeric_stats_proto.median = float( quantiles_util.find_median(weighted_quantiles)) # Construct the weighted equi-width histogram from the quantiles and # add it to the numeric stats proto. weighted_std_histogram = quantiles_util.generate_equi_width_histogram( weighted_quantiles, numeric_stats.min, numeric_stats.max, numeric_stats.weighted_total_num_values, num_histogram_buckets) weighted_std_histogram.num_nan = numeric_stats.num_nan weighted_numeric_stats_proto.histograms.extend([weighted_std_histogram]) # Construct the weighted quantiles histogram from the quantiles and # add it to the numeric stats proto. weighted_q_histogram = quantiles_util.generate_quantiles_histogram( weighted_quantiles, numeric_stats.min, numeric_stats.max, numeric_stats.weighted_total_num_values, num_quantiles_histogram_buckets) weighted_q_histogram.num_nan = numeric_stats.num_nan weighted_numeric_stats_proto.histograms.extend([weighted_q_histogram]) result.weighted_numeric_stats.CopyFrom( weighted_numeric_stats_proto) return result
def _make_numeric_stats_proto( numeric_stats: _PartialNumericStats, total_num_values: int, num_histogram_buckets: int, num_quantiles_histogram_buckets: int, has_weights: bool ) -> statistics_pb2.NumericStatistics: """Convert the partial numeric statistics into NumericStatistics proto.""" result = statistics_pb2.NumericStatistics() if numeric_stats.num_nan > 0: total_num_values -= numeric_stats.num_nan if total_num_values == 0: # If we only have nan values, we only set num_nan. if numeric_stats.num_nan > 0: result.histograms.add(type=statistics_pb2.Histogram.STANDARD).num_nan = ( numeric_stats.num_nan) result.histograms.add(type=statistics_pb2.Histogram.QUANTILES).num_nan = ( numeric_stats.num_nan) return result mean = numeric_stats.sum / total_num_values variance = max( 0, (numeric_stats.sum_of_squares / total_num_values) - mean * mean) result.mean = float(mean) result.std_dev = math.sqrt(variance) result.num_zeros = numeric_stats.num_zeros result.min = float(numeric_stats.min) result.max = float(numeric_stats.max) # Extract the quantiles from the summary. assert numeric_stats.quantiles_summary is not None quantiles = ( numeric_stats.quantiles_summary.GetQuantiles( max(num_quantiles_histogram_buckets, _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM * num_histogram_buckets)).flatten().to_pylist()) # Find the median from the quantiles and update the numeric stats proto. result.median = float(quantiles_util.find_median(quantiles)) # Construct the equi-width histogram from the quantiles and add it to the # numeric stats proto. std_histogram = quantiles_util.generate_equi_width_histogram( quantiles, numeric_stats.finite_min, numeric_stats.finite_max, total_num_values, num_histogram_buckets) std_histogram.num_nan = numeric_stats.num_nan new_std_histogram = result.histograms.add() new_std_histogram.CopyFrom(std_histogram) # Construct the quantiles histogram from the quantiles and add it to the # numeric stats proto. q_histogram = quantiles_util.generate_quantiles_histogram( quantiles, total_num_values, num_quantiles_histogram_buckets) q_histogram.num_nan = numeric_stats.num_nan new_q_histogram = result.histograms.add() new_q_histogram.CopyFrom(q_histogram) # Add weighted numeric stats to the proto. if has_weights: weighted_numeric_stats_proto = statistics_pb2.WeightedNumericStatistics() if numeric_stats.weighted_total_num_values == 0: weighted_mean = 0 weighted_variance = 0 else: weighted_mean = (numeric_stats.weighted_sum / numeric_stats.weighted_total_num_values) weighted_variance = max(0, (numeric_stats.weighted_sum_of_squares / numeric_stats.weighted_total_num_values) - weighted_mean**2) weighted_numeric_stats_proto.mean = weighted_mean weighted_numeric_stats_proto.std_dev = math.sqrt(weighted_variance) # Extract the weighted quantiles from the summary. assert numeric_stats.weighted_quantiles_summary is not None weighted_quantiles = ( numeric_stats.weighted_quantiles_summary.GetQuantiles( max(num_quantiles_histogram_buckets, _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM * num_histogram_buckets)).flatten().to_pylist()) # Find the weighted median from the quantiles and update the proto. weighted_numeric_stats_proto.median = float( quantiles_util.find_median(weighted_quantiles)) # Construct the weighted equi-width histogram from the quantiles and # add it to the numeric stats proto. weighted_std_histogram = quantiles_util.generate_equi_width_histogram( weighted_quantiles, numeric_stats.finite_min, numeric_stats.finite_max, numeric_stats.weighted_total_num_values, num_histogram_buckets) weighted_std_histogram.num_nan = numeric_stats.num_nan weighted_numeric_stats_proto.histograms.extend([weighted_std_histogram]) # Construct the weighted quantiles histogram from the quantiles and # add it to the numeric stats proto. weighted_q_histogram = quantiles_util.generate_quantiles_histogram( weighted_quantiles, numeric_stats.weighted_total_num_values, num_quantiles_histogram_buckets) weighted_q_histogram.num_nan = numeric_stats.num_nan weighted_numeric_stats_proto.histograms.extend([weighted_q_histogram]) result.weighted_numeric_stats.CopyFrom( weighted_numeric_stats_proto) return result
def _make_feature_stats_proto( common_stats, feature_name, q_combiner, num_values_histogram_buckets, is_categorical, has_weights ): """Convert the partial common stats into a FeatureNameStatistics proto. Args: common_stats: The partial common stats associated with a feature. feature_name: The name of the feature. q_combiner: The quantiles combiner used to construct the quantiles histogram for the number of values in the feature. num_values_histogram_buckets: Number of buckets in the quantiles histogram for the number of values per feature. is_categorical: A boolean indicating whether the feature is categorical. has_weights: A boolean indicating whether a weight feature is specified. Returns: A statistics_pb2.FeatureNameStatistics proto. """ common_stats_proto = statistics_pb2.CommonStatistics() common_stats_proto.num_non_missing = common_stats.num_non_missing common_stats_proto.num_missing = common_stats.num_missing common_stats_proto.tot_num_values = common_stats.total_num_values if common_stats.num_non_missing > 0: common_stats_proto.min_num_values = common_stats.min_num_values common_stats_proto.max_num_values = common_stats.max_num_values common_stats_proto.avg_num_values = ( common_stats.total_num_values / common_stats.num_non_missing) # Add num_values_histogram to the common stats proto. num_values_quantiles = q_combiner.extract_output( common_stats.num_values_summary) histogram = quantiles_util.generate_quantiles_histogram( num_values_quantiles, common_stats.min_num_values, common_stats.max_num_values, common_stats.num_non_missing, num_values_histogram_buckets) common_stats_proto.num_values_histogram.CopyFrom(histogram) # Add weighted common stats to the proto. if has_weights: weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics( num_non_missing=common_stats.weighted_num_non_missing, num_missing=common_stats.weighted_num_missing, tot_num_values=common_stats.weighted_total_num_values) if common_stats.weighted_num_non_missing > 0: weighted_common_stats_proto.avg_num_values = ( common_stats.weighted_total_num_values / common_stats.weighted_num_non_missing) common_stats_proto.weighted_common_stats.CopyFrom( weighted_common_stats_proto) # Create a new FeatureNameStatistics proto. result = statistics_pb2.FeatureNameStatistics() result.name = feature_name # Set the feature type. # If we have a categorical feature, we preserve the type to be the original # INT type. Currently we don't set the type if we cannot infer it, which # happens when all the values are missing. We need to add an UNKNOWN type # to the stats proto to handle this case. if is_categorical: result.type = statistics_pb2.FeatureNameStatistics.INT elif common_stats.type is None: # If a feature is completely missing, we assume the type to be STRING. result.type = statistics_pb2.FeatureNameStatistics.STRING else: result.type = common_stats.type # Copy the common stats into appropriate numeric/string stats. # If the type is not set, we currently wrap the common stats # within numeric stats. if (result.type == statistics_pb2.FeatureNameStatistics.STRING or is_categorical): # Add the common stats into string stats. string_stats_proto = statistics_pb2.StringStatistics() string_stats_proto.common_stats.CopyFrom(common_stats_proto) result.string_stats.CopyFrom(string_stats_proto) else: # Add the common stats into numeric stats. numeric_stats_proto = statistics_pb2.NumericStatistics() numeric_stats_proto.common_stats.CopyFrom(common_stats_proto) result.num_stats.CopyFrom(numeric_stats_proto) return result
def _make_common_stats_proto( common_stats: _PartialCommonStats, parent_common_stats: Optional[_PartialCommonStats], make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch], num_values_histogram_buckets: int, has_weights: bool ) -> statistics_pb2.CommonStatistics: """Convert the partial common stats into a CommonStatistics proto.""" result = statistics_pb2.CommonStatistics() parent_presence_and_valency = None if parent_common_stats is not None: parent_presence_and_valency = ( _PresenceAndValencyStats(make_quantiles_sketch_fn) if parent_common_stats.presence_and_valency_stats is None else parent_common_stats.presence_and_valency_stats[-1]) presence_and_valency_stats = common_stats.presence_and_valency_stats # the CommonStatistics already contains the presence and valency # for a 1-nested feature. if (presence_and_valency_stats is not None and len(presence_and_valency_stats) > 1): result.presence_and_valency_stats.extend( _make_presence_and_valency_stats_protos( parent_presence_and_valency, common_stats.presence_and_valency_stats)) if has_weights: result.weighted_presence_and_valency_stats.extend( _make_weighted_presence_and_valency_stats_protos( parent_presence_and_valency, common_stats.presence_and_valency_stats)) top_level_presence_and_valency = ( _PresenceAndValencyStats(make_quantiles_sketch_fn) if common_stats.presence_and_valency_stats is None else common_stats.presence_and_valency_stats[0]) result.num_non_missing = top_level_presence_and_valency.num_non_missing if parent_presence_and_valency is not None: result.num_missing = ( parent_presence_and_valency.total_num_values - top_level_presence_and_valency.num_non_missing) result.tot_num_values = top_level_presence_and_valency.total_num_values # TODO(b/79685042): Need to decide on what is the expected values for # statistics like min_num_values, max_num_values, avg_num_values, when # all the values for the feature are missing. if top_level_presence_and_valency.num_non_missing > 0: result.min_num_values = top_level_presence_and_valency.min_num_values result.max_num_values = top_level_presence_and_valency.max_num_values result.avg_num_values = ( top_level_presence_and_valency.total_num_values / top_level_presence_and_valency.num_non_missing) if top_level_presence_and_valency.num_values_summary is not None: # Add num_values_histogram to the common stats proto. num_values_quantiles = ( top_level_presence_and_valency.num_values_summary.GetQuantiles( num_values_histogram_buckets).flatten().to_pylist()) histogram = quantiles_util.generate_quantiles_histogram( num_values_quantiles, top_level_presence_and_valency.num_non_missing, num_values_histogram_buckets) result.num_values_histogram.CopyFrom(histogram) # Add weighted common stats to the proto. if has_weights: weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics( num_non_missing=top_level_presence_and_valency.weighted_num_non_missing, tot_num_values=top_level_presence_and_valency.weighted_total_num_values) if parent_presence_and_valency is not None: weighted_common_stats_proto.num_missing = ( parent_presence_and_valency.weighted_total_num_values - top_level_presence_and_valency.weighted_num_non_missing) if top_level_presence_and_valency.weighted_num_non_missing > 0: weighted_common_stats_proto.avg_num_values = ( top_level_presence_and_valency.weighted_total_num_values / top_level_presence_and_valency.weighted_num_non_missing) result.weighted_common_stats.CopyFrom( weighted_common_stats_proto) return result