def test_generate_equi_width_histogram(self): result = quantiles_util.generate_equi_width_histogram( quantiles=np.array([0, 1, 5, 10, 15, 20, 24], dtype=np.float32), finite_min=0, finite_max=24, total_count=18, num_buckets=3) expected_result = text_format.Parse( """ buckets { low_value: 0 high_value: 8.0 sample_count: 7.8 } buckets { low_value: 8.0 high_value: 16.0 sample_count: 4.8 } buckets { low_value: 16.0 high_value: 24.0 sample_count: 5.4 } type: STANDARD """, statistics_pb2.Histogram()) self.assertEqual(result, expected_result)
def _make_feature_stats_proto(numeric_stats, feature_name, quantiles_combiner, num_histogram_buckets, num_quantiles_histogram_buckets): """Convert the partial numeric statistics into FeatureNameStatistics proto.""" numeric_stats_proto = statistics_pb2.NumericStatistics() # Set the stats in the proto only if we have at least one value for the # feature. if numeric_stats.total_num_values > 0: mean = numeric_stats.sum / numeric_stats.total_num_values variance = max( 0, (numeric_stats.sum_of_squares / numeric_stats.total_num_values) - mean * mean) numeric_stats_proto.mean = float(mean) numeric_stats_proto.std_dev = math.sqrt(variance) numeric_stats_proto.num_zeros = numeric_stats.num_zeros numeric_stats_proto.min = float(numeric_stats.min) numeric_stats_proto.max = float(numeric_stats.max) # Extract the quantiles from the summary. quantiles = quantiles_combiner.extract_output( numeric_stats.quantiles_summary) # Find the median from the quantiles and update the numeric stats proto. numeric_stats_proto.median = float( quantiles_util.find_median(quantiles)) # Construct the equi-width histogram from the quantiles and add it to the # numeric stats proto. std_histogram = quantiles_util.generate_equi_width_histogram( quantiles, numeric_stats.min, numeric_stats.max, numeric_stats.total_num_values, num_histogram_buckets) std_histogram.num_nan = numeric_stats.num_nan new_std_histogram = numeric_stats_proto.histograms.add() new_std_histogram.CopyFrom(std_histogram) # Construct the quantiles histogram from the quantiles and add it to the # numeric stats proto. q_histogram = quantiles_util.generate_quantiles_histogram( quantiles, numeric_stats.min, numeric_stats.max, numeric_stats.total_num_values, num_quantiles_histogram_buckets) q_histogram.num_nan = numeric_stats.num_nan new_q_histogram = numeric_stats_proto.histograms.add() new_q_histogram.CopyFrom(q_histogram) # Create a new FeatureNameStatistics proto. result = statistics_pb2.FeatureNameStatistics() result.name = feature_name result.type = numeric_stats.type result.num_stats.CopyFrom(numeric_stats_proto) return result
def _make_numeric_stats_proto( numeric_stats, total_num_values, quantiles_combiner, num_histogram_buckets, num_quantiles_histogram_buckets, has_weights ): """Convert the partial numeric statistics into NumericStatistics proto.""" result = statistics_pb2.NumericStatistics() if numeric_stats.num_nan > 0: total_num_values -= numeric_stats.num_nan # Set the stats in the proto only if we have at least one value for the # feature. if total_num_values == 0: return result mean = numeric_stats.sum / total_num_values variance = max( 0, (numeric_stats.sum_of_squares / total_num_values) - mean * mean) result.mean = float(mean) result.std_dev = math.sqrt(variance) result.num_zeros = numeric_stats.num_zeros result.min = float(numeric_stats.min) result.max = float(numeric_stats.max) # Extract the quantiles from the summary. quantiles = quantiles_combiner.extract_output( numeric_stats.quantiles_summary) # Find the median from the quantiles and update the numeric stats proto. result.median = float(quantiles_util.find_median(quantiles)) # Construct the equi-width histogram from the quantiles and add it to the # numeric stats proto. std_histogram = quantiles_util.generate_equi_width_histogram( quantiles, numeric_stats.min, numeric_stats.max, total_num_values, num_histogram_buckets) std_histogram.num_nan = numeric_stats.num_nan new_std_histogram = result.histograms.add() new_std_histogram.CopyFrom(std_histogram) # Construct the quantiles histogram from the quantiles and add it to the # numeric stats proto. q_histogram = quantiles_util.generate_quantiles_histogram( quantiles, numeric_stats.min, numeric_stats.max, total_num_values, num_quantiles_histogram_buckets) q_histogram.num_nan = numeric_stats.num_nan new_q_histogram = result.histograms.add() new_q_histogram.CopyFrom(q_histogram) # Add weighted numeric stats to the proto. if has_weights: weighted_numeric_stats_proto = statistics_pb2.WeightedNumericStatistics() if numeric_stats.weighted_total_num_values == 0: weighted_mean = 0 weighted_variance = 0 else: weighted_mean = (numeric_stats.weighted_sum / numeric_stats.weighted_total_num_values) weighted_variance = max(0, (numeric_stats.weighted_sum_of_squares / numeric_stats.weighted_total_num_values) - weighted_mean**2) weighted_numeric_stats_proto.mean = weighted_mean weighted_numeric_stats_proto.std_dev = math.sqrt(weighted_variance) # Extract the weighted quantiles from the summary. weighted_quantiles = quantiles_combiner.extract_output( numeric_stats.weighted_quantiles_summary) # Find the weighted median from the quantiles and update the proto. weighted_numeric_stats_proto.median = float( quantiles_util.find_median(weighted_quantiles)) # Construct the weighted equi-width histogram from the quantiles and # add it to the numeric stats proto. weighted_std_histogram = quantiles_util.generate_equi_width_histogram( weighted_quantiles, numeric_stats.min, numeric_stats.max, numeric_stats.weighted_total_num_values, num_histogram_buckets) weighted_std_histogram.num_nan = numeric_stats.num_nan weighted_numeric_stats_proto.histograms.extend([weighted_std_histogram]) # Construct the weighted quantiles histogram from the quantiles and # add it to the numeric stats proto. weighted_q_histogram = quantiles_util.generate_quantiles_histogram( weighted_quantiles, numeric_stats.min, numeric_stats.max, numeric_stats.weighted_total_num_values, num_quantiles_histogram_buckets) weighted_q_histogram.num_nan = numeric_stats.num_nan weighted_numeric_stats_proto.histograms.extend([weighted_q_histogram]) result.weighted_numeric_stats.CopyFrom( weighted_numeric_stats_proto) return result
def _make_numeric_stats_proto( numeric_stats: _PartialNumericStats, total_num_values: int, num_histogram_buckets: int, num_quantiles_histogram_buckets: int, has_weights: bool ) -> statistics_pb2.NumericStatistics: """Convert the partial numeric statistics into NumericStatistics proto.""" result = statistics_pb2.NumericStatistics() if numeric_stats.num_nan > 0: total_num_values -= numeric_stats.num_nan if total_num_values == 0: # If we only have nan values, we only set num_nan. if numeric_stats.num_nan > 0: result.histograms.add(type=statistics_pb2.Histogram.STANDARD).num_nan = ( numeric_stats.num_nan) result.histograms.add(type=statistics_pb2.Histogram.QUANTILES).num_nan = ( numeric_stats.num_nan) return result mean = numeric_stats.sum / total_num_values variance = max( 0, (numeric_stats.sum_of_squares / total_num_values) - mean * mean) result.mean = float(mean) result.std_dev = math.sqrt(variance) result.num_zeros = numeric_stats.num_zeros result.min = float(numeric_stats.min) result.max = float(numeric_stats.max) # Extract the quantiles from the summary. assert numeric_stats.quantiles_summary is not None quantiles = ( numeric_stats.quantiles_summary.GetQuantiles( max(num_quantiles_histogram_buckets, _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM * num_histogram_buckets)).flatten().to_pylist()) # Find the median from the quantiles and update the numeric stats proto. result.median = float(quantiles_util.find_median(quantiles)) # Construct the equi-width histogram from the quantiles and add it to the # numeric stats proto. std_histogram = quantiles_util.generate_equi_width_histogram( quantiles, numeric_stats.finite_min, numeric_stats.finite_max, total_num_values, num_histogram_buckets) std_histogram.num_nan = numeric_stats.num_nan new_std_histogram = result.histograms.add() new_std_histogram.CopyFrom(std_histogram) # Construct the quantiles histogram from the quantiles and add it to the # numeric stats proto. q_histogram = quantiles_util.generate_quantiles_histogram( quantiles, total_num_values, num_quantiles_histogram_buckets) q_histogram.num_nan = numeric_stats.num_nan new_q_histogram = result.histograms.add() new_q_histogram.CopyFrom(q_histogram) # Add weighted numeric stats to the proto. if has_weights: weighted_numeric_stats_proto = statistics_pb2.WeightedNumericStatistics() if numeric_stats.weighted_total_num_values == 0: weighted_mean = 0 weighted_variance = 0 else: weighted_mean = (numeric_stats.weighted_sum / numeric_stats.weighted_total_num_values) weighted_variance = max(0, (numeric_stats.weighted_sum_of_squares / numeric_stats.weighted_total_num_values) - weighted_mean**2) weighted_numeric_stats_proto.mean = weighted_mean weighted_numeric_stats_proto.std_dev = math.sqrt(weighted_variance) # Extract the weighted quantiles from the summary. assert numeric_stats.weighted_quantiles_summary is not None weighted_quantiles = ( numeric_stats.weighted_quantiles_summary.GetQuantiles( max(num_quantiles_histogram_buckets, _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM * num_histogram_buckets)).flatten().to_pylist()) # Find the weighted median from the quantiles and update the proto. weighted_numeric_stats_proto.median = float( quantiles_util.find_median(weighted_quantiles)) # Construct the weighted equi-width histogram from the quantiles and # add it to the numeric stats proto. weighted_std_histogram = quantiles_util.generate_equi_width_histogram( weighted_quantiles, numeric_stats.finite_min, numeric_stats.finite_max, numeric_stats.weighted_total_num_values, num_histogram_buckets) weighted_std_histogram.num_nan = numeric_stats.num_nan weighted_numeric_stats_proto.histograms.extend([weighted_std_histogram]) # Construct the weighted quantiles histogram from the quantiles and # add it to the numeric stats proto. weighted_q_histogram = quantiles_util.generate_quantiles_histogram( weighted_quantiles, numeric_stats.weighted_total_num_values, num_quantiles_histogram_buckets) weighted_q_histogram.num_nan = numeric_stats.num_nan weighted_numeric_stats_proto.histograms.extend([weighted_q_histogram]) result.weighted_numeric_stats.CopyFrom( weighted_numeric_stats_proto) return result