Ejemplo n.º 1
0
def _run_quantiles_combiner_test(test: absltest.TestCase,
                                 q_combiner: quantiles_util.QuantilesCombiner,
                                 batches: List[List[np.ndarray]],
                                 expected_result: np.ndarray):
  """Tests quantiles combiner."""
  summaries = [q_combiner.add_input(q_combiner.create_accumulator(),
                                    batch) for batch in batches]
  result = q_combiner.extract_output(q_combiner.merge_accumulators(summaries))
  test.assertEqual(result.dtype, expected_result.dtype)
  test.assertEqual(result.size, expected_result.size)
  for i in range(expected_result.size):
    test.assertAlmostEqual(result[i], expected_result[i])
Ejemplo n.º 2
0
def _make_common_stats_proto(
    common_stats: _PartialCommonStats,
    parent_common_stats: Optional[_PartialCommonStats],
    q_combiner: quantiles_util.QuantilesCombiner,
    num_values_histogram_buckets: int,
    has_weights: bool
) -> statistics_pb2.CommonStatistics:
  """Convert the partial common stats into a CommonStatistics proto."""
  result = statistics_pb2.CommonStatistics()
  result.num_non_missing = common_stats.num_non_missing
  if parent_common_stats is not None:
    result.num_missing = (
        parent_common_stats.total_num_values - common_stats.num_non_missing)
  result.tot_num_values = common_stats.total_num_values

  # TODO(b/79685042): Need to decide on what is the expected values for
  # statistics like min_num_values, max_num_values, avg_num_values, when
  # all the values for the feature are missing.
  if common_stats.num_non_missing > 0:
    result.min_num_values = common_stats.min_num_values
    result.max_num_values = common_stats.max_num_values
    result.avg_num_values = (
        common_stats.total_num_values / common_stats.num_non_missing)

    # Add num_values_histogram to the common stats proto.
    num_values_quantiles = q_combiner.extract_output(
        common_stats.num_values_summary)
    histogram = quantiles_util.generate_quantiles_histogram(
        num_values_quantiles, common_stats.num_non_missing,
        num_values_histogram_buckets)
    result.num_values_histogram.CopyFrom(histogram)

  # Add weighted common stats to the proto.
  if has_weights:
    weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics(
        num_non_missing=common_stats.weighted_num_non_missing,
        tot_num_values=common_stats.weighted_total_num_values)
    if parent_common_stats is not None:
      weighted_common_stats_proto.num_missing = (
          parent_common_stats.weighted_total_num_values -
          common_stats.weighted_num_non_missing)

    if common_stats.weighted_num_non_missing > 0:
      weighted_common_stats_proto.avg_num_values = (
          common_stats.weighted_total_num_values /
          common_stats.weighted_num_non_missing)

    result.weighted_common_stats.CopyFrom(
        weighted_common_stats_proto)
  return result
def _make_numeric_stats_proto(
        numeric_stats: _PartialNumericStats, total_num_values: int,
        quantiles_combiner: quantiles_util.QuantilesCombiner,
        num_histogram_buckets: int, num_quantiles_histogram_buckets: int,
        has_weights: bool) -> statistics_pb2.NumericStatistics:
    """Convert the partial numeric statistics into NumericStatistics proto."""
    result = statistics_pb2.NumericStatistics()

    if numeric_stats.num_nan > 0:
        total_num_values -= numeric_stats.num_nan

    if total_num_values == 0:
        # If we only have nan values, we only set num_nan.
        if numeric_stats.num_nan > 0:
            result.histograms.add(
                type=statistics_pb2.Histogram.STANDARD).num_nan = (
                    numeric_stats.num_nan)
            result.histograms.add(
                type=statistics_pb2.Histogram.QUANTILES).num_nan = (
                    numeric_stats.num_nan)
        return result

    mean = numeric_stats.sum / total_num_values
    variance = max(0, (numeric_stats.sum_of_squares / total_num_values) -
                   mean * mean)
    result.mean = float(mean)
    result.std_dev = math.sqrt(variance)
    result.num_zeros = numeric_stats.num_zeros
    result.min = float(numeric_stats.min)
    result.max = float(numeric_stats.max)

    # Extract the quantiles from the summary.
    quantiles = quantiles_combiner.extract_output(
        numeric_stats.quantiles_summary)

    # Find the median from the quantiles and update the numeric stats proto.
    result.median = float(quantiles_util.find_median(quantiles))

    # Construct the equi-width histogram from the quantiles and add it to the
    # numeric stats proto.
    std_histogram = quantiles_util.generate_equi_width_histogram(
        quantiles, numeric_stats.min, numeric_stats.max, total_num_values,
        num_histogram_buckets)
    std_histogram.num_nan = numeric_stats.num_nan
    new_std_histogram = result.histograms.add()
    new_std_histogram.CopyFrom(std_histogram)

    # Construct the quantiles histogram from the quantiles and add it to the
    # numeric stats proto.
    q_histogram = quantiles_util.generate_quantiles_histogram(
        quantiles, numeric_stats.min, numeric_stats.max, total_num_values,
        num_quantiles_histogram_buckets)
    q_histogram.num_nan = numeric_stats.num_nan
    new_q_histogram = result.histograms.add()
    new_q_histogram.CopyFrom(q_histogram)

    # Add weighted numeric stats to the proto.
    if has_weights:
        weighted_numeric_stats_proto = statistics_pb2.WeightedNumericStatistics(
        )

        if numeric_stats.weighted_total_num_values == 0:
            weighted_mean = 0
            weighted_variance = 0
        else:
            weighted_mean = (numeric_stats.weighted_sum /
                             numeric_stats.weighted_total_num_values)
            weighted_variance = max(0,
                                    (numeric_stats.weighted_sum_of_squares /
                                     numeric_stats.weighted_total_num_values) -
                                    weighted_mean**2)
        weighted_numeric_stats_proto.mean = weighted_mean
        weighted_numeric_stats_proto.std_dev = math.sqrt(weighted_variance)

        # Extract the weighted quantiles from the summary.
        weighted_quantiles = quantiles_combiner.extract_output(
            numeric_stats.weighted_quantiles_summary)

        # Find the weighted median from the quantiles and update the proto.
        weighted_numeric_stats_proto.median = float(
            quantiles_util.find_median(weighted_quantiles))

        # Construct the weighted equi-width histogram from the quantiles and
        # add it to the numeric stats proto.
        weighted_std_histogram = quantiles_util.generate_equi_width_histogram(
            weighted_quantiles, numeric_stats.min, numeric_stats.max,
            numeric_stats.weighted_total_num_values, num_histogram_buckets)
        weighted_std_histogram.num_nan = numeric_stats.num_nan
        weighted_numeric_stats_proto.histograms.extend(
            [weighted_std_histogram])

        # Construct the weighted quantiles histogram from the quantiles and
        # add it to the numeric stats proto.
        weighted_q_histogram = quantiles_util.generate_quantiles_histogram(
            weighted_quantiles, numeric_stats.min, numeric_stats.max,
            numeric_stats.weighted_total_num_values,
            num_quantiles_histogram_buckets)
        weighted_q_histogram.num_nan = numeric_stats.num_nan
        weighted_numeric_stats_proto.histograms.extend([weighted_q_histogram])

        result.weighted_numeric_stats.CopyFrom(weighted_numeric_stats_proto)
    return result