Beispiel #1
0
def _make_numeric_stats_proto(
    numeric_stats,
    total_num_values,
    quantiles_combiner,
    num_histogram_buckets,
    num_quantiles_histogram_buckets,
    has_weights
    ):
  """Convert the partial numeric statistics into NumericStatistics proto."""
  result = statistics_pb2.NumericStatistics()

  if numeric_stats.num_nan > 0:
    total_num_values -= numeric_stats.num_nan

  # Set the stats in the proto only if we have at least one value for the
  # feature.
  if total_num_values == 0:
    return result

  mean = numeric_stats.sum / total_num_values
  variance = max(
      0, (numeric_stats.sum_of_squares / total_num_values) -
      mean * mean)
  result.mean = float(mean)
  result.std_dev = math.sqrt(variance)
  result.num_zeros = numeric_stats.num_zeros
  result.min = float(numeric_stats.min)
  result.max = float(numeric_stats.max)

  # Extract the quantiles from the summary.
  quantiles = quantiles_combiner.extract_output(
      numeric_stats.quantiles_summary)

  # Find the median from the quantiles and update the numeric stats proto.
  result.median = float(quantiles_util.find_median(quantiles))

  # Construct the equi-width histogram from the quantiles and add it to the
  # numeric stats proto.
  std_histogram = quantiles_util.generate_equi_width_histogram(
      quantiles, numeric_stats.min, numeric_stats.max,
      total_num_values, num_histogram_buckets)
  std_histogram.num_nan = numeric_stats.num_nan
  new_std_histogram = result.histograms.add()
  new_std_histogram.CopyFrom(std_histogram)

  # Construct the quantiles histogram from the quantiles and add it to the
  # numeric stats proto.
  q_histogram = quantiles_util.generate_quantiles_histogram(
      quantiles, numeric_stats.min, numeric_stats.max,
      total_num_values, num_quantiles_histogram_buckets)
  q_histogram.num_nan = numeric_stats.num_nan
  new_q_histogram = result.histograms.add()
  new_q_histogram.CopyFrom(q_histogram)

  # Add weighted numeric stats to the proto.
  if has_weights:
    weighted_numeric_stats_proto = statistics_pb2.WeightedNumericStatistics()

    if numeric_stats.weighted_total_num_values == 0:
      weighted_mean = 0
      weighted_variance = 0
    else:
      weighted_mean = (numeric_stats.weighted_sum /
                       numeric_stats.weighted_total_num_values)
      weighted_variance = max(0, (numeric_stats.weighted_sum_of_squares /
                                  numeric_stats.weighted_total_num_values)
                              - weighted_mean**2)
    weighted_numeric_stats_proto.mean = weighted_mean
    weighted_numeric_stats_proto.std_dev = math.sqrt(weighted_variance)

    # Extract the weighted quantiles from the summary.
    weighted_quantiles = quantiles_combiner.extract_output(
        numeric_stats.weighted_quantiles_summary)

    # Find the weighted median from the quantiles and update the proto.
    weighted_numeric_stats_proto.median = float(
        quantiles_util.find_median(weighted_quantiles))

    # Construct the weighted equi-width histogram from the quantiles and
    # add it to the numeric stats proto.
    weighted_std_histogram = quantiles_util.generate_equi_width_histogram(
        weighted_quantiles, numeric_stats.min, numeric_stats.max,
        numeric_stats.weighted_total_num_values, num_histogram_buckets)
    weighted_std_histogram.num_nan = numeric_stats.num_nan
    weighted_numeric_stats_proto.histograms.extend([weighted_std_histogram])

    # Construct the weighted quantiles histogram from the quantiles and
    # add it to the numeric stats proto.
    weighted_q_histogram = quantiles_util.generate_quantiles_histogram(
        weighted_quantiles, numeric_stats.min, numeric_stats.max,
        numeric_stats.weighted_total_num_values,
        num_quantiles_histogram_buckets)
    weighted_q_histogram.num_nan = numeric_stats.num_nan
    weighted_numeric_stats_proto.histograms.extend([weighted_q_histogram])

    result.weighted_numeric_stats.CopyFrom(
        weighted_numeric_stats_proto)
  return result
def _make_numeric_stats_proto(
    numeric_stats: _PartialNumericStats,
    total_num_values: int,
    num_histogram_buckets: int,
    num_quantiles_histogram_buckets: int,
    has_weights: bool
    ) -> statistics_pb2.NumericStatistics:
  """Convert the partial numeric statistics into NumericStatistics proto."""
  result = statistics_pb2.NumericStatistics()

  if numeric_stats.num_nan > 0:
    total_num_values -= numeric_stats.num_nan

  if total_num_values == 0:
    # If we only have nan values, we only set num_nan.
    if numeric_stats.num_nan > 0:
      result.histograms.add(type=statistics_pb2.Histogram.STANDARD).num_nan = (
          numeric_stats.num_nan)
      result.histograms.add(type=statistics_pb2.Histogram.QUANTILES).num_nan = (
          numeric_stats.num_nan)
    return result

  mean = numeric_stats.sum / total_num_values
  variance = max(
      0, (numeric_stats.sum_of_squares / total_num_values) -
      mean * mean)
  result.mean = float(mean)
  result.std_dev = math.sqrt(variance)
  result.num_zeros = numeric_stats.num_zeros
  result.min = float(numeric_stats.min)
  result.max = float(numeric_stats.max)

  # Extract the quantiles from the summary.
  assert numeric_stats.quantiles_summary is not None
  quantiles = (
      numeric_stats.quantiles_summary.GetQuantiles(
          max(num_quantiles_histogram_buckets,
              _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM *
              num_histogram_buckets)).flatten().to_pylist())

  # Find the median from the quantiles and update the numeric stats proto.
  result.median = float(quantiles_util.find_median(quantiles))

  # Construct the equi-width histogram from the quantiles and add it to the
  # numeric stats proto.
  std_histogram = quantiles_util.generate_equi_width_histogram(
      quantiles, numeric_stats.finite_min, numeric_stats.finite_max,
      total_num_values, num_histogram_buckets)
  std_histogram.num_nan = numeric_stats.num_nan
  new_std_histogram = result.histograms.add()
  new_std_histogram.CopyFrom(std_histogram)

  # Construct the quantiles histogram from the quantiles and add it to the
  # numeric stats proto.
  q_histogram = quantiles_util.generate_quantiles_histogram(
      quantiles, total_num_values, num_quantiles_histogram_buckets)
  q_histogram.num_nan = numeric_stats.num_nan
  new_q_histogram = result.histograms.add()
  new_q_histogram.CopyFrom(q_histogram)

  # Add weighted numeric stats to the proto.
  if has_weights:
    weighted_numeric_stats_proto = statistics_pb2.WeightedNumericStatistics()

    if numeric_stats.weighted_total_num_values == 0:
      weighted_mean = 0
      weighted_variance = 0
    else:
      weighted_mean = (numeric_stats.weighted_sum /
                       numeric_stats.weighted_total_num_values)
      weighted_variance = max(0, (numeric_stats.weighted_sum_of_squares /
                                  numeric_stats.weighted_total_num_values)
                              - weighted_mean**2)
    weighted_numeric_stats_proto.mean = weighted_mean
    weighted_numeric_stats_proto.std_dev = math.sqrt(weighted_variance)

    # Extract the weighted quantiles from the summary.
    assert numeric_stats.weighted_quantiles_summary is not None
    weighted_quantiles = (
        numeric_stats.weighted_quantiles_summary.GetQuantiles(
            max(num_quantiles_histogram_buckets,
                _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM *
                num_histogram_buckets)).flatten().to_pylist())

    # Find the weighted median from the quantiles and update the proto.
    weighted_numeric_stats_proto.median = float(
        quantiles_util.find_median(weighted_quantiles))

    # Construct the weighted equi-width histogram from the quantiles and
    # add it to the numeric stats proto.
    weighted_std_histogram = quantiles_util.generate_equi_width_histogram(
        weighted_quantiles, numeric_stats.finite_min, numeric_stats.finite_max,
        numeric_stats.weighted_total_num_values, num_histogram_buckets)
    weighted_std_histogram.num_nan = numeric_stats.num_nan
    weighted_numeric_stats_proto.histograms.extend([weighted_std_histogram])

    # Construct the weighted quantiles histogram from the quantiles and
    # add it to the numeric stats proto.
    weighted_q_histogram = quantiles_util.generate_quantiles_histogram(
        weighted_quantiles, numeric_stats.weighted_total_num_values,
        num_quantiles_histogram_buckets)
    weighted_q_histogram.num_nan = numeric_stats.num_nan
    weighted_numeric_stats_proto.histograms.extend([weighted_q_histogram])

    result.weighted_numeric_stats.CopyFrom(
        weighted_numeric_stats_proto)
  return result
Beispiel #3
0
def _make_feature_stats_proto(
    common_stats, feature_name,
    q_combiner,
    num_values_histogram_buckets,
    is_categorical, has_weights
):
  """Convert the partial common stats into a FeatureNameStatistics proto.

  Args:
    common_stats: The partial common stats associated with a feature.
    feature_name: The name of the feature.
    q_combiner: The quantiles combiner used to construct the quantiles
        histogram for the number of values in the feature.
    num_values_histogram_buckets: Number of buckets in the quantiles
        histogram for the number of values per feature.
    is_categorical: A boolean indicating whether the feature is categorical.
    has_weights: A boolean indicating whether a weight feature is specified.

  Returns:
    A statistics_pb2.FeatureNameStatistics proto.
  """
  common_stats_proto = statistics_pb2.CommonStatistics()
  common_stats_proto.num_non_missing = common_stats.num_non_missing
  common_stats_proto.num_missing = common_stats.num_missing
  common_stats_proto.tot_num_values = common_stats.total_num_values

  if common_stats.num_non_missing > 0:
    common_stats_proto.min_num_values = common_stats.min_num_values
    common_stats_proto.max_num_values = common_stats.max_num_values
    common_stats_proto.avg_num_values = (
        common_stats.total_num_values / common_stats.num_non_missing)

    # Add num_values_histogram to the common stats proto.
    num_values_quantiles = q_combiner.extract_output(
        common_stats.num_values_summary)
    histogram = quantiles_util.generate_quantiles_histogram(
        num_values_quantiles, common_stats.min_num_values,
        common_stats.max_num_values, common_stats.num_non_missing,
        num_values_histogram_buckets)
    common_stats_proto.num_values_histogram.CopyFrom(histogram)

  # Add weighted common stats to the proto.
  if has_weights:
    weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics(
        num_non_missing=common_stats.weighted_num_non_missing,
        num_missing=common_stats.weighted_num_missing,
        tot_num_values=common_stats.weighted_total_num_values)

    if common_stats.weighted_num_non_missing > 0:
      weighted_common_stats_proto.avg_num_values = (
          common_stats.weighted_total_num_values /
          common_stats.weighted_num_non_missing)

    common_stats_proto.weighted_common_stats.CopyFrom(
        weighted_common_stats_proto)

  # Create a new FeatureNameStatistics proto.
  result = statistics_pb2.FeatureNameStatistics()
  result.name = feature_name
  # Set the feature type.
  # If we have a categorical feature, we preserve the type to be the original
  # INT type. Currently we don't set the type if we cannot infer it, which
  # happens when all the values are missing. We need to add an UNKNOWN type
  # to the stats proto to handle this case.
  if is_categorical:
    result.type = statistics_pb2.FeatureNameStatistics.INT
  elif common_stats.type is None:
    # If a feature is completely missing, we assume the type to be STRING.
    result.type = statistics_pb2.FeatureNameStatistics.STRING
  else:
    result.type = common_stats.type

  # Copy the common stats into appropriate numeric/string stats.
  # If the type is not set, we currently wrap the common stats
  # within numeric stats.
  if (result.type == statistics_pb2.FeatureNameStatistics.STRING or
      is_categorical):
    # Add the common stats into string stats.
    string_stats_proto = statistics_pb2.StringStatistics()
    string_stats_proto.common_stats.CopyFrom(common_stats_proto)
    result.string_stats.CopyFrom(string_stats_proto)
  else:
    # Add the common stats into numeric stats.
    numeric_stats_proto = statistics_pb2.NumericStatistics()
    numeric_stats_proto.common_stats.CopyFrom(common_stats_proto)
    result.num_stats.CopyFrom(numeric_stats_proto)

  return result
def _make_common_stats_proto(
    common_stats: _PartialCommonStats,
    parent_common_stats: Optional[_PartialCommonStats],
    make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch],
    num_values_histogram_buckets: int,
    has_weights: bool
) -> statistics_pb2.CommonStatistics:
  """Convert the partial common stats into a CommonStatistics proto."""
  result = statistics_pb2.CommonStatistics()
  parent_presence_and_valency = None
  if parent_common_stats is not None:
    parent_presence_and_valency = (
        _PresenceAndValencyStats(make_quantiles_sketch_fn)
        if parent_common_stats.presence_and_valency_stats is None else
        parent_common_stats.presence_and_valency_stats[-1])

  presence_and_valency_stats = common_stats.presence_and_valency_stats
  # the CommonStatistics already contains the presence and valency
  # for a 1-nested feature.
  if (presence_and_valency_stats is not None and
      len(presence_and_valency_stats) > 1):
    result.presence_and_valency_stats.extend(
        _make_presence_and_valency_stats_protos(
            parent_presence_and_valency,
            common_stats.presence_and_valency_stats))
    if has_weights:
      result.weighted_presence_and_valency_stats.extend(
          _make_weighted_presence_and_valency_stats_protos(
              parent_presence_and_valency,
              common_stats.presence_and_valency_stats))

  top_level_presence_and_valency = (
      _PresenceAndValencyStats(make_quantiles_sketch_fn)
      if common_stats.presence_and_valency_stats is None else
      common_stats.presence_and_valency_stats[0])
  result.num_non_missing = top_level_presence_and_valency.num_non_missing

  if parent_presence_and_valency is not None:
    result.num_missing = (
        parent_presence_and_valency.total_num_values -
        top_level_presence_and_valency.num_non_missing)
  result.tot_num_values = top_level_presence_and_valency.total_num_values

  # TODO(b/79685042): Need to decide on what is the expected values for
  # statistics like min_num_values, max_num_values, avg_num_values, when
  # all the values for the feature are missing.
  if top_level_presence_and_valency.num_non_missing > 0:
    result.min_num_values = top_level_presence_and_valency.min_num_values
    result.max_num_values = top_level_presence_and_valency.max_num_values
    result.avg_num_values = (
        top_level_presence_and_valency.total_num_values /
        top_level_presence_and_valency.num_non_missing)

    if top_level_presence_and_valency.num_values_summary is not None:

      # Add num_values_histogram to the common stats proto.
      num_values_quantiles = (
          top_level_presence_and_valency.num_values_summary.GetQuantiles(
              num_values_histogram_buckets).flatten().to_pylist())
      histogram = quantiles_util.generate_quantiles_histogram(
          num_values_quantiles, top_level_presence_and_valency.num_non_missing,
          num_values_histogram_buckets)
      result.num_values_histogram.CopyFrom(histogram)

  # Add weighted common stats to the proto.
  if has_weights:
    weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics(
        num_non_missing=top_level_presence_and_valency.weighted_num_non_missing,
        tot_num_values=top_level_presence_and_valency.weighted_total_num_values)
    if parent_presence_and_valency is not None:
      weighted_common_stats_proto.num_missing = (
          parent_presence_and_valency.weighted_total_num_values -
          top_level_presence_and_valency.weighted_num_non_missing)

    if top_level_presence_and_valency.weighted_num_non_missing > 0:
      weighted_common_stats_proto.avg_num_values = (
          top_level_presence_and_valency.weighted_total_num_values /
          top_level_presence_and_valency.weighted_num_non_missing)

    result.weighted_common_stats.CopyFrom(
        weighted_common_stats_proto)
  return result