Beispiel #1
0
def _make_common_stats_proto(
    common_stats: _PartialCommonStats,
    parent_common_stats: Optional[_PartialCommonStats],
    q_combiner: quantiles_util.QuantilesCombiner,
    num_values_histogram_buckets: int,
    has_weights: bool
) -> statistics_pb2.CommonStatistics:
  """Convert the partial common stats into a CommonStatistics proto."""
  result = statistics_pb2.CommonStatistics()
  result.num_non_missing = common_stats.num_non_missing
  if parent_common_stats is not None:
    result.num_missing = (
        parent_common_stats.total_num_values - common_stats.num_non_missing)
  result.tot_num_values = common_stats.total_num_values

  # TODO(b/79685042): Need to decide on what is the expected values for
  # statistics like min_num_values, max_num_values, avg_num_values, when
  # all the values for the feature are missing.
  if common_stats.num_non_missing > 0:
    result.min_num_values = common_stats.min_num_values
    result.max_num_values = common_stats.max_num_values
    result.avg_num_values = (
        common_stats.total_num_values / common_stats.num_non_missing)

    # Add num_values_histogram to the common stats proto.
    num_values_quantiles = q_combiner.extract_output(
        common_stats.num_values_summary)
    histogram = quantiles_util.generate_quantiles_histogram(
        num_values_quantiles, common_stats.num_non_missing,
        num_values_histogram_buckets)
    result.num_values_histogram.CopyFrom(histogram)

  # Add weighted common stats to the proto.
  if has_weights:
    weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics(
        num_non_missing=common_stats.weighted_num_non_missing,
        tot_num_values=common_stats.weighted_total_num_values)
    if parent_common_stats is not None:
      weighted_common_stats_proto.num_missing = (
          parent_common_stats.weighted_total_num_values -
          common_stats.weighted_num_non_missing)

    if common_stats.weighted_num_non_missing > 0:
      weighted_common_stats_proto.avg_num_values = (
          common_stats.weighted_total_num_values /
          common_stats.weighted_num_non_missing)

    result.weighted_common_stats.CopyFrom(
        weighted_common_stats_proto)
  return result
Beispiel #2
0
def get_dataset_feature_statistics(builder, split):
  """Calculate statistics for the specified split."""
  statistics = statistics_pb2.DatasetFeatureStatistics()

  # Make this to the best of our abilities.
  schema = schema_pb2.Schema()

  dataset = builder.as_dataset(split=split)

  # Just computing the number of examples for now.
  statistics.num_examples = 0

  # Feature dictionaries.
  feature_to_shape = {}
  feature_to_dtype = {}
  feature_to_num_examples = collections.defaultdict(int)
  feature_to_min = {}
  feature_to_max = {}

  for example in dataset:
    statistics.num_examples += 1

    assert isinstance(example, dict)

    feature_names = example.keys()
    for feature_name in feature_names:

      # Update the number of examples this feature appears in.
      feature_to_num_examples[feature_name] += 1

      feature_shape = example[feature_name].shape
      feature_dtype = example[feature_name].dtype
      feature_np = example[feature_name].numpy()

      feature_min, feature_max = None, None
      is_numeric = (
          feature_dtype.is_floating or feature_dtype.is_integer or
          feature_dtype.is_bool)
      if is_numeric:
        feature_min = np.min(feature_np)
        feature_max = np.max(feature_np)

      # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add
      # logic for that.

      # Set the shape, or assert shapes match.
      if feature_name not in feature_to_shape:
        feature_to_shape[feature_name] = feature_shape
      else:
        assert feature_to_shape[feature_name] == feature_shape

      # Set the shape, or assert shapes match.
      if feature_name not in feature_to_dtype:
        feature_to_dtype[feature_name] = feature_dtype
      else:
        assert feature_to_dtype[feature_name] == feature_dtype

      # Set or update the min, max.
      if is_numeric:
        if ((feature_name not in feature_to_min) or
            (feature_to_min[feature_name] > feature_min)):
          feature_to_min[feature_name] = feature_min

        if ((feature_name not in feature_to_max) or
            (feature_to_max[feature_name] < feature_max)):
          feature_to_max[feature_name] = feature_max

  # Start here, we've processed all examples.

  # Assert that the keys match up.
  assert feature_to_shape.keys() == feature_to_dtype.keys()
  assert feature_to_shape.keys() == feature_to_num_examples.keys()

  for feature_name in feature_to_shape:
    # Try to fill in the schema.
    feature = schema.feature.add()
    feature.name = feature_name

    # TODO(afrozm): What do we do for non fixed size shapes?
    # What to do for scalars?
    for dim in feature_to_shape[feature_name].as_list():
      feature.shape.dim.add().size = dim
    feature_type = feature_to_dtype[feature_name]
    feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES)

    common_statistics = statistics_pb2.CommonStatistics()
    common_statistics.num_non_missing = feature_to_num_examples[feature_name]
    common_statistics.num_missing = (
        statistics.num_examples - common_statistics.num_non_missing)

    feature_name_statistics = statistics.features.add()
    feature_name_statistics.name = feature_name

    # TODO(afrozm): This can be skipped, since type information was added to
    # the Schema.
    feature_name_statistics.type = _SCHEMA_TYPE_MAP.get(
        feature.type, statistics_pb2.FeatureNameStatistics.BYTES)

    if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT:
      numeric_statistics = statistics_pb2.NumericStatistics()
      numeric_statistics.min = feature_to_min[feature_name]
      numeric_statistics.max = feature_to_max[feature_name]
      numeric_statistics.common_stats.CopyFrom(common_statistics)
      feature_name_statistics.num_stats.CopyFrom(numeric_statistics)
    else:
      # Let's shove it into BytesStatistics for now.
      bytes_statistics = statistics_pb2.BytesStatistics()
      bytes_statistics.common_stats.CopyFrom(common_statistics)
      feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics)

  return statistics, schema
Beispiel #3
0
def get_dataset_feature_statistics(builder, split):
    """Calculate statistics for the specified split."""
    statistics = statistics_pb2.DatasetFeatureStatistics()

    # Make this to the best of our abilities.
    schema = schema_pb2.Schema()

    dataset = builder.as_dataset(split=split)

    # Just computing the number of examples for now.
    statistics.num_examples = 0

    # Feature dictionaries.
    feature_to_num_examples = collections.defaultdict(int)
    feature_to_min = {}
    feature_to_max = {}

    np_dataset = dataset_utils.dataset_as_numpy(dataset)
    for example in tqdm.tqdm(np_dataset, unit=" examples"):
        statistics.num_examples += 1

        assert isinstance(example, dict)

        feature_names = sorted(example.keys())
        for feature_name in feature_names:

            # Update the number of examples this feature appears in.
            feature_to_num_examples[feature_name] += 1

            feature_np = example[feature_name]

            # For compatibility in graph and eager mode, we can get PODs here and
            # everything may not be neatly wrapped up in numpy's ndarray.

            feature_dtype = type(feature_np)

            if isinstance(feature_np, np.ndarray):
                feature_dtype = feature_np.dtype.type

            feature_min, feature_max = None, None
            is_numeric = (np.issubdtype(feature_dtype, np.number)
                          or feature_dtype == np.bool_)
            if is_numeric:
                feature_min = np.min(feature_np)
                feature_max = np.max(feature_np)

            # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add
            # logic for that.

            # Set or update the min, max.
            if is_numeric:
                if ((feature_name not in feature_to_min)
                        or (feature_to_min[feature_name] > feature_min)):
                    feature_to_min[feature_name] = feature_min

                if ((feature_name not in feature_to_max)
                        or (feature_to_max[feature_name] < feature_max)):
                    feature_to_max[feature_name] = feature_max

    # Start here, we've processed all examples.

    output_shapes_dict = dataset.output_shapes
    output_types_dict = dataset.output_types

    for feature_name in sorted(feature_to_num_examples.keys()):
        # Try to fill in the schema.
        feature = schema.feature.add()
        feature.name = feature_name

        # TODO(afrozm): Make this work with nested structures, currently the Schema
        # proto has no support for it.
        maybe_feature_shape = output_shapes_dict[feature_name]
        if not isinstance(maybe_feature_shape, tf.TensorShape):
            logging.error(
                "Statistics generation doesn't work for nested structures yet")
            continue

        for dim in maybe_feature_shape.as_list():
            # We denote `None`s as -1 in the shape proto.
            feature.shape.dim.add().size = dim if dim else -1
        feature_type = output_types_dict[feature_name]
        feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES)

        common_statistics = statistics_pb2.CommonStatistics()
        common_statistics.num_non_missing = feature_to_num_examples[
            feature_name]
        common_statistics.num_missing = (statistics.num_examples -
                                         common_statistics.num_non_missing)

        feature_name_statistics = statistics.features.add()
        feature_name_statistics.name = feature_name

        # TODO(afrozm): This can be skipped, since type information was added to
        # the Schema.
        feature_name_statistics.type = _SCHEMA_TYPE_MAP.get(
            feature.type, statistics_pb2.FeatureNameStatistics.BYTES)

        if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT:
            numeric_statistics = statistics_pb2.NumericStatistics()
            numeric_statistics.min = feature_to_min[feature_name]
            numeric_statistics.max = feature_to_max[feature_name]
            numeric_statistics.common_stats.CopyFrom(common_statistics)
            feature_name_statistics.num_stats.CopyFrom(numeric_statistics)
        else:
            # Let's shove it into BytesStatistics for now.
            bytes_statistics = statistics_pb2.BytesStatistics()
            bytes_statistics.common_stats.CopyFrom(common_statistics)
            feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics)

    return statistics, schema
Beispiel #4
0
def _make_feature_stats_proto(common_stats, feature_name, q_combiner,
                              num_values_histogram_buckets, is_categorical,
                              has_weights):
    """Convert the partial common stats into a FeatureNameStatistics proto.

  Args:
    common_stats: The partial common stats associated with a feature.
    feature_name: The name of the feature.
    q_combiner: The quantiles combiner used to construct the quantiles
        histogram for the number of values in the feature.
    num_values_histogram_buckets: Number of buckets in the quantiles
        histogram for the number of values per feature.
    is_categorical: A boolean indicating whether the feature is categorical.
    has_weights: A boolean indicating whether a weight feature is specified.

  Returns:
    A statistics_pb2.FeatureNameStatistics proto.
  """
    common_stats_proto = statistics_pb2.CommonStatistics()
    common_stats_proto.num_non_missing = common_stats.num_non_missing
    common_stats_proto.num_missing = common_stats.num_missing
    common_stats_proto.tot_num_values = common_stats.total_num_values

    if common_stats.num_non_missing > 0:
        common_stats_proto.min_num_values = common_stats.min_num_values
        common_stats_proto.max_num_values = common_stats.max_num_values
        common_stats_proto.avg_num_values = (common_stats.total_num_values /
                                             common_stats.num_non_missing)

        # Add num_values_histogram to the common stats proto.
        num_values_quantiles = q_combiner.extract_output(
            common_stats.num_values_summary)
        histogram = quantiles_util.generate_quantiles_histogram(
            num_values_quantiles, common_stats.min_num_values,
            common_stats.max_num_values, common_stats.num_non_missing,
            num_values_histogram_buckets)
        common_stats_proto.num_values_histogram.CopyFrom(histogram)

    # Add weighted common stats to the proto.
    if has_weights:
        weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics(
            num_non_missing=common_stats.weighted_num_non_missing,
            num_missing=common_stats.weighted_num_missing,
            tot_num_values=common_stats.weighted_total_num_values)

        if common_stats.weighted_num_non_missing > 0:
            weighted_common_stats_proto.avg_num_values = (
                common_stats.weighted_total_num_values /
                common_stats.weighted_num_non_missing)

        common_stats_proto.weighted_common_stats.CopyFrom(
            weighted_common_stats_proto)

    # Create a new FeatureNameStatistics proto.
    result = statistics_pb2.FeatureNameStatistics()
    result.name = feature_name
    # Set the feature type.
    # If we have a categorical feature, we preserve the type to be the original
    # INT type. Currently we don't set the type if we cannot infer it, which
    # happens when all the values are missing. We need to add an UNKNOWN type
    # to the stats proto to handle this case.
    if is_categorical:
        result.type = statistics_pb2.FeatureNameStatistics.INT
    elif common_stats.type is None:
        # If a feature is completely missing, we assume the type to be STRING.
        result.type = statistics_pb2.FeatureNameStatistics.STRING
    else:
        result.type = common_stats.type

    # Copy the common stats into appropriate numeric/string stats.
    # If the type is not set, we currently wrap the common stats
    # within numeric stats.
    if (result.type == statistics_pb2.FeatureNameStatistics.STRING
            or is_categorical):
        # Add the common stats into string stats.
        string_stats_proto = statistics_pb2.StringStatistics()
        string_stats_proto.common_stats.CopyFrom(common_stats_proto)
        result.string_stats.CopyFrom(string_stats_proto)
    else:
        # Add the common stats into numeric stats.
        numeric_stats_proto = statistics_pb2.NumericStatistics()
        numeric_stats_proto.common_stats.CopyFrom(common_stats_proto)
        result.num_stats.CopyFrom(numeric_stats_proto)

    return result
def _make_common_stats_proto(
    common_stats: _PartialCommonStats,
    parent_common_stats: Optional[_PartialCommonStats],
    make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch],
    num_values_histogram_buckets: int,
    has_weights: bool
) -> statistics_pb2.CommonStatistics:
  """Convert the partial common stats into a CommonStatistics proto."""
  result = statistics_pb2.CommonStatistics()
  parent_presence_and_valency = None
  if parent_common_stats is not None:
    parent_presence_and_valency = (
        _PresenceAndValencyStats(make_quantiles_sketch_fn)
        if parent_common_stats.presence_and_valency_stats is None else
        parent_common_stats.presence_and_valency_stats[-1])

  presence_and_valency_stats = common_stats.presence_and_valency_stats
  # the CommonStatistics already contains the presence and valency
  # for a 1-nested feature.
  if (presence_and_valency_stats is not None and
      len(presence_and_valency_stats) > 1):
    result.presence_and_valency_stats.extend(
        _make_presence_and_valency_stats_protos(
            parent_presence_and_valency,
            common_stats.presence_and_valency_stats))
    if has_weights:
      result.weighted_presence_and_valency_stats.extend(
          _make_weighted_presence_and_valency_stats_protos(
              parent_presence_and_valency,
              common_stats.presence_and_valency_stats))

  top_level_presence_and_valency = (
      _PresenceAndValencyStats(make_quantiles_sketch_fn)
      if common_stats.presence_and_valency_stats is None else
      common_stats.presence_and_valency_stats[0])
  result.num_non_missing = top_level_presence_and_valency.num_non_missing

  if parent_presence_and_valency is not None:
    result.num_missing = (
        parent_presence_and_valency.total_num_values -
        top_level_presence_and_valency.num_non_missing)
  result.tot_num_values = top_level_presence_and_valency.total_num_values

  # TODO(b/79685042): Need to decide on what is the expected values for
  # statistics like min_num_values, max_num_values, avg_num_values, when
  # all the values for the feature are missing.
  if top_level_presence_and_valency.num_non_missing > 0:
    result.min_num_values = top_level_presence_and_valency.min_num_values
    result.max_num_values = top_level_presence_and_valency.max_num_values
    result.avg_num_values = (
        top_level_presence_and_valency.total_num_values /
        top_level_presence_and_valency.num_non_missing)

    if top_level_presence_and_valency.num_values_summary is not None:

      # Add num_values_histogram to the common stats proto.
      num_values_quantiles = (
          top_level_presence_and_valency.num_values_summary.GetQuantiles(
              num_values_histogram_buckets).flatten().to_pylist())
      histogram = quantiles_util.generate_quantiles_histogram(
          num_values_quantiles, top_level_presence_and_valency.num_non_missing,
          num_values_histogram_buckets)
      result.num_values_histogram.CopyFrom(histogram)

  # Add weighted common stats to the proto.
  if has_weights:
    weighted_common_stats_proto = statistics_pb2.WeightedCommonStatistics(
        num_non_missing=top_level_presence_and_valency.weighted_num_non_missing,
        tot_num_values=top_level_presence_and_valency.weighted_total_num_values)
    if parent_presence_and_valency is not None:
      weighted_common_stats_proto.num_missing = (
          parent_presence_and_valency.weighted_total_num_values -
          top_level_presence_and_valency.weighted_num_non_missing)

    if top_level_presence_and_valency.weighted_num_non_missing > 0:
      weighted_common_stats_proto.avg_num_values = (
          top_level_presence_and_valency.weighted_total_num_values /
          top_level_presence_and_valency.weighted_num_non_missing)

    result.weighted_common_stats.CopyFrom(
        weighted_common_stats_proto)
  return result