def __init__(
      self,  # pylint: disable=useless-super-delegation
      name = 'NumericStatsGenerator',
      schema = None,
      num_histogram_buckets = 10,
      num_quantiles_histogram_buckets = 10,
      epsilon = 0.01):
    """Initializes a numeric statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      num_histogram_buckets: An optional number of buckets in a standard
          NumericStatistics.histogram with equal-width buckets.
      num_quantiles_histogram_buckets: An optional number of buckets in a
          quantiles NumericStatistics.histogram.
      epsilon: An optional error tolerance for the computation of quantiles,
          typically a small fraction close to zero (e.g. 0.01). Higher values
          of epsilon increase the quantile approximation, and hence result in
          more unequal buckets, but could improve performance, and resource
          consumption.
    """
    super(NumericStatsGenerator, self).__init__(name)
    self._categorical_features = set(
        stats_util.get_categorical_numeric_features(schema) if schema else [])
    # Initialize quantiles combiner for equi-width histogram.
    self._std_hist_combiner = quantiles_util.QuantilesCombiner(
        _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM * num_histogram_buckets,
        epsilon)
    # Initialize quantiles combiner for quantiles histogram.
    self._quantiles_hist_combiner = quantiles_util.QuantilesCombiner(
        num_quantiles_histogram_buckets, epsilon)
    self._num_histogram_buckets = num_histogram_buckets
Example #2
0
    def __init__(
            self,  # pylint: disable=useless-super-delegation
            name: Text = 'BasicStatsGenerator',
            schema: Optional[schema_pb2.Schema] = None,
            weight_feature: Optional[types.FeatureName] = None,
            num_values_histogram_buckets: Optional[int] = 10,
            num_histogram_buckets: Optional[int] = 10,
            num_quantiles_histogram_buckets: Optional[int] = 10,
            epsilon: Optional[float] = 0.01) -> None:
        """Initializes basic statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      weight_feature: An optional feature name whose numeric value represents
          the weight of an example.
      num_values_histogram_buckets: An optional number of buckets in a quantiles
          histogram for the number of values per Feature, which is stored in
          CommonStatistics.num_values_histogram.
      num_histogram_buckets: An optional number of buckets in a standard
          NumericStatistics.histogram with equal-width buckets.
      num_quantiles_histogram_buckets: An optional number of buckets in a
          quantiles NumericStatistics.histogram.
      epsilon: An optional error tolerance for the computation of quantiles,
          typically a small fraction close to zero (e.g. 0.01). Higher values
          of epsilon increase the quantile approximation, and hence result in
          more unequal buckets, but could improve performance, and resource
          consumption.
    """
        super(BasicStatsGenerator, self).__init__(name, schema)

        self._bytes_features = set(
            schema_util.get_bytes_features(schema) if schema else [])
        self._categorical_features = set(
            schema_util.get_categorical_numeric_features(schema
                                                         ) if schema else [])
        self._weight_feature = weight_feature
        self._num_values_histogram_buckets = num_values_histogram_buckets
        # Initialize quantiles combiner for histogram over number of values.
        self._num_values_quantiles_combiner = quantiles_util.QuantilesCombiner(
            self._num_values_histogram_buckets, epsilon)

        self._num_histogram_buckets = num_histogram_buckets
        self._num_quantiles_histogram_buckets = num_quantiles_histogram_buckets
        num_buckets = max(
            self._num_quantiles_histogram_buckets,
            _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM *
            self._num_histogram_buckets)
        assert num_buckets % self._num_quantiles_histogram_buckets == 0
        # Initialize quantiles combiner for histogram over feature values.
        self._values_quantiles_combiner = quantiles_util.QuantilesCombiner(
            num_buckets, epsilon, has_weights=True)
Example #3
0
    def __init__(
            self,  # pylint: disable=useless-super-delegation
            name='CommonStatsGenerator',
            schema=None,
            weight_feature=None,
            num_values_histogram_buckets=10,
            epsilon=0.01):
        """Initializes a common statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      weight_feature: An optional feature name whose numeric value represents
          the weight of an example.
      num_values_histogram_buckets: An optional number of buckets in a quantiles
          histogram for the number of values per Feature, which is stored in
          CommonStatistics.num_values_histogram.
      epsilon: An optional error tolerance for the computation of quantiles,
          typically a small fraction close to zero (e.g. 0.01). Higher values
          of epsilon increase the quantile approximation, and hence result in
          more unequal buckets, but could improve performance, and resource
          consumption.
    """
        super(CommonStatsGenerator, self).__init__(name, schema)
        self._categorical_features = set(
            schema_util.get_categorical_numeric_features(schema
                                                         ) if schema else [])
        self._weight_feature = weight_feature
        self._num_values_histogram_buckets = num_values_histogram_buckets
        # Initialize quantiles combiner.
        self._quantiles_combiner = quantiles_util.QuantilesCombiner(
            self._num_values_histogram_buckets, epsilon)
Example #4
0
 def test_quantiles_combiner(self):
     batches = [[np.linspace(1, 100, 100)], [np.linspace(101, 200, 100)],
                [np.linspace(201, 300, 100)]]
     expected_result = np.array([61.0, 121.0, 181.0, 241.0],
                                dtype=np.float32)
     q_combiner = quantiles_util.QuantilesCombiner(5, 0.00001)
     _run_quantiles_combiner_test(self, q_combiner, batches,
                                  expected_result)
Example #5
0
 def test_quantiles_combiner_with_weights(self):
   batches = [[np.linspace(1, 100, 100), [1] * 100],
              [np.linspace(101, 200, 100), [2] * 100],
              [np.linspace(201, 300, 100), [3] * 100]]
   expected_result = np.array(
       [1.0, 111.0, 171.0, 221.0, 261.0, 300.0], dtype=np.float32)
   q_combiner = quantiles_util.QuantilesCombiner(5, 0.00001, has_weights=True)
   _run_quantiles_combiner_test(self, q_combiner, batches, expected_result)