def test_accuracy_after_compact(self, max_num_elements, eps, num_quantiles): s1 = sketches.QuantilesSketch(eps, max_num_elements, 1) s2 = sketches.QuantilesSketch(eps, max_num_elements, 1) s3 = sketches.QuantilesSketch(eps, max_num_elements, 1) values = pa.array(reversed(range(max_num_elements))) weights = pa.array(range(max_num_elements)) total_weight = (max_num_elements - 1) * max_num_elements / 2 def cdf(x): left_weight = (2 * (max_num_elements - 1) - x) * (x + 1) / 2 return left_weight / total_weight _add_values(s1, values[:max_num_elements // 10], weights[:max_num_elements // 10]) _add_values(s2, values[max_num_elements // 10:max_num_elements // 3], weights[max_num_elements // 10:max_num_elements // 3]) _add_values(s3, values[max_num_elements // 3:], weights[max_num_elements // 3:]) s2.Compact() s3.Compact() s2.Merge(s3) s2.Compact() s1.Compact() s1.Merge(s2) s1.Compact() quantiles = s1.GetQuantiles(num_quantiles - 1).to_pylist()[0] self.assert_quantiles_accuracy(quantiles, cdf, eps)
def __init__(self, invalidate=False, num_in_vocab_tokens: int = 0, total_num_tokens: int = 0, sum_in_vocab_token_lengths: int = 0, num_examples: int = 0) -> None: # True only if this feature should never be considered, e.g: some # value_lists have inconsistent types or feature doesn't have an # NL domain. self.invalidate = invalidate self.num_in_vocab_tokens = num_in_vocab_tokens self.total_num_tokens = total_num_tokens self.sum_in_vocab_token_lengths = sum_in_vocab_token_lengths self.num_examples = num_examples self.vocab_token_length_quantiles = sketches.QuantilesSketch( _QUANTILES_SKETCH_ERROR, _QUANTILES_SKETCH_NUM_ELEMENTS, _QUANTILES_SKETCH_NUM_STREAMS) self.min_sequence_length = None self.max_sequence_length = None self.sequence_length_quantiles = sketches.QuantilesSketch( _QUANTILES_SKETCH_ERROR, _QUANTILES_SKETCH_NUM_ELEMENTS, _QUANTILES_SKETCH_NUM_STREAMS) self.token_occurrence_counts = sketches.MisraGriesSketch( _NUM_MISRAGRIES_SKETCH_BUCKETS) self.token_statistics = collections.defaultdict(_TokenStats) self.reported_sequences_coverage = [] self.reported_sequences_avg_token_length = []
def test_quantiles_sketch_init(self): with self.assertRaisesRegex(RuntimeError, "eps must be positive"): _ = sketches.QuantilesSketch(0, 1 << 32, 1) with self.assertRaisesRegex(RuntimeError, "max_num_elements must be >= 1."): _ = sketches.QuantilesSketch(0.0001, 0, 1) with self.assertRaisesRegex(RuntimeError, "num_streams must be >= 1."): _ = sketches.QuantilesSketch(0.0001, 1 << 32, 0) _ = sketches.QuantilesSketch(0.0001, 1 << 32, 1)
def test_merge(self, values, expected, num_streams, weights=None): if weights is None: weights = [None] * len(values) s1 = sketches.QuantilesSketch(0.00001, 1 << 32, num_streams) for value, weight in zip(values[:len(values) // 2], weights[:len(weights) // 2]): _add_values(s1, value, weight) s2 = sketches.QuantilesSketch(0.00001, 1 << 32, num_streams) for value, weight in zip(values[len(values) // 2:], weights[len(weights) // 2:]): _add_values(s2, value, weight) s1 = _pickle_roundtrip(s1) s2 = _pickle_roundtrip(s2) s1.Merge(s2) result = s1.GetQuantiles(len(expected[0]) - 1).to_pylist() np.testing.assert_almost_equal(expected, result)
def test_quantiles(self, values, expected, num_streams, weights=None): s = sketches.QuantilesSketch(0.00001, 1 << 32, num_streams) if weights is None: weights = [None] * len(values) for value, weight in zip(values, weights): _add_values(s, value, weight) result = s.GetQuantiles(len(expected[0]) - 1).to_pylist() np.testing.assert_almost_equal(expected, result)
def test_pickle(self, values, expected, num_streams, weights=None): s = sketches.QuantilesSketch(0.00001, 1 << 32, num_streams) if weights is None: weights = [None] * len(values) for value, weight in zip(values, weights): _add_values(s, value, weight) pickled = pickle.dumps(s) self.assertIsInstance(pickled, bytes) unpickled = pickle.loads(pickled) self.assertIsInstance(unpickled, sketches.QuantilesSketch) result = unpickled.GetQuantiles(len(expected[0]) - 1).to_pylist() np.testing.assert_almost_equal(expected, result)
def test_accuracy(self, max_num_elements, eps, num_quantiles): s = sketches.QuantilesSketch(eps, max_num_elements, 1) values = pa.array(reversed(range(max_num_elements))) weights = pa.array(range(max_num_elements)) total_weight = (max_num_elements - 1) * max_num_elements / 2 def cdf(x): left_weight = (2 * (max_num_elements - 1) - x) * (x + 1) / 2 return left_weight / total_weight _add_values(s, values, weights) quantiles = s.GetQuantiles(num_quantiles - 1).to_pylist()[0] self.assert_quantiles_accuracy(quantiles, cdf, eps)
def test_compact(self, values, expected, num_streams, weights=None): s = sketches.QuantilesSketch(0.00001, 1 << 32, num_streams) num_values = len(values) if weights is None: weights = [None] * num_values for value, weight in zip(values[:num_values // 2], weights[:num_values // 2]): _add_values(s, value, weight) s.Compact() for value, weight in zip(values[num_values // 2:], weights[num_values // 2:]): _add_values(s, value, weight) s.Compact() result = s.GetQuantiles(len(expected[0]) - 1).to_pylist() np.testing.assert_almost_equal(expected, result)
def __init__( self, # pylint: disable=useless-super-delegation name: Text = 'BasicStatsGenerator', schema: Optional[schema_pb2.Schema] = None, example_weight_map: ExampleWeightMap = ExampleWeightMap(), num_values_histogram_buckets: Optional[int] = 10, num_histogram_buckets: Optional[int] = 10, num_quantiles_histogram_buckets: Optional[int] = 10, epsilon: Optional[float] = 0.01) -> None: """Initializes basic statistics generator. Args: name: An optional unique name associated with the statistics generator. schema: An optional schema for the dataset. example_weight_map: an ExampleWeightMap that maps a FeaturePath to its corresponding weight column. num_values_histogram_buckets: An optional number of buckets in a quantiles histogram for the number of values per Feature, which is stored in CommonStatistics.num_values_histogram. num_histogram_buckets: An optional number of buckets in a standard NumericStatistics.histogram with equal-width buckets. num_quantiles_histogram_buckets: An optional number of buckets in a quantiles NumericStatistics.histogram. epsilon: An optional error tolerance for the computation of quantiles, typically a small fraction close to zero (e.g. 0.01). Higher values of epsilon increase the quantile approximation, and hence result in more unequal buckets, but could improve performance, and resource consumption. """ super(BasicStatsGenerator, self).__init__(name, schema) self._bytes_features = set( schema_util.get_bytes_features(schema) if schema else []) self._categorical_features = set( schema_util.get_categorical_numeric_features(schema) if schema else []) self._example_weight_map = example_weight_map self._num_values_histogram_buckets = num_values_histogram_buckets self._num_histogram_buckets = num_histogram_buckets self._num_quantiles_histogram_buckets = num_quantiles_histogram_buckets self._make_quantiles_sketch_fn = lambda: sketches.QuantilesSketch( # pylint: disable=g-long-lambda eps=epsilon, max_num_elements=1 << 32, num_streams=1)