def test_hll_example(self): k = 12 # 2^k = 4096 rows in the table n = 1 << 18 # ~256k unique values # create a couple sketches and inject some values # we'll have 1/4 of the values overlap hll = hll_sketch(k, tgt_hll_type.HLL_8) hll2 = hll_sketch(k, tgt_hll_type.HLL_6) offset = int(3 * n / 4) # it's a float w/o cast # because we hash on the bits, not an abstract numeric value, # hll.update(1) and hll.update(1.0) give different results. for i in range(0, n): hll.update(i) hll2.update(i + offset) # although we provide get_composite_estimate() and get_estimate(), # the latter will always give the best available estimate. we # recommend using get_estimate(). # we can check that the upper and lower bounds bracket the # estimate, without needing to know the exact value. self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate()) self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate()) # unioning uses a separate class, and we can either get a result # sketch or query the union object directly union = hll_union(k) union.update(hll) union.update(hll2) result = union.get_result() self.assertEqual(result.get_estimate(), union.get_estimate()) # since our process here (including post-union HLL) is # deterministic, we have checked and know the exact # answer is within one standard deviation of the estimate self.assertLessEqual(union.get_lower_bound(1), 7 * n / 4) self.assertGreaterEqual(union.get_upper_bound(1), 7 * n / 4) # serialize for storage and reconstruct sk_bytes = result.serialize_compact() self.assertEqual(len(sk_bytes), result.get_compact_serialization_bytes()) new_hll = hll_sketch.deserialize(sk_bytes) # the sketch can self-report its configuation and status self.assertEqual(new_hll.lg_config_k, k) self.assertEqual(new_hll.tgt_type, tgt_hll_type.HLL_4) self.assertFalse(new_hll.is_empty()) # if we want to reduce some object overhead, we can also reset new_hll.reset() self.assertTrue(new_hll.is_empty())
def calculate_sketch_statistics(data): columns = list(data.columns) types = list(data.dtypes) stats_dict = {} for column, type in zip(columns, types): if type in [np.int32, np.int64, np.float64]: data_col = data[column].to_numpy() if data[column].dtype in [np.int32, np.int64]: kll = kll_ints_sketch(2048) elif data[column].dtype == np.float64: kll = kll_floats_sketch(2048) kll.update(data_col) stat_values = kll.get_quantiles([0.05, 0.25, 0.5, 0.75, 0.95]) stat_names = ["0.05", "Q1", "Median", "Q3", "0.95"] hll = hll_sketch(DEFAULT_HLL_K, DEFAULT_HLL_TYPE) hll.update(data_col) #works with local fork (np.array extension) approx_distinct_count = hll.get_estimate() stat_values.append(round(approx_distinct_count)) stat_names.append("Distinct Count") stat_pairs = [list(i) for i in zip(stat_names, stat_values)] stats_dict[column] = stat_pairs return stats_dict
def __init__(self, lg_k=None, sketch=None): if sketch is None: if lg_k is None: lg_k = DEFAULT_LG_K sketch = datasketches.hll_sketch(lg_k) assert isinstance(sketch, datasketches.hll_sketch) self.sketch = sketch self.lg_k = lg_k
def compute_metrics(self, properties: Set[Property], repo: MetadataRepository): quantile_properties = [ property for property in properties if isinstance(property, Quantile) ] quantile_metrics: Dict[Property, Metric] = {} for quantile_property in quantile_properties: data_col = self.data[quantile_property.column].to_numpy() sketch_type = "" if self.data[quantile_property.column].dtype == np.int64: kll = kll_ints_sketch(DEFAULT_SKETCH_SIZE) sketch_type = "ints" elif self.data[quantile_property.column].dtype == np.float64: kll = kll_floats_sketch(DEFAULT_SKETCH_SIZE) sketch_type = "floats" else: raise NotImplementedError( f"Data Type {self.data[quantile_property.column].dtype} is not supported for sketches!" ) kll.update(data_col) quantile = kll.get_quantiles([quantile_property.quantile])[0] serialized_kll = kll.serialize().hex() #bytes.fromhex() quantile_state = QuantileState( quantile_property.property_identifier(), serialized_kll, quantile, sketch_type) repo.register_state(quantile_state) quantile_metric = metric_from_value(quantile, quantile_property.name, quantile_property.instance, quantile_property.entity) quantile_metrics[quantile_property] = quantile_metric approx_distinct_properties = [ property for property in properties if isinstance(property, ApproxDistinctness) ] approx_distinct_metrics: Dict[Property, Metric] = {} for approx_distinct_property in approx_distinct_properties: data_col = self.data[approx_distinct_property.column].to_numpy() hll = hll_sketch(DEFAULT_HLL_K, DEFAULT_HLL_TYPE) #for v in data_col: #slow # hll.update(v) hll.update(data_col) #works with local fork (np.array extension) approx_distinct_count = hll.get_estimate() num_rows = len(data_col) serialized_hll = hll.serialize_updatable().hex() #bytes.fromhex() approx_distinct_state = ApproxDistinctState( approx_distinct_property.property_identifier(), serialized_hll, approx_distinct_count, num_rows) repo.register_state(approx_distinct_state) approx_distinctness = min(approx_distinct_count / num_rows, 1.00) approx_distinct_metric = metric_from_value( approx_distinctness, approx_distinct_property.name, approx_distinct_property.instance, approx_distinct_property.entity) approx_distinct_metrics[ approx_distinct_property] = approx_distinct_metric other_properties = [ property for property in properties if (not isinstance(property, Quantile) and not isinstance(property, ApproxDistinctness)) ] metrics = self.engine.compute_metrics(other_properties, repo) metrics.update(quantile_metrics) metrics.update(approx_distinct_metrics) return metrics
def generate_sketch(self, n, k, sk_type=tgt_hll_type.HLL_4, st_idx=0): sk = hll_sketch(k, sk_type) for i in range(st_idx, st_idx + n): sk.update(i) return sk