Beispiel #1
0
    def test_hll_example(self):
        k = 12      # 2^k = 4096 rows in the table
        n = 1 << 18 # ~256k unique values

        # create a couple sketches and inject some values
        # we'll have 1/4 of the values overlap
        hll  = hll_sketch(k, tgt_hll_type.HLL_8)
        hll2 = hll_sketch(k, tgt_hll_type.HLL_6)
        offset = int(3 * n / 4) # it's a float w/o cast
        # because we hash on the bits, not an abstract numeric value,
        # hll.update(1) and hll.update(1.0) give different results.
        for i in range(0, n):
            hll.update(i)
            hll2.update(i + offset)
        
        # although we provide get_composite_estimate() and get_estimate(),
        # the latter will always give the best available estimate.  we
        # recommend using get_estimate().
        # we can check that the upper and lower bounds bracket the
        # estimate, without needing to know the exact value.
        self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
        self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())

        # unioning uses a separate class, and we can either get a result
        # sketch or query the union object directly
        union = hll_union(k)
        union.update(hll)
        union.update(hll2)
        result = union.get_result()
        self.assertEqual(result.get_estimate(), union.get_estimate())

        # since our process here (including post-union HLL) is
        # deterministic, we have checked and know the exact
        # answer is within one standard deviation of the estimate
        self.assertLessEqual(union.get_lower_bound(1), 7 * n / 4)
        self.assertGreaterEqual(union.get_upper_bound(1), 7 * n / 4)

        # serialize for storage and reconstruct
        sk_bytes = result.serialize_compact()
        self.assertEqual(len(sk_bytes), result.get_compact_serialization_bytes())
        new_hll = hll_sketch.deserialize(sk_bytes)

        # the sketch can self-report its configuation and status
        self.assertEqual(new_hll.lg_config_k, k)
        self.assertEqual(new_hll.tgt_type, tgt_hll_type.HLL_4)
        self.assertFalse(new_hll.is_empty())

        # if we want to reduce some object overhead, we can also reset
        new_hll.reset()
        self.assertTrue(new_hll.is_empty())
Beispiel #2
0
def calculate_sketch_statistics(data):
    columns = list(data.columns)
    types = list(data.dtypes)

    stats_dict = {}
    for column, type in zip(columns, types):
        if type in [np.int32, np.int64, np.float64]:
            data_col = data[column].to_numpy()
            if data[column].dtype in [np.int32, np.int64]:
                kll = kll_ints_sketch(2048)
            elif data[column].dtype == np.float64:
                kll = kll_floats_sketch(2048)
            kll.update(data_col)
            stat_values = kll.get_quantiles([0.05, 0.25, 0.5, 0.75, 0.95])
            stat_names = ["0.05", "Q1", "Median", "Q3", "0.95"]

            hll = hll_sketch(DEFAULT_HLL_K, DEFAULT_HLL_TYPE)
            hll.update(data_col)  #works with local fork (np.array extension)
            approx_distinct_count = hll.get_estimate()
            stat_values.append(round(approx_distinct_count))
            stat_names.append("Distinct Count")

            stat_pairs = [list(i) for i in zip(stat_names, stat_values)]
            stats_dict[column] = stat_pairs

    return stats_dict
Beispiel #3
0
 def __init__(self, lg_k=None, sketch=None):
     if sketch is None:
         if lg_k is None:
             lg_k = DEFAULT_LG_K
         sketch = datasketches.hll_sketch(lg_k)
     assert isinstance(sketch, datasketches.hll_sketch)
     self.sketch = sketch
     self.lg_k = lg_k
Beispiel #4
0
    def compute_metrics(self, properties: Set[Property],
                        repo: MetadataRepository):
        quantile_properties = [
            property for property in properties
            if isinstance(property, Quantile)
        ]
        quantile_metrics: Dict[Property, Metric] = {}
        for quantile_property in quantile_properties:
            data_col = self.data[quantile_property.column].to_numpy()
            sketch_type = ""
            if self.data[quantile_property.column].dtype == np.int64:
                kll = kll_ints_sketch(DEFAULT_SKETCH_SIZE)
                sketch_type = "ints"
            elif self.data[quantile_property.column].dtype == np.float64:
                kll = kll_floats_sketch(DEFAULT_SKETCH_SIZE)
                sketch_type = "floats"
            else:
                raise NotImplementedError(
                    f"Data Type {self.data[quantile_property.column].dtype} is not supported for sketches!"
                )
            kll.update(data_col)
            quantile = kll.get_quantiles([quantile_property.quantile])[0]
            serialized_kll = kll.serialize().hex()  #bytes.fromhex()
            quantile_state = QuantileState(
                quantile_property.property_identifier(), serialized_kll,
                quantile, sketch_type)
            repo.register_state(quantile_state)
            quantile_metric = metric_from_value(quantile,
                                                quantile_property.name,
                                                quantile_property.instance,
                                                quantile_property.entity)
            quantile_metrics[quantile_property] = quantile_metric

        approx_distinct_properties = [
            property for property in properties
            if isinstance(property, ApproxDistinctness)
        ]
        approx_distinct_metrics: Dict[Property, Metric] = {}
        for approx_distinct_property in approx_distinct_properties:
            data_col = self.data[approx_distinct_property.column].to_numpy()
            hll = hll_sketch(DEFAULT_HLL_K, DEFAULT_HLL_TYPE)
            #for v in data_col: #slow
            #    hll.update(v)
            hll.update(data_col)  #works with local fork (np.array extension)
            approx_distinct_count = hll.get_estimate()
            num_rows = len(data_col)
            serialized_hll = hll.serialize_updatable().hex()  #bytes.fromhex()
            approx_distinct_state = ApproxDistinctState(
                approx_distinct_property.property_identifier(), serialized_hll,
                approx_distinct_count, num_rows)
            repo.register_state(approx_distinct_state)
            approx_distinctness = min(approx_distinct_count / num_rows, 1.00)
            approx_distinct_metric = metric_from_value(
                approx_distinctness, approx_distinct_property.name,
                approx_distinct_property.instance,
                approx_distinct_property.entity)
            approx_distinct_metrics[
                approx_distinct_property] = approx_distinct_metric

        other_properties = [
            property for property in properties
            if (not isinstance(property, Quantile)
                and not isinstance(property, ApproxDistinctness))
        ]
        metrics = self.engine.compute_metrics(other_properties, repo)
        metrics.update(quantile_metrics)
        metrics.update(approx_distinct_metrics)
        return metrics
Beispiel #5
0
 def generate_sketch(self, n, k, sk_type=tgt_hll_type.HLL_4, st_idx=0):
     sk = hll_sketch(k, sk_type)
     for i in range(st_idx, st_idx + n):
         sk.update(i)
     return sk