Example #1
0
 def __init__(
     self,
     variance: VarianceTracker = None,
     floats: FloatTracker = None,
     ints: IntTracker = None,
     theta_sketch: ThetaSketch = None,
     histogram: datasketches.kll_floats_sketch = None,
     frequent_numbers: dsketch.FrequentNumbersSketch = None,
 ):
     # Our own trackers
     if variance is None:
         variance = VarianceTracker()
     if floats is None:
         floats = FloatTracker()
     if ints is None:
         ints = IntTracker()
     if theta_sketch is None:
         theta_sketch = ThetaSketch()
     if histogram is None:
         histogram = datasketches.kll_floats_sketch(DEFAULT_HIST_K)
     if frequent_numbers is None:
         frequent_numbers = dsketch.FrequentNumbersSketch()
     self.variance = variance
     self.floats = floats
     self.ints = ints
     self.theta_sketch = theta_sketch
     self.histogram = histogram
     self.frequent_numbers = frequent_numbers
Example #2
0
def calculate_sketch_statistics(data):
    columns = list(data.columns)
    types = list(data.dtypes)

    stats_dict = {}
    for column, type in zip(columns, types):
        if type in [np.int32, np.int64, np.float64]:
            data_col = data[column].to_numpy()
            if data[column].dtype in [np.int32, np.int64]:
                kll = kll_ints_sketch(2048)
            elif data[column].dtype == np.float64:
                kll = kll_floats_sketch(2048)
            kll.update(data_col)
            stat_values = kll.get_quantiles([0.05, 0.25, 0.5, 0.75, 0.95])
            stat_names = ["0.05", "Q1", "Median", "Q3", "0.95"]

            hll = hll_sketch(DEFAULT_HLL_K, DEFAULT_HLL_TYPE)
            hll.update(data_col)  #works with local fork (np.array extension)
            approx_distinct_count = hll.get_estimate()
            stat_values.append(round(approx_distinct_count))
            stat_names.append("Distinct Count")

            stat_pairs = [list(i) for i in zip(stat_names, stat_values)]
            stats_dict[column] = stat_pairs

    return stats_dict
def test_histogram_summary():
    hist = datasketches.kll_floats_sketch(256)
    vals = [1, 2, 3, 4, 5, 6, 7, 8, 9]
    vals = [float(v) for v in vals]
    for val in vals:
        hist.update(val)

    summary = summaryconverters.histogram_from_sketch(hist)
    _hist_summary_check(summary, vals)
    assert len(summary.counts) > 1
def test_single_value_histogram_summary():
    hist = datasketches.kll_floats_sketch(256)
    vals = 30 * [1]
    vals = [float(v) for v in vals]
    for val in vals:
        hist.update(val)

    summary = summaryconverters.histogram_from_sketch(hist)
    _hist_summary_check(summary, vals)
    assert len(summary.counts) == 1
Example #5
0
    def test_kll_example(self):
        from numpy.random import randn
        k = 160
        n = 2**20

        # create a sketch and inject ~1 million N(0,1) points
        kll = kll_floats_sketch(k)
        for i in range(0, n):
            kll.update(randn())

        # 0 should be near the median
        self.assertAlmostEqual(0.5, kll.get_rank(0.0), delta=0.02)

        # the median should be near 0
        self.assertAlmostEqual(0.0, kll.get_quantile(0.5), delta=0.02)

        # we also track the min/max independently from the rest of the data
        # which lets us know the full observed data range
        self.assertLessEqual(kll.get_min_value(), kll.get_quantile(0.01))
        self.assertLessEqual(0.0, kll.get_rank(kll.get_min_value()))
        self.assertGreaterEqual(kll.get_max_value(), kll.get_quantile(0.99))
        self.assertGreaterEqual(1.0, kll.get_rank(kll.get_max_value()))

        # we can also extract a list of values at a time,
        # here the values should give us something close to [-2, -1, 0, 1, 2].
        # then get the CDF, which will return something close to
        # the original values used in get_quantiles()
        # finally, can check the normalized rank error bound
        pts = kll.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
        cdf = kll.get_cdf(
            pts)  # include 1.0 at end to account for all probability mass
        self.assertEqual(len(cdf), len(pts) + 1)
        err = kll.normalized_rank_error(False)
        self.assertEqual(err,
                         kll_floats_sketch.get_normalized_rank_error(k, False))

        # and a few basic queries about the sketch
        self.assertFalse(kll.is_empty())
        self.assertTrue(kll.is_estimation_mode())
        self.assertEqual(kll.get_n(), n)
        self.assertLess(kll.get_num_retained(), n)

        # merging itself will double the number of items the sketch has seen
        kll.merge(kll)
        self.assertEqual(kll.get_n(), 2 * n)

        # we can then serialize and reconstruct the sketch
        kll_bytes = kll.serialize()
        new_kll = kll.deserialize(kll_bytes)
        self.assertEqual(kll.get_num_retained(), new_kll.get_num_retained())
        self.assertEqual(kll.get_min_value(), new_kll.get_min_value())
        self.assertEqual(kll.get_max_value(), new_kll.get_max_value())
        self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7))
        self.assertEqual(kll.get_rank(0.0), new_kll.get_rank(0.0))
Example #6
0
 def run_step(self, run_number: int, step_size: int,
              howlong: float) -> ReturnRunStep:
     assert self.context
     with self.context as ctx:
         indices = ctx.table.created.next(step_size)  # returns a slice
         steps = indices_len(indices)
         if not steps:
             return self._return_run_step(self.state_blocked, steps_run=0)
         input_df = ctx.table.data()
         column = input_df[self.column]
         column = column.loc[fix_loc(indices)]
         # if self._kll is None:
         #    self._kll_func = kll_floats_sketch
         #    self._kll = self._kll_func(self._k)
         kll = self._kll
         sk = kll_floats_sketch(self._k)  # self._kll_func(self._k)
         sk.update(column)
         assert kll
         kll.merge(sk)
         max_ = kll.get_max_value()
         min_ = kll.get_min_value()
         quantiles: Floats = []
         splits: Floats = []
         pmf: Floats = []
         if self.params.quantiles:
             quantiles = kll.get_quantiles(self.params.quantiles)
         if self.params.binning:
             par_bin = self.params.binning
             if isinstance(par_bin, integer_types):
                 num_splits = par_bin
                 splits = np.linspace(min_, max_, num_splits)
                 pmf = kll.get_pmf(splits[:-1])
             elif isinstance(par_bin, Sequence):
                 splits = par_bin
                 pmf = kll.get_pmf(splits)
             elif isinstance(par_bin, dict):
                 lower_ = par_bin["lower"]
                 upper_ = par_bin["upper"]
                 num_splits = par_bin["n_splits"]
                 splits = np.linspace(lower_, upper_, num_splits)
                 pmf = kll.get_pmf(splits[:-1])
         res = dict(max=max_,
                    min=min_,
                    quantiles=quantiles,
                    splits=splits,
                    pmf=pmf)
         if self.result is None:
             self.result = PsDict(res)
         else:
             self.psdict.update(res)
         return self._return_run_step(self.next_state(ctx.table), steps)
Example #7
0
 def test_kll2(self):
     np.random.seed(42)
     s = self.scheduler()
     random = RandomTable(3, rows=10_000, scheduler=s)
     kll = KLLSketch(column="_1", scheduler=s)
     kll.params.quantiles = QUANTILES
     kll.input[0] = random.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = kll.output.result
     aio.run(s.start())
     val = random.result["_1"].value
     sk = kll_floats_sketch(K)
     sk.update(val)
     self.compare(kll.result["quantiles"], sk.get_quantiles(QUANTILES))
Example #8
0
 def test_kll4(self):
     np.random.seed(42)
     s = self.scheduler()
     random = RandomTable(3, rows=10_000, scheduler=s)
     kll = KLLSketch(column="_1", scheduler=s)
     kll.params.binning = SPLITS_SEQ
     kll.input[0] = random.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = kll.output.result
     aio.run(s.start())
     val = random.result["_1"].value
     sk = kll_floats_sketch(K)
     sk.update(val)
     pmf = sk.get_pmf(SPLITS_SEQ)
     self.compare(kll.result["pmf"], pmf)
Example #9
0
 def test_kll(self):
     np.random.seed(42)
     s = self.scheduler()
     random = RandomTable(3, rows=10_000, scheduler=s)
     kll = KLLSketch(column="_1", scheduler=s)
     kll.input[0] = random.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = kll.output.result
     aio.run(s.start())
     val = random.result["_1"].value
     sk = kll_floats_sketch(K)
     sk.update(val)
     self.assertAlmostEqual(kll.result["max"], sk.get_max_value())
     self.assertAlmostEqual(kll.result["min"], sk.get_min_value())
     self.assertEqual(kll.result["quantiles"], [])
     self.assertEqual(kll.result["splits"], [])
     self.assertEqual(kll.result["pmf"], [])
Example #10
0
def calculate_sketch_statistics_np(np_arr):
    columns = np_arr.keys()
    stats_dict = {}
    for column in columns:
        type = np_arr[column].dtype
        if type in [np.int32, np.int64, np.float64]:
            data_col = np_arr[column]
            if type in [np.int32, np.int64]:
                kll = kll_ints_sketch(2048)
            elif type == np.float64:
                kll = kll_floats_sketch(2048)
            kll.update(data_col)
            quantiles = kll.get_quantiles([0.05, 0.25, 0.5, 0.75, 0.95])
            quantile_names = ["0.05", "Q1", "Median", "Q3", "0.95"]
            stat_pairs = [list(i) for i in zip(quantile_names, quantiles)]
            stats_dict[column] = stat_pairs

    return stats_dict
Example #11
0
 def test_kll5(self):
     np.random.seed(42)
     s = self.scheduler()
     random = RandomTable(3, rows=10_000, scheduler=s)
     kll = KLLSketch(column="_1", scheduler=s)
     kll.params.binning = SPLITS_DICT
     kll.input[0] = random.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = kll.output.result
     aio.run(s.start())
     val = random.result["_1"].value
     sk = kll_floats_sketch(K)
     sk.update(val)
     lower_ = SPLITS_DICT["lower"]
     upper_ = SPLITS_DICT["upper"]
     num_splits = SPLITS_DICT["n_splits"]
     splits = np.linspace(lower_, upper_, num_splits)
     pmf = sk.get_pmf(splits[:-1])
     self.compare(kll.result["pmf"], pmf)
Example #12
0
 def test_kll3(self):
     np.random.seed(42)
     s = self.scheduler()
     random = RandomTable(3, rows=10_000, scheduler=s)
     kll = KLLSketch(column="_1", scheduler=s)
     kll.params.binning = BINS
     kll.input[0] = random.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = kll.output.result
     aio.run(s.start())
     val = random.result["_1"].value
     sk = kll_floats_sketch(K)
     sk.update(val)
     max_ = sk.get_max_value()
     min_ = sk.get_min_value()
     num_splits = BINS
     splits = np.linspace(min_, max_, num_splits)
     pmf = sk.get_pmf(splits[:-1])
     self.compare(kll.result["pmf"], pmf)
Example #13
0
    def metrics_from_states(
        self, properties_and_states: Dict[Property,
                                          State]) -> Dict[Property, Metric]:
        property_metric_map: Dict[Property, Metric] = {}
        for prop, state in properties_and_states.items():
            if isinstance(prop, Quantile):
                quantile_state = state  #QuantileState(quantile_property.property_identifier(), serialized_kll, quantile)
                if state.sketch_type == "floats":
                    kll_ser = kll_floats_sketch(DEFAULT_SKETCH_SIZE)
                else:
                    kll_ser = kll_ints_sketch(DEFAULT_SKETCH_SIZE)
                main_kll = kll_ser.deserialize(
                    bytes.fromhex(state.serializedKll))
                quantile = main_kll.get_quantiles([prop.quantile])[0]
                quantile_metric = metric_from_value(quantile, prop.name,
                                                    prop.instance, prop.entity)
                property_metric_map[prop] = quantile_metric
            elif isinstance(prop, ApproxDistinctness):
                approx_distinct_state = state  #ApproxDistinctState(approx_distinct_property.property_identifier(), serialized_hll, approx_distinct_count, num_rows)
                approx_distinctness = min(
                    approx_distinct_state.approx_distinct_count /
                    approx_distinct_state.num_rows, 1.00)
                approx_distinct_metric = metric_from_value(
                    approx_distinctness, prop.name, prop.instance, prop.entity)
                property_metric_map[prop] = approx_distinct_metric
            elif isinstance(prop, Schema):
                schema_state = state  #SchemaState(schema_property.property_identifier(),schema)
                schema = schema_state.schema
                schema_metric = metric_from_value(schema, prop.name,
                                                  prop.instance, prop.entity)
                property_metric_map[prop] = schema_metric
            else:
                operator = SQLOperatorFactory.create_operator(prop)
                metric = operator.get_metric(state)
                property_metric_map[prop] = metric

        return property_metric_map
Example #14
0
 def __init__(self, column: str, k: int = 200, **kwds: Any) -> None:
     super().__init__(**kwds)
     self.column: str = column
     self._k: int = k
     self._kll: kll_floats_sketch = kll_floats_sketch(k)
     self.default_step_size: int = 10000
Example #15
0
 def test_kll_floats_sketch(self):
     # alraedy tested ints and it's templatized, so just make sure it instantiates properly
     k = 75
     kll = kll_floats_sketch(k)
     self.assertTrue(kll.is_empty())
Example #16
0
    def compute_metrics(self, properties: Set[Property],
                        repo: MetadataRepository):
        quantile_properties = [
            property for property in properties
            if isinstance(property, Quantile)
        ]
        quantile_metrics: Dict[Property, Metric] = {}
        for quantile_property in quantile_properties:
            data_col = self.data[quantile_property.column].to_numpy()
            sketch_type = ""
            if self.data[quantile_property.column].dtype == np.int64:
                kll = kll_ints_sketch(DEFAULT_SKETCH_SIZE)
                sketch_type = "ints"
            elif self.data[quantile_property.column].dtype == np.float64:
                kll = kll_floats_sketch(DEFAULT_SKETCH_SIZE)
                sketch_type = "floats"
            else:
                raise NotImplementedError(
                    f"Data Type {self.data[quantile_property.column].dtype} is not supported for sketches!"
                )
            kll.update(data_col)
            quantile = kll.get_quantiles([quantile_property.quantile])[0]
            serialized_kll = kll.serialize().hex()  #bytes.fromhex()
            quantile_state = QuantileState(
                quantile_property.property_identifier(), serialized_kll,
                quantile, sketch_type)
            repo.register_state(quantile_state)
            quantile_metric = metric_from_value(quantile,
                                                quantile_property.name,
                                                quantile_property.instance,
                                                quantile_property.entity)
            quantile_metrics[quantile_property] = quantile_metric

        approx_distinct_properties = [
            property for property in properties
            if isinstance(property, ApproxDistinctness)
        ]
        approx_distinct_metrics: Dict[Property, Metric] = {}
        for approx_distinct_property in approx_distinct_properties:
            data_col = self.data[approx_distinct_property.column].to_numpy()
            hll = hll_sketch(DEFAULT_HLL_K, DEFAULT_HLL_TYPE)
            #for v in data_col: #slow
            #    hll.update(v)
            hll.update(data_col)  #works with local fork (np.array extension)
            approx_distinct_count = hll.get_estimate()
            num_rows = len(data_col)
            serialized_hll = hll.serialize_updatable().hex()  #bytes.fromhex()
            approx_distinct_state = ApproxDistinctState(
                approx_distinct_property.property_identifier(), serialized_hll,
                approx_distinct_count, num_rows)
            repo.register_state(approx_distinct_state)
            approx_distinctness = min(approx_distinct_count / num_rows, 1.00)
            approx_distinct_metric = metric_from_value(
                approx_distinctness, approx_distinct_property.name,
                approx_distinct_property.instance,
                approx_distinct_property.entity)
            approx_distinct_metrics[
                approx_distinct_property] = approx_distinct_metric

        other_properties = [
            property for property in properties
            if (not isinstance(property, Quantile)
                and not isinstance(property, ApproxDistinctness))
        ]
        metrics = self.engine.compute_metrics(other_properties, repo)
        metrics.update(quantile_metrics)
        metrics.update(approx_distinct_metrics)
        return metrics
Example #17
0
    def __merge_states(self, states: Sequence[State]) -> State:
        first_state = states[0]
        result_state = None
        if isinstance(first_state, SchemaState):
            result_state = first_state
        elif isinstance(first_state, MaxState):
            max_value: float = first_state.max_value
            for state in states:
                max_value = max(max_value, state.max_value)
            result_state = MaxState(first_state.id, max_value)
        elif isinstance(first_state, MeanState):
            total: float = 0
            count: int = 0
            for state in states:
                total = total + state.total
                count = count + state.count
            result_state = MeanState(first_state.id, total, count)
        elif isinstance(first_state, MinState):
            min_value: float = first_state.min_value
            for state in states:
                min_value = min(min_value, state.min_value)
            result_state = MinState(first_state.id, min_value)
        elif isinstance(first_state, NumMatches):
            num_matches: int = 0
            for state in states:
                num_matches = num_matches + state.num_matches
            result_state = NumMatches(first_state.id, num_matches)
        elif isinstance(first_state, NumMatchesAndCount):
            num_matches: int = 0
            count: int = 0
            for state in states:
                num_matches = num_matches + state.num_matches
                count = count + state.count
            result_state = NumMatchesAndCount(first_state.id, num_matches,
                                              count)
        elif isinstance(first_state, QuantileState):
            if first_state.sketch_type == "floats":
                kll_ser = kll_floats_sketch(DEFAULT_SKETCH_SIZE)
            else:
                kll_ser = kll_ints_sketch(DEFAULT_SKETCH_SIZE)
            main_kll = kll_ser.deserialize(
                bytes.fromhex(first_state.serializedKll))

            i = 0
            for state in states:
                if i == 0:
                    i += 1
                    continue
                new_kll = kll_ser.deserialize(
                    bytes.fromhex(state.serializedKll))
                main_kll.merge(new_kll)

            result_state = QuantileState(first_state.id,
                                         main_kll.serialize().hex(),
                                         first_state.quantile,
                                         first_state.sketch_type)
        elif isinstance(first_state, ApproxDistinctState):
            main_hll = hll_sketch.deserialize(
                bytes.fromhex(first_state.serializedHll))
            num_rows = first_state.num_rows
            i = 0
            for state in states:
                if i == 0:
                    i += 1
                    continue
                num_rows = num_rows + state.num_rows
                new_hll = hll_sketch.deserialize(
                    bytes.fromhex(state.serializedHll))
                main_hll.update(new_hll)
            approx_distinct_count = main_hll.get_estimate()
            serialized_hll = main_hll.serialize_updatable().hex()
            result_state = ApproxDistinctState(first_state.id, serialized_hll,
                                               approx_distinct_count, num_rows)
        elif isinstance(first_state, StandardDeviationState):
            n: float = first_state.n
            avg: float = first_state.avg
            m2: float = first_state.m2
            stddev: float = first_state.stddev
            i = 0
            for state in states:
                if i == 0:
                    i += 1
                    continue
                n = n + state.n
                avg = (state.n * state.avg + n * avg) / n
                delta = state.avg - avg
                m2 = state.m2 + m2 + delta * delta * state.n * n / n
                stddev = (m2 / (n - 1)) if n > 1 else 0
            result_state = StandardDeviationState(first_state.id, n, avg, m2,
                                                  stddev)
        elif isinstance(first_state, SumState):
            sum_value: float = 0
            for state in states:
                sum_value = sum_value + state.sum_value
            result_state = SumState(first_state.id, sum_value)
        elif isinstance(first_state, FrequenciesAndNumRows):
            raise NotImplementedError(
                "Merging of FrequenciesAndNumRows states not implemented, yet")
            #frequencies_table: str
            #grouping_columns: List[str]
            #num_rows: int
            #def get_table_name(self) -> str:
            #    return self.frequencies_table

        return result_state
Example #18
0
 def reset(self) -> None:
     if self.result is not None:
         self.psdict.clear()
     self._kll = kll_floats_sketch(self._k)