Exemple #1
0
def calculate_sketch_statistics(data):
    columns = list(data.columns)
    types = list(data.dtypes)

    stats_dict = {}
    for column, type in zip(columns, types):
        if type in [np.int32, np.int64, np.float64]:
            data_col = data[column].to_numpy()
            if data[column].dtype in [np.int32, np.int64]:
                kll = kll_ints_sketch(2048)
            elif data[column].dtype == np.float64:
                kll = kll_floats_sketch(2048)
            kll.update(data_col)
            stat_values = kll.get_quantiles([0.05, 0.25, 0.5, 0.75, 0.95])
            stat_names = ["0.05", "Q1", "Median", "Q3", "0.95"]

            hll = hll_sketch(DEFAULT_HLL_K, DEFAULT_HLL_TYPE)
            hll.update(data_col)  #works with local fork (np.array extension)
            approx_distinct_count = hll.get_estimate()
            stat_values.append(round(approx_distinct_count))
            stat_names.append("Distinct Count")

            stat_pairs = [list(i) for i in zip(stat_names, stat_values)]
            stats_dict[column] = stat_pairs

    return stats_dict
Exemple #2
0
def deserialize_kll_floats_sketch(x: bytes, kind: str = "float"):
    """
    Deserialize a KLL floats sketch.  Compatible with whylogs-java

    whylogs histograms are serialized as kll floats sketches

    Parameters
    ----------
    x : bytes
        Serialized sketch
    kind : str, optional
        Specify type of sketch: 'float' or 'int'

    Returns
    -------
    sketch : `kll_floats_sketch`, `kll_ints_sketch`, or None
        If `x` is an empty sketch, return None, else return the deserialized
        sketch.
    """
    if len(x) < 1:
        return
    if kind == "float":
        h = datasketches.kll_floats_sketch.deserialize(x)
    elif kind == "int":
        h = datasketches.kll_ints_sketch(x)
    if h.get_n() < 1:
        return
    return h
Exemple #3
0
    def test_kll_ints_sketch(self):
        k = 100
        n = 10
        kll = kll_ints_sketch(k)
        for i in range(0, n):
            kll.update(i)

        self.assertEqual(kll.get_min_value(), 0)
        self.assertEqual(kll.get_max_value(), n - 1)
        self.assertEqual(kll.get_n(), n)
        self.assertFalse(kll.is_empty())
        self.assertFalse(kll.is_estimation_mode())  # n < k

        pmf = kll.get_pmf([round(n / 2)])
        self.assertIsNotNone(pmf)
        self.assertEqual(len(pmf), 2)

        cdf = kll.get_cdf([round(n / 2)])
        self.assertIsNotNone(cdf)
        self.assertEqual(len(cdf), 2)

        self.assertEqual(kll.get_quantile(0.5), round(n / 2))
        quants = kll.get_quantiles([0.25, 0.5, 0.75])
        self.assertIsNotNone(quants)
        self.assertEqual(len(quants), 3)

        self.assertEqual(kll.get_rank(round(n / 2)), 0.5)

        # merge self
        kll.merge(kll)
        self.assertEqual(kll.get_n(), 2 * n)

        sk_bytes = kll.serialize()
        self.assertTrue(
            isinstance(kll_ints_sketch.deserialize(sk_bytes), kll_ints_sketch))
Exemple #4
0
def calculate_sketch_statistics_np(np_arr):
    columns = np_arr.keys()
    stats_dict = {}
    for column in columns:
        type = np_arr[column].dtype
        if type in [np.int32, np.int64, np.float64]:
            data_col = np_arr[column]
            if type in [np.int32, np.int64]:
                kll = kll_ints_sketch(2048)
            elif type == np.float64:
                kll = kll_floats_sketch(2048)
            kll.update(data_col)
            quantiles = kll.get_quantiles([0.05, 0.25, 0.5, 0.75, 0.95])
            quantile_names = ["0.05", "Q1", "Median", "Q3", "0.95"]
            stat_pairs = [list(i) for i in zip(quantile_names, quantiles)]
            stats_dict[column] = stat_pairs

    return stats_dict
Exemple #5
0
    def metrics_from_states(
        self, properties_and_states: Dict[Property,
                                          State]) -> Dict[Property, Metric]:
        property_metric_map: Dict[Property, Metric] = {}
        for prop, state in properties_and_states.items():
            if isinstance(prop, Quantile):
                quantile_state = state  #QuantileState(quantile_property.property_identifier(), serialized_kll, quantile)
                if state.sketch_type == "floats":
                    kll_ser = kll_floats_sketch(DEFAULT_SKETCH_SIZE)
                else:
                    kll_ser = kll_ints_sketch(DEFAULT_SKETCH_SIZE)
                main_kll = kll_ser.deserialize(
                    bytes.fromhex(state.serializedKll))
                quantile = main_kll.get_quantiles([prop.quantile])[0]
                quantile_metric = metric_from_value(quantile, prop.name,
                                                    prop.instance, prop.entity)
                property_metric_map[prop] = quantile_metric
            elif isinstance(prop, ApproxDistinctness):
                approx_distinct_state = state  #ApproxDistinctState(approx_distinct_property.property_identifier(), serialized_hll, approx_distinct_count, num_rows)
                approx_distinctness = min(
                    approx_distinct_state.approx_distinct_count /
                    approx_distinct_state.num_rows, 1.00)
                approx_distinct_metric = metric_from_value(
                    approx_distinctness, prop.name, prop.instance, prop.entity)
                property_metric_map[prop] = approx_distinct_metric
            elif isinstance(prop, Schema):
                schema_state = state  #SchemaState(schema_property.property_identifier(),schema)
                schema = schema_state.schema
                schema_metric = metric_from_value(schema, prop.name,
                                                  prop.instance, prop.entity)
                property_metric_map[prop] = schema_metric
            else:
                operator = SQLOperatorFactory.create_operator(prop)
                metric = operator.get_metric(state)
                property_metric_map[prop] = metric

        return property_metric_map
Exemple #6
0
    def compute_metrics(self, properties: Set[Property],
                        repo: MetadataRepository):
        quantile_properties = [
            property for property in properties
            if isinstance(property, Quantile)
        ]
        quantile_metrics: Dict[Property, Metric] = {}
        for quantile_property in quantile_properties:
            data_col = self.data[quantile_property.column].to_numpy()
            sketch_type = ""
            if self.data[quantile_property.column].dtype == np.int64:
                kll = kll_ints_sketch(DEFAULT_SKETCH_SIZE)
                sketch_type = "ints"
            elif self.data[quantile_property.column].dtype == np.float64:
                kll = kll_floats_sketch(DEFAULT_SKETCH_SIZE)
                sketch_type = "floats"
            else:
                raise NotImplementedError(
                    f"Data Type {self.data[quantile_property.column].dtype} is not supported for sketches!"
                )
            kll.update(data_col)
            quantile = kll.get_quantiles([quantile_property.quantile])[0]
            serialized_kll = kll.serialize().hex()  #bytes.fromhex()
            quantile_state = QuantileState(
                quantile_property.property_identifier(), serialized_kll,
                quantile, sketch_type)
            repo.register_state(quantile_state)
            quantile_metric = metric_from_value(quantile,
                                                quantile_property.name,
                                                quantile_property.instance,
                                                quantile_property.entity)
            quantile_metrics[quantile_property] = quantile_metric

        approx_distinct_properties = [
            property for property in properties
            if isinstance(property, ApproxDistinctness)
        ]
        approx_distinct_metrics: Dict[Property, Metric] = {}
        for approx_distinct_property in approx_distinct_properties:
            data_col = self.data[approx_distinct_property.column].to_numpy()
            hll = hll_sketch(DEFAULT_HLL_K, DEFAULT_HLL_TYPE)
            #for v in data_col: #slow
            #    hll.update(v)
            hll.update(data_col)  #works with local fork (np.array extension)
            approx_distinct_count = hll.get_estimate()
            num_rows = len(data_col)
            serialized_hll = hll.serialize_updatable().hex()  #bytes.fromhex()
            approx_distinct_state = ApproxDistinctState(
                approx_distinct_property.property_identifier(), serialized_hll,
                approx_distinct_count, num_rows)
            repo.register_state(approx_distinct_state)
            approx_distinctness = min(approx_distinct_count / num_rows, 1.00)
            approx_distinct_metric = metric_from_value(
                approx_distinctness, approx_distinct_property.name,
                approx_distinct_property.instance,
                approx_distinct_property.entity)
            approx_distinct_metrics[
                approx_distinct_property] = approx_distinct_metric

        other_properties = [
            property for property in properties
            if (not isinstance(property, Quantile)
                and not isinstance(property, ApproxDistinctness))
        ]
        metrics = self.engine.compute_metrics(other_properties, repo)
        metrics.update(quantile_metrics)
        metrics.update(approx_distinct_metrics)
        return metrics
Exemple #7
0
    def __merge_states(self, states: Sequence[State]) -> State:
        first_state = states[0]
        result_state = None
        if isinstance(first_state, SchemaState):
            result_state = first_state
        elif isinstance(first_state, MaxState):
            max_value: float = first_state.max_value
            for state in states:
                max_value = max(max_value, state.max_value)
            result_state = MaxState(first_state.id, max_value)
        elif isinstance(first_state, MeanState):
            total: float = 0
            count: int = 0
            for state in states:
                total = total + state.total
                count = count + state.count
            result_state = MeanState(first_state.id, total, count)
        elif isinstance(first_state, MinState):
            min_value: float = first_state.min_value
            for state in states:
                min_value = min(min_value, state.min_value)
            result_state = MinState(first_state.id, min_value)
        elif isinstance(first_state, NumMatches):
            num_matches: int = 0
            for state in states:
                num_matches = num_matches + state.num_matches
            result_state = NumMatches(first_state.id, num_matches)
        elif isinstance(first_state, NumMatchesAndCount):
            num_matches: int = 0
            count: int = 0
            for state in states:
                num_matches = num_matches + state.num_matches
                count = count + state.count
            result_state = NumMatchesAndCount(first_state.id, num_matches,
                                              count)
        elif isinstance(first_state, QuantileState):
            if first_state.sketch_type == "floats":
                kll_ser = kll_floats_sketch(DEFAULT_SKETCH_SIZE)
            else:
                kll_ser = kll_ints_sketch(DEFAULT_SKETCH_SIZE)
            main_kll = kll_ser.deserialize(
                bytes.fromhex(first_state.serializedKll))

            i = 0
            for state in states:
                if i == 0:
                    i += 1
                    continue
                new_kll = kll_ser.deserialize(
                    bytes.fromhex(state.serializedKll))
                main_kll.merge(new_kll)

            result_state = QuantileState(first_state.id,
                                         main_kll.serialize().hex(),
                                         first_state.quantile,
                                         first_state.sketch_type)
        elif isinstance(first_state, ApproxDistinctState):
            main_hll = hll_sketch.deserialize(
                bytes.fromhex(first_state.serializedHll))
            num_rows = first_state.num_rows
            i = 0
            for state in states:
                if i == 0:
                    i += 1
                    continue
                num_rows = num_rows + state.num_rows
                new_hll = hll_sketch.deserialize(
                    bytes.fromhex(state.serializedHll))
                main_hll.update(new_hll)
            approx_distinct_count = main_hll.get_estimate()
            serialized_hll = main_hll.serialize_updatable().hex()
            result_state = ApproxDistinctState(first_state.id, serialized_hll,
                                               approx_distinct_count, num_rows)
        elif isinstance(first_state, StandardDeviationState):
            n: float = first_state.n
            avg: float = first_state.avg
            m2: float = first_state.m2
            stddev: float = first_state.stddev
            i = 0
            for state in states:
                if i == 0:
                    i += 1
                    continue
                n = n + state.n
                avg = (state.n * state.avg + n * avg) / n
                delta = state.avg - avg
                m2 = state.m2 + m2 + delta * delta * state.n * n / n
                stddev = (m2 / (n - 1)) if n > 1 else 0
            result_state = StandardDeviationState(first_state.id, n, avg, m2,
                                                  stddev)
        elif isinstance(first_state, SumState):
            sum_value: float = 0
            for state in states:
                sum_value = sum_value + state.sum_value
            result_state = SumState(first_state.id, sum_value)
        elif isinstance(first_state, FrequenciesAndNumRows):
            raise NotImplementedError(
                "Merging of FrequenciesAndNumRows states not implemented, yet")
            #frequencies_table: str
            #grouping_columns: List[str]
            #num_rows: int
            #def get_table_name(self) -> str:
            #    return self.frequencies_table

        return result_state