Beispiel #1
0
    def metrics_from_states(
        self, properties_and_states: Dict[Property,
                                          State]) -> Dict[Property, Metric]:
        property_metric_map: Dict[Property, Metric] = {}
        for prop, state in properties_and_states.items():
            if isinstance(prop, Quantile):
                quantile_state = state  #QuantileState(quantile_property.property_identifier(), serialized_kll, quantile)
                if state.sketch_type == "floats":
                    kll_ser = kll_floats_sketch(DEFAULT_SKETCH_SIZE)
                else:
                    kll_ser = kll_ints_sketch(DEFAULT_SKETCH_SIZE)
                main_kll = kll_ser.deserialize(
                    bytes.fromhex(state.serializedKll))
                quantile = main_kll.get_quantiles([prop.quantile])[0]
                quantile_metric = metric_from_value(quantile, prop.name,
                                                    prop.instance, prop.entity)
                property_metric_map[prop] = quantile_metric
            elif isinstance(prop, ApproxDistinctness):
                approx_distinct_state = state  #ApproxDistinctState(approx_distinct_property.property_identifier(), serialized_hll, approx_distinct_count, num_rows)
                approx_distinctness = min(
                    approx_distinct_state.approx_distinct_count /
                    approx_distinct_state.num_rows, 1.00)
                approx_distinct_metric = metric_from_value(
                    approx_distinctness, prop.name, prop.instance, prop.entity)
                property_metric_map[prop] = approx_distinct_metric
            elif isinstance(prop, Schema):
                schema_state = state  #SchemaState(schema_property.property_identifier(),schema)
                schema = schema_state.schema
                schema_metric = metric_from_value(schema, prop.name,
                                                  prop.instance, prop.entity)
                property_metric_map[prop] = schema_metric
            else:
                operator = SQLOperatorFactory.create_operator(prop)
                metric = operator.get_metric(state)
                property_metric_map[prop] = metric

        return property_metric_map
Beispiel #2
0
    def compute_metrics(self, properties: Set[Property],
                        repo: MetadataRepository):
        quantile_properties = [
            property for property in properties
            if isinstance(property, Quantile)
        ]
        quantile_metrics: Dict[Property, Metric] = {}
        for quantile_property in quantile_properties:
            data_col = self.data[quantile_property.column].to_numpy()
            sketch_type = ""
            if self.data[quantile_property.column].dtype == np.int64:
                kll = kll_ints_sketch(DEFAULT_SKETCH_SIZE)
                sketch_type = "ints"
            elif self.data[quantile_property.column].dtype == np.float64:
                kll = kll_floats_sketch(DEFAULT_SKETCH_SIZE)
                sketch_type = "floats"
            else:
                raise NotImplementedError(
                    f"Data Type {self.data[quantile_property.column].dtype} is not supported for sketches!"
                )
            kll.update(data_col)
            quantile = kll.get_quantiles([quantile_property.quantile])[0]
            serialized_kll = kll.serialize().hex()  #bytes.fromhex()
            quantile_state = QuantileState(
                quantile_property.property_identifier(), serialized_kll,
                quantile, sketch_type)
            repo.register_state(quantile_state)
            quantile_metric = metric_from_value(quantile,
                                                quantile_property.name,
                                                quantile_property.instance,
                                                quantile_property.entity)
            quantile_metrics[quantile_property] = quantile_metric

        approx_distinct_properties = [
            property for property in properties
            if isinstance(property, ApproxDistinctness)
        ]
        approx_distinct_metrics: Dict[Property, Metric] = {}
        for approx_distinct_property in approx_distinct_properties:
            data_col = self.data[approx_distinct_property.column].to_numpy()
            hll = hll_sketch(DEFAULT_HLL_K, DEFAULT_HLL_TYPE)
            #for v in data_col: #slow
            #    hll.update(v)
            hll.update(data_col)  #works with local fork (np.array extension)
            approx_distinct_count = hll.get_estimate()
            num_rows = len(data_col)
            serialized_hll = hll.serialize_updatable().hex()  #bytes.fromhex()
            approx_distinct_state = ApproxDistinctState(
                approx_distinct_property.property_identifier(), serialized_hll,
                approx_distinct_count, num_rows)
            repo.register_state(approx_distinct_state)
            approx_distinctness = min(approx_distinct_count / num_rows, 1.00)
            approx_distinct_metric = metric_from_value(
                approx_distinctness, approx_distinct_property.name,
                approx_distinct_property.instance,
                approx_distinct_property.entity)
            approx_distinct_metrics[
                approx_distinct_property] = approx_distinct_metric

        other_properties = [
            property for property in properties
            if (not isinstance(property, Quantile)
                and not isinstance(property, ApproxDistinctness))
        ]
        metrics = self.engine.compute_metrics(other_properties, repo)
        metrics.update(quantile_metrics)
        metrics.update(approx_distinct_metrics)
        return metrics
Beispiel #3
0
 def get_metric(self, state: StandardDeviationState) -> DoubleMetric:
     return metric_from_value(state.stddev, self.property.name,
                              self.property.instance, self.property.entity)
Beispiel #4
0
 def get_metric(self, state: NumMatchesAndCount) -> DoubleMetric:
     return metric_from_value(state.num_matches / state.count,
                              self.property.name, self.property.instance,
                              self.property.entity)
Beispiel #5
0
 def get_metric(self, state: SumState) -> DoubleMetric:
     return metric_from_value(state.sum_value, self.property.name,
                              self.property.instance, self.property.entity)
Beispiel #6
0
 def get_metric(self, state: MeanState) -> DoubleMetric:
     return metric_from_value(state.total / state.count, self.property.name,
                              self.property.instance, self.property.entity)
Beispiel #7
0
 def extract_metric(self, result: DataFrame, num_rows: int) -> DoubleMetric:
     distinct_n = int(result[self.distinct_count][0])
     unique_n = int(result[self.unique_count][0])
     return metric_from_value(unique_n / distinct_n, self.property.name,
                              self.property.instance, self.property.entity)
Beispiel #8
0
 def extract_metric(self, result: DataFrame, num_rows: int) -> DoubleMetric:
     uniqueness_count = int(result[self.unique_count][0])
     return metric_from_value(uniqueness_count / num_rows,
                              self.property.name, self.property.instance,
                              self.property.entity)
Beispiel #9
0
    def compute_metrics(self, properties: Set[Property], repo: MetadataRepository) -> Dict[Property, Metric]:
        schema_property: Schema = None
        scanning_operators: List[ScanShareableOperator] = []
        grouping_operator_groups: Dict[Grouping, List[GroupingShareableOperator]] = {}
        metrics: Dict[Property, Metric] = {}

        for property in properties:
            if isinstance(property, Schema):
                schema_property = property
                continue

            try:
                operator = SQLOperatorFactory.create_operator(property)
            except UnsupportedPropertyException as ex:
                metrics[property] = metric_from_failure(ex, property)
                continue

            if isinstance(operator, ScanShareableOperator):
                scanning_operators.append(operator)
            elif isinstance(operator, GroupingShareableOperator):
                key = Grouping(operator.get_groupings(), operator.get_num_rows(), operator.filter)
                operators = grouping_operator_groups.setdefault(key,[])
                operators += [operator]
            else:
                raise UnknownOperatorTypeException(f"Operator '{operator.__class__.__name__}' "
                                                   f"is neither Scan nor Grouping operator.")

        if schema_property is not None:
            schema = self.get_schema()
            schema_state = SchemaState(schema_property.property_identifier(),schema)
            repo.register_state(schema_state)
            schema_metric = metric_from_value(
                schema, schema_property.name, schema_property.instance,schema_property.entity
            )
            metrics[schema_property] = schema_metric

        aggregations = set()
        # collect basic counts for grouping operators in first scan pass
        for grouping, grouping_operators in grouping_operator_groups.items():
            aggregations.add(grouping.get_num_rows_aggregation())

        if self.no_sharing:
            for operator in scanning_operators:
                aggregations = operator.get_aggregations()
                if len(aggregations) > 0:
                    scanning_result = self.execute_and_fetch(f"SELECT {', '.join(aggregations)} FROM {self.table}")
                    state = operator.extract_state(scanning_result)
                    state = repo.register_state(state)
                    metric = operator.get_metric(state)
                    metrics[operator.get_property()] = metric
        else:
            for operator in scanning_operators:
                aggregations = aggregations.union(operator.get_aggregations())

            if len(aggregations) > 0:
                scanning_result = self.execute_and_fetch(f"SELECT {', '.join(aggregations)} FROM {self.table}")

            for operator in scanning_operators:
                state = operator.extract_state(scanning_result)
                state = repo.register_state(state)
                metric = operator.get_metric(state)
                metrics[operator.get_property()] = metric

        for grouping, grouping_operators in grouping_operator_groups.items():
            grouping_columns = list(grouping.grouping_cols)
            num_rows = grouping.extract_num_rows(scanning_result)
            if grouping.filter is None:
                query = f"SELECT {', '.join(list(grouping_columns) + ['COUNT(*) as count'])} FROM {self.table} GROUP BY {', '.join(grouping_columns)}"
            else:
                query = f"SELECT {', '.join(list(grouping_columns) + ['COUNT(*) as count'])} FROM {self.table} WHERE {grouping.filter} GROUP BY {', '.join(grouping_columns)}"

            frequencies_table = repo.get_frequency_table_name(grouping.identifier())
            if self.is_local_state_handler(repo):
                self.execute_and_store(query, frequencies_table)
                state = FrequenciesAndNumRows(grouping.identifier(), frequencies_table, grouping_columns, num_rows)
            else:
                self.execute_and_store(query, frequencies_table, temp=True)
                grouping_df = self.execute_and_fetch(f"SELECT * FROM {frequencies_table}")
                state = FrequenciesAndNumRows(grouping.identifier(), frequencies_table, grouping_columns, num_rows)
                state.set_df(grouping_df)

            aggregations = set()
            for operator in grouping_operators:
                aggregations = aggregations.union(operator.get_aggregations())

            grouping_result = self.execute_and_fetch(f"SELECT {', '.join(aggregations)} FROM {frequencies_table}")
            for operator in grouping_operators:
                metric = operator.extract_metric(grouping_result, num_rows)
                metrics[operator.get_property()] = metric
                state.id = operator.get_property().property_identifier()
                state = repo.register_state(state)

        return metrics