Example #1
0
 def convert_result_to_serializable(self, data, **kwargs):
     data = deepcopy(data)
     if isinstance(data, ExpectationValidationResult):
         data.result = convert_to_json_serializable(data.result)
     elif isinstance(data, dict):
         data["result"] = convert_to_json_serializable(data.get("result"))
     return data
 def to_dict(self) -> dict:
     """
     Returns: This DataAssistantResult as dictionary (JSON-serializable dictionary for DataAssistantResult objects).
     """
     domain: Domain
     parameter_values_for_fully_qualified_parameter_names: Dict[str, ParameterNode]
     expectation_configuration: ExpectationConfiguration
     return {
         "profiler_config": self.profiler_config.to_json_dict(),
         "metrics_by_domain": [
             {
                 "domain_id": domain.id,
                 "domain": domain.to_json_dict(),
                 "parameter_values_for_fully_qualified_parameter_names": convert_to_json_serializable(
                     data=parameter_values_for_fully_qualified_parameter_names
                 ),
             }
             for domain, parameter_values_for_fully_qualified_parameter_names in self.metrics_by_domain.items()
         ],
         "expectation_configurations": [
             expectation_configuration.to_json_dict()
             for expectation_configuration in self.expectation_configurations
         ],
         "execution_time": convert_to_json_serializable(data=self.execution_time),
     }
Example #3
0
def test_resolve_config_using_acceptable_arguments(checkpoint):

    checkpoint_run_anonymizer = CheckpointRunAnonymizer(salt=DATA_CONTEXT_ID)

    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})

    batch_request_param = {
        "runtime_parameters": {
            "batch_data": df
        },
        "batch_identifiers": {
            "default_identifier_name": "my_simple_df"
        },
    }

    result_format_param = {"result_format": "SUMMARY"}

    kwargs = {
        "batch_request": batch_request_param,
        "result_format": result_format_param,
    }

    # Matching how this is called in usage_statistics.py (parameter style)
    resolved_runtime_kwargs: dict = (
        checkpoint_run_anonymizer.resolve_config_using_acceptable_arguments(
            *(checkpoint, ), **kwargs))

    # Assertions about important bits of the substituted_runtime_config

    expected_top_level_batch_request = {
        "runtime_parameters": {
            "batch_data": df
        },
        "batch_identifiers": {
            "default_identifier_name": "my_simple_df"
        },
    }
    expected_top_level_batch_request = convert_to_json_serializable(
        data=expected_top_level_batch_request)
    actual_top_level_batch_request = convert_to_json_serializable(
        data=resolved_runtime_kwargs["batch_request"])
    assert actual_top_level_batch_request == expected_top_level_batch_request

    validation_level_batch_request = resolved_runtime_kwargs["validations"][0][
        "batch_request"]

    assert validation_level_batch_request == RuntimeBatchRequest(
        **{
            "datasource_name": "example_datasource",
            "data_connector_name": "default_runtime_data_connector_name",
            "data_asset_name": "my_data_asset",
            "batch_identifiers": {
                "default_identifier_name": "my_simple_df"
            },
            "runtime_parameters": {
                "batch_data": df
            },
        })
    assert (resolved_runtime_kwargs["validations"][0]["expectation_suite_name"]
            == "test_suite")
def test_lossy_serialization_warning(caplog):
    caplog.set_level(logging.WARNING, logger="great_expectations.core")

    d = (Decimal(7091.17117297555159893818)**Decimal(2) +
         Decimal(7118.70008070942458289210)**Decimal(2) +
         (Decimal(-1513.67274389594149397453))**Decimal(2))**Decimal(1.5)
    f_1 = (7091.17117297555159893818**2 + 7118.70008070942458289210**2 +
           (-1513.67274389594149397453)**2)**1.5
    f_2 = float(d)
    assert not (-1e-55 < Decimal.from_float(f_1) - d < 1e-55)
    assert not (-1e-55 < Decimal.from_float(f_2) - d < 1e-55)

    convert_to_json_serializable(d)
    assert len(caplog.messages) == 1
    assert caplog.messages[0].startswith("Using lossy conversion for decimal")

    caplog.clear()
    d = Decimal(0.1)
    f_1 = 0.1
    f_2 = float(d)

    assert -1e-55 < Decimal.from_float(f_1) - d < 1e-55
    assert -1e-55 < Decimal.from_float(f_2) - d < 1e-55
    convert_to_json_serializable(d)
    assert len(caplog.messages) == 0
Example #5
0
    def to_json_dict(self) -> dict:
        """
        # TODO: <Alex>2/4/2022</Alex>
        This implementation of "SerializableDictDot.to_json_dict() occurs frequently and should ideally serve as the
        reference implementation in the "SerializableDictDot" class itself.  However, the circular import dependencies,
        due to the location of the "great_expectations/types/__init__.py" and "great_expectations/core/util.py" modules
        make this refactoring infeasible at the present time.
        """

        # if batch_data appears in BatchRequest, temporarily replace it with
        # str placeholder before calling convert_to_json_serializable so that
        # batch_data is not serialized
        if batch_request_contains_batch_data(batch_request=self):
            batch_data: Union[BatchRequestBase,
                              dict] = self.runtime_parameters["batch_data"]
            self.runtime_parameters["batch_data"]: str = str(type(batch_data))
            serializeable_dict: dict = convert_to_json_serializable(
                data=self.to_dict())
            # after getting serializable_dict, restore original batch_data
            self.runtime_parameters["batch_data"]: Union[BatchRequestBase,
                                                         dict] = batch_data
        else:
            serializeable_dict: dict = convert_to_json_serializable(
                data=self.to_dict())

        return serializeable_dict
Example #6
0
    def to_json_dict(self) -> dict:
        details: dict = {}

        key: str
        value: Any
        for key, value in self["details"].items():
            if value:
                if key == INFERRED_SEMANTIC_TYPE_KEY:
                    column_name: str
                    semantic_type: Union[str, SemanticDomainTypes]
                    value = {
                        column_name:
                        SemanticDomainTypes(semantic_type.lower()).value if
                        isinstance(semantic_type, str) else semantic_type.value
                        for column_name, semantic_type in value.items()
                    }

            details[key] = convert_to_json_serializable(data=value)

        json_dict: dict = {
            "domain_type": self["domain_type"].value,
            "domain_kwargs": self["domain_kwargs"].to_json_dict(),
            "details": details,
            "rule_name": self["rule_name"],
        }
        json_dict = convert_to_json_serializable(data=json_dict)

        return deep_filter_properties_iterable(properties=json_dict,
                                               clean_falsy=True)
Example #7
0
 def prepare_dump(self, data, **kwargs):
     data = deepcopy(data)
     if isinstance(data, ExpectationSuiteValidationResult):
         data.meta = convert_to_json_serializable(data.meta)
     elif isinstance(data, dict):
         data["meta"] = convert_to_json_serializable(data.get("meta"))
     return data
Example #8
0
def test_serialization_of_spark_df(spark_session):
    df = pd.DataFrame({"a": [1, 2, 3]})
    sdf = spark_session.createDataFrame(df)
    assert convert_to_json_serializable(sdf) == {"a": [1, 2, 3]}

    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
    sdf = spark_session.createDataFrame(df)
    assert convert_to_json_serializable(sdf) == {"a": [1, 2, 3], "b": [4, 5, 6]}
Example #9
0
 def to_json_dict(self):
     myself = deepcopy(self)
     # NOTE - JPC - 20191031: migrate to expectation-specific schemas that subclass result with properly-typed
     # schemas to get serialization all-the-way down via dump
     myself["evaluation_parameters"] = convert_to_json_serializable(
         myself["evaluation_parameters"])
     myself["statistics"] = convert_to_json_serializable(
         myself["statistics"])
     myself["meta"] = convert_to_json_serializable(myself["meta"])
     myself = expectationSuiteValidationResultSchema.dump(myself)
     return myself
Example #10
0
 def to_json_dict(self):
     myself = expectationValidationResultSchema.dump(self)
     # NOTE - JPC - 20191031: migrate to expectation-specific schemas that subclass result with properly-typed
     # schemas to get serialization all-the-way down via dump
     if "result" in myself:
         myself["result"] = convert_to_json_serializable(myself["result"])
     if "meta" in myself:
         myself["meta"] = convert_to_json_serializable(myself["meta"])
     if "exception_info" in myself:
         myself["exception_info"] = convert_to_json_serializable(
             myself["exception_info"])
     return myself
 def to_json_dict(self):
     myself = expectationSuiteSchema.dump(self)
     # NOTE - JPC - 20191031: migrate to expectation-specific schemas that subclass result with properly-typed
     # schemas to get serialization all-the-way down via dump
     myself["expectations"] = convert_to_json_serializable(
         myself["expectations"])
     try:
         myself["evaluation_parameters"] = convert_to_json_serializable(
             myself["evaluation_parameters"])
     except KeyError:
         pass  # Allow evaluation parameters to be missing if empty
     myself["meta"] = convert_to_json_serializable(myself["meta"])
     return myself
def test_lossy_serialization_warning(caplog):
    caplog.set_level(logging.WARNING, logger="great_expectations.core")

    d = Decimal("12345.678901234567890123456789")

    convert_to_json_serializable(d)
    assert len(caplog.messages) == 1
    assert caplog.messages[0].startswith(
        "Using lossy conversion for decimal 12345.678901234567890123456789")

    caplog.clear()
    d = Decimal("0.1")
    convert_to_json_serializable(d)
    print(caplog.messages)
    assert len(caplog.messages) == 0
Example #13
0
 def to_json_dict(self) -> dict:
     fields_dict: dict = {
         "exception_traceback": self.exception_traceback,
         "exception_message": self.exception_message,
         "raised_exception": self.raised_exception,
     }
     return convert_to_json_serializable(fields_dict)
Example #14
0
 def __eq__(self, other):
     """ExpectationConfiguration equality does include meta, but ignores instance identity."""
     if not isinstance(other, self.__class__):
         # Delegate comparison to the other instance's __eq__.
         return NotImplemented
     this_kwargs: dict = convert_to_json_serializable(self.kwargs)
     other_kwargs: dict = convert_to_json_serializable(other.kwargs)
     this_meta: dict = convert_to_json_serializable(self.meta)
     other_meta: dict = convert_to_json_serializable(other.meta)
     return all(
         (
             self.expectation_type == other.expectation_type,
             this_kwargs == other_kwargs,
             this_meta == other_meta,
         )
     )
 def to_json_dict(self) -> dict:
     result = convert_to_json_serializable(data=asdict(self))
     result["execution_engines_list"] = sorted([
         engine for engine, _bool in result["execution_engines"].items()
         if _bool is True
     ])
     return result
Example #16
0
 def to_json_dict(self) -> dict:
     """
     :returns a JSON-serialiable dict containing the project configuration
     """
     commented_map: CommentedMap = self.get_schema_validated_updated_commented_map(
     )
     return convert_to_json_serializable(data=commented_map)
Example #17
0
 def to_json_dict(self) -> dict:
     return convert_to_json_serializable({
         "name":
         self.name,
         "max_proportion_unique":
         self.max_unique_values,
         "metric_name_defining_limit":
         self.metric_name_defining_limit,
     })
Example #18
0
 def to_json_dict(self) -> dict:
     return convert_to_json_serializable(
         {
             "datasource_name": self.datasource_name,
             "data_connector_name": self.data_connector_name,
             "data_asset_name": self.data_asset_name,
             "batch_identifiers": self.batch_identifiers,
         }
     )
 def to_dict(self) -> dict:
     """
     Returns: This RuleBasedProfilerResult as dictionary (JSON-serializable for RuleBasedProfilerResult objects).
     """
     domain: Domain
     fully_qualified_parameter_names: List[str]
     parameter_values_for_fully_qualified_parameter_names: Dict[
         str, ParameterNode]
     expectation_configuration: ExpectationConfiguration
     return {
         "fully_qualified_parameter_names_by_domain":
         [{
             "domain_id":
             domain.id,
             "domain":
             domain.to_json_dict(),
             "fully_qualified_parameter_names":
             convert_to_json_serializable(
                 data=fully_qualified_parameter_names),
         } for domain, fully_qualified_parameter_names in
          self.fully_qualified_parameter_names_by_domain.items()],
         "parameter_values_for_fully_qualified_parameter_names_by_domain":
         [{
             "domain_id":
             domain.id,
             "domain":
             domain.to_json_dict(),
             "parameter_values_for_fully_qualified_parameter_names":
             convert_to_json_serializable(
                 data=parameter_values_for_fully_qualified_parameter_names),
         } for domain, parameter_values_for_fully_qualified_parameter_names
          in self.
          parameter_values_for_fully_qualified_parameter_names_by_domain.
          items()],
         "expectation_configurations": [
             expectation_configuration.to_json_dict() for
             expectation_configuration in self.expectation_configurations
         ],
         "citation":
         self.citation,
         "execution_time":
         self.execution_time,
     }
Example #20
0
 def to_json_dict(self) -> dict:
     """
     # TODO: <Alex>2/4/2022</Alex>
     This implementation of "SerializableDictDot.to_json_dict() occurs frequently and should ideally serve as the
     reference implementation in the "SerializableDictDot" class itself.  However, the circular import dependencies,
     due to the location of the "great_expectations/types/__init__.py" and "great_expectations/core/util.py" modules
     make this refactoring infeasible at the present time.
     """
     dict_obj: dict = self.to_dict()
     serializeable_dict: dict = convert_to_json_serializable(data=dict_obj)
     return serializeable_dict
Example #21
0
 def to_json_dict(self) -> dict:
     """
     # TODO: <Alex>2/4/2022</Alex>
     This implementation of "SerializableDictDot.to_json_dict() occurs frequently and should ideally serve as the
     reference implementation in the "SerializableDictDot" class itself.  However, the circular import dependencies,
     due to the location of the "great_expectations/types/__init__.py" and "great_expectations/core/util.py" modules
     make this refactoring infeasible at the present time.
     """
     serializeable_dict: dict = {
         "run_id": self.run_id.to_json_dict(),
         "run_results": convert_to_json_serializable(
             data=recursively_convert_to_json_serializable(test_obj=self.run_results)
         ),
         "checkpoint_config": self.checkpoint_config.to_json_dict(),
         "success": convert_to_json_serializable(data=self.success),
     }
     serializeable_dict = recursively_convert_to_json_serializable(
         test_obj=serializeable_dict
     )
     return serializeable_dict
Example #22
0
def get_column_unique_count_patch(self, column):
    if self.engine.dialect.name.lower() == "redshift":
        element_values = self.engine.execute(
            sa.select([sa.text(f"APPROXIMATE count(distinct {column})")
                       ]  # type:ignore
                      ).select_from(self._table))
        return convert_to_json_serializable(element_values.fetchone()[0])
    elif self.engine.dialect.name.lower() == "bigquery":
        element_values = self.engine.execute(
            sa.select([sa.text(f"APPROX_COUNT_DISTINCT ({column})")
                       ]  # type:ignore
                      ).select_from(self._table))
        return convert_to_json_serializable(element_values.fetchone()[0])
    elif self.engine.dialect.name.lower() == "snowflake":
        element_values = self.engine.execute(
            sa.select([sa.text(f"APPROX_COUNT_DISTINCT({column})")
                       ]  # type:ignore
                      ).select_from(self._table))
        return convert_to_json_serializable(element_values.fetchone()[0])
    return convert_to_json_serializable(
        self.engine.execute(
            sa.select([sa.func.count(sa.func.distinct(sa.column(column)))
                       ]).select_from(self._table)).scalar())
def test_pandas_unexpected_rows_basic_result_format(
        dataframe_for_unexpected_rows):
    expectationConfiguration = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={
            "column": "b",
            "mostly": 0.9,
            "value_set": ["cat", "fish", "dog", "giraffe"],
            "result_format": {
                "result_format": "BASIC",
                "include_unexpected_rows": True,
            },
        },
    )

    expectation = ExpectColumnValuesToBeInSet(expectationConfiguration)
    batch: Batch = Batch(data=dataframe_for_unexpected_rows)
    engine = PandasExecutionEngine()
    validator = Validator(
        execution_engine=engine,
        batches=[
            batch,
        ],
    )
    result = expectation.validate(validator)

    assert convert_to_json_serializable(result.result) == {
        "element_count": 6,
        "unexpected_count": 2,
        "unexpected_percent": 33.33333333333333,
        "partial_unexpected_list": ["lion", "zebra"],
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_percent_total": 33.33333333333333,
        "unexpected_percent_nonmissing": 33.33333333333333,
        "unexpected_rows": [{
            "a": 5,
            "b": "lion"
        }, {
            "a": 10,
            "b": "zebra"
        }],
    }
Example #24
0
    def resolve_metric_bundle(
        self,
        metric_fn_bundle: Iterable[Tuple[MetricConfiguration, Any, dict, dict]],
    ) -> Dict[Tuple[str, str, str], Any]:
        """For every metric in a set of Metrics to resolve, obtains necessary metric keyword arguments and builds
        bundles of the metrics into one large query dictionary so that they are all executed simultaneously. Will fail
        if bundling the metrics together is not possible.

            Args:
                metric_fn_bundle (Iterable[Tuple[MetricConfiguration, Callable, dict]): \
                    A Dictionary containing a MetricProvider's MetricConfiguration (its unique identifier), its metric provider function
                    (the function that actually executes the metric), and the arguments to pass to the metric provider function.
                    A dictionary of metrics defined in the registry and corresponding arguments

            Returns:
                A dictionary of metric names and their corresponding now-queried values.
        """
        resolved_metrics = {}

        # We need a different query for each domain (where clause).
        queries: Dict[Tuple, dict] = {}
        for (
            metric_to_resolve,
            engine_fn,
            compute_domain_kwargs,
            accessor_domain_kwargs,
            metric_provider_kwargs,
        ) in metric_fn_bundle:
            if not isinstance(compute_domain_kwargs, IDDict):
                compute_domain_kwargs = IDDict(compute_domain_kwargs)
            domain_id = compute_domain_kwargs.to_id()
            if domain_id not in queries:
                queries[domain_id] = {
                    "select": [],
                    "ids": [],
                    "domain_kwargs": compute_domain_kwargs,
                }
            if self.engine.dialect.name == "clickhouse":
                queries[domain_id]["select"].append(
                    engine_fn.label(
                        metric_to_resolve.metric_name.join(
                            random.choices(string.ascii_lowercase, k=2)
                        )
                    )
                )
            else:
                queries[domain_id]["select"].append(
                    engine_fn.label(metric_to_resolve.metric_name)
                )
            queries[domain_id]["ids"].append(metric_to_resolve.id)
        for query in queries.values():
            domain_kwargs = query["domain_kwargs"]
            selectable = self.get_domain_records(
                domain_kwargs=domain_kwargs,
            )
            assert len(query["select"]) == len(query["ids"])
            try:
                """
                If a custom query is passed, selectable will be TextClause and not formatted
                as a subquery wrapped in "(subquery) alias". TextClause must first be converted
                to TextualSelect using sa.columns() before it can be converted to type Subquery
                """
                if TextClause and isinstance(selectable, TextClause):
                    res = self.engine.execute(
                        sa.select(query["select"]).select_from(
                            selectable.columns().subquery()
                        )
                    ).fetchall()
                else:
                    res = self.engine.execute(
                        sa.select(query["select"]).select_from(selectable)
                    ).fetchall()
                logger.debug(
                    f"SqlAlchemyExecutionEngine computed {len(res[0])} metrics on domain_id {IDDict(domain_kwargs).to_id()}"
                )
            except OperationalError as oe:
                exception_message: str = "An SQL execution Exception occurred.  "
                exception_traceback: str = traceback.format_exc()
                exception_message += f'{type(oe).__name__}: "{str(oe)}".  Traceback: "{exception_traceback}".'
                logger.error(exception_message)
                raise ExecutionEngineError(message=exception_message)
            assert (
                len(res) == 1
            ), "all bundle-computed metrics must be single-value statistics"
            assert len(query["ids"]) == len(
                res[0]
            ), "unexpected number of metrics returned"
            for idx, id in enumerate(query["ids"]):
                resolved_metrics[id] = convert_to_json_serializable(res[0][idx])

        return resolved_metrics
Example #25
0
 def prepare_dump(self, data, **kwargs):
     data = deepcopy(data)
     data.meta = convert_to_json_serializable(data.meta)
     return data
Example #26
0
    def resolve_metric_bundle(
        self, metric_fn_bundle: Iterable[Tuple[MetricConfiguration, Any, dict, dict]],
    ) -> dict:
        """For every metrics in a set of Metrics to resolve, obtains necessary metric keyword arguments and builds a
        bundles the metrics into one large query dictionary so that they are all executed simultaneously. Will fail if
        bundling the metrics together is not possible.

            Args:
                metric_fn_bundle (Iterable[Tuple[MetricConfiguration, Callable, dict]): \
                    A Dictionary containing a MetricProvider's MetricConfiguration (its unique identifier), its metric provider function
                    (the function that actually executes the metric), and the arguments to pass to the metric provider function.
                metrics (Dict[Tuple, Any]): \
                    A dictionary of metrics defined in the registry and corresponding arguments

            Returns:
                A dictionary of metric names and their corresponding now-queried values.
        """
        resolved_metrics = dict()

        # We need a different query for each domain (where clause).
        queries: Dict[Tuple, dict] = dict()
        for (
            metric_to_resolve,
            engine_fn,
            compute_domain_kwargs,
            accessor_domain_kwargs,
            metric_provider_kwargs,
        ) in metric_fn_bundle:
            if not isinstance(compute_domain_kwargs, IDDict):
                compute_domain_kwargs = IDDict(compute_domain_kwargs)
            domain_id = compute_domain_kwargs.to_id()
            if domain_id not in queries:
                queries[domain_id] = {
                    "select": [],
                    "ids": [],
                    "domain_kwargs": compute_domain_kwargs,
                }
            queries[domain_id]["select"].append(
                engine_fn.label(metric_to_resolve.metric_name)
            )
            queries[domain_id]["ids"].append(metric_to_resolve.id)
        for query in queries.values():
            selectable, compute_domain_kwargs, _ = self.get_compute_domain(
                query["domain_kwargs"], domain_type="identity"
            )
            assert len(query["select"]) == len(query["ids"])
            res = self.engine.execute(
                sa.select(query["select"]).select_from(selectable)
            ).fetchall()
            logger.debug(
                f"SqlAlchemyExecutionEngine computed {len(res[0])} metrics on domain_id {IDDict(compute_domain_kwargs).to_id()}"
            )
            assert (
                len(res) == 1
            ), "all bundle-computed metrics must be single-value statistics"
            assert len(query["ids"]) == len(
                res[0]
            ), "unexpected number of metrics returned"
            for idx, id in enumerate(query["ids"]):
                resolved_metrics[id] = convert_to_json_serializable(res[0][idx])

        # Convert metrics to be serializable
        return resolved_metrics
Example #27
0
 def to_json_dict(self):
     myself = expectationConfigurationSchema.dump(self)
     # NOTE - JPC - 20191031: migrate to expectation-specific schemas that subclass result with properly-typed
     # schemas to get serialization all-the-way down via dump
     myself["kwargs"] = convert_to_json_serializable(myself["kwargs"])
     return myself
Example #28
0
 def to_json_dict(self) -> dict:
     return convert_to_json_serializable(data=dict(self))
Example #29
0
 def to_json_dict(self) -> dict:
     """
     Returns JSON dictionary equivalent of this object.
     """
     return convert_to_json_serializable(data=self.to_dict())
Example #30
0
 def convert_result_to_serializable(self, data, **kwargs):
     data = deepcopy(data)
     data.result = convert_to_json_serializable(data.result)
     return data