Example #1
0
def test_parse_validation_graph_with_bad_metrics_args():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    validator = Validator(execution_engine=engine)
    for configuration in [expectation_configuration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than")
        validation_dependencies = expectation_impl(
            configuration).get_validation_dependencies(
                configuration,
                execution_engine=engine,
            )

        for metric_configuration in validation_dependencies["metrics"].values(
        ):
            validator.build_metric_dependency_graph(
                graph=graph,
                execution_engine=engine,
                metric_configuration=metric_configuration,
                configuration=configuration,
            )
    ready_metrics, needed_metrics = validator._parse_validation_graph(
        validation_graph=graph, metrics=("nonexistent", "NONE"))
    assert len(ready_metrics) == 2 and len(needed_metrics) == 9
Example #2
0
    def get_runtime_kwargs(self, runtime_configuration=None):
        expectation_kwargs_dict = self.kwarg_lookup_dict.get(
            self.expectation_type, None
        )
        if expectation_kwargs_dict is None:
            impl = get_expectation_impl(self.expectation_type)
            if impl is not None:
                runtime_keys = impl.runtime_keys
                default_kwarg_values = impl.default_kwarg_values
            else:
                expectation_kwargs_dict = self._get_default_custom_kwargs()
                default_kwarg_values = expectation_kwargs_dict.get(
                    "default_kwarg_values", dict()
                )
                runtime_keys = self.runtime_kwargs
        else:
            default_kwarg_values = expectation_kwargs_dict.get(
                "default_kwarg_values", dict()
            )
            runtime_keys = self.runtime_kwargs

        success_kwargs = self.get_success_kwargs()
        lookup_kwargs = deepcopy(self.kwargs)
        if runtime_configuration:
            lookup_kwargs.update(runtime_configuration)
        runtime_kwargs = {
            key: lookup_kwargs.get(key, default_kwarg_values.get(key))
            for key in runtime_keys
        }
        runtime_kwargs["result_format"] = parse_result_format(
            runtime_kwargs["result_format"]
        )
        runtime_kwargs.update(success_kwargs)
        return runtime_kwargs
Example #3
0
def test_parse_validation_graph():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    # noinspection PyUnusedLocal
    expectation = ExpectColumnValueZScoresToBeLessThan(
        expectation_configuration)
    # noinspection PyUnusedLocal
    batch = Batch(data=df)
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    for configuration in [expectation_configuration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than")
        validation_dependencies = expectation_impl(
            configuration).get_validation_dependencies(configuration, engine)

        for metric_configuration in validation_dependencies["metrics"].values(
        ):
            Validator(execution_engine=engine).build_metric_dependency_graph(
                graph=graph,
                execution_engine=engine,
                metric_configuration=metric_configuration,
                configuration=configuration,
            )
    ready_metrics, needed_metrics = Validator(engine)._parse_validation_graph(
        validation_graph=graph, metrics=dict())
    assert len(ready_metrics) == 2 and len(needed_metrics) == 9
Example #4
0
 def get_success_kwargs(self):
     expectation_kwargs_dict = self.kwarg_lookup_dict.get(
         self.expectation_type, None
     )
     if expectation_kwargs_dict is None:
         impl = get_expectation_impl(self.expectation_type)
         if impl is not None:
             success_keys = impl.success_keys
             default_kwarg_values = impl.default_kwarg_values
         else:
             expectation_kwargs_dict = self._get_default_custom_kwargs()
             default_kwarg_values = expectation_kwargs_dict.get(
                 "default_kwarg_values", dict()
             )
             success_keys = expectation_kwargs_dict["success_kwargs"]
     else:
         default_kwarg_values = expectation_kwargs_dict.get(
             "default_kwarg_values", dict()
         )
         success_keys = expectation_kwargs_dict["success_kwargs"]
     domain_kwargs = self.get_domain_kwargs()
     success_kwargs = {
         key: self.kwargs.get(key, default_kwarg_values.get(key))
         for key in success_keys
     }
     success_kwargs.update(domain_kwargs)
     return success_kwargs
Example #5
0
 def get_domain_kwargs(self):
     expectation_kwargs_dict = self.kwarg_lookup_dict.get(
         self.expectation_type, None
     )
     if expectation_kwargs_dict is None:
         impl = get_expectation_impl(self.expectation_type)
         if impl is not None:
             domain_keys = impl.domain_keys
             default_kwarg_values = impl.default_kwarg_values
         else:
             expectation_kwargs_dict = self._get_default_custom_kwargs()
             default_kwarg_values = expectation_kwargs_dict.get(
                 "default_kwarg_values", dict()
             )
             domain_keys = expectation_kwargs_dict["domain_kwargs"]
     else:
         default_kwarg_values = expectation_kwargs_dict.get(
             "default_kwarg_values", dict()
         )
         domain_keys = expectation_kwargs_dict["domain_kwargs"]
     domain_kwargs = {
         key: self.kwargs.get(key, default_kwarg_values.get(key))
         for key in domain_keys
     }
     missing_kwargs = set(domain_keys) - set(domain_kwargs.keys())
     if missing_kwargs:
         raise InvalidExpectationKwargsError(
             f"Missing domain kwargs: {list(missing_kwargs)}"
         )
     return domain_kwargs
Example #6
0
def test_populate_dependencies():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectationConfiguration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    expectation = ExpectColumnValueZScoresToBeLessThan(
        expectationConfiguration)
    batch = Batch(data=df)
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    for configuration in [expectationConfiguration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than")
        validation_dependencies = expectation_impl(
            configuration).get_validation_dependencies(
                configuration,
                engine,
            )

        for metric_configuration in validation_dependencies["metrics"].values(
        ):
            Validator(execution_engine=engine).build_metric_dependency_graph(
                graph,
                metric_configuration,
                configuration,
                execution_engine=engine)
    assert len(graph.edges) == 10
Example #7
0
def test_populate_dependencies_with_incorrect_metric_name():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    # noinspection PyUnusedLocal
    expectation = ExpectColumnValueZScoresToBeLessThan(expectation_configuration)
    # noinspection PyUnusedLocal
    batch = Batch(data=df)
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    for configuration in [expectation_configuration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than"
        )
        validation_dependencies = expectation_impl(
            configuration
        ).get_validation_dependencies(
            configuration,
            engine,
        )

        try:
            Validator(execution_engine=engine).build_metric_dependency_graph(
                graph=graph,
                execution_engine=engine,
                metric_configuration=MetricConfiguration(
                    "column_values.not_a_metric", IDDict()
                ),
                configuration=configuration,
            )
        except ge_exceptions.MetricProviderError as e:
            graph = e

    assert isinstance(graph, ge_exceptions.MetricProviderError)
Example #8
0
def test_resolve_validation_graph_with_bad_config_catch_exceptions_true(
    basic_datasource, ):
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]})

    batch = basic_datasource.get_single_batch_from_batch_request(
        RuntimeBatchRequest(
            **{
                "datasource_name": "my_datasource",
                "data_connector_name": "test_runtime_data_connector",
                "data_asset_name": "IN_MEMORY_DATA_ASSET",
                "runtime_parameters": {
                    "batch_data": df,
                },
                "batch_identifiers": {
                    "pipeline_stage_name": 0,
                    "airflow_run_id": 0,
                    "custom_key_0": 0,
                },
            }))

    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_max_to_be_between",
        kwargs={
            "column": "not_in_table",
            "min_value": 1,
            "max_value": 29
        },
    )

    runtime_configuration = {
        "catch_exceptions": True,
        "result_format": {
            "result_format": "BASIC"
        },
    }

    execution_engine = PandasExecutionEngine()

    validator = Validator(execution_engine=execution_engine, batches=[batch])

    expectation_impl = get_expectation_impl(
        expectation_configuration.expectation_type)
    validation_dependencies = expectation_impl().get_validation_dependencies(
        expectation_configuration, execution_engine,
        runtime_configuration)["metrics"]

    graph = ValidationGraph()

    for metric_configuration in validation_dependencies.values():
        validator.build_metric_dependency_graph(
            graph=graph,
            execution_engine=execution_engine,
            metric_configuration=metric_configuration,
            configuration=expectation_configuration,
            runtime_configuration=runtime_configuration,
        )

    metrics: Dict[Tuple[str, str, str], Any] = {}
    aborted_metrics_info: Dict[Tuple[str, str, str], Dict[
        str, Union[MetricConfiguration, Set[ExceptionInfo],
                   int]], ] = validator.resolve_validation_graph(
                       graph=graph,
                       metrics=metrics,
                       runtime_configuration=runtime_configuration,
                   )

    assert len(aborted_metrics_info) == 1

    aborted_metric_info_item = list(aborted_metrics_info.values())[0]
    assert aborted_metric_info_item[
        "num_failures"] == MAX_METRIC_COMPUTATION_RETRIES

    assert len(aborted_metric_info_item["exception_info"]) == 1

    exception_info = next(iter(aborted_metric_info_item["exception_info"]))
    assert (exception_info["exception_message"] ==
            'Error: The column "not_in_table" in BatchData does not exist.')
    def _check_linting(
            expectation_instance) -> ExpectationDiagnosticCheckMessage:
        """Check if linting checks pass for Expectation"""
        sub_messages: List[dict] = []
        message: str = "Passes all linting checks"
        passed: bool = False
        black_ok: bool = False
        isort_ok: bool = False
        file_and_class_names_ok: bool = False
        rx_expectation_instance_repr = re.compile(
            r"<.*\.([^\.]*) object at .*")

        try:
            expectation_camel_name = rx_expectation_instance_repr.match(
                repr(expectation_instance)).group(1)
        except AttributeError:
            sub_messages.append({
                "message":
                "Arg passed to _check_linting was not an instance of an Expectation, so cannot check linting",
                "passed": False,
            })
            return ExpectationDiagnosticCheckMessage(
                message=message,
                passed=passed,
                sub_messages=sub_messages,
            )

        impl = get_expectation_impl(camel_to_snake(expectation_camel_name))
        try:
            source_file_path = inspect.getfile(impl)
        except TypeError:
            sub_messages.append({
                "message":
                "inspect.getfile(impl) raised a TypeError (impl is a built-in class)",
                "passed": False,
            })
            return ExpectationDiagnosticCheckMessage(
                message=message,
                passed=passed,
                sub_messages=sub_messages,
            )

        snaked_impl_name = camel_to_snake(impl.__name__)
        source_file_base_no_ext = os.path.basename(source_file_path).rsplit(
            ".", 1)[0]
        with open(source_file_path) as fp:
            code = fp.read()

        if snaked_impl_name != source_file_base_no_ext:
            sub_messages.append({
                "message":
                f"The snake_case of {impl.__name__} ({snaked_impl_name}) does not match filename part ({source_file_base_no_ext})",
                "passed": False,
            })
        else:
            file_and_class_names_ok = True

        if black is None:
            sub_messages.append({
                "message": "Could not find 'black', so cannot check linting",
                "passed": False,
            })

        if isort is None:
            sub_messages.append({
                "message": "Could not find 'isort', so cannot check linting",
                "passed": False,
            })

        if black and isort:
            blacked_code = lint_code(code)
            if code != blacked_code:
                sub_messages.append({
                    "message": "Your code would be reformatted with black",
                    "passed": False,
                })
            else:
                black_ok = True
            isort_ok = isort.check_code(
                code,
                **isort.profiles.black,
                ignore_whitespace=True,
                known_local_folder=["great_expectations"],
            )
            if not isort_ok:
                sub_messages.append({
                    "message": "Your code would be reformatted with isort",
                    "passed": False,
                })

        passed = black_ok and isort_ok and file_and_class_names_ok
        return ExpectationDiagnosticCheckMessage(
            message=message,
            passed=passed,
            sub_messages=sub_messages,
        )
Example #10
0
 def _get_expectation_impl(self):
     return get_expectation_impl(self.expectation_type)
Example #11
0
def test_registry_basics():
    expectation = get_expectation_impl("expect_column_values_to_be_in_set")
    assert expectation == ExpectColumnValuesToBeInSet