Ejemplo n.º 1
0
def exp1():
    return ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={
            "column": "a",
            "value_set": [1, 2, 3],
            "result_format": "BASIC"
        },
        meta={"notes": "This is an expectation."},
    )
Ejemplo n.º 2
0
    def _create_string_length_expectation(
            self, key: str,
            details: dict) -> Optional[ExpectationConfiguration]:
        """https://json-schema.org/understanding-json-schema/reference/string.html#length"""
        object_types = self._get_object_types(details=details)

        if JsonSchemaTypes.STRING.value not in object_types:
            return None

        type_ = details.get("type", None)
        any_of = details.get("anyOf", None)

        if not type_ and not any_of:
            return None

        if type_:
            minimum = details.get("minLength", None)
            maximum = details.get("maxLength", None)
        elif any_of:
            for item in any_of:
                item_type = item.get("type", None)
                if item_type == JsonSchemaTypes.STRING.value:
                    minimum = item.get("minLength", None)
                    maximum = item.get("maxLength", None)
                    break

        if minimum is None and maximum is None:
            return None

        kwargs = {
            "column": key,
        }
        if minimum == maximum:
            kwargs["value"] = minimum
            return ExpectationConfiguration(
                "expect_column_value_lengths_to_equal", kwargs)
        if minimum is not None:
            kwargs["min_value"] = minimum
        if maximum is not None:
            kwargs["max_value"] = maximum

        return ExpectationConfiguration(
            "expect_column_value_lengths_to_be_between", kwargs)
def column_pair_expectation():
    return ExpectationConfiguration(
        expectation_type="expect_column_pair_values_to_be_in_set",
        kwargs={
            "column_A": "1",
            "column_B": "b",
            "value_set": [(1, 1), (2, 2)],
            "result_format": "BASIC",
        },
    )
Ejemplo n.º 4
0
def test_autoinspect_columns_exist(test_backend):
    df = get_dataset(test_backend, {"a": [1, 2, 3]},
                     profiler=ge.profile.ColumnsExistProfiler)
    suite = df.get_expectation_suite()

    assert len(suite.expectations) == 1
    assert suite.expectations == [
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "a"})
    ]
    def _check_input_validation(
        expectation_instance,
        examples: List[ExpectationTestDataCases],
    ) -> ExpectationDiagnosticCheckMessage:
        """Check that the validate_configuration exists and doesn't raise a config error"""
        passed = False
        sub_messages = []
        rx = re.compile(r"^[\s]+assert", re.MULTILINE)
        try:
            first_test = examples[0]["tests"][0]
        except IndexError:
            sub_messages.append({
                "message":
                "No example found to get kwargs for ExpectationConfiguration",
                "passed": passed,
            })
        else:
            if "validate_configuration" not in expectation_instance.__class__.__dict__:
                sub_messages.append({
                    "message":
                    "No validate_configuration method defined on subclass",
                    "passed": passed,
                })
            else:
                expectation_config = ExpectationConfiguration(
                    expectation_type=expectation_instance.expectation_type,
                    kwargs=first_test.input,
                )
                validate_configuration_source = inspect.getsource(
                    expectation_instance.__class__.validate_configuration)
                if rx.search(validate_configuration_source):
                    sub_messages.append({
                        "message":
                        "Custom 'assert' statements in validate_configuration",
                        "passed": True,
                    })
                else:
                    sub_messages.append({
                        "message":
                        "Using default validate_configuration from template",
                        "passed": False,
                    })
                try:
                    expectation_instance.validate_configuration(
                        expectation_config)
                except InvalidExpectationConfigurationError:
                    pass
                else:
                    passed = True

        return ExpectationDiagnosticCheckMessage(
            message="Has basic input validation and type checking",
            passed=passed,
            sub_messages=sub_messages,
        )
Ejemplo n.º 6
0
def test_graph_validate(basic_datasource):
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]})

    batch = basic_datasource.get_single_batch_from_batch_request(
        BatchRequest(
            **{
                "datasource_name":
                "my_datasource",
                "data_connector_name":
                "test_runtime_data_connector",
                "data_asset_name":
                "IN_MEMORY_DATA_ASSET",
                "batch_data":
                df,
                "partition_request":
                PartitionRequest(
                    **{
                        "batch_identifiers": {
                            "pipeline_stage_name": 0,
                            "airflow_run_id": 0,
                            "custom_key_0": 0,
                        }
                    }),
            }))

    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "b",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    result = Validator(execution_engine=PandasExecutionEngine(),
                       batches=[batch]).graph_validate(
                           configurations=[expectation_configuration])
    assert result == [
        ExpectationValidationResult(
            success=True,
            expectation_config=None,
            meta={},
            result={
                "element_count": 6,
                "unexpected_count": 0,
                "unexpected_percent": 0.0,
                "partial_unexpected_list": [],
                "missing_count": 1,
                "missing_percent": 16.666666666666664,
                "unexpected_percent_nonmissing": 0.0,
            },
            exception_info=None,
        )
    ]
Ejemplo n.º 7
0
    def _create_null_or_not_null_column_expectation(
        self, key: str, details: dict
    ) -> Optional[ExpectationConfiguration]:
        """https://json-schema.org/understanding-json-schema/reference/null.html"""
        object_types = self._get_object_types(details=details)
        enum_list = self._get_enum_list(details=details)
        kwargs = {"column": key}

        if enum_list:
            object_types = set(enum_list).union(set(object_types))

        if JsonSchemaTypes.NULL.value not in object_types:
            return ExpectationConfiguration(
                "expect_column_values_to_not_be_null", kwargs
            )

        if len(object_types) == 1:
            return ExpectationConfiguration("expect_column_values_to_be_null", kwargs)

        return None
Ejemplo n.º 8
0
def test__find_evr_by_type(titanic_profiled_evrs_1):
    # TODO: _find_all_evrs_by_type should accept an ValidationResultSuite, not ValidationResultSuite.results
    found_evr = Renderer()._find_evr_by_type(titanic_profiled_evrs_1.results,
                                             "expect_column_to_exist")
    print(found_evr)
    assert found_evr is None

    # TODO: _find_all_evrs_by_type should accept an ValidationResultSuite, not ValidationResultSuite.results
    found_evr = Renderer()._find_evr_by_type(
        titanic_profiled_evrs_1.results,
        "expect_column_distinct_values_to_be_in_set")
    print(found_evr)
    assert found_evr == ExpectationValidationResult(
        success=True,
        result={
            "observed_value": ["*", "1st", "2nd", "3rd"],
            "element_count": 1313,
            "missing_count": 0,
            "missing_percent": 0.0,
            "details": {
                "value_counts": [
                    {
                        "value": "*",
                        "count": 1
                    },
                    {
                        "value": "1st",
                        "count": 322
                    },
                    {
                        "value": "2nd",
                        "count": 279
                    },
                    {
                        "value": "3rd",
                        "count": 711
                    },
                ]
            },
        },
        exception_info={
            "raised_exception": False,
            "exception_message": None,
            "exception_traceback": None,
        },
        expectation_config=ExpectationConfiguration(
            expectation_type="expect_column_distinct_values_to_be_in_set",
            kwargs={
                "column": "PClass",
                "value_set": None,
                "result_format": "SUMMARY"
            },
        ),
    )
Ejemplo n.º 9
0
def test_expectation_configuration_get_evaluation_parameter_dependencies():
    # Getting evaluation parameter dependencies relies on pyparsing, but the expectation
    # configuration is responsible for ensuring that it only returns one copy of required metrics.

    # If different expectations rely on the same upstream dependency,then it is possible for duplicates
    # to be present nonetheless
    ec = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "norm",
            "min_value": {
                "$PARAMETER":
                "(-3 * urn:great_expectations:validations:profile:expect_column_stdev_to_be_between"
                ".result.observed_value:column=norm) + "
                "urn:great_expectations:validations:profile:expect_column_mean_to_be_between.result.observed_value"
                ":column=norm"
            },
            "max_value": {
                "$PARAMETER":
                "(3 * urn:great_expectations:validations:profile:expect_column_stdev_to_be_between"
                ".result.observed_value:column=norm) + "
                "urn:great_expectations:validations:profile:expect_column_mean_to_be_between.result.observed_value"
                ":column=norm"
            },
        },
    )

    dependencies = ec.get_evaluation_parameter_dependencies()
    dependencies["profile"][0]["metric_kwargs_id"]["column=norm"] = set(
        dependencies["profile"][0]["metric_kwargs_id"]["column=norm"])

    assert {
        "profile": [{
            "metric_kwargs_id": {
                "column=norm": {
                    "expect_column_stdev_to_be_between.result.observed_value",
                    "expect_column_mean_to_be_between.result.observed_value",
                }
            }
        }]
    } == dependencies
Ejemplo n.º 10
0
    def _create_boolean_expectation(
        self, key: str, details: dict
    ) -> Optional[ExpectationConfiguration]:
        """https://json-schema.org/understanding-json-schema/reference/boolean.html"""
        object_types = self._get_object_types(details=details)

        if JsonSchemaTypes.BOOLEAN.value not in object_types:
            return None

        # TODO map JSONSchema types to which type backend? Pandas? Should this value set be parameterized per back end?
        kwargs = {"column": key, "value_set": [True, False]}
        return ExpectationConfiguration("expect_column_values_to_be_in_set", kwargs)
Ejemplo n.º 11
0
def test_expectation_string_renderer_styling():
    renderer = ExpectationStringRenderer()
    result = renderer.render(
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_unique",
            kwargs={"column": "Name"},
        ))
    assert len(result) == 1
    assert result[0].string_template[
        "template"] == "$column values must be unique."

    result = renderer.render(
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_unique",
            kwargs={
                "column": "Name",
                "mostly": 0.3
            },
        ))
    assert len(result) == 1
    template = result[0].string_template
    assert (
        template["template"] ==
        "$column values must be unique, at least $mostly_pct % of the time.")
    assert template["params"]["mostly_pct"] == "30"

    result = renderer.render(
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_unique",
            kwargs={
                "column": "Name",
                "mostly": 0.32345
            },
        ))
    assert len(result) == 1
    template = result[0].string_template
    assert (
        template["template"] ==
        "$column values must be unique, at least $mostly_pct % of the time.")
    assert template["params"]["mostly_pct"] == "32.345"
Ejemplo n.º 12
0
 def _create_existence_expectation(
     self, key: str, details: dict
 ) -> ExpectationConfiguration:
     kwargs = {"column": key}
     description = details.get("description", None)
     meta = None
     if description:
         meta = {
             "notes": {
                 "format": "markdown",
                 "content": [f"### Description:\n{description}"],
             }
         }
     return ExpectationConfiguration("expect_column_to_exist", kwargs, meta=meta)
Ejemplo n.º 13
0
    def _create_set_expectation(
        self, key: str, details: dict
    ) -> Optional[ExpectationConfiguration]:
        """https://json-schema.org/understanding-json-schema/reference/generic.html#enumerated-values"""
        enum_list = self._get_enum_list(details=details)

        if not enum_list:
            return None

        enum_list = list(
            filter(lambda item: item is not JsonSchemaTypes.NULL.value, enum_list)
        )

        kwargs = {"column": key, "value_set": enum_list}
        return ExpectationConfiguration("expect_column_values_to_be_in_set", kwargs)
Ejemplo n.º 14
0
def test_atomic_diagnostic_observed_value_without_result(
        snapshot, get_diagnostic_rendered_content):
    # Please note that the vast majority of Expectations are calling `Expectation._atomic_diagnostic_observed_value()`
    # As such, the specific expectation_type used here is irrelevant and is simply used to trigger the parent class.
    expectation_config = {
        "expectation_type": "expect_table_row_count_to_equal",
        "kwargs": {},
    }
    update_dict = {
        "expectation_config": ExpectationConfiguration(**expectation_config),
    }
    rendered_content = get_diagnostic_rendered_content(update_dict)

    res = rendered_content.to_json_dict()
    pprint(res)
    snapshot.assert_match(res)
Ejemplo n.º 15
0
def test_expect_column_value_z_scores_to_be_less_than_impl():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]})
    expectationConfiguration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    expectation = ExpectColumnValueZScoresToBeLessThan(
        expectationConfiguration)
    engine = PandasExecutionEngine(batch_data_dict={"my_id": df})
    result = expectation.validate(Validator(execution_engine=engine))
    assert result == ExpectationValidationResult(success=True, )
def test_replace_expectation_finds_too_many_matches(ge_cloud_suite, ge_cloud_id):
    new_expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={"column": "b", "value_set": [4, 5, 6], "result_format": "BASIC"},
        meta={"notes": "This is a new expectation."},
    )
    with pytest.raises(ValueError) as err:
        ge_cloud_suite.replace_expectation(
            new_expectation_configuration=new_expectation_configuration,
            existing_expectation_configuration=None,
            ge_cloud_id=ge_cloud_id,
        )
    assert (
        str(err.value)
        == "More than one matching expectation was found. Please be more specific with your search criteria"
    )
def test_add_expectation_with_ge_cloud_id(
    mock_emit,
    single_expectation_suite_with_expectation_ge_cloud_id,
):
    """
    This test ensures that expectation does not lose ge_cloud_id attribute when updated
    """
    expectation_ge_cloud_id = (
        single_expectation_suite_with_expectation_ge_cloud_id.expectations[
            0
        ].ge_cloud_id
    )
    # updated expectation does not have ge_cloud_id
    updated_expectation = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={
            "column": "a",
            "value_set": [11, 22, 33, 44, 55],
            "result_format": "BASIC",
        },
        meta={"notes": "This is an expectation."},
    )
    single_expectation_suite_with_expectation_ge_cloud_id.add_expectation(
        updated_expectation, overwrite_existing=True
    )
    assert (
        single_expectation_suite_with_expectation_ge_cloud_id.expectations[
            0
        ].ge_cloud_id
        == expectation_ge_cloud_id
    )
    # make sure expectation config was actually updated
    assert single_expectation_suite_with_expectation_ge_cloud_id.expectations[0].kwargs[
        "value_set"
    ] == [11, 22, 33, 44, 55]

    # ensure usage statistics are being emitted correctly
    assert mock_emit.call_count == 1
    assert mock_emit.call_args_list == [
        mock.call(
            {
                "event": "expectation_suite.add_expectation",
                "event_payload": {},
                "success": True,
            }
        )
    ]
Ejemplo n.º 18
0
    def _get_prescriptive_rendered_content(
        update_dict: Dict[str, Union[str, dict]], ) -> RenderedAtomicContent:
        # Overwrite any fields passed in from test and instantiate ExpectationConfiguration
        expectation_configuration_kwargs.update(update_dict)
        config = ExpectationConfiguration(**expectation_configuration_kwargs)
        expectation_type = expectation_configuration_kwargs["expectation_type"]

        # Programatically determine the renderer implementations
        renderer_impl = get_renderer_impl(
            object_name=expectation_type,
            renderer_type="atomic.prescriptive.summary",
        )[1]

        # Determine RenderedAtomicContent output
        source_obj = {"configuration": config}
        res = renderer_impl(**source_obj)
        return res
Ejemplo n.º 19
0
def test_atomic_diagnostic_observed_value_expect_column_kl_divergence_to_be_less_than(
    snapshot, get_diagnostic_rendered_content
):
    # Please note that the vast majority of Expectations are calling `Expectation._atomic_diagnostic_observed_value()`
    # As such, the specific expectation_type used here is irrelevant and is simply used to trigger the parent class.
    expectation_config = {
        "expectation_type": "expect_column_kl_divergence_to_be_less_than",
        "kwargs": {
            "column": "min_event_time",
            "partition_object": {
                "bins": [0, 5, 10, 30, 50],
                "weights": [0.2, 0.3, 0.1, 0.4],
            },
            "threshold": 0.1,
        },
        "meta": {},
        "ge_cloud_id": "4b53c4d5-90ba-467a-b7a7-379640bbd729",
    }
    update_dict = {
        "expectation_config": ExpectationConfiguration(**expectation_config),
        "result": {
            "observed_value": 0.0,
            "details": {
                "observed_partition": {
                    "values": [1, 2, 4],
                    "weights": [0.3754, 0.615, 0.0096],
                },
                "expected_partition": {
                    "values": [1, 2, 4],
                    "weights": [0.3754, 0.615, 0.0096],
                },
            },
        },
    }
    rendered_content = get_diagnostic_rendered_content(update_dict)

    res = rendered_content.to_json_dict()
    pprint(res)

    # replace version of vega-lite in res to match snapshot test
    res["value"]["graph"]["$schema"] = re.sub(
        r"v\d*\.\d*\.\d*", "v4.8.1", res["value"]["graph"]["$schema"]
    )
    snapshot.assert_match(res)
    def _get_column_type_with_caching(cls, dataset, column_name, cache):
        column_cache_entry = cache.get(column_name)
        if not column_cache_entry:
            column_cache_entry = {}
            cache[column_name] = column_cache_entry
        column_type = column_cache_entry.get("type")
        if not column_type:
            column_type = cls._get_column_type(dataset, column_name)
            column_cache_entry["type"] = column_type
            # remove the expectation
            # Does this change with different config format?
            dataset.remove_expectation(
                ExpectationConfiguration(
                    expectation_type="expect_column_values_to_be_in_type_list",
                    kwargs={"column": column_name},
                ))
            dataset.set_config_value("interactive_evaluation", True)

        return column_type
Ejemplo n.º 21
0
def test_atomic_diagnostic_observed_value_expect_column_quantile_values_to_be_between(
        snapshot, get_diagnostic_rendered_content):
    # Please note that the vast majority of Expectations are calling `Expectation._atomic_diagnostic_observed_value()`
    # As such, the specific expectation_type used here is irrelevant and is simply used to trigger the parent class.
    expectation_config = {
        "expectation_type": "expect_column_quantile_values_to_be_between",
        "kwargs": {
            "column": "Unnamed: 0",
            "quantile_ranges": {
                "quantiles": [0.05, 0.25, 0.5, 0.75, 0.95],
                "value_ranges": [
                    [66, 68],
                    [328, 330],
                    [656, 658],
                    [984, 986],
                    [1246, 1248],
                ],
            },
            "allow_relative_error": False,
        },
        "meta": {},
        "ge_cloud_id": "cd6b4f19-8167-4984-b495-54bffcb070da",
    }
    update_dict = {
        "expectation_config": ExpectationConfiguration(**expectation_config),
        "result": {
            "observed_value": {
                "quantiles": [0.05, 0.25, 0.5, 0.75, 0.95],
                "values": [67, 329, 657, 985, 1247],
            },
            "element_count": 1313,
            "missing_count": None,
            "missing_percent": None,
            "details": {
                "success_details": [True, True, True, True, True]
            },
        },
    }
    rendered_content = get_diagnostic_rendered_content(update_dict)

    res = rendered_content.to_json_dict()
    pprint(res)
    snapshot.assert_match(res)
Ejemplo n.º 22
0
def test_graph_validate_with_bad_config_catch_exceptions_false(
        basic_datasource):
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]})

    batch = basic_datasource.get_single_batch_from_batch_request(
        RuntimeBatchRequest(
            **{
                "datasource_name": "my_datasource",
                "data_connector_name": "test_runtime_data_connector",
                "data_asset_name": "IN_MEMORY_DATA_ASSET",
                "runtime_parameters": {
                    "batch_data": df,
                },
                "batch_identifiers": {
                    "pipeline_stage_name": 0,
                    "airflow_run_id": 0,
                    "custom_key_0": 0,
                },
            }))

    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_max_to_be_between",
        kwargs={
            "column": "not_in_table",
            "min_value": 1,
            "max_value": 29
        },
    )
    with pytest.raises(ge_exceptions.MetricResolutionError) as eee:
        # noinspection PyUnusedLocal
        result = Validator(execution_engine=PandasExecutionEngine(),
                           batches=[batch]).graph_validate(
                               configurations=[expectation_configuration],
                               runtime_configuration={
                                   "catch_exceptions": False,
                                   "result_format": {
                                       "result_format": "BASIC"
                                   },
                               },
                           )
    assert (str(eee.value) ==
            'Error: The column "not_in_table" in BatchData does not exist.')
Ejemplo n.º 23
0
def test_graph_validate_with_exception(basic_datasource):
    def mock_error(*args, **kwargs):
        raise Exception("Mock Error")

    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]})

    batch = basic_datasource.get_single_batch_from_batch_request(
        RuntimeBatchRequest(
            **{
                "datasource_name": "my_datasource",
                "data_connector_name": "test_runtime_data_connector",
                "data_asset_name": "IN_MEMORY_DATA_ASSET",
                "runtime_parameters": {
                    "batch_data": df,
                },
                "batch_identifiers": {
                    "pipeline_stage_name": 0,
                    "airflow_run_id": 0,
                    "custom_key_0": 0,
                },
            }))

    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "b",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )

    validator = Validator(execution_engine=PandasExecutionEngine(),
                          batches=[batch])
    validator.build_metric_dependency_graph = mock_error

    result = validator.graph_validate(
        configurations=[expectation_configuration])

    assert len(result) == 1
    assert result[0].expectation_config is not None
Ejemplo n.º 24
0
def test_populate_dependencies_with_incorrect_metric_name():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    # noinspection PyUnusedLocal
    expectation = ExpectColumnValueZScoresToBeLessThan(expectation_configuration)
    # noinspection PyUnusedLocal
    batch = Batch(data=df)
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    for configuration in [expectation_configuration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than"
        )
        validation_dependencies = expectation_impl(
            configuration
        ).get_validation_dependencies(
            configuration,
            engine,
        )

        try:
            Validator(execution_engine=engine).build_metric_dependency_graph(
                graph=graph,
                execution_engine=engine,
                metric_configuration=MetricConfiguration(
                    "column_values.not_a_metric", IDDict()
                ),
                configuration=configuration,
            )
        except ge_exceptions.MetricProviderError as e:
            graph = e

    assert isinstance(graph, ge_exceptions.MetricProviderError)
Ejemplo n.º 25
0
def test_sa_expect_column_value_z_scores_to_be_less_than_impl(
        postgresql_engine):
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]})
    df.to_sql("z_score_test_data", postgresql_engine, if_exists="replace")
    expectationConfiguration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    expectation = ExpectColumnValueZScoresToBeLessThan(
        expectationConfiguration)
    batch_data = SqlAlchemyBatchData(engine=postgresql_engine,
                                     table_name="z_score_test_data")
    engine = SqlAlchemyExecutionEngine(engine=postgresql_engine,
                                       batch_data_dict={"my_id": batch_data})
    result = expectation.validate(Validator(execution_engine=engine))
    assert result == ExpectationValidationResult(success=True, )
def test_graph_validate_with_bad_config(basic_datasource):
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]})

    batch = basic_datasource.get_single_batch_from_batch_request(
        BatchRequest(
            **{
                "datasource_name":
                "my_datasource",
                "data_connector_name":
                "test_runtime_data_connector",
                "data_asset_name":
                "IN_MEMORY_DATA_ASSET",
                "batch_data":
                df,
                "partition_request":
                PartitionRequest(
                    **{
                        "partition_identifiers": {
                            "pipeline_stage_name": 0,
                            "airflow_run_id": 0,
                            "custom_key_0": 0,
                        }
                    }),
            }))

    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_max_to_be_between",
        kwargs={
            "column": "not_in_table",
            "min_value": 1,
            "max_value": 29
        },
    )
    try:
        result = Validator(execution_engine=PandasExecutionEngine(),
                           batches=[batch]).graph_validate(
                               configurations=[expectation_configuration])
    except KeyError as e:
        result = e
    assert isinstance(result, KeyError)
Ejemplo n.º 27
0
def test_catch_exceptions_with_bad_expectation_type():
    # We want to catch degenerate cases where an expectation suite is incompatible with
    my_df = PandasDataset({"x": range(10)})
    my_df._expectation_suite.append_expectation(
        ExpectationConfiguration(expectation_type="foobar", kwargs={}))
    result = my_df.validate(catch_exceptions=True)

    # Find the foobar result
    idx = 0
    for idx, val_result in enumerate(result.results):
        if val_result.expectation_config.expectation_type == "foobar":
            break

    assert result.results[idx].success is False
    assert result.results[idx].expectation_config.expectation_type == "foobar"
    assert result.results[idx].expectation_config.kwargs == {}
    assert result.results[idx].exception_info["raised_exception"] is True
    assert ("AttributeError: 'PandasDataset' object has no attribute 'foobar'"
            in result.results[idx].exception_info["exception_traceback"])

    with pytest.raises(AttributeError):
        result = my_df.validate(catch_exceptions=False)
Ejemplo n.º 28
0
    def __init__(
        self,
        expectation_suite_name,
        data_context=None,
        expectations=None,
        evaluation_parameters=None,
        data_asset_type=None,
        execution_engine_type=None,
        meta=None,
        ge_cloud_id=None,
    ) -> None:
        self.expectation_suite_name = expectation_suite_name
        self.ge_cloud_id = ge_cloud_id
        self._data_context = data_context

        if expectations is None:
            expectations = []
        self.expectations = [
            ExpectationConfiguration(**expectation)
            if isinstance(expectation, dict)
            else expectation
            for expectation in expectations
        ]
        if evaluation_parameters is None:
            evaluation_parameters = {}
        self.evaluation_parameters = evaluation_parameters
        self.data_asset_type = data_asset_type
        self.execution_engine_type = execution_engine_type
        if meta is None:
            meta = {"great_expectations_version": ge_version}
        if (
            "great_expectations.__version__" not in meta.keys()
            and "great_expectations_version" not in meta.keys()
        ):
            meta["great_expectations_version"] = ge_version
        # We require meta information to be serializable, but do not convert until necessary
        ensure_json_serializable(meta)
        self.meta = meta
Ejemplo n.º 29
0
    def _create_type_expectation(
        self, key: str, details: dict
    ) -> Optional[ExpectationConfiguration]:
        object_types = self._get_object_types(details=details)
        object_types = list(
            filter(
                lambda object_type: object_type not in [JsonSchemaTypes.NULL.value],
                object_types,
            )
        )

        if len(object_types) == 0:
            return None

        type_list = []

        for type_ in object_types:
            type_list.extend(self.PROFILER_TYPE_LIST_BY_JSON_SCHEMA_TYPE[type_])

        kwargs = {"column": key, "type_list": type_list}
        return ExpectationConfiguration(
            "expect_column_values_to_be_in_type_list", kwargs
        )
Ejemplo n.º 30
0
def multi_batch_taxi_validator_ge_cloud_mode(
    yellow_trip_pandas_data_context, ) -> Validator:
    context: DataContext = yellow_trip_pandas_data_context
    context._ge_cloud_mode = True

    suite: ExpectationSuite = ExpectationSuite(
        expectation_suite_name="validating_taxi_data",
        expectations=[
            ExpectationConfiguration(
                expectation_type="expect_column_values_to_be_between",
                kwargs={
                    "column": "passenger_count",
                    "min_value": 0,
                    "max_value": 99,
                    "result_format": "BASIC",
                },
                meta={"notes": "This is an expectation."},
                ge_cloud_id=UUID("0faf94a9-f53a-41fb-8e94-32f218d4a774"),
            )
        ],
        data_context=context,
        meta={"notes": "This is an expectation suite."},
    )

    multi_batch_request: BatchRequest = BatchRequest(
        datasource_name="taxi_pandas",
        data_connector_name="monthly",
        data_asset_name="my_reports",
        data_connector_query={"batch_filter_parameters": {
            "year": "2019"
        }},
    )

    validator_multi_batch: Validator = context.get_validator(
        batch_request=multi_batch_request, expectation_suite=suite)

    return validator_multi_batch