Ejemplo n.º 1
0
    def __init__(self, expectation_suite_identifier, run_id, batch_identifier):
        """Constructs a ValidationResultIdentifier

        Args:
            expectation_suite_identifier (ExpectationSuiteIdentifier, list, tuple, or dict):
                identifying information for the fully qualified expectation suite used to validate
            run_id (RunIdentifier): The run_id for which validation occurred
        """
        super().__init__()
        self._expectation_suite_identifier = expectation_suite_identifier
        if isinstance(run_id, str):
            warnings.warn(
                "String run_ids will be deprecated in the future. Please provide a run_id of type "
                "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name "
                "and run_time (both optional).",
                DeprecationWarning,
            )
            try:
                run_time = parse(run_id)
            except (ValueError, TypeError):
                run_time = None
            run_id = RunIdentifier(run_name=run_id, run_time=run_time)
        elif isinstance(run_id, dict):
            run_id = RunIdentifier(**run_id)
        elif run_id is None:
            run_id = RunIdentifier()
        elif not isinstance(run_id, RunIdentifier):
            run_id = RunIdentifier(run_name=str(run_id))

        self._run_id = run_id
        self._batch_identifier = batch_identifier
Ejemplo n.º 2
0
 def __init__(
     self,
     run_id,
     data_asset_name,
     expectation_suite_identifier,
     metric_name,
     metric_kwargs,
     metric_value,
 ):
     super().__init__(metric_name, metric_kwargs, metric_value)
     if not isinstance(expectation_suite_identifier,
                       ExpectationSuiteIdentifier):
         expectation_suite_identifier = ExpectationSuiteIdentifier(
             expectation_suite_name=expectation_suite_identifier)
     if isinstance(run_id, str):
         warnings.warn(
             "String run_ids will be deprecated in the future. Please provide a run_id of type "
             "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name "
             "and run_time (both optional).",
             DeprecationWarning,
         )
         try:
             run_time = parse(run_id)
         except (ValueError, TypeError):
             run_time = None
         run_id = RunIdentifier(run_name=run_id, run_time=run_time)
     elif isinstance(run_id, dict):
         run_id = RunIdentifier(**run_id)
     elif run_id is None:
         run_id = RunIdentifier()
     elif not isinstance(run_id, RunIdentifier):
         run_id = RunIdentifier(run_name=str(run_id))
     self._run_id = run_id
     self._data_asset_name = data_asset_name
     self._expectation_suite_identifier = expectation_suite_identifier
Ejemplo n.º 3
0
    def profile(
        cls,
        data_asset,
        run_id=None,
        profiler_configuration=None,
        run_name=None,
        run_time=None,
    ):
        assert not (run_id and run_name) and not (
            run_id and run_time
        ), "Please provide either a run_id or run_name and/or run_time."
        if isinstance(run_id, str) and not run_name:
            warnings.warn(
                "String run_ids will be deprecated in the future. Please provide a run_id of type "
                "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name "
                "and run_time (both optional). Instead of providing a run_id, you may also provide"
                "run_name and run_time separately.",
                DeprecationWarning,
            )
            try:
                run_time = parse(run_id)
            except (ValueError, TypeError):
                pass
            run_id = RunIdentifier(run_name=run_id, run_time=run_time)
        elif isinstance(run_id, dict):
            run_id = RunIdentifier(**run_id)
        elif not isinstance(run_id, RunIdentifier):
            run_name = run_name or "profiling"
            run_id = RunIdentifier(run_name=run_name, run_time=run_time)

        if not cls.validate(data_asset):
            raise GreatExpectationsError(
                "Invalid data_asset for profiler; aborting")

        expectation_suite = cls._profile(data_asset,
                                         configuration=profiler_configuration)

        batch_kwargs = data_asset.batch_kwargs
        expectation_suite = cls.add_meta(expectation_suite, batch_kwargs)
        validation_results = data_asset.validate(expectation_suite,
                                                 run_id=run_id,
                                                 result_format="SUMMARY")
        expectation_suite.add_citation(
            comment=str(cls.__name__) +
            " added a citation based on the current batch.",
            batch_kwargs=data_asset.batch_kwargs,
            batch_markers=data_asset.batch_markers,
            batch_parameters=data_asset.batch_parameters,
        )
        return expectation_suite, validation_results
Ejemplo n.º 4
0
def validation_result_suite_id():
    return ValidationResultIdentifier(
        expectation_suite_identifier=ExpectationSuiteIdentifier(
            "asset.default"),
        run_id=RunIdentifier(run_name="test_100"),
        batch_identifier="1234",
    )
Ejemplo n.º 5
0
def test_TupleGCSStoreBackend_base_public_path():
    """
    What does this test and why?

    the base_public_path parameter allows users to point to a custom DNS when hosting Data docs.

    This test will exercise the get_url_for_key method twice to see that we are getting the expected url,
    with or without base_public_path
    """
    bucket = "leakybucket"
    prefix = "this_is_a_test_prefix"
    project = "dummy-project"
    base_public_path = "http://www.test.com/"

    my_store_with_base_public_path = TupleGCSStoreBackend(
        filepath_template=None,
        bucket=bucket,
        prefix=prefix,
        project=project,
        base_public_path=base_public_path,
    )

    with patch("google.cloud.storage.Client",
               autospec=True) as mock_gcs_client:
        mock_client = mock_gcs_client.return_value
        mock_bucket = mock_client.get_bucket.return_value
        mock_blob = mock_bucket.blob.return_value

        my_store_with_base_public_path.set(("BBB", ),
                                           b"bbb",
                                           content_encoding=None,
                                           content_type="image/png")

    run_id = RunIdentifier("my_run_id", datetime.datetime.utcnow())
    key = ValidationResultIdentifier(
        ExpectationSuiteIdentifier(expectation_suite_name="my_suite_name"),
        run_id,
        "my_batch_id",
    )
    run_time_string = run_id.to_tuple()[1]

    url = my_store_with_base_public_path.get_public_url_for_key(key.to_tuple())
    assert (
        url == "http://www.test.com/leakybucket" +
        f"/this_is_a_test_prefix/my_suite_name/my_run_id/{run_time_string}/my_batch_id"
    )
Ejemplo n.º 6
0
def validation_result_suite_extended_id():
    return ValidationResultIdentifier(
        expectation_suite_identifier=ExpectationSuiteIdentifier(
            "asset.default"),
        run_id=RunIdentifier(run_name="test_100",
                             run_time="Tue May 08 15:14:45 +0800 2012"),
        batch_identifier=BatchIdentifier(batch_identifier="1234",
                                         data_asset_name="asset"),
    )
Ejemplo n.º 7
0
def test_StoreAction():
    fake_in_memory_store = ValidationsStore(
        store_backend={"class_name": "InMemoryStoreBackend",}
    )
    stores = {"fake_in_memory_store": fake_in_memory_store}

    class Object:
        pass

    data_context = Object()
    data_context.stores = stores

    action = StoreValidationResultAction(
        data_context=data_context, target_store_name="fake_in_memory_store",
    )
    assert fake_in_memory_store.list_keys() == []

    action.run(
        validation_result_suite_identifier=ValidationResultIdentifier(
            expectation_suite_identifier=ExpectationSuiteIdentifier(
                expectation_suite_name="default_expectations"
            ),
            run_id="prod_20190801",
            batch_identifier="1234",
        ),
        validation_result_suite=ExpectationSuiteValidationResult(
            success=False, results=[]
        ),
        data_asset=None,
    )

    expected_run_id = RunIdentifier(
        run_name="prod_20190801", run_time="20190926T134241.000000Z"
    )

    assert len(fake_in_memory_store.list_keys()) == 1
    stored_identifier = fake_in_memory_store.list_keys()[0]
    assert stored_identifier.batch_identifier == "1234"
    assert (
        stored_identifier.expectation_suite_identifier.expectation_suite_name
        == "default_expectations"
    )
    assert stored_identifier.run_id == expected_run_id

    assert fake_in_memory_store.get(
        ValidationResultIdentifier(
            expectation_suite_identifier=ExpectationSuiteIdentifier(
                expectation_suite_name="default_expectations"
            ),
            run_id=expected_run_id,
            batch_identifier="1234",
        )
    ) == ExpectationSuiteValidationResult(success=False, results=[])
Ejemplo n.º 8
0
def test_database_evaluation_parameter_store_get_bind_params(param_store):
    # Bind params must be expressed as a string-keyed dictionary.
    # Verify that the param_store supports that
    run_id = RunIdentifier(run_name=datetime.datetime.now(
        datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ"))
    metric_identifier = ValidationMetricIdentifier(
        run_id=run_id,
        data_asset_name=None,
        expectation_suite_identifier="asset.warning",
        metric_name=
        "expect_column_values_to_match_regex.result.unexpected_percent",
        metric_kwargs_id="column=mycol",
    )

    metric_value = 12.3456789
    param_store.set(metric_identifier, metric_value)

    metric_identifier = ValidationMetricIdentifier(
        run_id=run_id,
        data_asset_name=None,
        expectation_suite_identifier="asset.warning",
        metric_name=
        "expect_table_row_count_to_be_between.result.observed_value",
        metric_kwargs_id=None,
    )

    metric_value = 512
    param_store.set(metric_identifier, metric_value)

    metric_identifier = ValidationMetricIdentifier(
        run_id=run_id,
        data_asset_name=None,
        expectation_suite_identifier="asset2.warning",
        metric_name=
        "expect_column_values_to_match_regex.result.unexpected_percent",
        metric_kwargs_id="column=mycol",
    )

    metric_value = 12.3456789
    param_store.set(metric_identifier, metric_value)

    params = param_store.get_bind_params(run_id)
    assert params == {
        "urn:great_expectations:validations:asset.warning:"
        "expect_column_values_to_match_regex.result.unexpected_percent:column=mycol":
        12.3456789,
        "urn:great_expectations:validations:asset.warning:"
        "expect_table_row_count_to_be_between.result.observed_value":
        512,
        "urn:great_expectations:validations:asset2.warning:"
        "expect_column_values_to_match_regex.result.unexpected_percent:column=mycol":
        12.3456789,
    }
Ejemplo n.º 9
0
def test_database_evaluation_parameter_store_basics(param_store):
    run_id = RunIdentifier(run_name=datetime.datetime.now(
        datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ"))
    metric_identifier = ValidationMetricIdentifier(
        run_id=run_id,
        data_asset_name=None,
        expectation_suite_identifier="asset.warning",
        metric_name=
        "expect_column_values_to_match_regex.result.unexpected_percent",
        metric_kwargs_id="column=mycol",
    )
    metric_value = 12.3456789

    param_store.set(metric_identifier, metric_value)
    value = param_store.get(metric_identifier)
    assert value == metric_value
Ejemplo n.º 10
0
 def from_tuple(cls, tuple_):
     if len(tuple_) < 6:
         raise GreatExpectationsError(
             "ValidationMetricIdentifier tuple must have at least six components."
         )
     if tuple_[2] == "__":
         tuple_data_asset_name = None
     else:
         tuple_data_asset_name = tuple_[2]
     metric_id = MetricIdentifier.from_tuple(tuple_[-2:])
     return cls(
         run_id=RunIdentifier.from_tuple((tuple_[0], tuple_[1])),
         data_asset_name=tuple_data_asset_name,
         expectation_suite_identifier=ExpectationSuiteIdentifier.from_tuple(
             tuple_[3:-2]),
         metric_name=metric_id.metric_name,
         metric_kwargs_id=metric_id.metric_kwargs_id,
     )
Ejemplo n.º 11
0
 def from_fixed_length_tuple(cls, tuple_):
     if len(tuple_) != 6:
         raise GreatExpectationsError(
             "ValidationMetricIdentifier fixed length tuple must have exactly six "
             "components.")
     if tuple_[2] == "__":
         tuple_data_asset_name = None
     else:
         tuple_data_asset_name = tuple_[2]
     metric_id = MetricIdentifier.from_tuple(tuple_[-2:])
     return cls(
         run_id=RunIdentifier.from_fixed_length_tuple(
             (tuple_[0], tuple_[1])),
         data_asset_name=tuple_data_asset_name,
         expectation_suite_identifier=ExpectationSuiteIdentifier.
         from_fixed_length_tuple(tuple((tuple_[3], ))),
         metric_name=metric_id.metric_name,
         metric_kwargs_id=metric_id.metric_kwargs_id,
     )
Ejemplo n.º 12
0
def test_StoreMetricsAction_column_metric(
    basic_in_memory_data_context_for_validation_operator, ):
    action = StoreMetricsAction(
        data_context=basic_in_memory_data_context_for_validation_operator,
        requested_metrics={
            "*": [
                {
                    "column": {
                        "provider_id": [
                            "expect_column_values_to_be_unique.result.unexpected_count"
                        ]
                    }
                },
                "statistics.evaluated_expectations",
                "statistics.successful_expectations",
            ]
        },
        target_store_name="metrics_store",
    )

    run_id = RunIdentifier(run_name="bar")

    validation_result = ExpectationSuiteValidationResult(
        success=False,
        meta={
            "expectation_suite_name": "foo",
            "run_id": run_id
        },
        results=[
            ExpectationValidationResult(
                meta={},
                result={
                    "element_count": 10,
                    "missing_count": 0,
                    "missing_percent": 0.0,
                    "unexpected_count": 7,
                    "unexpected_percent": 0.0,
                    "unexpected_percent_nonmissing": 0.0,
                    "partial_unexpected_list": [],
                },
                success=True,
                expectation_config=ExpectationConfiguration(
                    expectation_type="expect_column_values_to_be_unique",
                    kwargs={
                        "column": "provider_id",
                        "result_format": "BASIC"
                    },
                ),
                exception_info=None,
            )
        ],
        statistics={
            "evaluated_expectations": 5,
            "successful_expectations": 3
        },
    )

    action.run(
        validation_result,
        ValidationResultIdentifier.from_object(validation_result),
        data_asset=None,
    )

    assert (
        basic_in_memory_data_context_for_validation_operator.
        stores["metrics_store"].get(
            ValidationMetricIdentifier(
                run_id=run_id,
                data_asset_name=None,
                expectation_suite_identifier=ExpectationSuiteIdentifier("foo"),
                metric_name=
                "expect_column_values_to_be_unique.result.unexpected_count",
                metric_kwargs_id="column=provider_id",
            )) == 7)
Ejemplo n.º 13
0
    def run(
        self,
        template_name: Optional[str] = None,
        run_name_template: Optional[str] = None,
        expectation_suite_name: Optional[str] = None,
        batch_request: Optional[Union[BatchRequest, dict]] = None,
        action_list: Optional[List[dict]] = None,
        evaluation_parameters: Optional[dict] = None,
        runtime_configuration: Optional[dict] = None,
        validations: Optional[List[dict]] = None,
        profilers: Optional[List[dict]] = None,
        run_id=None,
        run_name=None,
        run_time=None,
        result_format=None,
        **kwargs,
    ) -> CheckpointResult:
        assert not (run_id and run_name) and not (
            run_id and run_time
        ), "Please provide either a run_id or run_name and/or run_time."

        run_time = run_time or datetime.now()
        runtime_configuration: dict = runtime_configuration or {}
        result_format: Optional[
            dict] = result_format or runtime_configuration.get("result_format")
        if result_format is None:
            result_format = {"result_format": "SUMMARY"}

        runtime_kwargs = {
            "template_name": template_name,
            "run_name_template": run_name_template,
            "expectation_suite_name": expectation_suite_name,
            "batch_request": batch_request,
            "action_list": action_list,
            "evaluation_parameters": evaluation_parameters,
            "runtime_configuration": runtime_configuration,
            "validations": validations,
            "profilers": profilers,
        }
        substituted_runtime_config: CheckpointConfig = self.get_substituted_config(
            runtime_kwargs=runtime_kwargs)
        run_name_template: Optional[
            str] = substituted_runtime_config.run_name_template
        validations: list = substituted_runtime_config.validations
        run_results = {}

        if run_name is None and run_name_template is not None:
            run_name: str = get_datetime_string_from_strftime_format(
                format_str=run_name_template, datetime_obj=run_time)

        run_id = run_id or RunIdentifier(run_name=run_name, run_time=run_time)

        for idx, validation_dict in enumerate(validations):
            try:
                substituted_validation_dict: dict = get_substituted_validation_dict(
                    substituted_runtime_config=substituted_runtime_config,
                    validation_dict=validation_dict,
                )
                batch_request: BatchRequest = substituted_validation_dict.get(
                    "batch_request")
                expectation_suite_name: str = substituted_validation_dict.get(
                    "expectation_suite_name")
                action_list: list = substituted_validation_dict.get(
                    "action_list")

                validator: Validator = self.data_context.get_validator(
                    batch_request=batch_request,
                    expectation_suite_name=expectation_suite_name,
                )
                action_list_validation_operator: ActionListValidationOperator = (
                    ActionListValidationOperator(
                        data_context=self.data_context,
                        action_list=action_list,
                        result_format=result_format,
                        name=f"{self.name}-checkpoint-validation[{idx}]",
                    ))
                val_op_run_result: ValidationOperatorResult = (
                    action_list_validation_operator.run(
                        assets_to_validate=[validator],
                        run_id=run_id,
                        evaluation_parameters=substituted_validation_dict.get(
                            "evaluation_parameters"),
                        result_format=result_format,
                    ))
                run_results.update(val_op_run_result.run_results)
            except CheckpointError as e:
                raise CheckpointError(
                    f"Exception occurred while running validation[{idx}] of checkpoint '{self.name}': {e.message}"
                )
        return CheckpointResult(run_id=run_id,
                                run_results=run_results,
                                checkpoint_config=self.config)
def test_configuration_driven_site_builder(
    site_builder_data_context_with_html_store_titanic_random, ):
    context = site_builder_data_context_with_html_store_titanic_random

    context.add_validation_operator(
        "validate_and_store",
        {
            "class_name":
            "ActionListValidationOperator",
            "action_list": [
                {
                    "name": "store_validation_result",
                    "action": {
                        "class_name": "StoreValidationResultAction",
                        "target_store_name": "validations_store",
                    },
                },
                {
                    "name": "extract_and_store_eval_parameters",
                    "action": {
                        "class_name": "StoreEvaluationParametersAction",
                        "target_store_name": "evaluation_parameter_store",
                    },
                },
            ],
        },
    )

    # profiling the Titanic datasource will generate one expectation suite and one validation
    # that is a profiling result
    datasource_name = "titanic"
    data_asset_name = "Titanic"
    profiler_name = "BasicDatasetProfiler"
    generator_name = "subdir_reader"
    context.profile_datasource(datasource_name)

    # creating another validation result using the profiler's suite (no need to use a new expectation suite
    # for this test). having two validation results - one with run id "profiling" - allows us to test
    # the logic of run_name_filter that helps filtering validation results to be included in
    # the profiling and the validation sections.
    batch_kwargs = context.build_batch_kwargs(
        datasource=datasource_name,
        batch_kwargs_generator=generator_name,
        name=data_asset_name,
    )

    expectation_suite_name = "{}.{}.{}.{}".format(datasource_name,
                                                  generator_name,
                                                  data_asset_name,
                                                  profiler_name)

    batch = context.get_batch(
        batch_kwargs=batch_kwargs,
        expectation_suite_name=expectation_suite_name,
    )
    run_id = RunIdentifier(run_name="test_run_id_12345")
    context.run_validation_operator(
        assets_to_validate=[batch],
        run_id=run_id,
        validation_operator_name="validate_and_store",
    )

    data_docs_config = context._project_config.data_docs_sites
    local_site_config = data_docs_config["local_site"]

    validations_set = set(context.stores["validations_store"].list_keys())
    assert len(validations_set) == 6
    assert (ValidationResultIdentifier(
        expectation_suite_identifier=ExpectationSuiteIdentifier(
            expectation_suite_name=expectation_suite_name),
        run_id="test_run_id_12345",
        batch_identifier=batch.batch_id,
    ) in validations_set)
    assert (ValidationResultIdentifier(
        expectation_suite_identifier=ExpectationSuiteIdentifier(
            expectation_suite_name=expectation_suite_name),
        run_id="profiling",
        batch_identifier=batch.batch_id,
    ) in validations_set)
    assert (ValidationResultIdentifier(
        expectation_suite_identifier=ExpectationSuiteIdentifier(
            expectation_suite_name=expectation_suite_name),
        run_id="profiling",
        batch_identifier=batch.batch_id,
    ) in validations_set)
    assert (ValidationResultIdentifier(
        expectation_suite_identifier=ExpectationSuiteIdentifier(
            expectation_suite_name=expectation_suite_name),
        run_id="profiling",
        batch_identifier=batch.batch_id,
    ) in validations_set)

    site_builder = SiteBuilder(
        data_context=context,
        runtime_environment={"root_directory": context.root_directory},
        **local_site_config)
    res = site_builder.build()

    index_page_locator_info = res[0]
    index_links_dict = res[1]

    # assert that how-to buttons and related elements are rendered (default behavior)
    assert_how_to_buttons(context, index_page_locator_info, index_links_dict)
    # print(json.dumps(index_page_locator_info, indent=2))
    assert (index_page_locator_info == "file://" + context.root_directory +
            "/uncommitted/data_docs/local_site/index.html")

    # print(json.dumps(index_links_dict, indent=2))

    assert "site_name" in index_links_dict

    assert "expectations_links" in index_links_dict
    assert len(index_links_dict["expectations_links"]) == 5

    assert "validations_links" in index_links_dict
    assert (len(index_links_dict["validations_links"]) == 1), """
    The only rendered validation should be the one not generated by the profiler
    """

    assert "profiling_links" in index_links_dict
    assert len(index_links_dict["profiling_links"]) == 5

    # save documentation locally
    os.makedirs("./tests/render/output", exist_ok=True)
    os.makedirs("./tests/render/output/documentation", exist_ok=True)

    if os.path.isdir("./tests/render/output/documentation"):
        shutil.rmtree("./tests/render/output/documentation")
    shutil.copytree(
        os.path.join(
            site_builder_data_context_with_html_store_titanic_random.
            root_directory,
            "uncommitted/data_docs/",
        ),
        "./tests/render/output/documentation",
    )

    # let's create another validation result and run the site builder to add it
    # to the data docs
    # the operator does not have an StoreValidationResultAction action configured, so the site
    # will not be updated without our call to site builder

    expectation_suite_path_component = expectation_suite_name.replace(".", "/")
    validation_result_page_path = os.path.join(
        site_builder.site_index_builder.target_store.
        store_backends[ValidationResultIdentifier].full_base_directory,
        "validations",
        expectation_suite_path_component,
        run_id.run_name,
        run_id.run_time.strftime("%Y%m%dT%H%M%S.%fZ"),
        batch.batch_id + ".html",
    )

    ts_last_mod_0 = os.path.getmtime(validation_result_page_path)

    run_id = RunIdentifier(run_name="test_run_id_12346")
    operator_result = context.run_validation_operator(
        assets_to_validate=[batch],
        run_id=run_id,
        validation_operator_name="validate_and_store",
    )

    validation_result_id = operator_result.list_validation_result_identifiers(
    )[0]
    res = site_builder.build(resource_identifiers=[validation_result_id])

    index_links_dict = res[1]

    # verify that an additional validation result HTML file was generated
    assert len(index_links_dict["validations_links"]) == 2

    site_builder.site_index_builder.target_store.store_backends[
        ValidationResultIdentifier].full_base_directory

    # verify that the validation result HTML file rendered in the previous run was NOT updated
    ts_last_mod_1 = os.path.getmtime(validation_result_page_path)

    assert ts_last_mod_0 == ts_last_mod_1

    # verify that the new method of the site builder that returns the URL of the HTML file that renders
    # a resource

    new_validation_result_page_path = os.path.join(
        site_builder.site_index_builder.target_store.
        store_backends[ValidationResultIdentifier].full_base_directory,
        "validations",
        expectation_suite_path_component,
        run_id.run_name,
        run_id.run_time.strftime("%Y%m%dT%H%M%S.%fZ"),
        batch.batch_id + ".html",
    )

    html_url = site_builder.get_resource_url(
        resource_identifier=validation_result_id)
    assert "file://" + new_validation_result_page_path == html_url

    html_url = site_builder.get_resource_url()
    assert ("file://" + os.path.join(
        site_builder.site_index_builder.target_store.
        store_backends[ValidationResultIdentifier].full_base_directory,
        "index.html",
    ) == html_url)

    team_site_config = data_docs_config["team_site"]
    team_site_builder = SiteBuilder(
        data_context=context,
        runtime_environment={"root_directory": context.root_directory},
        **team_site_config)
    team_site_builder.clean_site()
    obs = [
        url_dict
        for url_dict in context.get_docs_sites_urls(site_name="team_site")
        if url_dict.get("site_url")
    ]
    assert len(obs) == 0

    # exercise clean_site
    site_builder.clean_site()
    obs = [
        url_dict for url_dict in context.get_docs_sites_urls()
        if url_dict.get("site_url")
    ]
    assert len(obs) == 0

    # restore site
    context = site_builder_data_context_with_html_store_titanic_random
    site_builder = SiteBuilder(
        data_context=context,
        runtime_environment={"root_directory": context.root_directory},
        **local_site_config)
    res = site_builder.build()
def test_TupleGCSStoreBackend():
    # pytest.importorskip("google-cloud-storage")
    """
    What does this test test and why?

    Since no package like moto exists for GCP services, we mock the GCS client
    and assert that the store backend makes the right calls for set, get, and list.

    TODO : One option may be to have a GCS Store in Docker, which can be use to "actually" run these tests.
    """

    bucket = "leakybucket"
    prefix = "this_is_a_test_prefix"
    project = "dummy-project"

    my_store = TupleGCSStoreBackend(
        filepath_template="my_file_{0}", bucket=bucket, prefix=prefix, project=project
    )

    my_store_with_no_filepath_template = TupleGCSStoreBackend(
        filepath_template=None, bucket=bucket, prefix=prefix, project=project
    )

    with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client:

        mock_client = mock_gcs_client.return_value
        mock_bucket = mock_client.get_bucket.return_value
        mock_blob = mock_bucket.blob.return_value

        my_store.set(("AAA",), "aaa", content_type="text/html")

        mock_gcs_client.assert_called_once_with("dummy-project")
        mock_client.get_bucket.assert_called_once_with("leakybucket")
        mock_bucket.blob.assert_called_once_with("this_is_a_test_prefix/my_file_AAA")
        mock_blob.upload_from_string.assert_called_once_with(
            b"aaa", content_type="text/html"
        )

    with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client:
        mock_client = mock_gcs_client.return_value
        mock_bucket = mock_client.get_bucket.return_value
        mock_blob = mock_bucket.blob.return_value

        my_store_with_no_filepath_template.set(
            ("AAA",), b"aaa", content_encoding=None, content_type="image/png"
        )

        mock_gcs_client.assert_called_once_with("dummy-project")
        mock_client.get_bucket.assert_called_once_with("leakybucket")
        mock_bucket.blob.assert_called_once_with("this_is_a_test_prefix/AAA")
        mock_blob.upload_from_string.assert_called_once_with(
            b"aaa", content_type="image/png"
        )

    with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client:

        mock_client = mock_gcs_client.return_value
        mock_bucket = mock_client.get_bucket.return_value
        mock_blob = mock_bucket.get_blob.return_value
        mock_str = mock_blob.download_as_string.return_value

        my_store.get(("BBB",))

        mock_gcs_client.assert_called_once_with("dummy-project")
        mock_client.get_bucket.assert_called_once_with("leakybucket")
        mock_bucket.get_blob.assert_called_once_with(
            "this_is_a_test_prefix/my_file_BBB"
        )
        mock_blob.download_as_string.assert_called_once()
        mock_str.decode.assert_called_once_with("utf-8")

    with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client:

        mock_client = mock_gcs_client.return_value

        my_store.list_keys()

        mock_client.list_blobs.assert_called_once_with(
            "leakybucket", prefix="this_is_a_test_prefix"
        )

        my_store.remove_key("leakybucket")

        from google.cloud.exceptions import NotFound

        try:
            mock_client.get_bucket.assert_called_once_with("leakybucket")
        except NotFound:
            pass

    with patch("google.cloud.storage.Client", autospec=True) as mock_gcs_client:
        mock_gcs_client.side_effect = InvalidKeyError("Hi I am an InvalidKeyError")
        with pytest.raises(InvalidKeyError):
            my_store.get(("non_existent_key",))

    run_id = RunIdentifier("my_run_id", datetime.datetime.utcnow())
    key = ValidationResultIdentifier(
        ExpectationSuiteIdentifier(expectation_suite_name="my_suite_name"),
        run_id,
        "my_batch_id",
    )
    run_time_string = run_id.to_tuple()[1]

    url = my_store_with_no_filepath_template.get_url_for_key(key.to_tuple())
    assert (
        url
        == "https://storage.googleapis.com/leakybucket"
        + f"/this_is_a_test_prefix/my_suite_name/my_run_id/{run_time_string}/my_batch_id"
    )
def validation_operator_run(name, run_name, validation_config_file, suite,
                            directory):
    # Note though the long lines here aren't pythonic, they look best if Click does the line wraps.
    """
    Run a validation operator against some data.

    There are two modes to run this command:

    1. Interactive (good for development):

        Specify the name of the validation operator using the --name argument
        and the name of the expectation suite using the --suite argument.

        The cli will help you specify the batch of data that you want to
        validate interactively.


    2. Non-interactive (good for production):

        Use the `--validation_config_file` argument to specify the path of the validation configuration JSON file. This file can be used to instruct a validation operator to validate multiple batches of data and use multiple expectation suites to validate each batch.

        Learn how to create a validation config file here:
        https://great-expectations.readthedocs.io/en/latest/command_line.html#great-expectations-validation-operator-run-validation-config-file-validation-config-file-path

        This command exits with 0 if the validation operator ran and the "success" attribute in its return object is True. Otherwise, the command exits with 1.

    To learn more about validation operators, go here:
    https://great-expectations.readthedocs.io/en/latest/features/validation.html#validation-operators
    """

    try:
        context = DataContext(directory)
    except ge_exceptions.ConfigNotFoundError as err:
        cli_message("Failed to process <red>{}</red>".format(err.message))
        sys.exit(1)

    try:
        if validation_config_file is not None:
            try:
                with open(validation_config_file) as f:
                    validation_config = json.load(f)
            except (IOError, json_parse_exception) as e:
                cli_message(
                    f"Failed to process the --validation_config_file argument: <red>{e}</red>"
                )
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)

            validation_config_error_message = _validate_valdiation_config(
                validation_config)
            if validation_config_error_message is not None:
                cli_message(
                    "<red>The validation config in {0:s} is misconfigured: {1:s}</red>"
                    .format(validation_config_file,
                            validation_config_error_message))
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)

        else:
            if suite is None:
                cli_message("""
Please use --suite argument to specify the name of the expectation suite.
Call `great_expectation suite list` command to list the expectation suites in your project.
""")
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(0)

            suite = toolkit.load_expectation_suite(
                context, suite, "cli.validation_operator.run")

            if name is None:
                cli_message("""
Please use --name argument to specify the name of the validation operator.
Call `great_expectation validation-operator list` command to list the operators in your project.
""")
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)
            else:
                if name not in context.list_validation_operator_names():
                    cli_message(f"""
Could not find a validation operator {name}.
Call `great_expectation validation-operator list` command to list the operators in your project.
""")
                    send_usage_message(
                        data_context=context,
                        event="cli.validation_operator.run",
                        success=False,
                    )
                    sys.exit(1)

            batch_kwargs = None

            cli_message("""
Let us help you specify the batch of data your want the validation operator to validate."""
                        )

            try:
                data_source = toolkit.select_datasource(context)
            except ValueError as ve:
                cli_message("<red>{}</red>".format(ve))
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)

            if not data_source:
                cli_message("<red>No datasources found in the context.</red>")
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)

            if batch_kwargs is None:
                (
                    datasource_name,
                    batch_kwargs_generator,
                    data_asset,
                    batch_kwargs,
                ) = get_batch_kwargs(
                    context,
                    datasource_name=data_source.name,
                    batch_kwargs_generator_name=None,
                    data_asset_name=None,
                    additional_batch_kwargs=None,
                )

            validation_config = {
                "validation_operator_name":
                name,
                "batches": [{
                    "batch_kwargs":
                    batch_kwargs,
                    "expectation_suite_names": [suite.expectation_suite_name],
                }],
            }

        try:
            validation_operator_name = validation_config[
                "validation_operator_name"]
            batches_to_validate = []
            for entry in validation_config["batches"]:
                for expectation_suite_name in entry["expectation_suite_names"]:
                    batch = context.get_batch(entry["batch_kwargs"],
                                              expectation_suite_name)
                    batches_to_validate.append(batch)

            if run_name is None:
                run_name = datetime.datetime.now(
                    datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")

            run_id = RunIdentifier(run_name=run_name)

            if suite is None:
                results = context.run_validation_operator(
                    validation_operator_name,
                    assets_to_validate=batches_to_validate,
                    run_id=run_id,
                )
            else:
                if suite.evaluation_parameters is None:
                    results = context.run_validation_operator(
                        validation_operator_name,
                        assets_to_validate=batches_to_validate,
                        run_id=run_id,
                    )
                else:
                    results = context.run_validation_operator(
                        validation_operator_name,
                        assets_to_validate=batches_to_validate,
                        run_id=run_id,
                        evaluation_parameters=suite.evaluation_parameters,
                    )
        except (
                ge_exceptions.DataContextError,
                IOError,
                SQLAlchemyError,
        ) as e:
            cli_message("<red>{}</red>".format(e))
            send_usage_message(data_context=context,
                               event="cli.validation_operator.run",
                               success=False)
            sys.exit(1)

        if not results["success"]:
            cli_message("Validation Failed!")
            send_usage_message(data_context=context,
                               event="cli.validation_operator.run",
                               success=True)
            sys.exit(1)
        else:
            cli_message("Validation Succeeded!")
            send_usage_message(data_context=context,
                               event="cli.validation_operator.run",
                               success=True)
            sys.exit(0)
    except Exception as e:
        send_usage_message(data_context=context,
                           event="cli.validation_operator.run",
                           success=False)
        raise e
Ejemplo n.º 17
0
def test_configuration_driven_site_builder_skip_and_clean_missing(
    site_builder_data_context_with_html_store_titanic_random, ):
    # tests auto-cleaning functionality of DefaultSiteIndexBuilder
    # when index page is built, if an HTML page is present without corresponding suite or validation result,
    # the HTML page should be removed and not appear on index page
    context = site_builder_data_context_with_html_store_titanic_random

    context.add_validation_operator(
        "validate_and_store",
        {
            "class_name":
            "ActionListValidationOperator",
            "action_list": [
                {
                    "name": "store_validation_result",
                    "action": {
                        "class_name": "StoreValidationResultAction",
                        "target_store_name": "validations_store",
                    },
                },
                {
                    "name": "extract_and_store_eval_parameters",
                    "action": {
                        "class_name": "StoreEvaluationParametersAction",
                        "target_store_name": "evaluation_parameter_store",
                    },
                },
            ],
        },
    )

    # profiling the Titanic datasource will generate one expectation suite and one validation
    # that is a profiling result
    datasource_name = "titanic"
    data_asset_name = "Titanic"
    profiler_name = "BasicDatasetProfiler"
    generator_name = "subdir_reader"
    context.profile_datasource(datasource_name)

    # creating another validation result using the profiler's suite (no need to use a new expectation suite
    # for this test). having two validation results - one with run id "profiling" - allows us to test
    # the logic of run_name_filter that helps filtering validation results to be included in
    # the profiling and the validation sections.
    batch_kwargs = context.build_batch_kwargs(
        datasource=datasource_name,
        batch_kwargs_generator=generator_name,
        data_asset_name=data_asset_name,
    )

    expectation_suite_name = "{}.{}.{}.{}".format(datasource_name,
                                                  generator_name,
                                                  data_asset_name,
                                                  profiler_name)

    batch = context.get_batch(
        batch_kwargs=batch_kwargs,
        expectation_suite_name=expectation_suite_name,
    )
    run_id = RunIdentifier(run_name="test_run_id_12345")
    context.run_validation_operator(
        assets_to_validate=[batch],
        run_id=run_id,
        validation_operator_name="validate_and_store",
    )

    data_docs_config = context._project_config.data_docs_sites
    local_site_config = data_docs_config["local_site"]

    validations_set = set(context.stores["validations_store"].list_keys())
    assert len(validations_set) == 6

    expectation_suite_set = set(
        context.stores["expectations_store"].list_keys())
    assert len(expectation_suite_set) == 5

    site_builder = SiteBuilder(
        data_context=context,
        runtime_environment={"root_directory": context.root_directory},
        **local_site_config)
    site_builder.build()

    # test expectation suite pages
    expectation_suite_html_pages = {
        ExpectationSuiteIdentifier.from_tuple(suite_tuple)
        for suite_tuple in site_builder.target_store.
        store_backends[ExpectationSuiteIdentifier].list_keys()
    }
    # suites in expectations store should match html pages
    assert expectation_suite_set == expectation_suite_html_pages

    # remove suites from expectations store
    for i in range(2):
        context.stores["expectations_store"].remove_key(
            list(expectation_suite_set)[i])

    # re-build data docs, which should remove suite HTML pages that no longer have corresponding suite in
    # expectations store
    site_builder.build()

    expectation_suite_set = set(
        context.stores["expectations_store"].list_keys())
    expectation_suite_html_pages = {
        ExpectationSuiteIdentifier.from_tuple(suite_tuple)
        for suite_tuple in site_builder.target_store.
        store_backends[ExpectationSuiteIdentifier].list_keys()
    }
    assert expectation_suite_set == expectation_suite_html_pages

    # test validation result pages
    validation_html_pages = {
        ValidationResultIdentifier.from_tuple(result_tuple)
        for result_tuple in site_builder.target_store.
        store_backends[ValidationResultIdentifier].list_keys()
    }
    # validations in store should match html pages
    assert validations_set == validation_html_pages

    # remove validations from store
    for i in range(2):
        context.stores["validations_store"].store_backend.remove_key(
            list(validations_set)[i])

    # re-build data docs, which should remove validation HTML pages that no longer have corresponding validation in
    # validations store
    site_builder.build()

    validations_set = set(context.stores["validations_store"].list_keys())
    validation_html_pages = {
        ValidationResultIdentifier.from_tuple(result_tuple)
        for result_tuple in site_builder.target_store.
        store_backends[ValidationResultIdentifier].list_keys()
    }
    assert validations_set == validation_html_pages
Ejemplo n.º 18
0
def test_resource_key_passes_run_name_filter():
    resource_key = ValidationResultIdentifier(
        expectation_suite_identifier=ExpectationSuiteIdentifier("test_suite"),
        run_id=RunIdentifier(run_name="foofooprofilingfoo"),
        batch_identifier="f14c3d2f6e8028c2db0c25edabdb0d61",
    )

    assert (
        resource_key_passes_run_name_filter(
            resource_key, run_name_filter={"equals": "profiling"}
        )
        is False
    )
    assert (
        resource_key_passes_run_name_filter(
            resource_key, run_name_filter={"equals": "foofooprofilingfoo"}
        )
        is True
    )

    assert (
        resource_key_passes_run_name_filter(
            resource_key, run_name_filter={"not_equals": "profiling"}
        )
        is True
    )
    assert (
        resource_key_passes_run_name_filter(
            resource_key, run_name_filter={"not_equals": "foofooprofilingfoo"}
        )
        is False
    )

    assert (
        resource_key_passes_run_name_filter(
            resource_key, run_name_filter={"includes": "profiling"}
        )
        is True
    )
    assert (
        resource_key_passes_run_name_filter(
            resource_key, run_name_filter={"includes": "foobar"}
        )
        is False
    )

    assert (
        resource_key_passes_run_name_filter(
            resource_key, run_name_filter={"not_includes": "foobar"}
        )
        is True
    )
    assert (
        resource_key_passes_run_name_filter(
            resource_key, run_name_filter={"not_includes": "profiling"}
        )
        is False
    )

    assert (
        resource_key_passes_run_name_filter(
            resource_key,
            run_name_filter={"matches_regex": "(foo){2}profiling(" "foo)+"},
        )
        is True
    )
    assert (
        resource_key_passes_run_name_filter(
            resource_key,
            run_name_filter={"matches_regex": "(foo){3}profiling(" "foo)+"},
        )
        is False
    )
    with pytest.warns(DeprecationWarning):
        assert (
            resource_key_passes_run_name_filter(
                resource_key, run_name_filter={"eq": "profiling"}
            )
            is False
        )
        assert (
            resource_key_passes_run_name_filter(
                resource_key, run_name_filter={"eq": "foofooprofilingfoo"}
            )
            is True
        )
    with pytest.warns(DeprecationWarning):
        assert (
            resource_key_passes_run_name_filter(
                resource_key, run_name_filter={"ne": "profiling"}
            )
            is True
        )
        assert (
            resource_key_passes_run_name_filter(
                resource_key, run_name_filter={"ne": "foofooprofilingfoo"}
            )
            is False
        )
Ejemplo n.º 19
0
 def from_fixed_length_tuple(cls, tuple_):
     return cls(
         ExpectationSuiteIdentifier(tuple_[0]),
         RunIdentifier.from_tuple((tuple_[1], tuple_[2])),
         tuple_[3],
     )
    def run(
        self,
        assets_to_validate,
        run_id=None,
        base_expectation_suite_name=None,
        evaluation_parameters=None,
        run_name=None,
        run_time=None,
        result_format=None,
    ):
        assert not (run_id and run_name) and not (
            run_id and run_time
        ), "Please provide either a run_id or run_name and/or run_time."
        if isinstance(run_id, str) and not run_name:
            warnings.warn(
                "String run_ids will be deprecated in the future. Please provide a run_id of type "
                "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name "
                "and run_time (both optional). Instead of providing a run_id, you may also provide"
                "run_name and run_time separately.",
                DeprecationWarning,
            )
            try:
                run_time = parse(run_id)
            except (ParserError, TypeError):
                pass
            run_id = RunIdentifier(run_name=run_id, run_time=run_time)
        elif isinstance(run_id, dict):
            run_id = RunIdentifier(**run_id)
        elif not isinstance(run_id, RunIdentifier):
            run_id = RunIdentifier(run_name=run_name, run_time=run_time)

        if base_expectation_suite_name is None:
            if self.base_expectation_suite_name is None:
                raise ValueError(
                    "base_expectation_suite_name must be configured in the validation operator or passed at runtime"
                )
            base_expectation_suite_name = self.base_expectation_suite_name

        run_results = {}

        for item in assets_to_validate:
            batch = self._build_batch_from_item(item)

            batch_id = batch.batch_id
            run_id = run_id

            assert not batch_id is None
            assert not run_id is None

            failure_expectation_suite_identifier = ExpectationSuiteIdentifier(
                expectation_suite_name=base_expectation_suite_name
                + self.expectation_suite_name_suffixes[0]
            )

            failure_validation_result_id = ValidationResultIdentifier(
                expectation_suite_identifier=failure_expectation_suite_identifier,
                run_id=run_id,
                batch_identifier=batch_id,
            )

            failure_expectation_suite = None
            try:
                failure_expectation_suite = self.data_context.stores[
                    self.data_context.expectations_store_name
                ].get(failure_expectation_suite_identifier)

            # NOTE : Abe 2019/09/17 : I'm concerned that this may be too permissive, since
            # it will catch any error in the Store, not just KeyErrors. In the longer term, a better
            # solution will be to have the Stores catch other known errors and raise KeyErrors,
            # so that methods like this can catch and handle a single error type.
            except Exception:
                logger.debug(
                    "Failure expectation suite not found: {}".format(
                        failure_expectation_suite_identifier
                    )
                )

            if failure_expectation_suite:
                failure_run_result_obj = {"expectation_suite_severity_level": "failure"}
                failure_validation_result = batch.validate(
                    failure_expectation_suite,
                    result_format=result_format
                    if result_format
                    else self.result_format,
                    evaluation_parameters=evaluation_parameters,
                )
                failure_run_result_obj["validation_result"] = failure_validation_result
                failure_actions_results = self._run_actions(
                    batch,
                    failure_expectation_suite_identifier,
                    failure_expectation_suite,
                    failure_validation_result,
                    run_id,
                )
                failure_run_result_obj["actions_results"] = failure_actions_results
                run_results[failure_validation_result_id] = failure_run_result_obj

                if not failure_validation_result.success and self.stop_on_first_error:
                    break

            warning_expectation_suite_identifier = ExpectationSuiteIdentifier(
                expectation_suite_name=base_expectation_suite_name
                + self.expectation_suite_name_suffixes[1]
            )

            warning_validation_result_id = ValidationResultIdentifier(
                expectation_suite_identifier=warning_expectation_suite_identifier,
                run_id=run_id,
                batch_identifier=batch.batch_id,
            )

            warning_expectation_suite = None
            try:
                warning_expectation_suite = self.data_context.stores[
                    self.data_context.expectations_store_name
                ].get(warning_expectation_suite_identifier)
            except Exception:
                logger.debug(
                    "Warning expectation suite not found: {}".format(
                        warning_expectation_suite_identifier
                    )
                )

            if warning_expectation_suite:
                warning_run_result_obj = {"expectation_suite_severity_level": "warning"}
                warning_validation_result = batch.validate(
                    warning_expectation_suite,
                    result_format=result_format
                    if result_format
                    else self.result_format,
                    evaluation_parameters=evaluation_parameters,
                )
                warning_run_result_obj["validation_result"] = warning_validation_result
                warning_actions_results = self._run_actions(
                    batch,
                    warning_expectation_suite_identifier,
                    warning_expectation_suite,
                    warning_validation_result,
                    run_id,
                )
                warning_run_result_obj["actions_results"] = warning_actions_results
                run_results[warning_validation_result_id] = warning_run_result_obj

        validation_operator_result = ValidationOperatorResult(
            run_id=run_id,
            run_results=run_results,
            validation_operator_config=self.validation_operator_config,
            evaluation_parameters=evaluation_parameters,
            success=all(
                [
                    run_result_obj["validation_result"].success
                    for run_result_obj in run_results.values()
                ]
            ),
        )

        if self.slack_webhook:
            if (
                self.notify_on == "all"
                or self.notify_on == "success"
                and validation_operator_result.success
                or self.notify_on == "failure"
                and not validation_operator_result.success
            ):
                slack_query = self._build_slack_query(
                    validation_operator_result=validation_operator_result
                )
                send_slack_notification(
                    query=slack_query, slack_webhook=self.slack_webhook
                )

        return validation_operator_result
Ejemplo n.º 21
0
    def run(
        self,
        template_name: Optional[str] = None,
        run_name_template: Optional[str] = None,
        expectation_suite_name: Optional[str] = None,
        batch_request: Optional[Union[BatchRequestBase, dict]] = None,
        action_list: Optional[List[dict]] = None,
        evaluation_parameters: Optional[dict] = None,
        runtime_configuration: Optional[dict] = None,
        validations: Optional[List[dict]] = None,
        profilers: Optional[List[dict]] = None,
        run_id: Optional[Union[str, RunIdentifier]] = None,
        run_name: Optional[str] = None,
        run_time: Optional[Union[str, datetime.datetime]] = None,
        result_format: Optional[Union[str, dict]] = None,
        expectation_suite_ge_cloud_id: Optional[str] = None,
    ) -> CheckpointResult:
        assert not (run_id and run_name) and not (
            run_id and run_time
        ), "Please provide either a run_id or run_name and/or run_time."

        run_time = run_time or datetime.datetime.now()
        runtime_configuration = runtime_configuration or {}
        result_format = result_format or runtime_configuration.get(
            "result_format")

        batch_request = get_batch_request_as_dict(batch_request=batch_request)
        validations = get_validations_with_batch_request_as_dict(
            validations=validations)

        runtime_kwargs: dict = {
            "template_name": template_name,
            "run_name_template": run_name_template,
            "expectation_suite_name": expectation_suite_name,
            "batch_request": batch_request or {},
            "action_list": action_list or [],
            "evaluation_parameters": evaluation_parameters or {},
            "runtime_configuration": runtime_configuration or {},
            "validations": validations or [],
            "profilers": profilers or [],
            "expectation_suite_ge_cloud_id": expectation_suite_ge_cloud_id,
        }

        substituted_runtime_config: dict = self.get_substituted_config(
            runtime_kwargs=runtime_kwargs)

        run_name_template = substituted_runtime_config.get("run_name_template")

        batch_request = substituted_runtime_config.get("batch_request")
        validations = substituted_runtime_config.get("validations") or []

        if len(validations) == 0 and not batch_request:
            raise ge_exceptions.CheckpointError(
                f'Checkpoint "{self.name}" must contain either a batch_request or validations.'
            )

        if run_name is None and run_name_template is not None:
            run_name = get_datetime_string_from_strftime_format(
                format_str=run_name_template, datetime_obj=run_time)

        run_id = run_id or RunIdentifier(run_name=run_name, run_time=run_time)

        # Use AsyncExecutor to speed up I/O bound validations by running them in parallel with multithreading (if
        # concurrency is enabled in the data context configuration) -- please see the below arguments used to initialize
        # AsyncExecutor and the corresponding AsyncExecutor docstring for more details on when multiple threads are
        # used.
        with AsyncExecutor(self.data_context.concurrency,
                           max_workers=len(validations)) as async_executor:
            # noinspection PyUnresolvedReferences
            async_validation_operator_results: List[
                AsyncResult[ValidationOperatorResult]] = []
            if len(validations) > 0:
                for idx, validation_dict in enumerate(validations):
                    self._run_validation(
                        substituted_runtime_config=substituted_runtime_config,
                        async_validation_operator_results=
                        async_validation_operator_results,
                        async_executor=async_executor,
                        result_format=result_format,
                        run_id=run_id,
                        idx=idx,
                        validation_dict=validation_dict,
                    )
            else:
                self._run_validation(
                    substituted_runtime_config=substituted_runtime_config,
                    async_validation_operator_results=
                    async_validation_operator_results,
                    async_executor=async_executor,
                    result_format=result_format,
                    run_id=run_id,
                )

            run_results: dict = {}
            for async_validation_operator_result in async_validation_operator_results:
                run_results.update(
                    async_validation_operator_result.result().run_results)

        return CheckpointResult(
            run_id=run_id,
            run_results=run_results,
            checkpoint_config=self.config,
        )
def test_run_identifier_parses_datetime_run_name():
    time = datetime.datetime.now(
        datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
    run_id = RunIdentifier(run_name=time)
    assert run_id.run_name == run_id.run_time.strftime("%Y%m%dT%H%M%S.%fZ")
Ejemplo n.º 23
0
    def run(
        self,
        checkpoint_name: str = None,
        ge_checkpoint: Checkpoint = None,
        checkpoint_kwargs: dict = None,
        context: ge.DataContext = None,
        assets_to_validate: list = None,
        batch_kwargs: dict = None,
        expectation_suite_name: str = None,
        context_root_dir: str = None,
        runtime_environment: Optional[dict] = None,
        run_name: str = None,
        run_info_at_end: bool = True,
        disable_markdown_artifact: bool = False,
        validation_operator: str = "action_list_operator",
        evaluation_parameters: Optional[dict] = None,
    ):
        """
        Task run method.

        Args:
            - checkpoint_name (str, optional): the name of a pre-configured checkpoint; should match the
                filename of the checkpoint without the extension. Either checkpoint_name or
                checkpoint_config is required when using the Great Expectations v3 API.
            - ge_checkpoint (Checkpoint, optional): an in-memory GE `Checkpoint` object used to perform
                validation. If not provided then `checkpoint_name` will be used to load the specified
                checkpoint.
            - checkpoint_kwargs (Dict, optional): A dictionary whose keys match the parameters of
                `CheckpointConfig` which can be used to update and populate the task's Checkpoint at
                runtime.
            - context (DataContext, optional): an in-memory GE `DataContext` object. e.g.
                `ge.data_context.DataContext()` If not provided then `context_root_dir` will be used to
                look for one.
            - assets_to_validate (list, optional): A list of assets to validate when running the
                validation operator. Only used in the Great Expectations v2 API
            - batch_kwargs (dict, optional): a dictionary of batch kwargs to be used when validating
                assets. Only used in the Great Expectations v2 API
            - expectation_suite_name (str, optional): the name of an expectation suite to be used when
                validating assets. Only used in the Great Expectations v2 API
            - context_root_dir (str, optional): the absolute or relative path to the directory holding
                your `great_expectations.yml`
            - runtime_environment (dict, optional): a dictionary of great expectation config key-value
                pairs to overwrite your config in `great_expectations.yml`
            - run_name (str, optional): the name of this  Great Expectation validation run; defaults to
                the task slug
            - run_info_at_end (bool, optional): add run info to the end of the artifact generated by this
                task. Defaults to `True`.
            - disable_markdown_artifact (bool, optional): toggle the posting of a markdown artifact from
                this tasks. Defaults to `False`.
            - evaluation_parameters (Optional[dict], optional): the evaluation parameters to use when
                running validation. For more information, see
                [example](https://docs.prefect.io/api/latest/tasks/great_expectations.html#rungreatexpectationsvalidation)
                and
                [docs](https://docs.greatexpectations.io/en/latest/reference/core_concepts/evaluation_parameters.html).
            - validation_operator (str, optional): configure the actions to be executed after running
                validation. Defaults to `action_list_operator`.

        Raises:
            - 'signals.FAIL' if the validation was not a success

        Returns:
            - result
                ('great_expectations.validation_operators.types.validation_operator_result.ValidationOperatorResult'):
                The Great Expectations metadata returned from the validation if the v2 (batch_kwargs) API
                is used.

                ('great_expectations.checkpoint.checkpoint.CheckpointResult'):
                The Great Expectations metadata returned from running the provided checkpoint if a
                checkpoint name is provided.

        """

        if version.parse(ge.__version__) < version.parse("0.13.8"):
            self.logger.warning(
                f"You are using great_expectations version {ge.__version__} which may cause"
                "errors in this task. Please upgrade great_expections to 0.13.8 or later."
            )

        runtime_environment = runtime_environment or dict()
        checkpoint_kwargs = checkpoint_kwargs or dict()

        # Load context if not provided directly
        if not context:
            context = ge.DataContext(
                context_root_dir=context_root_dir,
                runtime_environment=runtime_environment,
            )

        # Check that the parameters are mutually exclusive
        if (sum(
                bool(x) for x in [
                    (expectation_suite_name and batch_kwargs),
                    assets_to_validate,
                    checkpoint_name,
                    ge_checkpoint,
                ]) != 1):
            raise ValueError(
                "Exactly one of expectation_suite_name + batch_kwargs, assets_to_validate, "
                "checkpoint_name, or ge_checkpoint is required to run validation."
            )

        results = None
        # If there is a checkpoint or checkpoint name provided, run the checkpoint.
        # Checkpoints are the preferred deployment of validation configuration.
        if ge_checkpoint or checkpoint_name:
            ge_checkpoint = ge_checkpoint or context.get_checkpoint(
                checkpoint_name)
            results = ge_checkpoint.run(
                evaluation_parameters=evaluation_parameters,
                run_id=RunIdentifier(run_name
                                     or prefect.context.get("task_slug")),
                **checkpoint_kwargs,
            )
        else:
            # If assets are not provided directly through `assets_to_validate` then they need be loaded
            #   get batch from `batch_kwargs` and `expectation_suite_name`
            if not assets_to_validate:
                assets_to_validate = [
                    context.get_batch(batch_kwargs, expectation_suite_name)
                ]

            # Run validation operator
            results = context.run_validation_operator(
                validation_operator,
                assets_to_validate=assets_to_validate,
                run_id=RunIdentifier(run_name
                                     or prefect.context.get("task_slug")),
                evaluation_parameters=evaluation_parameters,
            )

        # Generate artifact markdown
        if not disable_markdown_artifact:
            validation_results_page_renderer = (
                ge.render.renderer.ValidationResultsPageRenderer(
                    run_info_at_end=run_info_at_end))
            rendered_content_list = validation_results_page_renderer.render_validation_operator_result(
                # This also works with a CheckpointResult because of duck typing.
                # The passed in object needs a list_validation_results method that
                # returns a list of ExpectationSuiteValidationResult.
                validation_operator_result=results)
            markdown_artifact = " ".join(
                ge.render.view.DefaultMarkdownPageView().render(
                    rendered_content_list))

            create_markdown_artifact(markdown_artifact)

        if results.success is False:
            raise signals.FAIL(result=results)

        return results
Ejemplo n.º 24
0
def test_StoreMetricsAction(
        basic_in_memory_data_context_for_validation_operator):
    action = StoreMetricsAction(
        data_context=basic_in_memory_data_context_for_validation_operator,
        requested_metrics={
            "*": [
                "statistics.evaluated_expectations",
                "statistics.successful_expectations",
            ]
        },
        target_store_name="metrics_store",
    )

    run_id = RunIdentifier(run_name="bar")

    validation_result = ExpectationSuiteValidationResult(
        success=False,
        meta={
            "expectation_suite_name": "foo",
            "run_id": run_id
        },
        statistics={
            "evaluated_expectations": 5,
            "successful_expectations": 3
        },
    )

    # Run the action and store our metrics
    action.run(
        validation_result,
        ValidationResultIdentifier.from_object(validation_result),
        data_asset=None,
    )

    validation_result = ExpectationSuiteValidationResult(
        success=False,
        meta={
            "expectation_suite_name": "foo.warning",
            "run_id": run_id
        },
        statistics={
            "evaluated_expectations": 8,
            "successful_expectations": 4
        },
    )

    action.run(
        validation_result,
        ValidationResultIdentifier.from_object(validation_result),
        data_asset=None,
    )

    assert (
        basic_in_memory_data_context_for_validation_operator.
        stores["metrics_store"].get(
            ValidationMetricIdentifier(
                run_id=run_id,
                data_asset_name=None,
                expectation_suite_identifier=ExpectationSuiteIdentifier("foo"),
                metric_name="statistics.evaluated_expectations",
                metric_kwargs_id=None,
            )) == 5)

    assert (
        basic_in_memory_data_context_for_validation_operator.
        stores["metrics_store"].get(
            ValidationMetricIdentifier(
                run_id=run_id,
                data_asset_name=None,
                expectation_suite_identifier=ExpectationSuiteIdentifier("foo"),
                metric_name="statistics.successful_expectations",
                metric_kwargs_id=None,
            )) == 3)

    assert (basic_in_memory_data_context_for_validation_operator.
            stores["metrics_store"].get(
                ValidationMetricIdentifier(
                    run_id=run_id,
                    data_asset_name=None,
                    expectation_suite_identifier=ExpectationSuiteIdentifier(
                        "foo.warning"),
                    metric_name="statistics.evaluated_expectations",
                    metric_kwargs_id=None,
                )) == 8)

    assert (basic_in_memory_data_context_for_validation_operator.
            stores["metrics_store"].get(
                ValidationMetricIdentifier(
                    run_id=run_id,
                    data_asset_name=None,
                    expectation_suite_identifier=ExpectationSuiteIdentifier(
                        "foo.warning"),
                    metric_name="statistics.successful_expectations",
                    metric_kwargs_id=None,
                )) == 4)
Ejemplo n.º 25
0
def test_evaluation_parameter_store_methods(
    data_context_parameterized_expectation_suite, ):
    run_id = RunIdentifier(run_name="20191125T000000.000000Z")
    source_patient_data_results = ExpectationSuiteValidationResult(
        meta={
            "expectation_suite_name": "source_patient_data.default",
            "run_id": run_id,
        },
        results=[
            ExpectationValidationResult(
                expectation_config=ExpectationConfiguration(
                    expectation_type="expect_table_row_count_to_equal",
                    kwargs={
                        "value": 1024,
                    },
                ),
                success=True,
                exception_info={
                    "exception_message": None,
                    "exception_traceback": None,
                    "raised_exception": False,
                },
                result={
                    "observed_value": 1024,
                    "element_count": 1024,
                    "missing_percent": 0.0,
                    "missing_count": 0,
                },
            )
        ],
        success=True,
    )

    data_context_parameterized_expectation_suite.store_evaluation_parameters(
        source_patient_data_results)

    bound_parameters = data_context_parameterized_expectation_suite.evaluation_parameter_store.get_bind_params(
        run_id)
    assert bound_parameters == {
        "urn:great_expectations:validations:source_patient_data.default:expect_table_row_count_to_equal.result"
        ".observed_value":
        1024
    }
    source_diabetes_data_results = ExpectationSuiteValidationResult(
        meta={
            "expectation_suite_name": "source_diabetes_data.default",
            "run_id": run_id,
        },
        results=[
            ExpectationValidationResult(
                expectation_config=ExpectationConfiguration(
                    expectation_type=
                    "expect_column_unique_value_count_to_be_between",
                    kwargs={
                        "column": "patient_nbr",
                        "min": 2048,
                        "max": 2048
                    },
                ),
                success=True,
                exception_info={
                    "exception_message": None,
                    "exception_traceback": None,
                    "raised_exception": False,
                },
                result={
                    "observed_value": 2048,
                    "element_count": 5000,
                    "missing_percent": 0.0,
                    "missing_count": 0,
                },
            )
        ],
        success=True,
    )

    data_context_parameterized_expectation_suite.store_evaluation_parameters(
        source_diabetes_data_results)
    bound_parameters = data_context_parameterized_expectation_suite.evaluation_parameter_store.get_bind_params(
        run_id)
    assert bound_parameters == {
        "urn:great_expectations:validations:source_patient_data.default:expect_table_row_count_to_equal.result"
        ".observed_value":
        1024,
        "urn:great_expectations:validations:source_diabetes_data.default"
        ":expect_column_unique_value_count_to_be_between.result.observed_value:column=patient_nbr":
        2048,
    }
    def run(
        self,
        assets_to_validate,
        run_id=None,
        evaluation_parameters=None,
        run_name=None,
        run_time=None,
        result_format=None,
    ):
        assert not (run_id and run_name) and not (
            run_id and run_time
        ), "Please provide either a run_id or run_name and/or run_time."
        if isinstance(run_id, str) and not run_name:
            warnings.warn(
                "String run_ids will be deprecated in the future. Please provide a run_id of type "
                "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name "
                "and run_time (both optional). Instead of providing a run_id, you may also provide"
                "run_name and run_time separately.",
                DeprecationWarning,
            )
            try:
                run_time = parse(run_id)
            except (ParserError, TypeError):
                pass
            run_id = RunIdentifier(run_name=run_id, run_time=run_time)
        elif isinstance(run_id, dict):
            run_id = RunIdentifier(**run_id)
        elif not isinstance(run_id, RunIdentifier):
            run_id = RunIdentifier(run_name=run_name, run_time=run_time)

        run_results = {}

        for item in assets_to_validate:
            run_result_obj = {}
            batch = self._build_batch_from_item(item)
            expectation_suite_identifier = ExpectationSuiteIdentifier(
                expectation_suite_name=batch._expectation_suite.expectation_suite_name
            )
            validation_result_id = ValidationResultIdentifier(
                batch_identifier=batch.batch_id,
                expectation_suite_identifier=expectation_suite_identifier,
                run_id=run_id,
            )
            batch_validation_result = batch.validate(
                run_id=run_id,
                result_format=result_format if result_format else self.result_format,
                evaluation_parameters=evaluation_parameters,
            )
            run_result_obj["validation_result"] = batch_validation_result
            batch_actions_results = self._run_actions(
                batch,
                expectation_suite_identifier,
                batch._expectation_suite,
                batch_validation_result,
                run_id,
            )
            run_result_obj["actions_results"] = batch_actions_results
            run_results[validation_result_id] = run_result_obj

        return ValidationOperatorResult(
            run_id=run_id,
            run_results=run_results,
            validation_operator_config=self.validation_operator_config,
            evaluation_parameters=evaluation_parameters,
        )
    def resolve_config_using_acceptable_arguments(
        checkpoint: "Checkpoint",  # noqa: F821
        template_name: Optional[str] = None,
        run_name_template: Optional[str] = None,
        expectation_suite_name: Optional[str] = None,
        batch_request: Optional[Union[BatchRequest, RuntimeBatchRequest,
                                      dict]] = None,
        action_list: Optional[List[dict]] = None,
        evaluation_parameters: Optional[dict] = None,
        runtime_configuration: Optional[dict] = None,
        validations: Optional[List[dict]] = None,
        profilers: Optional[List[dict]] = None,
        run_id: Optional[Union[str, RunIdentifier]] = None,
        run_name: Optional[str] = None,
        run_time: Optional[Union[str, datetime.datetime]] = None,
        result_format: Optional[Union[str, dict]] = None,
        expectation_suite_ge_cloud_id: Optional[str] = None,
    ) -> dict:
        """
        This method reconciles the Checkpoint configuration (e.g., obtained from the Checkpoint store) with dynamically
        supplied arguments in order to obtain that Checkpoint specification that is ready for running validation on it.
        This procedure is necessecitated by the fact that the Checkpoint configuration is hierarchical in its form,
        which was established for the purposes of making the specification of different Checkpoint capabilities easy.
        In particular, entities, such as BatchRequest, expectation_suite_name, and action_list, can be specified at the
        top Checkpoint level with the suitable ovverrides provided at lower levels (e.g., in the validations section).
        Reconciling and normalizing the Checkpoint configuration is essential for usage statistics, because the exact
        values of the entities in their formally validated form (e.g., BatchRequest) is the required level of detail.
        """
        assert not (run_id and run_name) and not (
            run_id and run_time
        ), "Please provide either a run_id or run_name and/or run_time."

        run_time = run_time or datetime.datetime.now()
        runtime_configuration = runtime_configuration or {}

        batch_request = get_batch_request_as_dict(batch_request=batch_request)
        validations = get_validations_with_batch_request_as_dict(
            validations=validations)

        runtime_kwargs: dict = {
            "template_name": template_name,
            "run_name_template": run_name_template,
            "expectation_suite_name": expectation_suite_name,
            "batch_request": batch_request,
            "action_list": action_list,
            "evaluation_parameters": evaluation_parameters,
            "runtime_configuration": runtime_configuration,
            "validations": validations,
            "profilers": profilers,
            "expectation_suite_ge_cloud_id": expectation_suite_ge_cloud_id,
        }
        substituted_runtime_config: dict = checkpoint.get_substituted_config(
            runtime_kwargs=runtime_kwargs)
        run_name_template = substituted_runtime_config.get("run_name_template")
        validations = substituted_runtime_config.get("validations") or []
        batch_request = substituted_runtime_config.get("batch_request")
        if len(validations) == 0 and not batch_request:
            raise ge_exceptions.CheckpointError(
                f'Checkpoint "{checkpoint.name}" must contain either a batch_request or validations.'
            )

        if run_name is None and run_name_template is not None:
            run_name = get_datetime_string_from_strftime_format(
                format_str=run_name_template, datetime_obj=run_time)

        run_id = run_id or RunIdentifier(run_name=run_name, run_time=run_time)

        validation_dict: dict

        for validation_dict in validations:
            substituted_validation_dict: dict = get_substituted_validation_dict(
                substituted_runtime_config=substituted_runtime_config,
                validation_dict=validation_dict,
            )
            validation_batch_request: Union[
                BatchRequest,
                RuntimeBatchRequest] = substituted_validation_dict.get(
                    "batch_request")
            validation_dict["batch_request"] = validation_batch_request
            validation_expectation_suite_name: str = substituted_validation_dict.get(
                "expectation_suite_name")
            validation_dict[
                "expectation_suite_name"] = validation_expectation_suite_name
            validation_expectation_suite_ge_cloud_id: str = (
                substituted_validation_dict.get(
                    "expectation_suite_ge_cloud_id"))
            validation_dict[
                "expectation_suite_ge_cloud_id"] = validation_expectation_suite_ge_cloud_id
            validation_action_list: list = substituted_validation_dict.get(
                "action_list")
            validation_dict["action_list"] = validation_action_list

        return substituted_runtime_config
def test_MicrosoftTeams_validation_results_with_datadocs():
    validation_result_suite = ExpectationSuiteValidationResult(
        results=[],
        success=True,
        statistics={
            "evaluated_expectations": 0,
            "successful_expectations": 0,
            "unsuccessful_expectations": 0,
            "success_percent": None,
        },
        meta={
            "great_expectations_version": "v0.8.0__develop",
            "expectation_suite_name": "asset.default",
            "run_id": "test_100",
        },
    )

    validation_result_suite_identifier = ValidationResultIdentifier(
        expectation_suite_identifier=ExpectationSuiteIdentifier(
            "asset.default"),
        run_id=RunIdentifier(run_name="test_100",
                             run_time="Tue May 08 15:14:45 +0800 2012"),
        batch_identifier=BatchIdentifier(batch_identifier="1234",
                                         data_asset_name="asset"),
    )

    data_docs_pages = {"local_site": "file:///localsite/index.html"}

    rendered_output = MicrosoftTeamsRenderer().render(
        validation_result_suite, validation_result_suite_identifier,
        data_docs_pages)

    expected_output = {
        "attachments": [{
            "content": {
                "$schema":
                "http://adaptivecards.io/schemas/adaptive-card.json",
                "actions": [{
                    "title": "Open data docs",
                    "type": "Action.OpenUrl",
                    "url": "file:///localsite/index.html",
                }],
                "body": [
                    {
                        "height":
                        "auto",
                        "items": [{
                            "columns": [{
                                "items": [
                                    {
                                        "size": "large",
                                        "text": "Validation "
                                        "results",
                                        "type": "TextBlock",
                                        "weight": "bolder",
                                        "wrap": True,
                                    },
                                    {
                                        "isSubtle": True,
                                        "spacing": "none",
                                        "text": "May "
                                        "08 "
                                        "2012 "
                                        "07:14:45",
                                        "type": "TextBlock",
                                        "wrap": True,
                                    },
                                ],
                                "type":
                                "Column",
                                "width":
                                "stretch",
                            }],
                            "type":
                            "ColumnSet",
                        }],
                        "separator":
                        True,
                        "type":
                        "Container",
                    },
                    {
                        "height":
                        "auto",
                        "items": [
                            {
                                "color": "good",
                                "horizontalAlignment": "left",
                                "text": "**Batch validation "
                                "status:** Success "
                                "!!!",
                                "type": "TextBlock",
                            },
                            {
                                "horizontalAlignment": "left",
                                "text": "**Data asset "
                                "name:** asset",
                                "type": "TextBlock",
                            },
                            {
                                "horizontalAlignment": "left",
                                "text": "**Expectation "
                                "suite name:** "
                                "asset.default",
                                "type": "TextBlock",
                            },
                            {
                                "horizontalAlignment": "left",
                                "text": "**Run name:** "
                                "test_100",
                                "type": "TextBlock",
                            },
                            {
                                "horizontalAlignment": "left",
                                "text": "**Batch ID:** 1234",
                                "type": "TextBlock",
                            },
                            {
                                "horizontalAlignment":
                                "left",
                                "text":
                                "**Summary:** *0* "
                                "of *0* "
                                "expectations were "
                                "met",
                                "type":
                                "TextBlock",
                            },
                        ],
                        "separator":
                        True,
                        "type":
                        "Container",
                    },
                ],
                "type":
                "AdaptiveCard",
                "version":
                "1.0",
            },
            "contentType":
            "application/vnd.microsoft.card.adaptive",
        }],
        "type":
        "message",
    }

    assert rendered_output == expected_output
Ejemplo n.º 29
0
 def from_tuple(cls, tuple_):
     return cls(
         ExpectationSuiteIdentifier.from_tuple(tuple_[0:-3]),
         RunIdentifier.from_tuple((tuple_[-3], tuple_[-2])),
         tuple_[-1],
     )