Example #1
0
    def run(
        self,
        template_name: Optional[str] = None,
        run_name_template: Optional[str] = None,
        expectation_suite_name: Optional[str] = None,
        batch_request: Optional[Union[BatchRequestBase, dict]] = None,
        action_list: Optional[List[dict]] = None,
        evaluation_parameters: Optional[dict] = None,
        runtime_configuration: Optional[dict] = None,
        validations: Optional[List[dict]] = None,
        profilers: Optional[List[dict]] = None,
        run_id: Optional[Union[str, RunIdentifier]] = None,
        run_name: Optional[str] = None,
        run_time: Optional[Union[str, datetime.datetime]] = None,
        result_format: Optional[Union[str, dict]] = None,
        expectation_suite_ge_cloud_id: Optional[str] = None,
    ) -> CheckpointResult:
        assert not (run_id and run_name) and not (
            run_id and run_time
        ), "Please provide either a run_id or run_name and/or run_time."

        run_time = run_time or datetime.datetime.now()
        runtime_configuration = runtime_configuration or {}
        result_format = result_format or runtime_configuration.get(
            "result_format")

        batch_request = get_batch_request_as_dict(batch_request=batch_request)
        validations = get_validations_with_batch_request_as_dict(
            validations=validations)

        runtime_kwargs: dict = {
            "template_name": template_name,
            "run_name_template": run_name_template,
            "expectation_suite_name": expectation_suite_name,
            "batch_request": batch_request or {},
            "action_list": action_list or [],
            "evaluation_parameters": evaluation_parameters or {},
            "runtime_configuration": runtime_configuration or {},
            "validations": validations or [],
            "profilers": profilers or [],
            "expectation_suite_ge_cloud_id": expectation_suite_ge_cloud_id,
        }

        substituted_runtime_config: dict = self.get_substituted_config(
            runtime_kwargs=runtime_kwargs)

        run_name_template = substituted_runtime_config.get("run_name_template")

        batch_request = substituted_runtime_config.get("batch_request")
        validations = substituted_runtime_config.get("validations") or []

        if len(validations) == 0 and not batch_request:
            raise ge_exceptions.CheckpointError(
                f'Checkpoint "{self.name}" must contain either a batch_request or validations.'
            )

        if run_name is None and run_name_template is not None:
            run_name = get_datetime_string_from_strftime_format(
                format_str=run_name_template, datetime_obj=run_time)

        run_id = run_id or RunIdentifier(run_name=run_name, run_time=run_time)

        # Use AsyncExecutor to speed up I/O bound validations by running them in parallel with multithreading (if
        # concurrency is enabled in the data context configuration) -- please see the below arguments used to initialize
        # AsyncExecutor and the corresponding AsyncExecutor docstring for more details on when multiple threads are
        # used.
        with AsyncExecutor(self.data_context.concurrency,
                           max_workers=len(validations)) as async_executor:
            # noinspection PyUnresolvedReferences
            async_validation_operator_results: List[
                AsyncResult[ValidationOperatorResult]] = []
            if len(validations) > 0:
                for idx, validation_dict in enumerate(validations):
                    self._run_validation(
                        substituted_runtime_config=substituted_runtime_config,
                        async_validation_operator_results=
                        async_validation_operator_results,
                        async_executor=async_executor,
                        result_format=result_format,
                        run_id=run_id,
                        idx=idx,
                        validation_dict=validation_dict,
                    )
            else:
                self._run_validation(
                    substituted_runtime_config=substituted_runtime_config,
                    async_validation_operator_results=
                    async_validation_operator_results,
                    async_executor=async_executor,
                    result_format=result_format,
                    run_id=run_id,
                )

            run_results: dict = {}
            for async_validation_operator_result in async_validation_operator_results:
                run_results.update(
                    async_validation_operator_result.result().run_results)

        return CheckpointResult(
            run_id=run_id,
            run_results=run_results,
            checkpoint_config=self.config,
        )
    def run(
        self,
        assets_to_validate,
        run_id=None,
        evaluation_parameters=None,
        run_name=None,
        run_time=None,
        catch_exceptions=None,
        result_format=None,
        checkpoint_identifier=None,
    ):
        assert not (run_id and run_name) and not (
            run_id and run_time
        ), "Please provide either a run_id or run_name and/or run_time."
        if isinstance(run_id, str) and not run_name:
            warnings.warn(
                "String run_ids will be deprecated in the future. Please provide a run_id of type "
                "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name "
                "and run_time (both optional). Instead of providing a run_id, you may also provide"
                "run_name and run_time separately.",
                DeprecationWarning,
            )
            try:
                run_time = parse(run_id)
            except (ValueError, TypeError):
                pass
            run_id = RunIdentifier(run_name=run_id, run_time=run_time)
        elif isinstance(run_id, dict):
            run_id = RunIdentifier(**run_id)
        elif not isinstance(run_id, RunIdentifier):
            run_id = RunIdentifier(run_name=run_name, run_time=run_time)

        ###
        # NOTE: 20211010 - jdimatteo: This method is called by both Checkpoint.run and LegacyCheckpoint.run and below
        # usage of AsyncExecutor may speed up I/O bound validations by running them in parallel with multithreading
        # (if concurrency is enabled in the data context configuration).
        #
        # When this method is called by LegacyCheckpoint.run, len(assets_to_validate) may be greater than 1. If
        # concurrency is enabled in the configuration AND len(assets_to_validate) > 1, then execution is run in multiple
        # threads with AsyncExecutor -- otherwise AsyncExecutor only uses the current single thread to execute the work.
        # Please see the below arguments used to initialize AsyncExecutor and the corresponding AsyncExecutor docstring
        # for more details on when multiple threads are used.
        #
        # When this method is called by Checkpoint.run, len(assets_to_validate) may be 1 even if there are multiple
        # validations, because Checkpoint.run calls this method in a loop for each validation. AsyncExecutor is also
        # used in the Checkpoint.run loop to optionally run each validation in parallel with multithreading, so this
        # method's AsyncExecutor is nested within the Checkpoint.run AsyncExecutor. The AsyncExecutor logic to only use
        # multithreading when max_workers > 1 ensures that no nested multithreading is ever used when
        # len(assets_to_validate) is equal to 1. So no unnecessary multithreading is ever used here even though it may
        # be nested inside another AsyncExecutor (and this is a good thing because it avoids extra overhead associated
        # with each thread and minimizes the total number of threads to simplify debugging).
        with AsyncExecutor(
                self.data_context.concurrency,
                max_workers=len(assets_to_validate)) as async_executor:
            batch_and_async_result_tuples = []
            for item in assets_to_validate:
                batch = self._build_batch_from_item(item)

                if hasattr(batch, "active_batch_id"):
                    batch_identifier = batch.active_batch_id
                else:
                    batch_identifier = batch.batch_id

                if result_format is None:
                    result_format = self.result_format

                batch_validate_arguments = {
                    "run_id": run_id,
                    "result_format": result_format,
                    "evaluation_parameters": evaluation_parameters,
                }

                if catch_exceptions is not None:
                    batch_validate_arguments[
                        "catch_exceptions"] = catch_exceptions

                batch_and_async_result_tuples.append((
                    batch,
                    async_executor.submit(
                        batch.validate,
                        **batch_validate_arguments,
                    ),
                ))

            run_results = {}
            for batch, async_batch_validation_result in batch_and_async_result_tuples:
                if self.data_context.ge_cloud_mode:
                    expectation_suite_identifier = GeCloudIdentifier(
                        resource_type="expectation_suite",
                        ge_cloud_id=batch._expectation_suite.ge_cloud_id,
                    )
                    validation_result_id = GeCloudIdentifier(
                        resource_type="suite_validation_result")
                else:
                    expectation_suite_identifier = ExpectationSuiteIdentifier(
                        expectation_suite_name=batch._expectation_suite.
                        expectation_suite_name)
                    validation_result_id = ValidationResultIdentifier(
                        batch_identifier=batch_identifier,
                        expectation_suite_identifier=
                        expectation_suite_identifier,
                        run_id=run_id,
                    )

                batch_actions_results = self._run_actions(
                    batch=batch,
                    expectation_suite_identifier=expectation_suite_identifier,
                    expectation_suite=batch._expectation_suite,
                    batch_validation_result=async_batch_validation_result.
                    result(),
                    run_id=run_id,
                    validation_result_id=validation_result_id,
                    checkpoint_identifier=checkpoint_identifier,
                )

                run_result_obj = {
                    "validation_result":
                    async_batch_validation_result.result(),
                    "actions_results": batch_actions_results,
                }
                run_results[validation_result_id] = run_result_obj

        return ValidationOperatorResult(
            run_id=run_id,
            run_results=run_results,
            validation_operator_config=self.validation_operator_config,
            evaluation_parameters=evaluation_parameters,
        )
Example #3
0
    def _run_validation(
        self,
        substituted_runtime_config: dict,
        async_validation_operator_results: List[AsyncResult],
        async_executor: AsyncExecutor,
        result_format: Optional[dict],
        run_id: Optional[Union[str, RunIdentifier]],
        idx: Optional[int] = 0,
        validation_dict: Optional[dict] = None,
    ) -> None:
        if validation_dict is None:
            validation_dict = {}

        try:
            substituted_validation_dict: dict = get_substituted_validation_dict(
                substituted_runtime_config=substituted_runtime_config,
                validation_dict=validation_dict,
            )
            batch_request: Union[
                BatchRequest,
                RuntimeBatchRequest] = substituted_validation_dict.get(
                    "batch_request")
            expectation_suite_name: str = substituted_validation_dict.get(
                "expectation_suite_name")
            expectation_suite_ge_cloud_id: str = substituted_validation_dict.get(
                "expectation_suite_ge_cloud_id")
            include_rendered_content: bool = substituted_validation_dict.get(
                "include_rendered_content", False)

            validator: Validator = self.data_context.get_validator(
                batch_request=batch_request,
                expectation_suite_name=(expectation_suite_name
                                        if not self.data_context.ge_cloud_mode
                                        else None),
                expectation_suite_ge_cloud_id=(expectation_suite_ge_cloud_id if
                                               self.data_context.ge_cloud_mode
                                               else None),
                include_rendered_content=include_rendered_content,
            )

            action_list: list = substituted_validation_dict.get("action_list")
            runtime_configuration_validation = substituted_validation_dict.get(
                "runtime_configuration", {})
            catch_exceptions_validation = runtime_configuration_validation.get(
                "catch_exceptions")
            result_format_validation = runtime_configuration_validation.get(
                "result_format")
            result_format = result_format or result_format_validation

            if result_format is None:
                result_format = {"result_format": "SUMMARY"}

            action_list_validation_operator: ActionListValidationOperator = (
                ActionListValidationOperator(
                    data_context=self.data_context,
                    action_list=action_list,
                    result_format=result_format,
                    name=f"{self.name}-checkpoint-validation[{idx}]",
                ))
            checkpoint_identifier = None
            if self.data_context.ge_cloud_mode:
                checkpoint_identifier = GeCloudIdentifier(
                    resource_type=GeCloudRESTResource.CONTRACT,
                    ge_cloud_id=str(self.ge_cloud_id),
                )

            operator_run_kwargs = {}

            if catch_exceptions_validation is not None:
                operator_run_kwargs[
                    "catch_exceptions"] = catch_exceptions_validation

            async_validation_operator_results.append(
                async_executor.submit(
                    action_list_validation_operator.run,
                    assets_to_validate=[validator],
                    run_id=run_id,
                    evaluation_parameters=substituted_validation_dict.get(
                        "evaluation_parameters"),
                    result_format=result_format,
                    checkpoint_identifier=checkpoint_identifier,
                    checkpoint_name=self.name,
                    **operator_run_kwargs,
                ))
        except (
                ge_exceptions.CheckpointError,
                ge_exceptions.ExecutionEngineError,
                ge_exceptions.MetricError,
        ) as e:
            raise ge_exceptions.CheckpointError(
                f"Exception occurred while running validation[{idx}] of Checkpoint '{self.name}': {e.message}."
            )
Example #4
0
def test_async_executor_does_execute_concurrently_when_concurrency_enabled_with_multiple_max_workers():
    with AsyncExecutor(
        ConcurrencyConfig(enabled=True), max_workers=100
    ) as async_executor:
        assert async_executor.execute_concurrently