def load_batch(context: DataContext, suite: Union[str, ExpectationSuite], batch_kwargs: Union[dict, BatchKwargs]) -> DataAsset: batch: DataAsset = context.get_batch(batch_kwargs, suite) assert isinstance( batch, DataAsset), "Batch failed to load. Please check your batch_kwargs" return batch
def validation_operator_run(name, run_name, validation_config_file, suite, directory): # Note though the long lines here aren't pythonic, they look best if Click does the line wraps. """ Run a validation operator against some data. There are two modes to run this command: 1. Interactive (good for development): Specify the name of the validation operator using the --name argument and the name of the expectation suite using the --suite argument. The cli will help you specify the batch of data that you want to validate interactively. 2. Non-interactive (good for production): Use the `--validation_config_file` argument to specify the path of the validation configuration JSON file. This file can be used to instruct a validation operator to validate multiple batches of data and use multiple expectation suites to validate each batch. Learn how to create a validation config file here: https://great-expectations.readthedocs.io/en/latest/command_line.html#great-expectations-validation-operator-run-validation-config-file-validation-config-file-path This command exits with 0 if the validation operator ran and the "success" attribute in its return object is True. Otherwise, the command exits with 1. To learn more about validation operators, go here: https://great-expectations.readthedocs.io/en/latest/features/validation.html#validation-operators """ try: context = DataContext(directory) except ge_exceptions.ConfigNotFoundError as err: cli_message("Failed to process <red>{}</red>".format(err.message)) sys.exit(1) try: if validation_config_file is not None: try: with open(validation_config_file) as f: validation_config = json.load(f) except (OSError, json_parse_exception) as e: cli_message( f"Failed to process the --validation_config_file argument: <red>{e}</red>" ) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) validation_config_error_message = _validate_valdiation_config( validation_config) if validation_config_error_message is not None: cli_message( "<red>The validation config in {:s} is misconfigured: {:s}</red>" .format(validation_config_file, validation_config_error_message)) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) else: if suite is None: cli_message(""" Please use --suite argument to specify the name of the expectation suite. Call `great_expectation suite list` command to list the expectation suites in your project. """) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(0) suite = toolkit.load_expectation_suite( context, suite, "cli.validation_operator.run") if name is None: cli_message(""" Please use --name argument to specify the name of the validation operator. Call `great_expectation validation-operator list` command to list the operators in your project. """) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) else: if name not in context.list_validation_operator_names(): cli_message(f""" Could not find a validation operator {name}. Call `great_expectation validation-operator list` command to list the operators in your project. """) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) batch_kwargs = None cli_message(""" Let us help you specify the batch of data your want the validation operator to validate.""" ) try: data_source = toolkit.select_datasource(context) except ValueError as ve: cli_message("<red>{}</red>".format(ve)) send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) if not data_source: cli_message("<red>No datasources found in the context.</red>") send_usage_message( data_context=context, event="cli.validation_operator.run", success=False, ) sys.exit(1) if batch_kwargs is None: ( datasource_name, batch_kwargs_generator, data_asset, batch_kwargs, ) = get_batch_kwargs( context, datasource_name=data_source.name, batch_kwargs_generator_name=None, data_asset_name=None, additional_batch_kwargs=None, ) validation_config = { "validation_operator_name": name, "batches": [{ "batch_kwargs": batch_kwargs, "expectation_suite_names": [suite.expectation_suite_name], }], } try: validation_operator_name = validation_config[ "validation_operator_name"] batches_to_validate = [] for entry in validation_config["batches"]: for expectation_suite_name in entry["expectation_suite_names"]: batch = context.get_batch(entry["batch_kwargs"], expectation_suite_name) batches_to_validate.append(batch) if run_name is None: run_name = datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") run_id = RunIdentifier(run_name=run_name) if suite is None: results = context.run_validation_operator( validation_operator_name, assets_to_validate=batches_to_validate, run_id=run_id, ) else: if suite.evaluation_parameters is None: results = context.run_validation_operator( validation_operator_name, assets_to_validate=batches_to_validate, run_id=run_id, ) else: results = context.run_validation_operator( validation_operator_name, assets_to_validate=batches_to_validate, run_id=run_id, evaluation_parameters=suite.evaluation_parameters, ) except (ge_exceptions.DataContextError, OSError, SQLAlchemyError) as e: cli_message("<red>{}</red>".format(e)) send_usage_message(data_context=context, event="cli.validation_operator.run", success=False) sys.exit(1) if not results["success"]: cli_message("Validation failed!") send_usage_message(data_context=context, event="cli.validation_operator.run", success=True) sys.exit(1) else: cli_message("Validation succeeded!") send_usage_message(data_context=context, event="cli.validation_operator.run", success=True) sys.exit(0) except Exception as e: send_usage_message(data_context=context, event="cli.validation_operator.run", success=False) raise e
saves validation results to your results store and then updates Data Docs. This makes viewing validation results easy for you and your team. Usage: - Run this file: `python {0}`. - This can be run manually or via a scheduler such as cron. - If your pipeline runner supports python snippets you can paste this into your pipeline. """ import sys from great_expectations import DataContext # checkpoint configuration context = DataContext("{1}") suite = context.get_expectation_suite("{2}") # You can modify your BatchKwargs to select different data batch_kwargs = {3} # checkpoint validation process batch = context.get_batch(batch_kwargs, suite) results = context.run_validation_operator("action_list_operator", [batch]) if not results["success"]: print("Validation Failed!") sys.exit(1) print("Validation Succeeded!") sys.exit(0)
def run( self, checkpoint_name: str = None, ge_checkpoint: Checkpoint = None, checkpoint_kwargs: dict = None, context: ge.DataContext = None, assets_to_validate: list = None, batch_kwargs: dict = None, expectation_suite_name: str = None, context_root_dir: str = None, runtime_environment: Optional[dict] = None, run_name: str = None, run_info_at_end: bool = True, disable_markdown_artifact: bool = False, validation_operator: str = "action_list_operator", evaluation_parameters: Optional[dict] = None, ): """ Task run method. Args: - checkpoint_name (str, optional): the name of a pre-configured checkpoint; should match the filename of the checkpoint without the extension. Either checkpoint_name or checkpoint_config is required when using the Great Expectations v3 API. - ge_checkpoint (Checkpoint, optional): an in-memory GE `Checkpoint` object used to perform validation. If not provided then `checkpoint_name` will be used to load the specified checkpoint. - checkpoint_kwargs (Dict, optional): A dictionary whose keys match the parameters of `CheckpointConfig` which can be used to update and populate the task's Checkpoint at runtime. - context (DataContext, optional): an in-memory GE `DataContext` object. e.g. `ge.data_context.DataContext()` If not provided then `context_root_dir` will be used to look for one. - assets_to_validate (list, optional): A list of assets to validate when running the validation operator. Only used in the Great Expectations v2 API - batch_kwargs (dict, optional): a dictionary of batch kwargs to be used when validating assets. Only used in the Great Expectations v2 API - expectation_suite_name (str, optional): the name of an expectation suite to be used when validating assets. Only used in the Great Expectations v2 API - context_root_dir (str, optional): the absolute or relative path to the directory holding your `great_expectations.yml` - runtime_environment (dict, optional): a dictionary of great expectation config key-value pairs to overwrite your config in `great_expectations.yml` - run_name (str, optional): the name of this Great Expectation validation run; defaults to the task slug - run_info_at_end (bool, optional): add run info to the end of the artifact generated by this task. Defaults to `True`. - disable_markdown_artifact (bool, optional): toggle the posting of a markdown artifact from this tasks. Defaults to `False`. - evaluation_parameters (Optional[dict], optional): the evaluation parameters to use when running validation. For more information, see [example](https://docs.prefect.io/api/latest/tasks/great_expectations.html#rungreatexpectationsvalidation) and [docs](https://docs.greatexpectations.io/en/latest/reference/core_concepts/evaluation_parameters.html). - validation_operator (str, optional): configure the actions to be executed after running validation. Defaults to `action_list_operator`. Raises: - 'signals.FAIL' if the validation was not a success Returns: - result ('great_expectations.validation_operators.types.validation_operator_result.ValidationOperatorResult'): The Great Expectations metadata returned from the validation if the v2 (batch_kwargs) API is used. ('great_expectations.checkpoint.checkpoint.CheckpointResult'): The Great Expectations metadata returned from running the provided checkpoint if a checkpoint name is provided. """ if version.parse(ge.__version__) < version.parse("0.13.8"): self.logger.warning( f"You are using great_expectations version {ge.__version__} which may cause" "errors in this task. Please upgrade great_expections to 0.13.8 or later." ) runtime_environment = runtime_environment or dict() checkpoint_kwargs = checkpoint_kwargs or dict() # Load context if not provided directly if not context: context = ge.DataContext( context_root_dir=context_root_dir, runtime_environment=runtime_environment, ) # Check that the parameters are mutually exclusive if (sum( bool(x) for x in [ (expectation_suite_name and batch_kwargs), assets_to_validate, checkpoint_name, ge_checkpoint, ]) != 1): raise ValueError( "Exactly one of expectation_suite_name + batch_kwargs, assets_to_validate, " "checkpoint_name, or ge_checkpoint is required to run validation." ) results = None # If there is a checkpoint or checkpoint name provided, run the checkpoint. # Checkpoints are the preferred deployment of validation configuration. if ge_checkpoint or checkpoint_name: ge_checkpoint = ge_checkpoint or context.get_checkpoint( checkpoint_name) results = ge_checkpoint.run( evaluation_parameters=evaluation_parameters, run_id={ "run_name": run_name or prefect.context.get("task_slug") }, **checkpoint_kwargs, ) else: # If assets are not provided directly through `assets_to_validate` then they need be loaded # get batch from `batch_kwargs` and `expectation_suite_name` if not assets_to_validate: assets_to_validate = [ context.get_batch(batch_kwargs, expectation_suite_name) ] # Run validation operator results = context.run_validation_operator( validation_operator, assets_to_validate=assets_to_validate, run_id={ "run_name": run_name or prefect.context.get("task_slug") }, evaluation_parameters=evaluation_parameters, ) # Generate artifact markdown if not disable_markdown_artifact: validation_results_page_renderer = ( ge.render.renderer.ValidationResultsPageRenderer( run_info_at_end=run_info_at_end)) rendered_content_list = validation_results_page_renderer.render_validation_operator_result( # This also works with a CheckpointResult because of duck typing. # The passed in object needs a list_validation_results method that # returns a list of ExpectationSuiteValidationResult. validation_operator_result=results) markdown_artifact = " ".join( ge.render.view.DefaultMarkdownPageView().render( rendered_content_list)) create_markdown_artifact(markdown_artifact) if results.success is False: raise signals.FAIL(result=results) return results
def suite_edit(suite, datasource, directory, jupyter, batch_kwargs): """ Generate a Jupyter notebook for editing an existing expectation suite. The SUITE argument is required. This is the name you gave to the suite when you created it. A batch of data is required to edit the suite, which is used as a sample. The edit command will help you specify a batch interactively. Or you can specify them manually by providing --batch-kwargs in valid JSON format. Read more about specifying batches of data in the documentation: https://docs.greatexpectations.io/ """ try: context = DataContext(directory) except ge_exceptions.ConfigNotFoundError as err: cli_message("<red>{}</red>".format(err.message)) return except ge_exceptions.ZeroDotSevenConfigVersionError as err: _offer_to_install_new_template(err, context.root_directory) return suite = _load_suite(context, suite) if batch_kwargs: try: batch_kwargs = json.loads(batch_kwargs) if datasource: batch_kwargs["datasource"] = datasource _batch = context.get_batch(batch_kwargs, suite.expectation_suite_name) assert isinstance(_batch, DataAsset) except json_parse_exception as je: cli_message("<red>Please check that your batch_kwargs are valid JSON.\n{}</red>".format(je)) sys.exit(1) except ge_exceptions.DataContextError: cli_message("<red>Please check that your batch_kwargs are able to load a batch.</red>") sys.exit(1) except ValueError as ve: cli_message("<red>Please check that your batch_kwargs are able to load a batch.\n{}</red>".format(ve)) sys.exit(1) else: cli_message(""" A batch of data is required to edit the suite - let's help you to specify it.""" ) additional_batch_kwargs = None try: data_source = select_datasource(context, datasource_name=datasource) except ValueError as ve: cli_message("<red>{}</red>".format(ve)) sys.exit(1) if not data_source: cli_message("<red>No datasources found in the context.</red>") sys.exit(1) if batch_kwargs is None: datasource_name, batch_kwarg_generator, data_asset, batch_kwargs = get_batch_kwargs( context, datasource_name=data_source.name, generator_name=None, generator_asset=None, additional_batch_kwargs=additional_batch_kwargs ) notebook_name = "{}.ipynb".format(suite.expectation_suite_name) notebook_path = os.path.join(context.root_directory, context.GE_EDIT_NOTEBOOK_DIR, notebook_name) NotebookRenderer().render_to_disk(suite, batch_kwargs, notebook_path) cli_message( "To continue editing this suite, run <green>jupyter notebook {}</green>".format( notebook_path ) ) if jupyter: subprocess.call(["jupyter", "notebook", notebook_path])
def _suite_edit(suite, datasource, directory, jupyter, batch_kwargs): batch_kwargs_json = batch_kwargs batch_kwargs = None try: context = DataContext(directory) except ge_exceptions.ConfigNotFoundError as err: cli_message("<red>{}</red>".format(err.message)) return suite = _load_suite(context, suite) citations = suite.get_citations(sort=True, require_batch_kwargs=True) if batch_kwargs_json: try: batch_kwargs = json.loads(batch_kwargs_json) if datasource: batch_kwargs["datasource"] = datasource _batch = context.get_batch(batch_kwargs, suite.expectation_suite_name) assert isinstance(_batch, DataAsset) except json_parse_exception as je: cli_message( "<red>Please check that your batch_kwargs are valid JSON.\n{}</red>" .format(je)) sys.exit(1) except ge_exceptions.DataContextError: cli_message( "<red>Please check that your batch_kwargs are able to load a batch.</red>" ) sys.exit(1) except ValueError as ve: cli_message( "<red>Please check that your batch_kwargs are able to load a batch.\n{}</red>" .format(ve)) sys.exit(1) elif citations: citation = citations[-1] batch_kwargs = citation.get("batch_kwargs") if not batch_kwargs: cli_message(""" A batch of data is required to edit the suite - let's help you to specify it.""" ) additional_batch_kwargs = None try: data_source = select_datasource(context, datasource_name=datasource) except ValueError as ve: cli_message("<red>{}</red>".format(ve)) sys.exit(1) if not data_source: cli_message("<red>No datasources found in the context.</red>") sys.exit(1) if batch_kwargs is None: ( datasource_name, batch_kwarg_generator, data_asset, batch_kwargs, ) = get_batch_kwargs( context, datasource_name=data_source.name, generator_name=None, generator_asset=None, additional_batch_kwargs=additional_batch_kwargs, ) notebook_name = "{}.ipynb".format(suite.expectation_suite_name) notebook_path = os.path.join(context.root_directory, context.GE_EDIT_NOTEBOOK_DIR, notebook_name) NotebookRenderer().render_to_disk(suite, notebook_path, batch_kwargs) if not jupyter: cli_message("To continue editing this suite, run <green>jupyter " f"notebook {notebook_path}</green>") if jupyter: subprocess.call(["jupyter", "notebook", notebook_path])