def __init__(self, *, run_name=None, data_context_root_dir=None, data_context=None, expectation_suite_name=None, batch_kwargs=None, assets_to_validate=None, checkpoint_name=None, fail_task_on_validation_failure=True, validation_operator_name="action_list_operator", **kwargs ): """ Args: run_name: Optional run_name to identify the validation run (defaults to timestamp if not specified) data_context_root_dir: Path of the great_expectations directory data_context: A great_expectations DataContext object expectation_suite_name: The name of the Expectation Suite to use for validation batch_kwargs: The batch_kwargs to use for validation assets_to_validate: A list of dictionaries of batch_kwargs + expectation suites to use for validation checkpoint_name: A Checkpoint name to use for validation fail_task_on_validation_failure: Fail the Airflow task if the Great Expectation validation fails validation_operator_name: Optional name of a Great Expectations validation operator, defaults to action_list_operator **kwargs: Optional kwargs """ super().__init__(**kwargs) self.run_name = run_name # Check that only one of the arguments is passed to set a data context (or none) if data_context_root_dir and data_context: raise ValueError("Only one of data_context_root_dir or data_context can be specified.") if data_context: self.data_context = data_context elif data_context_root_dir: self.data_context = ge.DataContext(data_context_root_dir) else: self.data_context = ge.DataContext() # Check that only the correct args to validate are passed # this doesn't cover the case where only one of expectation_suite_name or batch_kwargs is specified # along with one of the others, but I'm ok with just giving precedence to the correct one if sum(bool(x) for x in [(expectation_suite_name and batch_kwargs), assets_to_validate, checkpoint_name]) != 1: raise ValueError("Exactly one of expectation_suite_name + batch_kwargs, assets_to_validate, \ or checkpoint_name is required to run validation.") self.expectation_suite_name = expectation_suite_name self.batch_kwargs = batch_kwargs self.assets_to_validate = assets_to_validate self.checkpoint_name = checkpoint_name self.fail_task_on_validation_failure = fail_task_on_validation_failure self.validation_operator_name = validation_operator_name
def run( self, checkpoint_name: str = None, context_root_dir: str = None, runtime_environment: Optional[dict] = None, run_name: str = None, **kwargs ): """ Task run method. Args: - checkpoint_name (str): the name of the checkpoint; should match the filename of the checkpoint without .py - context_root_dir (str): the absolute or relative path to the directory holding your `great_expectations.yml` - runtime_environment (dict): a dictionary of great expectation config key-value pairs to overwrite your config in `great_expectations.yml` - run_name (str): the name of this Great Expectation validation run; defaults to the task slug - **kwargs (dict, optional): additional keyword arguments to pass to the Task constructor Raises: - 'signals.VALIDATIONFAIL' if the validation was not a success Returns: - result ('great_expectations.validation_operators.types.validation_operator_result.ValidationOperatorResult'): The Great Expectations metadata returned from the validation """ if checkpoint_name is None: raise ValueError("You must provide the checkpoint name.") runtime_environment = runtime_environment or dict() context = ge.DataContext( context_root_dir=context_root_dir, runtime_environment=runtime_environment ) checkpoint = context.get_checkpoint(checkpoint_name) batches_to_validate = [] for batch in checkpoint["batches"]: batch_kwargs = batch["batch_kwargs"] for suite_name in batch["expectation_suite_names"]: suite = context.get_expectation_suite(suite_name) batch = context.get_batch(batch_kwargs, suite) batches_to_validate.append(batch) results = context.run_validation_operator( checkpoint["validation_operator_name"], assets_to_validate=batches_to_validate, run_id={"run_name": prefect.context.get("task_slug")}, ) if results.success is False: raise signals.VALIDATIONFAIL(result=results) return results
def main(): print("Loading project") context = ge.DataContext("great_expectations") action_site_name = build_site_name() context_config = context.get_config() context_config["data_docs_sites"][action_site_name] = build_site_config( action_site_name ) # Note we mangle the in memory DataContext and do not persist this config context._project_config = context_config print(f"Building docs for site: {action_site_name}") validation_store = context.stores["validations_GCS_store"] if not isinstance(validation_store.store_backend, TupleFilesystemStoreBackend): # TODO the action will likely need to run entirely in python so an ephemeral # validation store can be used if desired. print("WARNING an external validation store exists and was likely polluted.") # Build only the GitHub Actions temporary site context.build_data_docs(site_names=[action_site_name]) gh_site_dir = f"{context.root_directory}/{action_site_name}" print(f"Site built in directory: {gh_site_dir}") print(f'::set-output name=ACTION_DOCS_LOCATION::{gh_site_dir}') with open('_temp_greatexpectations_action_docs_location_dir.txt', 'w') as f: f.write(f"{gh_site_dir}") # For local debugging, this is handy to verify docs built if os.getenv('DEBUG_OPEN_DOCS'): context.open_data_docs(site_name=action_site_name)
def test_assets_to_validate(self): context = ge.DataContext(context_root_dir=str(V2_API_PATH)) task = RunGreatExpectationsValidation( context=context, assets_to_validate=[ context.get_batch( { "path": "tests/tasks/great_expectations/data/yellow_tripdata_sample_2019-01.csv", "datasource": "data__dir", "data_asset_name": "yellow_tripdata_sample_2019-01", }, "taxi.demo", ) ], ) results = task.run() assert type(results) is ValidationOperatorResult
def test_assets_to_validate(self): context = ge.DataContext(context_root_dir=str(V2_API_PATH)) task = RunGreatExpectationsValidation( context=context, assets_to_validate=[ context.get_batch( { "path": "tests/tasks/great_expectations/data/yellow_tripdata_sample_2019-01.csv", "datasource": "data__dir", "data_asset_name": "yellow_tripdata_sample_2019-01", }, "taxi.demo", ) ], ) results = task.run() assert type(results) is ValidationOperatorResult serializable_results = repr(results) assert "run_id" in serializable_results assert "run_results" in serializable_results assert "validation_operator_config" in serializable_results assert "evaluation_parameters" in serializable_results assert "success" in serializable_results
configured by default. The default configuration of this Validation Operator saves validation results to your results store and then updates Data Docs. This makes viewing validation results easy for you and your team. Usage: - Run this file: `python {}`. - This can be run manually or via a scheduler such as cron. """ import sys import great_expectations as ge # tap configuration context = ge.DataContext( "/private/var/folders/_t/psczkmjd69vf9jz0bblzlzww0000gn/T/pytest-of-taylor/pytest-1812/empty_data_context0/great_expectations" ) suite = context.get_expectation_suite("sweet_suite") batch_kwargs = { "path": "/private/var/folders/_t/psczkmjd69vf9jz0bblzlzww0000gn/T/pytest-of-taylor/pytest-1812/filesystem_csv0/f1.csv", "datasource": "1_datasource", } # tap validation process batch = context.get_batch(batch_kwargs, suite) results = context.run_validation_operator("action_list_operator", [batch]) if not results["success"]: print("Validation Failed!") sys.exit(1)
def in_memory_checkpoint(): return ge.DataContext(context_root_dir=str(V3_API_PATH), ).get_checkpoint( "my_checkpoint_pass")
def run( self, checkpoint_name: str = None, ge_checkpoint: Checkpoint = None, checkpoint_kwargs: dict = None, context: ge.DataContext = None, assets_to_validate: list = None, batch_kwargs: dict = None, expectation_suite_name: str = None, context_root_dir: str = None, runtime_environment: Optional[dict] = None, run_name: str = None, run_info_at_end: bool = True, disable_markdown_artifact: bool = False, validation_operator: str = "action_list_operator", evaluation_parameters: Optional[dict] = None, ): """ Task run method. Args: - checkpoint_name (str, optional): the name of a pre-configured checkpoint; should match the filename of the checkpoint without the extension. Either checkpoint_name or checkpoint_config is required when using the Great Expectations v3 API. - ge_checkpoint (Checkpoint, optional): an in-memory GE `Checkpoint` object used to perform validation. If not provided then `checkpoint_name` will be used to load the specified checkpoint. - checkpoint_kwargs (Dict, optional): A dictionary whose keys match the parameters of `CheckpointConfig` which can be used to update and populate the task's Checkpoint at runtime. - context (DataContext, optional): an in-memory GE `DataContext` object. e.g. `ge.data_context.DataContext()` If not provided then `context_root_dir` will be used to look for one. - assets_to_validate (list, optional): A list of assets to validate when running the validation operator. Only used in the Great Expectations v2 API - batch_kwargs (dict, optional): a dictionary of batch kwargs to be used when validating assets. Only used in the Great Expectations v2 API - expectation_suite_name (str, optional): the name of an expectation suite to be used when validating assets. Only used in the Great Expectations v2 API - context_root_dir (str, optional): the absolute or relative path to the directory holding your `great_expectations.yml` - runtime_environment (dict, optional): a dictionary of great expectation config key-value pairs to overwrite your config in `great_expectations.yml` - run_name (str, optional): the name of this Great Expectation validation run; defaults to the task slug - run_info_at_end (bool, optional): add run info to the end of the artifact generated by this task. Defaults to `True`. - disable_markdown_artifact (bool, optional): toggle the posting of a markdown artifact from this tasks. Defaults to `False`. - evaluation_parameters (Optional[dict], optional): the evaluation parameters to use when running validation. For more information, see [example](https://docs.prefect.io/api/latest/tasks/great_expectations.html#rungreatexpectationsvalidation) and [docs](https://docs.greatexpectations.io/en/latest/reference/core_concepts/evaluation_parameters.html). - validation_operator (str, optional): configure the actions to be executed after running validation. Defaults to `action_list_operator`. Raises: - 'signals.FAIL' if the validation was not a success Returns: - result ('great_expectations.validation_operators.types.validation_operator_result.ValidationOperatorResult'): The Great Expectations metadata returned from the validation if the v2 (batch_kwargs) API is used. ('great_expectations.checkpoint.checkpoint.CheckpointResult'): The Great Expectations metadata returned from running the provided checkpoint if a checkpoint name is provided. """ if version.parse(ge.__version__) < version.parse("0.13.8"): self.logger.warning( f"You are using great_expectations version {ge.__version__} which may cause" "errors in this task. Please upgrade great_expections to 0.13.8 or later." ) runtime_environment = runtime_environment or dict() checkpoint_kwargs = checkpoint_kwargs or dict() # Load context if not provided directly if not context: context = ge.DataContext( context_root_dir=context_root_dir, runtime_environment=runtime_environment, ) # Check that the parameters are mutually exclusive if (sum( bool(x) for x in [ (expectation_suite_name and batch_kwargs), assets_to_validate, checkpoint_name, ge_checkpoint, ]) != 1): raise ValueError( "Exactly one of expectation_suite_name + batch_kwargs, assets_to_validate, " "checkpoint_name, or ge_checkpoint is required to run validation." ) results = None # If there is a checkpoint or checkpoint name provided, run the checkpoint. # Checkpoints are the preferred deployment of validation configuration. if ge_checkpoint or checkpoint_name: ge_checkpoint = ge_checkpoint or context.get_checkpoint( checkpoint_name) results = ge_checkpoint.run( evaluation_parameters=evaluation_parameters, run_id={ "run_name": run_name or prefect.context.get("task_slug") }, **checkpoint_kwargs, ) else: # If assets are not provided directly through `assets_to_validate` then they need be loaded # get batch from `batch_kwargs` and `expectation_suite_name` if not assets_to_validate: assets_to_validate = [ context.get_batch(batch_kwargs, expectation_suite_name) ] # Run validation operator results = context.run_validation_operator( validation_operator, assets_to_validate=assets_to_validate, run_id={ "run_name": run_name or prefect.context.get("task_slug") }, evaluation_parameters=evaluation_parameters, ) # Generate artifact markdown if not disable_markdown_artifact: validation_results_page_renderer = ( ge.render.renderer.ValidationResultsPageRenderer( run_info_at_end=run_info_at_end)) rendered_content_list = validation_results_page_renderer.render_validation_operator_result( # This also works with a CheckpointResult because of duck typing. # The passed in object needs a list_validation_results method that # returns a list of ExpectationSuiteValidationResult. validation_operator_result=results) markdown_artifact = " ".join( ge.render.view.DefaultMarkdownPageView().render( rendered_content_list)) create_markdown_artifact(markdown_artifact) if results.success is False: raise signals.FAIL(result=results) return results
def run( self, checkpoint_name: str = None, context: "ge.DataContext" = None, assets_to_validate: list = None, batch_kwargs: dict = None, expectation_suite_name: str = None, context_root_dir: str = None, runtime_environment: Optional[dict] = None, run_name: str = None, run_info_at_end: bool = True, disable_markdown_artifact: bool = False, validation_operator: str = "action_list_operator", evaluation_parameters: Optional[dict] = None, ): """ Task run method. Args: - checkpoint_name (str, optional): the name of the checkpoint; should match the filename of the checkpoint without .py - context (DataContext, optional): an in-memory GE DataContext object. e.g. `ge.data_context.DataContext()` If not provided then `context_root_dir` will be used to look for one. - assets_to_validate (list, optional): A list of assets to validate when running the validation operator. - batch_kwargs (dict, optional): a dictionary of batch kwargs to be used when validating assets. - expectation_suite_name (str, optional): the name of an expectation suite to be used when validating assets. - context_root_dir (str, optional): the absolute or relative path to the directory holding your `great_expectations.yml` - runtime_environment (dict, optional): a dictionary of great expectation config key-value pairs to overwrite your config in `great_expectations.yml` - run_name (str, optional): the name of this Great Expectation validation run; defaults to the task slug - run_info_at_end (bool, optional): add run info to the end of the artifact generated by this task. Defaults to `True`. - disable_markdown_artifact (bool, optional): toggle the posting of a markdown artifact from this tasks. Defaults to `False`. - evaluation_parameters (Optional[dict], optional): the evaluation parameters to use when running validation. For more information, see [example](https://docs.prefect.io/api/latest/tasks/great_expectations.html#rungreatexpectationsvalidation) and [docs](https://docs.greatexpectations.io/en/latest/reference/core_concepts/evaluation_parameters.html). - validation_operator (str, optional): configure the actions to be executed after running validation. Defaults to `action_list_operator`. Raises: - 'signals.FAIL' if the validation was not a success Returns: - result ('great_expectations.validation_operators.types.validation_operator_result.ValidationOperatorResult'): The Great Expectations metadata returned from the validation """ runtime_environment = runtime_environment or dict() # Load context if not provided directly if not context: context = ge.DataContext( context_root_dir=context_root_dir, runtime_environment=runtime_environment, ) # Check that the parameters are mutually exclusive if (sum( bool(x) for x in [ (expectation_suite_name and batch_kwargs), assets_to_validate, checkpoint_name, ]) != 1): raise ValueError( "Exactly one of expectation_suite_name + batch_kwargs, assets_to_validate, or " "checkpoint_name is required to run validation.") # If assets are not provided directly through `assets_to_validate` then they need be loaded # if a checkpoint_name is supplied, then load suite and batch_kwargs from there # otherwise get batch from `batch_kwargs` and `expectation_suite_name` if not assets_to_validate: assets_to_validate = [] if checkpoint_name: ge_checkpoint = context.get_checkpoint(checkpoint_name) for batch in ge_checkpoint["batches"]: batch_kwargs = batch["batch_kwargs"] for suite_name in batch["expectation_suite_names"]: suite = context.get_expectation_suite(suite_name) batch = context.get_batch(batch_kwargs, suite) assets_to_validate.append(batch) validation_operator = ge_checkpoint["validation_operator_name"] else: assets_to_validate.append( context.get_batch(batch_kwargs, expectation_suite_name)) # Run validation operator results = context.run_validation_operator( validation_operator, assets_to_validate=assets_to_validate, run_id={"run_name": run_name or prefect.context.get("task_slug")}, evaluation_parameters=evaluation_parameters, ) # Generate artifact markdown if not disable_markdown_artifact: run_info_at_end = True validation_results_page_renderer = ( ge.render.renderer.ValidationResultsPageRenderer( run_info_at_end=run_info_at_end)) rendered_document_content_list = ( validation_results_page_renderer. render_validation_operator_result( validation_operator_result=results)) markdown_artifact = " ".join( ge.render.view.DefaultMarkdownPageView().render( rendered_document_content_list)) create_markdown(markdown_artifact) if results.success is False: raise signals.FAIL(result=results) return results
import great_expectations as ge context = ge.DataContext() batch = context.get_batch( batch_request=ge.core.batch.BatchRequest(datasource_name="taxi", data_connector_name="monthly", data_asset_name="yellow", data_connector_query={ "batch_filter_parameters": { "year": "2019", "month": "02" } })) print(batch.head())
def test_save_expectation_suite_with_datetime_objects( data_context_parameterized_expectation_suite): # create datetime evaluation parameters evaluation_params = { "now": datetime.datetime.now(), "now_minus_48h": datetime.datetime.now() - datetime.timedelta(days=2) } test_data = { "data_refresh": [ datetime.datetime.now(), datetime.datetime.now() - datetime.timedelta(days=1) ] } test_df = pd.DataFrame(test_data) dataset_name = "test_pandas_source" with TemporaryDirectory() as tempdir: ge_path = os.path.join(tempdir, "great_expectations") ge.DataContext.create(tempdir, usage_statistics_enabled=False) context = ge.DataContext(ge_path) context.add_datasource(dataset_name, class_name="PandasDatasource") batch_kwargs = { "dataset": test_df, "datasource": dataset_name, "PandasInMemoryDF": True, "ge_batch_id": "test_id", } empty_suite = context.create_expectation_suite("test_suite") batch = context.get_batch(batch_kwargs=batch_kwargs, expectation_suite_name=empty_suite) for param in evaluation_params: batch.set_evaluation_parameter(param, evaluation_params[param]) # Add expectation that will succeed using the datetime in a $PARAMETER batch.expect_column_max_to_be_between( column="data_refresh", min_value={"$PARAMETER": "now_minus_48h"}) result = batch.validate() assert result.success batch.save_expectation_suite() assert isinstance(batch, PandasDataset) # Check that we can load the saved expectation suite reloaded_expectation_suite = context.get_expectation_suite( "test_suite") assert isinstance(reloaded_expectation_suite, ExpectationSuite) # Run validation via the action_list_operator run_id = { "run_name": f"{dataset_name}_{datetime.datetime.now()}", "run_time": datetime.datetime.now(), } results = context.run_validation_operator( "action_list_operator", assets_to_validate=[batch], run_id=run_id, evaluation_parameters=evaluation_params) assert results.success # Check that we can build Data Docs index_page_locator_infos = context.build_data_docs() assert index_page_locator_infos[ "local_site"] == f"file://{ge_path}/uncommitted/data_docs/local_site/index.html" # Check that we can reload the expectation suite and validate reloaded_batch = context.get_batch( batch_kwargs=batch_kwargs, expectation_suite_name=reloaded_expectation_suite) run_id = { "run_name": f"reloaded_{dataset_name}_{datetime.datetime.now()}", "run_time": datetime.datetime.now(), } reloaded_results = context.run_validation_operator( "action_list_operator", assets_to_validate=[reloaded_batch], run_id=run_id, ) assert reloaded_results.success
def run( self, checkpoint_name: str = None, context: "ge.DataContext" = None, assets_to_validate: list = None, batch_kwargs: dict = None, expectation_suite_name: str = None, get_checkpoint_from_context: bool = False, context_root_dir: str = None, runtime_environment: Optional[dict] = None, run_name: str = None, run_info_at_end: bool = True, disable_markdown_artifact: bool = False, ): """ Task run method. Args: - checkpoint_name (str, optional): the name of the checkpoint; should match the filename of the checkpoint without .py - context (DataContext, optional): an in-memory GE DataContext object. e.g. `ge.data_context.DataContext()` If not provided then `context_root_dir` will be used to look for one. - assets_to_validate (list, optional): A list of assets to validate when running the validation operator. If not provided then `batch_kwargs` and `expectation_suite_name` will be used if context is provided. Also, if not provided and `get_checkpoint_from_context` is True then the assets will be loaded from that context. - batch_kwargs (dict, optional): a dictionary of batch kwargs to be used when validating assets. - expectation_suite_name (str, optional): the name of an expectation suite to be used when validating assets. - get_checkpoint_from_context (bool, optional): get the checkpoint from context. Defaults to `False` - context_root_dir (str, optional): the absolute or relative path to the directory holding your `great_expectations.yml` - runtime_environment (dict, optional): a dictionary of great expectation config key-value pairs to overwrite your config in `great_expectations.yml` - run_name (str, optional): the name of this Great Expectation validation run; defaults to the task slug - run_info_at_end (bool, optional): add run info to the end of the artifact generated by this task. Defaults to `True`. - disable_markdown_artifact (bool, optional): toggle the posting of a markdown artifact from this tasks. Defaults to `False`. Raises: - 'signals.VALIDATIONFAIL' if the validation was not a success Returns: - result ('great_expectations.validation_operators.types.validation_operator_result.ValidationOperatorResult'): The Great Expectations metadata returned from the validation """ if checkpoint_name is None: raise ValueError("You must provide the checkpoint name.") runtime_environment = runtime_environment or dict() # Load context if not provided directly if not context: context = ge.DataContext( context_root_dir=context_root_dir, runtime_environment=runtime_environment, ) # if assets are not provided directly through `assets_to_validate` then they need be loaded # if the checkpoint is being loaded from the context then load suite and batch from there # otherwise get batch from `batch_kwargs` and `expectation_suite_name` if not assets_to_validate: assets_to_validate = [] if get_checkpoint_from_context: ge_checkpoint = context.get_checkpoint(checkpoint_name) for batch in ge_checkpoint["batches"]: batch_kwargs = batch["batch_kwargs"] for suite_name in batch["expectation_suite_names"]: suite = context.get_expectation_suite(suite_name) batch = context.get_batch(batch_kwargs, suite) assets_to_validate.append(batch) else: assets_to_validate.append( context.get_batch(batch_kwargs, expectation_suite_name) ) # Run validation operator results = context.run_validation_operator( checkpoint_name or ge_checkpoint["validation_operator_name"], assets_to_validate=assets_to_validate, run_id={"run_name": run_name or prefect.context.get("task_slug")}, ) if results.success is False: raise signals.FAIL(result=results) # Generate artifact markdown if not disable_markdown_artifact: run_info_at_end = True validation_results_page_renderer = ( ge.render.renderer.ValidationResultsPageRenderer( run_info_at_end=run_info_at_end ) ) rendered_document_content_list = ( validation_results_page_renderer.render_validation_operator_result( validation_operator_result=results ) ) markdown_artifact = " ".join( ge.render.view.DefaultMarkdownPageView().render( rendered_document_content_list ) ) create_markdown(markdown_artifact) return results