def __init__(self, conn, report): self.conn = conn self.report = report data_context_config = DataContextConfig( datasources={ self.datasource_name: DatasourceConfig( class_name="SqlAlchemyDatasource", credentials={ # This isn't actually used since we pass the connection directly, # but GE parses it to change some of its behavior so it's useful # to emulate that here. "url": self.conn.engine.url, }, ) }, store_backend_defaults=InMemoryStoreBackendDefaults(), anonymous_usage_statistics={ "enabled": False, # "data_context_id": <not set>, }, ) with _properly_init_datasource(self.conn): self.data_context = BaseDataContext( project_config=data_context_config)
def _ge_context(self) -> Iterator[GEContext]: with self.base_engine.connect() as conn: data_context = BaseDataContext(project_config=DataContextConfig( # The datasource will be added via add_datasource(). datasources={}, store_backend_defaults=InMemoryStoreBackendDefaults(), anonymous_usage_statistics={ "enabled": False, # "data_context_id": <not set>, }, )) datasource_name = f"{self._datasource_name_base}-{uuid.uuid4()}" datasource_config = DatasourceConfig( class_name="SqlAlchemyDatasource", credentials={ # This isn't actually used since we pass the connection directly, # but GE parses it to change some of its behavior so it's useful # to emulate that here. "url": conn.engine.url, }, ) with _inject_connection_into_datasource(conn): # Using the add_datasource method ensures that the datasource is added to # GE-internal cache, which avoids problems when calling GE methods later on. assert data_context.add_datasource( datasource_name, initialize=True, **dict(datasourceConfigSchema.dump(datasource_config)), ) assert data_context.get_datasource(datasource_name) yield GEContext(data_context, datasource_name)
def test_in_memory_data_context_configuration( titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled, ): project_config_dict: dict = titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled.get_config( mode=ConfigOutputModes.DICT) project_config_dict["plugins_directory"] = None project_config_dict["validation_operators"] = { "action_list_operator": { "class_name": "ActionListValidationOperator", "action_list": [ { "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction" }, }, { "name": "store_evaluation_params", "action": { "class_name": "StoreEvaluationParametersAction" }, }, { "name": "update_data_docs", "action": { "class_name": "UpdateDataDocsAction" }, }, ], } } # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields. project_config_dict = dataContextConfigSchema.dump(project_config_dict) project_config_dict = dataContextConfigSchema.load(project_config_dict) project_config: DataContextConfig = DataContextConfig( **project_config_dict) data_context = BaseDataContext( project_config=project_config, context_root_dir= titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled .root_directory, ) my_validator: Validator = data_context.get_validator( datasource_name="my_datasource", data_connector_name="my_basic_data_connector", data_asset_name="Titanic_1912", create_expectation_suite_with_name="my_test_titanic_expectation_suite", ) assert my_validator.expect_table_row_count_to_equal(1313)["success"] assert my_validator.expect_table_column_count_to_equal(7)["success"]
def test__normalize_absolute_or_relative_path(tmp_path_factory, basic_data_context_config): config_path = str(tmp_path_factory.mktemp('test__normalize_absolute_or_relative_path__dir')) context = BaseDataContext( basic_data_context_config, config_path, ) assert str(os.path.join("test__normalize_absolute_or_relative_path__dir0", "yikes")) in context._normalize_absolute_or_relative_path("yikes") assert "test__normalize_absolute_or_relative_path__dir" not in context._normalize_absolute_or_relative_path("/yikes") assert "/yikes" == context._normalize_absolute_or_relative_path("/yikes")
def test_in_memory_data_context_configuration( titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store, ): project_config_dict: dict = titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store.get_config( mode="dict") project_config_dict["plugins_directory"] = None project_config_dict["validation_operators"] = { "action_list_operator": { "class_name": "ActionListValidationOperator", "action_list": [ { "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction" }, }, { "name": "store_evaluation_params", "action": { "class_name": "StoreEvaluationParametersAction" }, }, { "name": "update_data_docs", "action": { "class_name": "UpdateDataDocsAction" }, }, ], } } project_config: DataContextConfig = DataContextConfig( **project_config_dict) data_context = BaseDataContext( project_config=project_config, context_root_dir= titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store .root_directory, ) my_validator: Validator = data_context.get_validator( datasource_name="my_datasource", data_connector_name="my_basic_data_connector", data_asset_name="Titanic_1912", create_expectation_suite_with_name="my_test_titanic_expectation_suite", ) assert my_validator.expect_table_row_count_to_equal(1313)["success"] assert my_validator.expect_table_column_count_to_equal(7)["success"]
def build_context(self): """ Purpose: Create a dataContext and datasource and add to object Returns: saves dataContext and datasource to self """ self.context=ge.get_context() #create datasource configuration datasource_config = { "name": "example_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "module_name": "great_expectations.execution_engine", "class_name": "PandasExecutionEngine", }, "data_connectors": { "default_runtime_data_connector_name": { "class_name": "RuntimeDataConnector", "batch_identifiers": ["default_identifier_name"], }, }, } #create data context configuration data_context_config = DataContextConfig( datasources={ "pandas": DatasourceConfig( class_name="Datasource", execution_engine={ "class_name": "PandasExecutionEngine" }, data_connectors={ "default_runtime_data_connector_name": { "class_name": "RuntimeDataConnector", "batch_identifiers": ["default_identifier_name"], } }, ) }, store_backend_defaults=FilesystemStoreBackendDefaults(root_directory=os.path.join(os.getcwd(),'great_expectations')), ) #build context and add data source self.context=BaseDataContext(project_config=data_context_config) #self.context.test_yaml_config(yaml.dump(datasource_config)) self.context.add_datasource(**datasource_config)
def test_usage_statistics_handler_build_envelope( in_memory_data_context_config_usage_stats_enabled, sample_partial_message): """This test is for a happy path only but will fail if there is an exception thrown in build_envelope""" context: BaseDataContext = BaseDataContext( in_memory_data_context_config_usage_stats_enabled) usage_statistics_handler = UsageStatisticsHandler( data_context=context, data_context_id=in_memory_data_context_config_usage_stats_enabled. anonymous_usage_statistics.data_context_id, usage_statistics_url=in_memory_data_context_config_usage_stats_enabled. anonymous_usage_statistics.usage_statistics_url, ) assert (usage_statistics_handler._data_context_id == "00000000-0000-0000-0000-000000000001") envelope = usage_statistics_handler.build_envelope(sample_partial_message) required_keys = [ "event", "event_payload", "version", "ge_version", "data_context_id", "data_context_instance_id", "event_time", ] assert all([key in envelope.keys() for key in required_keys]) assert envelope["version"] == "1.0.0" assert envelope[ "data_context_id"] == "00000000-0000-0000-0000-000000000001"
def test_opt_out_env_var_overrides_home_folder( in_memory_data_context_config_usage_stats_enabled, tmp_path_factory, monkeypatch): monkeypatch.delenv("GE_USAGE_STATS", raising=False) # Undo the project-wide test default home_config_dir = tmp_path_factory.mktemp("home_dir") home_config_dir = str(home_config_dir) etc_config_dir = tmp_path_factory.mktemp("etc") etc_config_dir = str(etc_config_dir) config_dirs = [home_config_dir, etc_config_dir] config_dirs = [ os.path.join(config_dir, "great_expectations.conf") for config_dir in config_dirs ] enabled_config = configparser.ConfigParser() enabled_config["anonymous_usage_statistics"] = {"enabled": "True"} with open(os.path.join(home_config_dir, "great_expectations.conf"), "w") as configfile: enabled_config.write(configfile) monkeypatch.setenv("GE_USAGE_STATS", "False") with mock.patch( "great_expectations.data_context.AbstractDataContext.GLOBAL_CONFIG_PATHS", config_dirs, ): assert (in_memory_data_context_config_usage_stats_enabled. anonymous_usage_statistics.enabled is True) context = BaseDataContext( in_memory_data_context_config_usage_stats_enabled) project_config = context._project_config assert project_config.anonymous_usage_statistics.enabled is False
def test_get_site_names_with_three_sites(tmpdir, basic_data_context_config): basic_data_context_config.data_docs_sites = {} for i in range(3): basic_data_context_config.data_docs_sites[f"site-{i}"] = { "class_name": "SiteBuilder", "store_backend": { "class_name": "TupleFilesystemStoreBackend", "base_directory": f"uncommitted/data_docs/site-{i}/", }, "site_index_builder": { "class_name": "DefaultSiteIndexBuilder" }, } context = BaseDataContext(basic_data_context_config, context_root_dir=tmpdir) assert context.get_site_names() == ["site-0", "site-1", "site-2"]
def test_usage_statistics_handler_validate_message_success( caplog, in_memory_data_context_config_usage_stats_enabled, sample_partial_message): # caplog default is WARNING and above, we want to see DEBUG level messages for this test caplog.set_level( level=logging.DEBUG, logger="great_expectations.core.usage_statistics.usage_statistics", ) context: BaseDataContext = BaseDataContext( in_memory_data_context_config_usage_stats_enabled) usage_statistics_handler = UsageStatisticsHandler( data_context=context, data_context_id=in_memory_data_context_config_usage_stats_enabled. anonymous_usage_statistics.data_context_id, usage_statistics_url=in_memory_data_context_config_usage_stats_enabled. anonymous_usage_statistics.usage_statistics_url, ) assert (usage_statistics_handler._data_context_id == "00000000-0000-0000-0000-000000000001") envelope = usage_statistics_handler.build_envelope(sample_partial_message) validated_message = usage_statistics_handler.validate_message( envelope, anonymized_usage_statistics_record_schema) assert validated_message assert not usage_stats_invalid_messages_exist(caplog.messages)
def test_ConfigOnlyDataContext_v013__initialization( tmp_path_factory, basic_data_context_v013_config ): config_path = str( tmp_path_factory.mktemp("test_ConfigOnlyDataContext__initialization__dir") ) context = BaseDataContext( basic_data_context_v013_config, config_path, ) assert len(context.plugins_directory.split("/")[-3:]) == 3 assert "" in context.plugins_directory.split("/")[-3:] pattern = re.compile(r"test_ConfigOnlyDataContext__initialization__dir\d*") assert ( len( list( filter( lambda element: element, sorted( pattern.match(element) is not None for element in context.plugins_directory.split("/")[-3:] ), ) ) ) == 1 )
def test_opt_out_etc(in_memory_data_context_config_usage_stats_enabled, tmp_path_factory): home_config_dir = tmp_path_factory.mktemp("home_dir") home_config_dir = str(home_config_dir) etc_config_dir = tmp_path_factory.mktemp("etc") etc_config_dir = str(etc_config_dir) config_dirs = [home_config_dir, etc_config_dir] config_dirs = [ os.path.join(config_dir, "great_expectations.conf") for config_dir in config_dirs ] enabled_config = configparser.ConfigParser() enabled_config["anonymous_usage_statistics"] = {"enabled": True} disabled_config = configparser.ConfigParser() disabled_config["anonymous_usage_statistics"] = {"enabled": False} with open(os.path.join(etc_config_dir, "great_expectations.conf"), 'w') as configfile: disabled_config.write(configfile) with mock.patch( "great_expectations.data_context.BaseDataContext.GLOBAL_CONFIG_PATHS", config_dirs): assert in_memory_data_context_config_usage_stats_enabled.anonymous_usage_statistics.enabled is True context = BaseDataContext( in_memory_data_context_config_usage_stats_enabled) project_config = context._project_config assert project_config.anonymous_usage_statistics.enabled is False
def test_opt_out_environment_variable(in_memory_data_context_config_usage_stats_enabled, monkeypatch): """Set the env variable GE_USAGE_STATS value to any of the following: FALSE, False, false, 0""" monkeypatch.setenv("GE_USAGE_STATS", "False") assert in_memory_data_context_config_usage_stats_enabled.anonymous_usage_statistics.enabled is True context = BaseDataContext(in_memory_data_context_config_usage_stats_enabled) project_config = context._project_config assert project_config.anonymous_usage_statistics.enabled is False
def test_opt_out_etc(in_memory_data_context_config_usage_stats_enabled, tmp_path_factory, monkeypatch): monkeypatch.delenv("GE_USAGE_STATS", raising=False) # Undo the project-wide test default home_config_dir = tmp_path_factory.mktemp("home_dir") home_config_dir = str(home_config_dir) etc_config_dir = tmp_path_factory.mktemp("etc") etc_config_dir = str(etc_config_dir) config_dirs = [home_config_dir, etc_config_dir] config_dirs = [ os.path.join(config_dir, "great_expectations.conf") for config_dir in config_dirs ] for false_string in ["False", "false", "f", "FALSE"]: disabled_config = configparser.ConfigParser() disabled_config["anonymous_usage_statistics"] = { "enabled": false_string } with open(os.path.join(etc_config_dir, "great_expectations.conf"), "w") as configfile: disabled_config.write(configfile) with mock.patch( "great_expectations.data_context.BaseDataContext.GLOBAL_CONFIG_PATHS", config_dirs, ): assert (in_memory_data_context_config_usage_stats_enabled. anonymous_usage_statistics.enabled is True) context = BaseDataContext( deepcopy(in_memory_data_context_config_usage_stats_enabled)) project_config = context._project_config assert project_config.anonymous_usage_statistics.enabled is False
def _add_checkpoint( context: BaseDataContext, backend_api: str, datasource_name: str, data_connector_name: str, checkpoint_name: str, suite_and_asset_names=[], ) -> SimpleCheckpoint: if backend_api == "V3": validations = [ { "expectation_suite_name": suite_and_asset_name, "batch_request": { "datasource_name": datasource_name, "data_connector_name": data_connector_name, "data_asset_name": suite_and_asset_name, "batch_spec_passthrough": {"create_temp_table": False}, }, } for suite_and_asset_name in suite_and_asset_names ] return context.add_checkpoint( name=checkpoint_name, class_name="SimpleCheckpoint", validations=validations, run_name_template="my_run_name", ) elif backend_api == "V2": batches = [ { "expectation_suite_names": [suite_and_asset_name], "batch_kwargs": { "datasource": datasource_name, "data_asset_name": suite_and_asset_name, "table": suite_and_asset_name, "batch_spec_passthrough": {"create_temp_table": False}, }, } for suite_and_asset_name in suite_and_asset_names ] return context.add_checkpoint( name=checkpoint_name, class_name="LegacyCheckpoint", batches=batches, ) else: raise ValueError(f"Unsupported backend_api {backend_api}")
def context_with_two_sites(titanic_data_context_stats_enabled_config_version_3): context = titanic_data_context_stats_enabled_config_version_3 config = context.get_config_with_variables_substituted() config.data_docs_sites["team_site"] = { "class_name": "SiteBuilder", "store_backend": { "class_name": "TupleFilesystemStoreBackend", "base_directory": "uncommitted/data_docs/team_site/", }, "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"}, } temp_context = BaseDataContext(config, context_root_dir=context.root_directory) new_context = DataContext(context.root_directory) new_context.set_config(temp_context.get_config_with_variables_substituted()) new_context._save_project_config() assert new_context.get_site_names() == ["local_site", "team_site"] return new_context
def test_data_context_concurrency_property(): data_context = BaseDataContext( project_config=DataContextConfig( concurrency=ConcurrencyConfig(enabled=True), store_backend_defaults=InMemoryStoreBackendDefaults(), ) ) assert data_context.concurrency.enabled
def test_consistent_name_anonymization(in_memory_data_context_config_usage_stats_enabled, monkeypatch): monkeypatch.delenv("GE_USAGE_STATS", raising=False) # Undo the project-wide test default context = BaseDataContext(in_memory_data_context_config_usage_stats_enabled) assert context.data_context_id == "00000000-0000-0000-0000-000000000001" payload = run_validation_operator_usage_statistics( context, "action_list_operator", assets_to_validate=[({"__fake_batch_kwargs": "mydatasource"}, "__fake_expectation_suite_name")], run_id="foo") # For a *specific* data_context_id, all names will be consistently anonymized assert payload["anonymized_operator_name"] == 'e079c942d946b823312054118b3b6ef4'
def test_inaccessible_active_bucket_warning_messages(caplog): """ What does this test do and why? Trying to create a data context with unreachable ACTIVE stores should show an warning message once per store e.g. Invalid store configuration: Please check the configuration of your TupleS3StoreBackend named expectations_S3_store Active stores are those named in: "expectations_store_name", "validations_store_name", "evaluation_parameter_store_name" """ bucket = "leakybucket" expectations_store_prefix = "expectations_store_prefix" validations_store_prefix = "validations_store_prefix" data_docs_store_prefix = "data_docs_store_prefix" # Create a bucket in Moto's mock AWS environment conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket) # Create a DataContext # Add inactive stores inactive_bucket = "inactive_leakybucket" stores = { "expectations_S3_store": { "class_name": "ExpectationsStore", "store_backend": { "class_name": "TupleS3StoreBackend", "bucket": inactive_bucket, "prefix": expectations_store_prefix, }, }, "validations_S3_store": { "class_name": "ValidationsStore", "store_backend": { "class_name": "TupleS3StoreBackend", "bucket": inactive_bucket, "prefix": validations_store_prefix, }, }, "evaluation_parameter_store": { "class_name": "EvaluationParameterStore" }, } in_code_data_context_project_config = build_in_code_data_context_project_config( bucket="leakybucket", expectations_store_prefix=expectations_store_prefix, validations_store_prefix=validations_store_prefix, data_docs_store_prefix=data_docs_store_prefix, stores=stores, ) _ = BaseDataContext(project_config=in_code_data_context_project_config) assert (caplog.messages.count( "Invalid store configuration: Please check the configuration of your TupleS3StoreBackend named expectations_S3_store" ) == 1) assert (caplog.messages.count( "Invalid store configuration: Please check the configuration of your TupleS3StoreBackend named validations_S3_store" ) == 1)
def test_ConfigOnlyDataContext__initialization(tmp_path_factory, basic_data_context_config): config_path = str(tmp_path_factory.mktemp('test_ConfigOnlyDataContext__initialization__dir')) context = BaseDataContext( basic_data_context_config, config_path, ) assert context.root_directory.split("/")[-1] == "test_ConfigOnlyDataContext__initialization__dir0" assert context.plugins_directory.split("/")[-3:] == ["test_ConfigOnlyDataContext__initialization__dir0", "plugins",""]
def test_dataset_from_pandas_source(tmpdir): data_file = tmpdir + '/data.json' json_data = [ {"name": "my name", "birthdate": "2020-10-01", "address": "1234 Main st", "size": 12}, {"name": "your name", "birthdate": "2020-06-01", "address": "1313 Mockingbird Ln", "size": 12} ] with open(data_file, mode='w') as out: json.dump(json_data, out) store_defaults = FilesystemStoreBackendDefaults(root_directory=tmpdir) project_config.stores = store_defaults.stores project_config.expectations_store_name = store_defaults.expectations_store_name project_config.validations_store_name = store_defaults.validations_store_name project_config.checkpoint_store_name = store_defaults.checkpoint_store_name ctx = BaseDataContext(project_config=project_config) pd_dataset = PandasDataset(pandas.read_json(data_file), **{'batch_kwargs': {'path': 'gcs://my_bucket/path/to/my/data'}, 'data_context': ctx}) action = OpenLineageValidationAction(ctx, openlineage_host='http://localhost:5000', openlineage_namespace='test_ns', job_name='test_job') datasets = action._fetch_datasets_from_pandas_source(pd_dataset, validation_result_suite=result_suite) assert len(datasets) == 1 input_ds = datasets[0] assert input_ds.name == '/path/to/my/data' assert input_ds.namespace == "gcs://my_bucket" assert "dataSource" in input_ds.facets assert input_ds.facets["dataSource"].name == "gcs://my_bucket" assert input_ds.facets["dataSource"].uri == 'gcs://my_bucket' assert 'schema' in input_ds.facets assert len(input_ds.facets['schema'].fields) == 4 assert all(f in input_ds.facets['schema'].fields for f in [SchemaField('name', 'object'), SchemaField('birthdate', 'object'), SchemaField('address', 'object'), SchemaField('size', 'int64')]) assert len(input_ds.inputFacets) == 3 assert all(k in input_ds.inputFacets for k in ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics']) assert input_ds.inputFacets['dataQuality'].rowCount == 10 assert 'size' in input_ds.inputFacets['dataQuality'].columnMetrics assert input_ds.inputFacets['dataQuality'].columnMetrics['size'].sum == 60 assert len(input_ds.inputFacets['greatExpectations_assertions'].assertions) == 2 assert all(a in input_ds.inputFacets['greatExpectations_assertions'].assertions for a in [GreatExpectationsAssertion('expect_table_row_count_to_equal', True), GreatExpectationsAssertion('expect_column_sum_to_be_between', True, 'size')])
def test__normalize_absolute_or_relative_path(tmp_path_factory, basic_data_context_v013_config): config_path = str( tmp_path_factory.mktemp( "test__normalize_absolute_or_relative_path__dir")) context = BaseDataContext( basic_data_context_v013_config, config_path, ) pattern_string = os.path.join( "^.*test__normalize_absolute_or_relative_path__dir\\d*", "yikes$") pattern = re.compile(pattern_string) assert (pattern.match( context._normalize_absolute_or_relative_path("yikes")) is not None) assert ("test__normalize_absolute_or_relative_path__dir" not in context._normalize_absolute_or_relative_path("/yikes")) assert "/yikes" == context._normalize_absolute_or_relative_path("/yikes")
def test_load_config_variables_file(basic_data_context_config, tmp_path_factory): # Setup: base_path = str(tmp_path_factory.mktemp('test_load_config_variables_file')) safe_mmkdir(os.path.join(base_path, "uncommitted")) with open(os.path.join(base_path, "uncommitted", "dev_variables.yml"), "w") as outfile: yaml.dump({'env': 'dev'}, outfile) with open(os.path.join(base_path, "uncommitted", "prod_variables.yml"), "w") as outfile: yaml.dump({'env': 'prod'}, outfile) basic_data_context_config[ "config_variables_file_path"] = "uncommitted/${TEST_CONFIG_FILE_ENV}_variables.yml" try: # We should be able to load different files based on an environment variable os.environ["TEST_CONFIG_FILE_ENV"] = "dev" context = BaseDataContext(basic_data_context_config, context_root_dir=base_path) config_vars = context._load_config_variables_file() assert config_vars['env'] == 'dev' os.environ["TEST_CONFIG_FILE_ENV"] = "prod" context = BaseDataContext(basic_data_context_config, context_root_dir=base_path) config_vars = context._load_config_variables_file() assert config_vars['env'] == 'prod' except Exception: raise finally: # Make sure we unset the environment variable we're using del os.environ["TEST_CONFIG_FILE_ENV"]
def test_load_config_variables_file(basic_data_context_v013_config, tmp_path_factory, monkeypatch): # Setup: base_path = str(tmp_path_factory.mktemp("test_load_config_variables_file")) os.makedirs(os.path.join(base_path, "uncommitted"), exist_ok=True) with open(os.path.join(base_path, "uncommitted", "dev_variables.yml"), "w") as outfile: yaml.dump({"env": "dev"}, outfile) with open(os.path.join(base_path, "uncommitted", "prod_variables.yml"), "w") as outfile: yaml.dump({"env": "prod"}, outfile) basic_data_context_v013_config[ "config_variables_file_path"] = "uncommitted/${TEST_CONFIG_FILE_ENV}_variables.yml" try: # We should be able to load different files based on an environment variable monkeypatch.setenv("TEST_CONFIG_FILE_ENV", "dev") context = BaseDataContext(basic_data_context_v013_config, context_root_dir=base_path) config_vars = context._load_config_variables_file() assert config_vars["env"] == "dev" monkeypatch.setenv("TEST_CONFIG_FILE_ENV", "prod") context = BaseDataContext(basic_data_context_v013_config, context_root_dir=base_path) config_vars = context._load_config_variables_file() assert config_vars["env"] == "prod" except Exception: raise finally: # Make sure we unset the environment variable we're using monkeypatch.delenv("TEST_CONFIG_FILE_ENV")
def test_DataContext_construct_data_context_id_uses_id_stored_in_DataContextConfig_if_no_configured_expectations_store( monkeypatch, ): """ What does this test and why? A DataContext should have an id. This ID should come from either: 1. configured expectations store store_backend_id 2. great_expectations.yml 3. new generated id from DataContextConfig This test verifies that DataContext._construct_data_context_id uses the data_context_id from DataContextConfig when there is no configured expectations store when instantiating the DataContext, and also that this data_context_id is used to configure the expectations_store.store_backend_id """ monkeypatch.delenv( "GE_USAGE_STATS", raising=False ) # Undo the project-wide test default bucket = "leakybucket" expectations_store_prefix = "expectations_store_prefix" validations_store_prefix = "validations_store_prefix" data_docs_store_prefix = "data_docs_store_prefix" manually_created_uuid = "00000000-0000-0000-0000-000000000eee" # Create a bucket in Moto's mock AWS environment conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket) # Create a DataContext (note NO existing expectations store already set up) in_code_data_context_project_config = build_in_code_data_context_project_config( bucket="leakybucket", expectations_store_prefix=expectations_store_prefix, validations_store_prefix=validations_store_prefix, data_docs_store_prefix=data_docs_store_prefix, ) # Manually set the data_context_id in the project_config in_code_data_context_project_config.anonymous_usage_statistics.data_context_id = ( manually_created_uuid ) in_code_data_context = BaseDataContext( project_config=in_code_data_context_project_config ) # Make sure the manually set data_context_id is propagated to all the appropriate places assert ( manually_created_uuid == in_code_data_context.data_context_id == in_code_data_context.stores[ in_code_data_context.expectations_store_name ].store_backend_id )
def build_in_memory_runtime_context(): data_context_config: DataContextConfig = DataContextConfig( datasources={ "pandas_datasource": { "execution_engine": { "class_name": "PandasExecutionEngine", "module_name": "great_expectations.execution_engine", }, "class_name": "Datasource", "module_name": "great_expectations.datasource", "data_connectors": { "runtime_data_connector": { "class_name": "RuntimeDataConnector", "batch_identifiers": [ "id_key_0", "id_key_1", ], } }, }, "spark_datasource": { "execution_engine": { "class_name": "SparkDFExecutionEngine", "module_name": "great_expectations.execution_engine", }, "class_name": "Datasource", "module_name": "great_expectations.datasource", "data_connectors": { "runtime_data_connector": { "class_name": "RuntimeDataConnector", "batch_identifiers": [ "id_key_0", "id_key_1", ], } }, }, }, expectations_store_name="expectations_store", validations_store_name="validations_store", evaluation_parameter_store_name="evaluation_parameter_store", checkpoint_store_name="checkpoint_store", store_backend_defaults=InMemoryStoreBackendDefaults(), ) context: BaseDataContext = BaseDataContext( project_config=data_context_config) return context
def test_consistent_name_anonymization( in_memory_data_context_config_usage_stats_enabled): context = BaseDataContext( in_memory_data_context_config_usage_stats_enabled) assert context.data_context_id == "6a52bdfa-e182-455b-a825-e69f076e67d6" payload = run_validation_operator_usage_statistics( context, "action_list_operator", assets_to_validate=[({ "__fake_batch_kwargs": "mydatasource" }, "__fake_expectation_suite_name")], run_id="foo") # For a *specific* data_context_id, all names will be consistently anonymized assert payload[ "anonymized_operator_name"] == '5bb011891aa7d41401e57759d5f5cb01'
def test_dataset_from_sql_source(test_db_file, tmpdir): connection_url = f'sqlite:///{test_db_file}' engine = create_engine(connection_url) ds = SqlAlchemyDataset(table_name=TABLE_NAME, engine=engine) store_defaults = FilesystemStoreBackendDefaults(root_directory=tmpdir) project_config.stores = store_defaults.stores project_config.expectations_store_name = store_defaults.expectations_store_name project_config.validations_store_name = store_defaults.validations_store_name project_config.checkpoint_store_name = store_defaults.checkpoint_store_name ctx = BaseDataContext(project_config=project_config) action = OpenLineageValidationAction(ctx, openlineage_host='http://localhost:5000', openlineage_namespace='test_ns', job_name='test_job') datasets = action._fetch_datasets_from_sql_source(ds, result_suite) assert datasets is not None assert len(datasets) == 1 input_ds = datasets[0] assert input_ds.name == TABLE_NAME assert input_ds.namespace == "sqlite" assert "dataSource" in input_ds.facets assert input_ds.facets["dataSource"].name == "sqlite" assert input_ds.facets["dataSource"].uri == "sqlite:/" + test_db_file assert 'schema' in input_ds.facets assert len(input_ds.facets['schema'].fields) == 4 assert all(f in input_ds.facets['schema'].fields for f in [SchemaField('name', 'TEXT'), SchemaField('birthdate', 'TEXT'), SchemaField('address', 'TEXT'), SchemaField('size', 'INTEGER')]) assert len(input_ds.inputFacets) == 3 assert all(k in input_ds.inputFacets for k in ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics']) assert input_ds.inputFacets['dataQuality'].rowCount == 10 assert 'size' in input_ds.inputFacets['dataQuality'].columnMetrics assert input_ds.inputFacets['dataQuality'].columnMetrics['size'].sum == 60 assert len(input_ds.inputFacets['greatExpectations_assertions'].assertions) == 2 assert all(a in input_ds.inputFacets['greatExpectations_assertions'].assertions for a in [GreatExpectationsAssertion('expect_table_row_count_to_equal', True), GreatExpectationsAssertion('expect_column_sum_to_be_between', True, 'size')])
def __init__(self, *, gcp_project, expectation_suite_name, gcs_bucket, gcs_expectations_prefix, gcs_validations_prefix, gcs_datadocs_prefix, query=None, table=None, bq_dataset_name, email_to, datadocs_domain='none', send_alert_email=True, datadocs_link_in_email=False, fail_task_on_validation_failure=True, bigquery_conn_id='bigquery_default', **kwargs): self.query = query self.table = table self.bigquery_conn_id = bigquery_conn_id self.bq_dataset_name = bq_dataset_name self.email_to = email_to self.gcp_project = gcp_project self.gcs_bucket = gcs_bucket self.gcs_expectations_prefix = gcs_expectations_prefix self.gcs_validations_prefix = gcs_validations_prefix self.gcs_datadocs_prefix = gcs_datadocs_prefix self.datadocs_domain = datadocs_domain self.send_alert_email = send_alert_email self.datadocs_link_in_email = datadocs_link_in_email self.fail_task_on_validation_failure = fail_task_on_validation_failure # Create a data context and batch_kwargs that will then be handed off to the base operator to do the # data validation against Expectations. data_context_config = self.create_data_context_config() data_context = BaseDataContext(project_config=data_context_config) batch_kwargs = self.get_batch_kwargs() # Call the parent constructor but override the default alerting behavior in the parent by hard coding # fail_task_on_validation_failure=False. This is done because we want to alert a little differently # than the parent class by sending an email to the user and then throwing an Airflow exception whenever # data doesn't match Expectations. super().__init__(data_context=data_context, batch_kwargs=batch_kwargs, expectation_suite_name=expectation_suite_name, fail_task_on_validation_failure=False, **kwargs)
def validate_with_great_expectations( self, dataframe: TypeVar("pyspark.sql.DataFrame"), # noqa: F821 expectation_suite: TypeVar("ge.core.ExpectationSuite"), # noqa: F821 ge_validate_kwargs: Optional[dict], ): # NOTE: InMemoryStoreBackendDefaults SHOULD NOT BE USED in normal settings. You # may experience data loss as it persists nothing. It is used here for testing. # Please refer to docs to learn how to instantiate your DataContext. store_backend_defaults = InMemoryStoreBackendDefaults() data_context_config = DataContextConfig( store_backend_defaults=store_backend_defaults, checkpoint_store_name=store_backend_defaults.checkpoint_store_name, ) context = BaseDataContext(project_config=data_context_config) datasource = { "name": "my_spark_dataframe", "class_name": "Datasource", "execution_engine": { "class_name": "SparkDFExecutionEngine", "force_reuse_spark_context": True, }, "data_connectors": { "default_runtime_data_connector_name": { "class_name": "RuntimeDataConnector", "batch_identifiers": ["batch_id"], } }, } context.add_datasource(**datasource) # Here is a RuntimeBatchRequest using a dataframe batch_request = RuntimeBatchRequest( datasource_name="my_spark_dataframe", data_connector_name="default_runtime_data_connector_name", data_asset_name= "<YOUR_MEANGINGFUL_NAME>", # This can be anything that identifies this data_asset for you batch_identifiers={"batch_id": "default_identifier"}, runtime_parameters={"batch_data": dataframe}, # Your dataframe goes here ) context.save_expectation_suite(expectation_suite) validator = context.get_validator( batch_request=batch_request, expectation_suite_name=expectation_suite.expectation_suite_name, ) report = validator.validate(**ge_validate_kwargs) return report