def _ge_context(self) -> Iterator[GEContext]: with self.base_engine.connect() as conn: data_context = BaseDataContext(project_config=DataContextConfig( # The datasource will be added via add_datasource(). datasources={}, store_backend_defaults=InMemoryStoreBackendDefaults(), anonymous_usage_statistics={ "enabled": False, # "data_context_id": <not set>, }, )) datasource_name = f"{self._datasource_name_base}-{uuid.uuid4()}" datasource_config = DatasourceConfig( class_name="SqlAlchemyDatasource", credentials={ # This isn't actually used since we pass the connection directly, # but GE parses it to change some of its behavior so it's useful # to emulate that here. "url": conn.engine.url, }, ) with _inject_connection_into_datasource(conn): # Using the add_datasource method ensures that the datasource is added to # GE-internal cache, which avoids problems when calling GE methods later on. assert data_context.add_datasource( datasource_name, initialize=True, **dict(datasourceConfigSchema.dump(datasource_config)), ) assert data_context.get_datasource(datasource_name) yield GEContext(data_context, datasource_name)
def test_DataContextConfig_with_BaseStoreBackendDefaults_and_simple_defaults( construct_data_context_config, default_pandas_datasource_config ): """ What does this test and why? Ensure that a very simple DataContextConfig setup with many defaults is created accurately and produces a valid DataContextConfig """ data_context_config = DataContextConfig( datasources={ "my_pandas_datasource": DatasourceConfig( class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": "../data/", } }, ) }, store_backend_defaults=BaseStoreBackendDefaults(), ) desired_config = construct_data_context_config( data_context_id=data_context_config.anonymous_usage_statistics.data_context_id, datasources=default_pandas_datasource_config, ) data_context_config_schema = DataContextConfigSchema() assert data_context_config_schema.dump(data_context_config) == desired_config assert DataContext.validate_config(project_config=data_context_config)
def __init__(self, conn, report): self.conn = conn self.report = report data_context_config = DataContextConfig( datasources={ self.datasource_name: DatasourceConfig( class_name="SqlAlchemyDatasource", credentials={ # This isn't actually used since we pass the connection directly, # but GE parses it to change some of its behavior so it's useful # to emulate that here. "url": self.conn.engine.url, }, ) }, store_backend_defaults=InMemoryStoreBackendDefaults(), anonymous_usage_statistics={ "enabled": False, # "data_context_id": <not set>, }, ) with _properly_init_datasource(self.conn): self.data_context = BaseDataContext( project_config=data_context_config)
def test_DataContextConfig_with_FilesystemStoreBackendDefaults_and_simple_defaults_no_root_directory( construct_data_context_config, default_pandas_datasource_config ): """ What does this test and why? Ensure that a very simple DataContextConfig setup using FilesystemStoreBackendDefaults is created accurately This test does not set the optional root_directory parameter """ data_context_config = DataContextConfig( datasources={ "my_pandas_datasource": DatasourceConfig( class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": "../data/", } }, ) }, store_backend_defaults=FilesystemStoreBackendDefaults(), ) # Create desired config data_context_id = data_context_config.anonymous_usage_statistics.data_context_id desired_config = construct_data_context_config( data_context_id=data_context_id, datasources=default_pandas_datasource_config ) data_context_config_schema = DataContextConfigSchema() assert data_context_config_schema.dump(data_context_config) == desired_config assert DataContext.validate_config(project_config=data_context_config)
def _upgrade_configuration_automatically(self): if not self.upgrade_log["skipped_checkpoint_store_upgrade"]: config_commented_map: CommentedMap = ( self.data_context.get_config().commented_map) for key, config in self.upgrade_checklist["automatic"][ "stores"].items(): config_commented_map["stores"][key] = config for key, value in self.upgrade_checklist["automatic"][ "store_names"].items(): config_commented_map[key] = value data_context_config: DataContextConfig = ( DataContextConfig.from_commented_map( commented_map=config_commented_map)) self.data_context.set_config(project_config=data_context_config) self.data_context._save_project_config() checkpoint_log_entry = { "stores": { DataContextConfigDefaults.DEFAULT_CHECKPOINT_STORE_NAME.value: data_context_config.stores[ DataContextConfigDefaults. DEFAULT_CHECKPOINT_STORE_NAME.value], }, "checkpoint_store_name": data_context_config.checkpoint_store_name, } self.upgrade_log["added_checkpoint_store"].update( checkpoint_log_entry)
def test_substituted_config_variables_not_written_to_file(tmp_path_factory): # this test uses a great_expectations.yml with almost all values replaced # with substitution variables project_path = str(tmp_path_factory.mktemp("data_context")) context_path = os.path.join(project_path, "great_expectations") asset_config_path = os.path.join(context_path, "expectations") create_data_context_files( context_path, asset_config_path, ge_config_fixture_filename= "great_expectations_basic_with_exhaustive_variables.yml", config_variables_fixture_filename="config_variables_exhaustive.yml", ) # load ge config fixture for expected path_to_yml = ( "../test_fixtures/great_expectations_basic_with_exhaustive_variables.yml" ) path_to_yml = file_relative_path(__file__, path_to_yml) with open(path_to_yml) as data: config_dict = yaml.load(data) expected_config = DataContextConfig.from_commented_map(config_dict) expected_config_dict = dataContextConfigSchema.dump(expected_config) expected_config_dict.pop("anonymous_usage_statistics") # instantiate data_context twice to go through cycle of loading config from file then saving context = ge.data_context.DataContext(context_path) context._save_project_config() context_config_dict = dataContextConfigSchema.dump( ge.data_context.DataContext(context_path)._project_config) context_config_dict.pop("anonymous_usage_statistics") assert context_config_dict == expected_config_dict
def _retrieve_data_context_config_from_ge_cloud(self) -> DataContextConfig: """ Utilizes the GeCloudConfig instantiated in the constructor to create a request to the Cloud API. Given proper authorization, the request retrieves a data context config that is pre-populated with GE objects specific to the user's Cloud environment (datasources, data connectors, etc). Please note that substitution for ${VAR} variables is performed in GE Cloud before being sent over the wire. :return: the configuration object retrieved from the Cloud API """ ge_cloud_url = ( self.ge_cloud_config.base_url + f"/organizations/{self.ge_cloud_config.organization_id}/data-context-configuration" ) auth_headers = { "Content-Type": "application/vnd.api+json", "Authorization": f"Bearer {self.ge_cloud_config.access_token}", } response = requests.get(ge_cloud_url, headers=auth_headers) if response.status_code != 200: raise ge_exceptions.GeCloudError( f"Bad request made to GE Cloud; {response.text}") config = response.json() return DataContextConfig(**config)
def basic_data_context_config(): return DataContextConfig( **{ "commented_map": {}, "config_version": 1, "plugins_directory": "plugins/", "evaluation_parameter_store_name": "evaluation_parameter_store", "validations_store_name": "does_not_have_to_be_real", "expectations_store_name": "expectations_store", "config_variables_file_path": "uncommitted/config_variables.yml", "datasources": {}, "stores": { "expectations_store": { "class_name": "ExpectationsStore", "store_backend": { "class_name": "TupleFilesystemStoreBackend", "base_directory": "expectations/", }, }, "evaluation_parameter_store": { "module_name": "great_expectations.data_context.store", "class_name": "EvaluationParameterStore", } }, "data_docs_sites": {}, "validation_operators": { "default": { "class_name": "ActionListValidationOperator", "action_list": [] } } })
def in_memory_data_context_config_usage_stats_enabled(): return DataContextConfig( **{ "commented_map": {}, "config_version": 1, "plugins_directory": None, "evaluation_parameter_store_name": "evaluation_parameter_store", "validations_store_name": "validations_store", "expectations_store_name": "expectations_store", "config_variables_file_path": None, "datasources": {}, "stores": { "expectations_store": { "class_name": "ExpectationsStore", }, "validations_store": { "class_name": "ValidationsStore", }, "evaluation_parameter_store": { "class_name": "EvaluationParameterStore", }, }, "data_docs_sites": {}, "validation_operators": { "default": { "class_name": "ActionListValidationOperator", "action_list": [] } }, "anonymous_usage_statistics": { "enabled": True, "data_context_id": "6a52bdfa-e182-455b-a825-e69f076e67d6", "usage_statistics_url": USAGE_STATISTICS_QA_URL } })
def create_data_context_config(self): # Get the credentials information for the BigQuery data source from the BigQuery Airflow connection conn = BaseHook.get_connection(self.bigquery_conn_id) connection_json = conn.extra_dejson credentials_path = connection_json[ 'extra__google_cloud_platform__key_path'] data_context_config = DataContextConfig( config_version=2, datasources={ "bq_datasource": { "credentials": { "url": "bigquery://" + self.gcp_project + "/" + self.bq_dataset_name + "?credentials_path=" + credentials_path }, "class_name": "SqlAlchemyDatasource", "module_name": "great_expectations.datasource", "data_asset_type": { "module_name": "great_expectations.dataset", "class_name": "SqlAlchemyDataset" } } }, store_backend_defaults=GCSStoreBackendDefaults( default_bucket_name=self.gcs_bucket, default_project_name=self.gcp_project, validations_store_prefix=self.gcs_validations_prefix, expectations_store_prefix=self.gcs_expectations_prefix, data_docs_prefix=self.gcs_datadocs_prefix, ), ) return data_context_config
def basic_data_context_config_for_validation_operator(): return DataContextConfig( config_version=1, plugins_directory=None, evaluation_parameter_store_name="evaluation_parameter_store", expectations_store_name="expectations_store", datasources={}, stores={ "expectations_store": { "class_name": "ExpectationsStore" }, "evaluation_parameter_store": { "class_name": "EvaluationParameterStore" }, "validation_result_store": { "class_name": "ValidationsStore" }, "metrics_store": { "class_name": "MetricStore" } }, validations_store_name="validation_result_store", data_docs_sites={}, validation_operators={ "store_val_res_and_extract_eval_params": { "class_name": "ActionListValidationOperator", "action_list": [{ "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction", "target_store_name": "validation_result_store", } }, { "name": "extract_and_store_eval_parameters", "action": { "class_name": "StoreEvaluationParametersAction", "target_store_name": "evaluation_parameter_store", } }] }, "errors_and_warnings_validation_operator": { "class_name": "WarningAndFailureExpectationSuitesValidationOperator", "action_list": [{ "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction", "target_store_name": "validation_result_store", } }, { "name": "extract_and_store_eval_parameters", "action": { "class_name": "StoreEvaluationParametersAction", "target_store_name": "evaluation_parameter_store", } }] } })
def test_data_context_concurrency_property(): data_context = BaseDataContext( project_config=DataContextConfig( concurrency=ConcurrencyConfig(enabled=True), store_backend_defaults=InMemoryStoreBackendDefaults(), ) ) assert data_context.concurrency.enabled
def test_in_memory_data_context_configuration( titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled, ): project_config_dict: dict = titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled.get_config( mode=ConfigOutputModes.DICT) project_config_dict["plugins_directory"] = None project_config_dict["validation_operators"] = { "action_list_operator": { "class_name": "ActionListValidationOperator", "action_list": [ { "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction" }, }, { "name": "store_evaluation_params", "action": { "class_name": "StoreEvaluationParametersAction" }, }, { "name": "update_data_docs", "action": { "class_name": "UpdateDataDocsAction" }, }, ], } } # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields. project_config_dict = dataContextConfigSchema.dump(project_config_dict) project_config_dict = dataContextConfigSchema.load(project_config_dict) project_config: DataContextConfig = DataContextConfig( **project_config_dict) data_context = BaseDataContext( project_config=project_config, context_root_dir= titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled .root_directory, ) my_validator: Validator = data_context.get_validator( datasource_name="my_datasource", data_connector_name="my_basic_data_connector", data_asset_name="Titanic_1912", create_expectation_suite_with_name="my_test_titanic_expectation_suite", ) assert my_validator.expect_table_row_count_to_equal(1313)["success"] assert my_validator.expect_table_column_count_to_equal(7)["success"]
def test_DataContextConfig_with_FilesystemStoreBackendDefaults_and_simple_defaults( construct_data_context_config, default_pandas_datasource_config ): """ What does this test and why? Ensure that a very simple DataContextConfig setup using FilesystemStoreBackendDefaults is created accurately This test sets the root_dir parameter """ test_root_directory = "test_root_dir" store_backend_defaults = FilesystemStoreBackendDefaults( root_directory=test_root_directory ) data_context_config = DataContextConfig( datasources={ "my_pandas_datasource": DatasourceConfig( class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": "../data/", } }, ) }, store_backend_defaults=store_backend_defaults, ) # Create desired config data_context_id = data_context_config.anonymous_usage_statistics.data_context_id desired_config = construct_data_context_config( data_context_id=data_context_id, datasources=default_pandas_datasource_config ) # Add root_directory to stores and data_docs desired_config["stores"][desired_config["expectations_store_name"]][ "store_backend" ]["root_directory"] = test_root_directory desired_config["stores"][desired_config["validations_store_name"]]["store_backend"][ "root_directory" ] = test_root_directory desired_config["stores"][desired_config["checkpoint_store_name"]]["store_backend"][ "root_directory" ] = test_root_directory desired_config["data_docs_sites"]["local_site"]["store_backend"][ "root_directory" ] = test_root_directory data_context_config_schema = DataContextConfigSchema() assert filter_properties_dict( properties=data_context_config_schema.dump(data_context_config) ) == filter_properties_dict(properties=desired_config) assert DataContext.validate_config(project_config=data_context_config)
def test_in_memory_data_context_configuration( titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store, ): project_config_dict: dict = titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store.get_config( mode="dict") project_config_dict["plugins_directory"] = None project_config_dict["validation_operators"] = { "action_list_operator": { "class_name": "ActionListValidationOperator", "action_list": [ { "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction" }, }, { "name": "store_evaluation_params", "action": { "class_name": "StoreEvaluationParametersAction" }, }, { "name": "update_data_docs", "action": { "class_name": "UpdateDataDocsAction" }, }, ], } } project_config: DataContextConfig = DataContextConfig( **project_config_dict) data_context = BaseDataContext( project_config=project_config, context_root_dir= titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store .root_directory, ) my_validator: Validator = data_context.get_validator( datasource_name="my_datasource", data_connector_name="my_basic_data_connector", data_asset_name="Titanic_1912", create_expectation_suite_with_name="my_test_titanic_expectation_suite", ) assert my_validator.expect_table_row_count_to_equal(1313)["success"] assert my_validator.expect_table_column_count_to_equal(7)["success"]
def validate_with_great_expectations( self, dataframe: TypeVar("pyspark.sql.DataFrame"), # noqa: F821 expectation_suite: TypeVar("ge.core.ExpectationSuite"), # noqa: F821 ge_validate_kwargs: Optional[dict], ): # NOTE: InMemoryStoreBackendDefaults SHOULD NOT BE USED in normal settings. You # may experience data loss as it persists nothing. It is used here for testing. # Please refer to docs to learn how to instantiate your DataContext. store_backend_defaults = InMemoryStoreBackendDefaults() data_context_config = DataContextConfig( store_backend_defaults=store_backend_defaults, checkpoint_store_name=store_backend_defaults.checkpoint_store_name, ) context = BaseDataContext(project_config=data_context_config) datasource = { "name": "my_spark_dataframe", "class_name": "Datasource", "execution_engine": { "class_name": "SparkDFExecutionEngine", "force_reuse_spark_context": True, }, "data_connectors": { "default_runtime_data_connector_name": { "class_name": "RuntimeDataConnector", "batch_identifiers": ["batch_id"], } }, } context.add_datasource(**datasource) # Here is a RuntimeBatchRequest using a dataframe batch_request = RuntimeBatchRequest( datasource_name="my_spark_dataframe", data_connector_name="default_runtime_data_connector_name", data_asset_name= "<YOUR_MEANGINGFUL_NAME>", # This can be anything that identifies this data_asset for you batch_identifiers={"batch_id": "default_identifier"}, runtime_parameters={"batch_data": dataframe}, # Your dataframe goes here ) context.save_expectation_suite(expectation_suite) validator = context.get_validator( batch_request=batch_request, expectation_suite_name=expectation_suite.expectation_suite_name, ) report = validator.validate(**ge_validate_kwargs) return report
def build_context(self): """ Purpose: Create a dataContext and datasource and add to object Returns: saves dataContext and datasource to self """ self.context=ge.get_context() #create datasource configuration datasource_config = { "name": "example_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "module_name": "great_expectations.execution_engine", "class_name": "PandasExecutionEngine", }, "data_connectors": { "default_runtime_data_connector_name": { "class_name": "RuntimeDataConnector", "batch_identifiers": ["default_identifier_name"], }, }, } #create data context configuration data_context_config = DataContextConfig( datasources={ "pandas": DatasourceConfig( class_name="Datasource", execution_engine={ "class_name": "PandasExecutionEngine" }, data_connectors={ "default_runtime_data_connector_name": { "class_name": "RuntimeDataConnector", "batch_identifiers": ["default_identifier_name"], } }, ) }, store_backend_defaults=FilesystemStoreBackendDefaults(root_directory=os.path.join(os.getcwd(),'great_expectations')), ) #build context and add data source self.context=BaseDataContext(project_config=data_context_config) #self.context.test_yaml_config(yaml.dump(datasource_config)) self.context.add_datasource(**datasource_config)
def build_in_memory_runtime_context(): data_context_config: DataContextConfig = DataContextConfig( datasources={ "pandas_datasource": { "execution_engine": { "class_name": "PandasExecutionEngine", "module_name": "great_expectations.execution_engine", }, "class_name": "Datasource", "module_name": "great_expectations.datasource", "data_connectors": { "runtime_data_connector": { "class_name": "RuntimeDataConnector", "batch_identifiers": [ "id_key_0", "id_key_1", ], } }, }, "spark_datasource": { "execution_engine": { "class_name": "SparkDFExecutionEngine", "module_name": "great_expectations.execution_engine", }, "class_name": "Datasource", "module_name": "great_expectations.datasource", "data_connectors": { "runtime_data_connector": { "class_name": "RuntimeDataConnector", "batch_identifiers": [ "id_key_0", "id_key_1", ], } }, }, }, expectations_store_name="expectations_store", validations_store_name="validations_store", evaluation_parameter_store_name="evaluation_parameter_store", checkpoint_store_name="checkpoint_store", store_backend_defaults=InMemoryStoreBackendDefaults(), ) context: BaseDataContext = BaseDataContext( project_config=data_context_config) return context
def test_data_context_variables_get_with_substitutions( data_context_config_dict: dict, ) -> None: env_var_name: str = "MY_CONFIG_VERSION" value_associated_with_env_var: float = 7.0 data_context_config_dict[ DataContextVariableSchema.CONFIG_VERSION] = f"${env_var_name}" config: DataContextConfig = DataContextConfig(**data_context_config_dict) substitutions: dict = { env_var_name: value_associated_with_env_var, } variables: DataContextVariables = EphemeralDataContextVariables( config=config, substitutions=substitutions) assert variables.config_version == value_associated_with_env_var
def _upgrade_configuration(self): if self.upgrade_log["skipped_upgrade"]: return config_commented_map: CommentedMap = ( self.data_context.get_config().commented_map) for name, value in self.upgrade_checklist.items(): if isinstance(value, dict): for key, config in value.items(): config_commented_map[name][key] = config else: config_commented_map[name] = value data_context_config: DataContextConfig = DataContextConfig.from_commented_map( commented_map=config_commented_map) self.data_context.set_config(project_config=data_context_config) # noinspection PyProtectedMember self.data_context._save_project_config() self._update_upgrade_log()
def _get_ge_context_local(ge_project_path: str) -> BaseDataContext: """ This is configured to work with an in-memory pandas DataFrame. This setup allows us to run validations before (perhaps unnecessarily) writing any data to disk, as well as at any other stage. Currently using local storage. Args: ge_project_path (str): The path to the Great Expectations project, eg. `/home/viadot/my_flow`. Expectation suites need to be placed inside the `expectations` folder, eg. `/home/viadot/my_flow/expectations/failure.json`. Returns: BaseDataContext: The GE context (ie. config) required to run the validations. """ data_context_config = DataContextConfig( datasources={ "pandas": DatasourceConfig( class_name="PandasDatasource", batch_kwargs_generators={}, # override the CSV default ) }, store_backend_defaults=FilesystemStoreBackendDefaults(ge_project_path), validation_operators={ "action_list_operator": { "class_name": "ActionListValidationOperator", "action_list": [ { "name": "store_validation_result", "action": {"class_name": "StoreValidationResultAction"}, }, { "name": "store_evaluation_params", "action": {"class_name": "StoreEvaluationParametersAction"}, }, { "name": "update_data_docs", "action": {"class_name": "UpdateDataDocsAction"}, }, ], } }, ) context = BaseDataContext(project_config=data_context_config) return context
def _set(self, key: Tuple[str, ...], value: Any, **kwargs: dict) -> None: ( resource_type, resource_name, ) = InlineStoreBackend._determine_resource_type_and_name_from_key(key) project_config: DataContextConfig = self._data_context.config if resource_type is DataContextVariableSchema.ALL_VARIABLES: config_commented_map_from_yaml = yaml.load(value) value = DataContextConfig.from_commented_map( commented_map=config_commented_map_from_yaml) self._data_context.set_config(value) elif resource_name is not None: project_config[resource_type][resource_name] = value else: project_config[resource_type] = value self._save_changes()
def test_file_data_context_variables_e2e( monkeypatch, file_data_context: FileDataContext, progress_bars: ProgressBarsConfig) -> None: """ What does this test do and why? Tests the E2E workflow with a FileDataContextVariables instance. 1. User updates certain values and sets them as attributes. 2. User persists changes utilizing the save_config call defined by the Variables API. 3. Upon reading the result config from disk, we can confirm that changes were appropriately persisted. It is also important to note that in the case of $VARS syntax, we NEVER want to persist the underlying value in order to preserve sensitive information. """ # Prepare updated progress bars to set and serialize to disk updated_progress_bars: ProgressBarsConfig = copy.deepcopy(progress_bars) updated_progress_bars.globally = False updated_progress_bars.profilers = True # Prepare updated plugins directory to set and serialize to disk (ensuring we hide the true value behind $VARS syntax) env_var_name: str = "MY_PLUGINS_DIRECTORY" value_associated_with_env_var: str = "foo/bar/baz" monkeypatch.setenv(env_var_name, value_associated_with_env_var) # Set attributes defined above file_data_context.variables.progress_bars = updated_progress_bars file_data_context.variables.plugins_directory = f"${env_var_name}" file_data_context.variables.save_config() # Review great_expectations.yml where values were written and confirm changes config_filepath = pathlib.Path(file_data_context.root_directory).joinpath( file_data_context.GE_YML) with open(config_filepath) as f: contents: dict = yaml.load(f) config_saved_to_disk: DataContextConfig = DataContextConfig(**contents) assert config_saved_to_disk.progress_bars == updated_progress_bars.to_dict( ) assert (file_data_context.variables.plugins_directory == value_associated_with_env_var) assert config_saved_to_disk.plugins_directory == f"${env_var_name}"
def _load_project_config(self): """ Reads the project configuration from the project configuration file. The file may contain ${SOME_VARIABLE} variables - see self.project_config_with_variables_substituted for how these are substituted. For Data Contexts in GE Cloud mode, a user-specific template is retrieved from the Cloud API - see self._retrieve_data_context_config_from_ge_cloud for more details. :return: the configuration object read from the file or template """ if self.ge_cloud_mode: config = self._retrieve_data_context_config_from_ge_cloud() return config path_to_yml = os.path.join(self._context_root_directory, self.GE_YML) try: with open(path_to_yml) as data: config_commented_map_from_yaml = yaml.load(data) except YAMLError as err: raise ge_exceptions.InvalidConfigurationYamlError( "Your configuration file is not a valid yml file likely due to a yml syntax error:\n\n{}".format( err ) ) except DuplicateKeyError: raise ge_exceptions.InvalidConfigurationYamlError( "Error: duplicate key found in project YAML file." ) except OSError: raise ge_exceptions.ConfigNotFoundError() try: return DataContextConfig.from_commented_map( commented_map=config_commented_map_from_yaml ) except ge_exceptions.InvalidDataContextConfigError: # Just to be explicit about what we intended to catch raise
def basic_data_context_v013_config(): return DataContextConfig( **{ "commented_map": {}, "config_version": 3, "plugins_directory": "plugins/", "evaluation_parameter_store_name": "evaluation_parameter_store", "validations_store_name": "does_not_have_to_be_real", "expectations_store_name": "expectations_store", "checkpoint_store_name": "checkpoint_store", "config_variables_file_path": "uncommitted/config_variables.yml", "datasources": {}, "stores": { "expectations_store": { "class_name": "ExpectationsStore", "store_backend": { "class_name": "TupleFilesystemStoreBackend", "base_directory": "expectations/", }, }, "evaluation_parameter_store": { "module_name": "great_expectations.data_context.store", "class_name": "EvaluationParameterStore", }, "checkpoint_store": { "class_name": "CheckpointStore", "store_backend": { "class_name": "TupleFilesystemStoreBackend", "base_directory": "checkpoints/", }, }, }, "data_docs_sites": {}, "anonymous_usage_statistics": { "enabled": True, "data_context_id": "6a52bdfa-e182-455b-a825-e69f076e67d6", "usage_statistics_url": USAGE_STATISTICS_QA_URL, }, } )
def in_memory_data_context_config_usage_stats_enabled(): return DataContextConfig( **{ "commented_map": {}, "config_version": 3, "plugins_directory": None, "evaluation_parameter_store_name": "evaluation_parameter_store", "validations_store_name": "validations_store", "expectations_store_name": "expectations_store", "checkpoint_store_name": "checkpoints_store", "config_variables_file_path": None, "datasources": {}, "stores": { "expectations_store": { "class_name": "ExpectationsStore", }, "validations_store": { "class_name": "ValidationsStore", }, "checkpoints_store": { "class_name": "CheckpointStore", }, "evaluation_parameter_store": { "class_name": "EvaluationParameterStore", }, }, "data_docs_sites": {}, "validation_operators": { "default": { "class_name": "ActionListValidationOperator", "action_list": [], } }, "anonymous_usage_statistics": { "enabled": True, "data_context_id": DATA_CONTEXT_ID, "usage_statistics_url": USAGE_STATISTICS_URL, }, })
def basic_in_memory_data_context_config_just_stores(): return DataContextConfig( config_version=3.0, plugins_directory=None, evaluation_parameter_store_name="evaluation_parameter_store", expectations_store_name="expectations_store", datasources={}, stores={ "expectations_store": { "class_name": "ExpectationsStore" }, "evaluation_parameter_store": { "class_name": "EvaluationParameterStore" }, "validation_result_store": { "class_name": "ValidationsStore" }, }, validations_store_name="validation_result_store", data_docs_sites={}, validation_operators={}, )
def get_config_with_variables_substituted( self, config: Optional[DataContextConfig] = None) -> DataContextConfig: """ Substitute vars in config of form ${var} or $(var) with values found in the following places, in order of precedence: ge_cloud_config (for Data Contexts in GE Cloud mode), runtime_environment, environment variables, config_variables, or ge_cloud_config_variable_defaults (allows certain variables to be optional in GE Cloud mode). """ if not config: config = self.config substitutions: dict = self._determine_substitutions() ge_cloud_config_variable_defaults = { "plugins_directory": self._normalize_absolute_or_relative_path( path=DataContextConfigDefaults.DEFAULT_PLUGINS_DIRECTORY.value ), "usage_statistics_url": DEFAULT_USAGE_STATISTICS_URL, } for config_variable, value in ge_cloud_config_variable_defaults.items( ): if substitutions.get(config_variable) is None: logger.info( f'Config variable "{config_variable}" was not found in environment or global config (' f'{self.GLOBAL_CONFIG_PATHS}). Using default value "{value}" instead. If you would ' f"like to " f"use a different value, please specify it in an environment variable or in a " f"great_expectations.conf file located at one of the above paths, in a section named " f'"ge_cloud_config".') substitutions[config_variable] = value return DataContextConfig(**substitute_all_config_variables( config, substitutions, self.DOLLAR_SIGN_ESCAPE_STRING))
def test_override_general_defaults( construct_data_context_config, default_pandas_datasource_config, default_spark_datasource_config, ): """ What does this test and why? A DataContextConfig should be able to be created by passing items into the constructor that override any defaults. It should also be able to handle multiple datasources, even if they are configured with a dictionary or a DatasourceConfig. """ data_context_config = DataContextConfig( config_version=999, plugins_directory="custom_plugins_directory", config_variables_file_path="custom_config_variables_file_path", datasources={ "my_spark_datasource": { "data_asset_type": { "class_name": "SparkDFDataset", "module_name": "great_expectations.dataset", }, "class_name": "SparkDFDatasource", "module_name": "great_expectations.datasource", "batch_kwargs_generators": {}, }, "my_pandas_datasource": DatasourceConfig( class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": "../data/", } }, ), }, stores={ "expectations_S3_store": { "class_name": "ExpectationsStore", "store_backend": { "class_name": "TupleS3StoreBackend", "bucket": "REPLACE_ME", "prefix": "REPLACE_ME", }, }, "expectations_S3_store2": { "class_name": "ExpectationsStore", "store_backend": { "class_name": "TupleS3StoreBackend", "bucket": "REPLACE_ME", "prefix": "REPLACE_ME", }, }, "validations_S3_store": { "class_name": "ValidationsStore", "store_backend": { "class_name": "TupleS3StoreBackend", "bucket": "REPLACE_ME", "prefix": "REPLACE_ME", }, }, "validations_S3_store2": { "class_name": "ValidationsStore", "store_backend": { "class_name": "TupleS3StoreBackend", "bucket": "REPLACE_ME", "prefix": "REPLACE_ME", }, }, "custom_evaluation_parameter_store": { "class_name": "EvaluationParameterStore" }, }, expectations_store_name="custom_expectations_store_name", validations_store_name="custom_validations_store_name", evaluation_parameter_store_name="custom_evaluation_parameter_store_name", data_docs_sites={ "s3_site": { "class_name": "SiteBuilder", "store_backend": { "class_name": "TupleS3StoreBackend", "bucket": "REPLACE_ME", }, "site_index_builder": { "class_name": "DefaultSiteIndexBuilder", "show_cta_footer": True, }, }, "local_site": { "class_name": "SiteBuilder", "show_how_to_buttons": True, "site_index_builder": { "class_name": "DefaultSiteIndexBuilder", "show_cta_footer": True, }, "store_backend": { "base_directory": "uncommitted/data_docs/local_site/", "class_name": "TupleFilesystemStoreBackend", }, }, }, validation_operators={ "custom_action_list_operator": { "class_name": "ActionListValidationOperator", "action_list": [ { "name": "custom_store_validation_result", "action": {"class_name": "CustomStoreValidationResultAction"}, }, { "name": "store_evaluation_params", "action": {"class_name": "StoreEvaluationParametersAction"}, }, { "name": "update_data_docs", "action": {"class_name": "UpdateDataDocsAction"}, }, ], } }, anonymous_usage_statistics={"enabled": True}, ) desired_stores = { "custom_evaluation_parameter_store": {"class_name": "EvaluationParameterStore"}, "expectations_S3_store": { "class_name": "ExpectationsStore", "store_backend": { "bucket": "REPLACE_ME", "class_name": "TupleS3StoreBackend", "prefix": "REPLACE_ME", }, }, "expectations_S3_store2": { "class_name": "ExpectationsStore", "store_backend": { "bucket": "REPLACE_ME", "class_name": "TupleS3StoreBackend", "prefix": "REPLACE_ME", }, }, "validations_S3_store": { "class_name": "ValidationsStore", "store_backend": { "bucket": "REPLACE_ME", "class_name": "TupleS3StoreBackend", "prefix": "REPLACE_ME", }, }, "validations_S3_store2": { "class_name": "ValidationsStore", "store_backend": { "bucket": "REPLACE_ME", "class_name": "TupleS3StoreBackend", "prefix": "REPLACE_ME", }, }, } desired_data_docs_sites_config = { "local_site": { "class_name": "SiteBuilder", "show_how_to_buttons": True, "site_index_builder": { "class_name": "DefaultSiteIndexBuilder", "show_cta_footer": True, }, "store_backend": { "base_directory": "uncommitted/data_docs/local_site/", "class_name": "TupleFilesystemStoreBackend", }, }, "s3_site": { "class_name": "SiteBuilder", "site_index_builder": { "class_name": "DefaultSiteIndexBuilder", "show_cta_footer": True, }, "store_backend": { "bucket": "REPLACE_ME", "class_name": "TupleS3StoreBackend", }, }, } desired_validation_operators = { "custom_action_list_operator": { "class_name": "ActionListValidationOperator", "action_list": [ { "name": "custom_store_validation_result", "action": {"class_name": "CustomStoreValidationResultAction"}, }, { "name": "store_evaluation_params", "action": {"class_name": "StoreEvaluationParametersAction"}, }, { "name": "update_data_docs", "action": {"class_name": "UpdateDataDocsAction"}, }, ], } } desired_config = construct_data_context_config( data_context_id=data_context_config.anonymous_usage_statistics.data_context_id, datasources={ **default_pandas_datasource_config, **default_spark_datasource_config, }, config_version=999.0, expectations_store_name="custom_expectations_store_name", validations_store_name="custom_validations_store_name", evaluation_parameter_store_name="custom_evaluation_parameter_store_name", stores=desired_stores, validation_operators=desired_validation_operators, data_docs_sites=desired_data_docs_sites_config, plugins_directory="custom_plugins_directory", ) desired_config["config_variables_file_path"] = "custom_config_variables_file_path" data_context_config_schema = DataContextConfigSchema() assert data_context_config_schema.dump(data_context_config) == desired_config assert DataContext.validate_config(project_config=data_context_config)
def test_DataContextConfig_with_DatabaseStoreBackendDefaults_using_all_parameters( construct_data_context_config, default_pandas_datasource_config ): """ What does this test and why? Make sure that DatabaseStoreBackendDefaults parameters are handled appropriately E.g. Make sure that default_credentials is ignored if individual store credentials are passed """ data_context_config = DataContextConfig( datasources={ "my_pandas_datasource": DatasourceConfig( class_name="PandasDatasource", module_name="great_expectations.datasource", data_asset_type={ "module_name": "great_expectations.dataset", "class_name": "PandasDataset", }, batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": "../data/", } }, ) }, store_backend_defaults=DatabaseStoreBackendDefaults( default_credentials={ "drivername": "postgresql", "host": "localhost", "port": "65432", "username": "******", "password": "******", "database": "ge_tutorials", }, expectations_store_credentials={ "drivername": "custom_expectations_store_drivername", "host": "custom_expectations_store_host", "port": "custom_expectations_store_port", "username": "******", "password": "******", "database": "custom_expectations_store_database", }, validations_store_credentials={ "drivername": "custom_validations_store_drivername", "host": "custom_validations_store_host", "port": "custom_validations_store_port", "username": "******", "password": "******", "database": "custom_validations_store_database", }, expectations_store_name="custom_expectations_database_store_name", validations_store_name="custom_validations_database_store_name", evaluation_parameter_store_name="custom_evaluation_parameter_store_name", ), ) # Create desired config desired_stores_config = { "custom_evaluation_parameter_store_name": { "class_name": "EvaluationParameterStore" }, "custom_expectations_database_store_name": { "class_name": "ExpectationsStore", "store_backend": { "class_name": "DatabaseStoreBackend", "credentials": { "database": "custom_expectations_store_database", "drivername": "custom_expectations_store_drivername", "host": "custom_expectations_store_host", "password": "******", "port": "custom_expectations_store_port", "username": "******", }, }, }, "custom_validations_database_store_name": { "class_name": "ValidationsStore", "store_backend": { "class_name": "DatabaseStoreBackend", "credentials": { "database": "custom_validations_store_database", "drivername": "custom_validations_store_drivername", "host": "custom_validations_store_host", "password": "******", "port": "custom_validations_store_port", "username": "******", }, }, }, } desired_data_docs_sites_config = { "local_site": { "class_name": "SiteBuilder", "show_how_to_buttons": True, "site_index_builder": { "class_name": "DefaultSiteIndexBuilder", "show_cta_footer": True, }, "store_backend": { "base_directory": "uncommitted/data_docs/local_site/", "class_name": "TupleFilesystemStoreBackend", }, } } desired_config = construct_data_context_config( data_context_id=data_context_config.anonymous_usage_statistics.data_context_id, datasources=default_pandas_datasource_config, expectations_store_name="custom_expectations_database_store_name", validations_store_name="custom_validations_database_store_name", evaluation_parameter_store_name="custom_evaluation_parameter_store_name", stores=desired_stores_config, data_docs_sites=desired_data_docs_sites_config, ) data_context_config_schema = DataContextConfigSchema() assert data_context_config_schema.dump(data_context_config) == desired_config assert DataContext.validate_config(project_config=data_context_config)