def test_golden_path_sql_datasource_configuration( sa, empty_data_context, test_connectable_postgresql_db): """Tests the golden path for setting up a StreamlinedSQLDatasource using test_yaml_config""" context = empty_data_context os.chdir(context.root_directory) # Everything below this line (except for asserts) is what we expect users to run as part of the golden path. import great_expectations as ge context = ge.get_context() db_hostname = os.getenv("GE_TEST_LOCAL_DB_HOSTNAME", "localhost") yaml_config = f""" class_name: SimpleSqlalchemyDatasource credentials: drivername: postgresql username: postgres password: "" host: {db_hostname} port: 5432 database: test_ci introspection: whole_table_with_limits: sampling_method: _sample_using_limit sampling_kwargs: n: 10 """ # noinspection PyUnusedLocal report_object = context.test_yaml_config( name="my_datasource", yaml_config=yaml_config, return_mode="report_object", ) print(json.dumps(report_object, indent=2)) print(context.datasources) my_batch = context.get_batch( "my_datasource", "whole_table_with_limits", "test_df", ) # assert len(my_batch.data.fetchall()) == 10 with pytest.raises(KeyError): my_batch = context.get_batch( "my_datasource", "whole_table_with_limits", "DOES_NOT_EXIST", ) my_validator = context.get_validator( datasource_name="my_datasource", data_connector_name="whole_table_with_limits", data_asset_name="test_df", expectation_suite=ExpectationSuite("my_expectation_suite"), ) my_evr = my_validator.expect_table_columns_to_match_set(column_set=[]) print(my_evr)
def test_ge(): CONNECTION_STRING = os.environ.get("DB_URL") context = ge.get_context() datasource_config = { "name": "my_datasource", "class_name": "Datasource", "execution_engine": { "class_name": "SqlAlchemyExecutionEngine", "connection_string": CONNECTION_STRING, "create_temp_table": False, }, "data_connectors": { "default_runtime_data_connector_name": { "class_name": "RuntimeDataConnector", "batch_identifiers": ["default_identifier_name"], }, "default_inferred_data_connector_name": { "class_name": "InferredAssetSqlDataConnector", "include_schema_name": True, }, }, } # Please note this override is only to provide good UX for docs and tests. # In normal usage you'd set your path directly in the yaml above. datasource_config["execution_engine"][ "connection_string"] = CONNECTION_STRING context.test_yaml_config(yaml.dump(datasource_config)) context.add_datasource(**datasource_config) # First test for RuntimeBatchRequest using a query batch_request = RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name= "default_name", # this can be anything that identifies this data runtime_parameters={"query": "SELECT TOP 10 * from dbo.taxi_data"}, batch_identifiers={"default_identifier_name": "default_identifier"}, batch_spec_passthrough={"create_temp_table": False}, ) context.create_expectation_suite(expectation_suite_name="test_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_suite") print(validator.head()) # NOTE: The following code is only for testing and can be ignored by users. assert isinstance(validator, ge.validator.validator.Validator)
def build_context(self): """ Purpose: Create a dataContext and datasource and add to object Returns: saves dataContext and datasource to self """ self.context=ge.get_context() #create datasource configuration datasource_config = { "name": "example_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "module_name": "great_expectations.execution_engine", "class_name": "PandasExecutionEngine", }, "data_connectors": { "default_runtime_data_connector_name": { "class_name": "RuntimeDataConnector", "batch_identifiers": ["default_identifier_name"], }, }, } #create data context configuration data_context_config = DataContextConfig( datasources={ "pandas": DatasourceConfig( class_name="Datasource", execution_engine={ "class_name": "PandasExecutionEngine" }, data_connectors={ "default_runtime_data_connector_name": { "class_name": "RuntimeDataConnector", "batch_identifiers": ["default_identifier_name"], } }, ) }, store_backend_defaults=FilesystemStoreBackendDefaults(root_directory=os.path.join(os.getcwd(),'great_expectations')), ) #build context and add data source self.context=BaseDataContext(project_config=data_context_config) #self.context.test_yaml_config(yaml.dump(datasource_config)) self.context.add_datasource(**datasource_config)
import os from typing import List from ruamel import yaml import great_expectations as ge from great_expectations.core.batch import Batch, BatchRequest CREDENTIAL = os.getenv("AZURE_ACCESS_KEY", "") context = ge.get_context() datasource_yaml = rf""" name: my_azure_datasource class_name: Datasource execution_engine: class_name: SparkDFExecutionEngine azure_options: account_url: <YOUR_ACCOUNT_URL> # or `conn_str` credential: <YOUR_CREDENTIAL> # if using a protected container data_connectors: default_runtime_data_connector_name: class_name: RuntimeDataConnector batch_identifiers: - default_identifier_name default_inferred_data_connector_name: class_name: InferredAssetAzureDataConnector azure_options: account_url: <YOUR_ACCOUNT_URL> # or `conn_str` credential: <YOUR_CREDENTIAL> # if using a protected container container: <YOUR_AZURE_CONTAINER_HERE>
def test_golden_path_configured_asset_pandas_datasource_configuration( empty_data_context, test_df, tmp_path_factory): """ Tests the golden path for InferredAssetFilesystemDataConnector with PandasExecutionEngine using test_yaml_config """ base_directory = str( tmp_path_factory.mktemp( "test_golden_path_pandas_datasource_configuration")) create_files_in_directory( directory=base_directory, file_name_list=[ "test_dir_foxtrot/A/A-1.csv", "test_dir_foxtrot/A/A-2.csv", "test_dir_foxtrot/A/A-3.csv", "test_dir_foxtrot/B/B-1.txt", "test_dir_foxtrot/B/B-2.txt", "test_dir_foxtrot/B/B-3.txt", "test_dir_foxtrot/C/C-2017.csv", "test_dir_foxtrot/C/C-2018.csv", "test_dir_foxtrot/C/C-2019.csv", "test_dir_foxtrot/D/D-aaa.csv", "test_dir_foxtrot/D/D-bbb.csv", "test_dir_foxtrot/D/D-ccc.csv", "test_dir_foxtrot/D/D-ddd.csv", "test_dir_foxtrot/D/D-eee.csv", ], file_content_fn=lambda: test_df.to_csv(header=True, index=False), ) context = empty_data_context os.chdir(context.root_directory) import great_expectations as ge context = ge.get_context() yaml_config = f""" class_name: Datasource execution_engine: class_name: PandasExecutionEngine data_connectors: my_filesystem_data_connector: class_name: ConfiguredAssetFilesystemDataConnector base_directory: {base_directory} # glob_directive: "*" default_regex: pattern: (.+)\\.csv group_names: - alphanumeric assets: A: base_directory: {base_directory}/test_dir_foxtrot/A pattern: (.+)-(\\d+)\\.csv group_names: - letter - number B: base_directory: {base_directory}/test_dir_foxtrot/B pattern: (.+)-(\\d+)\\.csv group_names: - letter - number C: base_directory: {base_directory}/test_dir_foxtrot/C pattern: (.+)-(\\d+)\\.csv group_names: - letter - year D: base_directory: {base_directory}/test_dir_foxtrot/D pattern: (.+)-(\\d+)\\.csv group_names: - letter - checksum """ # noinspection PyUnusedLocal report_object = context.test_yaml_config( name="my_directory_datasource", yaml_config=yaml_config, return_mode="report_object", ) # print(json.dumps(report_object, indent=2)) # print(context.datasources) my_batch = context.get_batch( datasource_name="my_directory_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="A", batch_identifiers={ "number": "2", }, batch_spec_passthrough={ "sampling_method": "_sample_using_hash", "sampling_kwargs": { "column_name": "date", "hash_function_name": "md5", "hash_value": "f", }, }, ) assert my_batch.batch_definition["data_asset_name"] == "A" my_batch.head() df_data = my_batch.data.dataframe assert df_data.shape == (10, 10) df_data["date"] = df_data.apply( lambda row: datetime.datetime.strptime(row["date"], "%Y-%m-%d").date(), axis=1) assert (test_df[(test_df["date"] == datetime.date(2020, 1, 15)) | (test_df["date"] == datetime.date(2020, 1, 29))].drop( "timestamp", axis=1).equals(df_data.drop("timestamp", axis=1))) with pytest.raises(ValueError): # noinspection PyUnusedLocal my_batch = context.get_batch( datasource_name="my_directory_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="DOES_NOT_EXIST", ) my_validator = context.get_validator( datasource_name="my_directory_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="C", data_connector_query={"batch_filter_parameters": { "year": "2019" }}, create_expectation_suite_with_name="my_expectations", batch_spec_passthrough={ "sampling_method": "_sample_using_hash", "sampling_kwargs": { "column_name": "date", "hash_function_name": "md5", "hash_value": "f", }, }, ) my_evr = my_validator.expect_column_values_to_be_between(column="d", min_value=1, max_value=31) assert my_evr.success
taxi_test_data: TaxiTestData = TaxiTestData( test_df, test_column_name="pickup_datetime") taxi_splitting_test_cases: TaxiSplittingTestCases = TaxiSplittingTestCases( taxi_test_data) test_cases: List[ TaxiSplittingTestCase] = taxi_splitting_test_cases.test_cases() for test_case in test_cases: print("Testing splitter method:", test_case.splitter_method_name) # 1. Setup context: DataContext = ge.get_context() datasource_name: str = "test_datasource" data_connector_name: str = "test_data_connector" data_asset_name: str = table_name # Read from generated table name column_name: str = taxi_splitting_test_cases.test_column_name # 2. Set splitter in DataConnector config data_connector_config: dict = { "class_name": "ConfiguredAssetSqlDataConnector", "assets": { data_asset_name: { "splitter_method": test_case.splitter_method_name, "splitter_kwargs": test_case.splitter_kwargs, } },
tests and will be updated. These statements can be ignored by users. Comments with the tags `<snippet>` and `</snippet>` are used to ensure that if this script is updated the snippets that are specified for use in documentation are maintained. These comments can be ignored by users. --documentation-- https://docs.greatexpectations.io/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant """ import great_expectations as ge from great_expectations.checkpoint import SimpleCheckpoint from great_expectations.core.batch import BatchRequest from great_expectations.core.yaml_handler import YAMLHandler yaml = YAMLHandler() context: ge.DataContext = ge.get_context() # Configure your datasource (if you aren't using one that already exists) # <snippet> datasource_config = { "name": "taxi_multi_batch_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "module_name": "great_expectations.execution_engine", "class_name": "PandasExecutionEngine", }, "data_connectors": { "inferred_data_connector_all_years": { "class_name": "InferredAssetFilesystemDataConnector",
def test_golden_path_inferred_asset_pandas_datasource_configuration( mock_emit, empty_data_context_stats_enabled, test_df, tmp_path_factory): """ Tests the golden path for InferredAssetFilesystemDataConnector with PandasExecutionEngine using test_yaml_config """ base_directory = str( tmp_path_factory.mktemp( "test_golden_path_pandas_datasource_configuration")) create_files_in_directory( directory=base_directory, file_name_list=[ "test_dir_charlie/A/A-1.csv", "test_dir_charlie/A/A-2.csv", "test_dir_charlie/A/A-3.csv", "test_dir_charlie/B/B-1.csv", "test_dir_charlie/B/B-2.csv", "test_dir_charlie/B/B-3.csv", "test_dir_charlie/C/C-1.csv", "test_dir_charlie/C/C-2.csv", "test_dir_charlie/C/C-3.csv", "test_dir_charlie/D/D-1.csv", "test_dir_charlie/D/D-2.csv", "test_dir_charlie/D/D-3.csv", ], file_content_fn=lambda: test_df.to_csv(header=True, index=False), ) context: DataContext = empty_data_context_stats_enabled os.chdir(context.root_directory) import great_expectations as ge context = ge.get_context() mock_emit.reset_mock() # Remove data_context.__init__ call yaml_config = f""" class_name: Datasource execution_engine: class_name: PandasExecutionEngine data_connectors: my_filesystem_data_connector: class_name: InferredAssetFilesystemDataConnector base_directory: {base_directory}/test_dir_charlie glob_directive: "*/*.csv" default_regex: pattern: (.+)/(.+)-(\\d+)\\.csv group_names: - subdirectory - data_asset_name - number """ # noinspection PyUnusedLocal report_object = context.test_yaml_config( name="my_directory_datasource", yaml_config=yaml_config, return_mode="report_object", ) # print(json.dumps(report_object, indent=2)) # print(context.datasources) assert mock_emit.call_count == 1 # Substitute anonymized names since it changes for each run anonymized_datasource_name = mock_emit.call_args_list[0][0][0][ "event_payload"]["anonymized_name"] anonymized_execution_engine_name = mock_emit.call_args_list[0][0][0][ "event_payload"]["anonymized_execution_engine"]["anonymized_name"] anonymized_data_connector_name = mock_emit.call_args_list[0][0][0][ "event_payload"]["anonymized_data_connectors"][0]["anonymized_name"] expected_call_args_list = [ mock.call({ "event": "data_context.test_yaml_config", "event_payload": { "anonymized_name": anonymized_datasource_name, "parent_class": "Datasource", "anonymized_execution_engine": { "anonymized_name": anonymized_execution_engine_name, "parent_class": "PandasExecutionEngine", }, "anonymized_data_connectors": [{ "anonymized_name": anonymized_data_connector_name, "parent_class": "InferredAssetFilesystemDataConnector", }], }, "success": True, }), ] assert mock_emit.call_args_list == expected_call_args_list my_batch = context.get_batch( datasource_name="my_directory_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="A", batch_identifiers={ "number": "2", }, batch_spec_passthrough={ "sampling_method": "_sample_using_hash", "sampling_kwargs": { "column_name": "date", "hash_function_name": "md5", "hash_value": "f", }, }, ) assert my_batch.batch_definition["data_asset_name"] == "A" df_data = my_batch.data.dataframe assert df_data.shape == (10, 10) df_data["date"] = df_data.apply( lambda row: datetime.datetime.strptime(row["date"], "%Y-%m-%d").date(), axis=1) assert (test_df[(test_df["date"] == datetime.date(2020, 1, 15)) | (test_df["date"] == datetime.date(2020, 1, 29))].drop( "timestamp", axis=1).equals(df_data.drop("timestamp", axis=1))) with pytest.raises(ValueError): # noinspection PyUnusedLocal my_batch = context.get_batch( datasource_name="my_directory_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="DOES_NOT_EXIST", ) my_validator = context.get_validator( datasource_name="my_directory_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="D", data_connector_query={"batch_filter_parameters": { "number": "3" }}, expectation_suite=ExpectationSuite("my_expectation_suite"), batch_spec_passthrough={ "sampling_method": "_sample_using_hash", "sampling_kwargs": { "column_name": "date", "hash_function_name": "md5", "hash_value": "f", }, }, ) my_evr = my_validator.expect_column_values_to_be_between(column="d", min_value=1, max_value=31) assert my_evr.success # TODO: <Alex>ALEX</Alex> # my_evr = my_validator.expect_table_columns_to_match_ordered_list(ordered_list=["x", "y", "z"]) # assert my_evr.success # No other usage stats calls detected assert mock_emit.call_count == 1
def test_golden_path_sql_datasource_configuration( mock_emit, empty_data_context_stats_enabled, sa, test_connectable_postgresql_db): """Tests the golden path for setting up a StreamlinedSQLDatasource using test_yaml_config""" context: DataContext = empty_data_context_stats_enabled os.chdir(context.root_directory) # Everything below this line (except for asserts) is what we expect users to run as part of the golden path. import great_expectations as ge context = ge.get_context() db_hostname = os.getenv("GE_TEST_LOCAL_DB_HOSTNAME", "localhost") yaml_config = f""" class_name: SimpleSqlalchemyDatasource credentials: drivername: postgresql username: postgres password: "" host: {db_hostname} port: 5432 database: test_ci introspection: whole_table_with_limits: sampling_method: _sample_using_limit sampling_kwargs: n: 10 """ # noinspection PyUnusedLocal report_object = context.test_yaml_config( name="my_datasource", yaml_config=yaml_config, return_mode="report_object", ) assert mock_emit.call_count == 2 # Substitute anonymized names since it changes for each run anonymized_datasource_name = mock_emit.call_args_list[1][0][0][ "event_payload"]["anonymized_name"] anonymized_data_connector_name = mock_emit.call_args_list[1][0][0][ "event_payload"]["anonymized_data_connectors"][0]["anonymized_name"] expected_call_args_list = [ mock.call({ "event_payload": {}, "event": "data_context.__init__", "success": True }), mock.call({ "event": "data_context.test_yaml_config", "event_payload": { "anonymized_name": anonymized_datasource_name, "parent_class": "SimpleSqlalchemyDatasource", "anonymized_execution_engine": { "parent_class": "SqlAlchemyExecutionEngine" }, "anonymized_data_connectors": [{ "anonymized_name": anonymized_data_connector_name, "parent_class": "InferredAssetSqlDataConnector", }], }, "success": True, }), ] assert mock_emit.call_args_list == expected_call_args_list print(json.dumps(report_object, indent=2)) print(context.datasources) my_batch = context.get_batch( "my_datasource", "whole_table_with_limits", "test_df", ) # assert len(my_batch.data.fetchall()) == 10 with pytest.raises(KeyError): my_batch = context.get_batch( "my_datasource", "whole_table_with_limits", "DOES_NOT_EXIST", ) my_validator = context.get_validator( datasource_name="my_datasource", data_connector_name="whole_table_with_limits", data_asset_name="test_df", expectation_suite=ExpectationSuite("my_expectation_suite"), ) my_evr = my_validator.expect_table_columns_to_match_set(column_set=[]) print(my_evr)