def test__get_full_file_path_for_asset_spark(basic_spark_df_execution_engine, fs): """ What does this test and why? File paths in DBFS need to use the `dbfs:/` protocol base instead of `/dbfs/` when being read using the `spark.read` method in the ExecutionEngine. In the data connector config however, the `/dbfs` version must be used. This test verifies that a config using a `/dbfs/` path is translated to `dbfs:/` when preparing the PathBatchSpec for the SparkDFExecutionEngine. """ base_directory: str = "/dbfs/great_expectations" base_directory_colon: str = "dbfs:/great_expectations" fs.create_dir(base_directory) create_files_in_directory( directory=base_directory, file_name_list=[ "test_dir_0/A/B/C/logfile_0.csv", "test_dir_0/A/B/C/bigfile_1.csv", "test_dir_0/A/filename2.csv", "test_dir_0/A/filename3.csv", ], ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetDBFSDataConnector datasource_name: BASE base_directory: {base_directory}/test_dir_0/A glob_directive: "*" default_regex: pattern: (.+)\\.csv group_names: - name assets: A: base_directory: B/C glob_directive: "log*.csv" pattern: (.+)_(\\d+)\\.csv group_names: - name - number """, ) my_data_connector: ConfiguredAssetDBFSDataConnector = instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "my_configured_asset_filesystem_data_connector", "execution_engine": basic_spark_df_execution_engine, }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) my_data_connector.data_context_root_directory = base_directory assert (my_data_connector._get_full_file_path_for_asset( path="bigfile_1.csv", asset=my_data_connector.assets["A"]) == f"{base_directory_colon}/test_dir_0/A/B/C/bigfile_1.csv") self_check_report = my_data_connector.self_check() assert self_check_report == { "class_name": "ConfiguredAssetDBFSDataConnector", "data_asset_count": 1, "example_data_asset_names": ["A"], "data_assets": { "A": { "batch_definition_count": 1, "example_data_references": ["logfile_0.csv"], } }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": {}, } my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition my_batch_request = BatchRequest( datasource_name="BASE", data_connector_name="my_configured_asset_filesystem_data_connector", data_asset_name="A", data_connector_query=None, ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request)) assert len(my_batch_definition_list) == 1 my_batch_definition = my_batch_definition_list[0] batch_spec: BatchSpec = my_data_connector.build_batch_spec( batch_definition=my_batch_definition) assert isinstance(batch_spec, PathBatchSpec) assert batch_spec.path == "dbfs:/great_expectations/test_dir_0/A/B/C/logfile_0.csv"
"account_url"] = "superconductivetesting.blob.core.windows.net" datasource_config["data_connectors"]["configured_data_connector_name"][ "azure_options"]["credential"] = CREDENTIAL datasource_config["data_connectors"]["configured_data_connector_name"][ "container"] = "superconductive-public" datasource_config["data_connectors"]["configured_data_connector_name"][ "name_starts_with"] = "data/taxi_yellow_tripdata_samples/" context.test_yaml_config(yaml.dump(datasource_config)) context.add_datasource(**datasource_config) # Here is a BatchRequest naming a data_asset batch_request = BatchRequest( datasource_name="my_azure_datasource", data_connector_name="configured_data_connector_name", data_asset_name="<YOUR_DATA_ASSET_NAME>", ) # Please note this override is only to provide good UX for docs and tests. # In normal usage you'd set your data asset name directly in the BatchRequest above. batch_request.data_asset_name = "taxi_data" context.create_expectation_suite(expectation_suite_name="test_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_suite") print(validator.head()) # NOTE: The following code is only for testing and can be ignored by users. assert isinstance(validator, ge.validator.validator.Validator)
def test_get_batch(data_context_with_sql_datasource_for_testing_get_batch): context = data_context_with_sql_datasource_for_testing_get_batch print( json.dumps( context.datasources["my_sqlite_db"].get_available_data_asset_names(), indent=4, ) ) # Successful specification using a typed BatchRequest context.get_batch( batch_request=BatchRequest( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", partition_request=PartitionRequest( partition_identifiers={"date": "2020-01-15"} ), ) ) # Failed specification using an untyped BatchRequest with pytest.raises(TypeError): context.get_batch( batch_request={ "datasource_name": "my_sqlite_db", "data_connector_name": "daily", "data_asset_name": "table_partitioned_by_date_column__A", "partition_request": {"partition_identifiers": {"date": "2020-01-15"}}, } ) # Failed specification using an incomplete BatchRequest with pytest.raises(ValueError): context.get_batch( batch_request=BatchRequest( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", partition_request=PartitionRequest(partition_identifiers={}), ) ) # Failed specification using an incomplete BatchRequest with pytest.raises(ValueError): context.get_batch( batch_request=BatchRequest( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", ) ) # Failed specification using an incomplete BatchRequest with pytest.raises(TypeError): context.get_batch( batch_request=BatchRequest( datasource_name="my_sqlite_db", data_connector_name="daily", ) ) # Failed specification using an incomplete BatchRequest # with pytest.raises(ValueError): with pytest.raises(TypeError): context.get_batch( batch_request=BatchRequest( # datasource_name=MISSING data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", partition_request=PartitionRequest(partition_identifiers={}), ) ) # Successful specification using parameters context.get_batch( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", date="2020-01-15", ) # Successful specification using parameters without parameter names for the identifying triple # This is the thinnest this can plausibly get. context.get_batch( "my_sqlite_db", "daily", "table_partitioned_by_date_column__A", date="2020-01-15", ) # Successful specification using parameters without parameter names for the identifying triple # In the case of a data_asset containing a single Batch, we don't even need parameters context.get_batch( "my_sqlite_db", "whole_table", "table_partitioned_by_date_column__A", ) # Successful specification using parameters and partition_request context.get_batch( "my_sqlite_db", "daily", "table_partitioned_by_date_column__A", partition_request=PartitionRequest( {"partition_identifiers": {"date": "2020-01-15"}} ), ) # Successful specification using parameters and partition_identifiers context.get_batch( "my_sqlite_db", "daily", "table_partitioned_by_date_column__A", partition_identifiers={"date": "2020-01-15"}, )
"<CONTAINER_PATH_TO_DATA>", "data/taxi_yellow_tripdata_samples/") datasource_yaml = datasource_yaml.replace( "<YOUR_ACCOUNT_URL>", "superconductivetesting.blob.core.windows.net") datasource_yaml = datasource_yaml.replace("<YOUR_CREDENTIAL>", CREDENTIAL) context.test_yaml_config(datasource_yaml) context.add_datasource(**yaml.load(datasource_yaml)) # Here is a BatchRequest naming a data_asset batch_request = BatchRequest( datasource_name="my_azure_datasource", data_connector_name="configured_data_connector_name", data_asset_name="<YOUR_DATA_ASSET_NAME>", batch_spec_passthrough={ "reader_method": "csv", "reader_options": { "header": True } }, ) # Please note this override is only to provide good UX for docs and tests. # In normal usage you'd set your data asset name directly in the BatchRequest above. batch_request.data_asset_name = "taxi_data" context.create_expectation_suite(expectation_suite_name="test_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_suite") print(validator.head())
def test_basic_datasource_runtime_data_connector_error_checking( basic_datasource_with_runtime_data_connector, ): test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) # Test for an unknown datasource with pytest.raises(ValueError): # noinspection PyUnusedLocal batch_list: List[ Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request( batch_request=BatchRequest( datasource_name="non_existent_datasource", data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset", )) # Test for an unknown data_connector with pytest.raises(ValueError): # noinspection PyUnusedLocal batch_list: List[ Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request( batch_request=BatchRequest( datasource_name=basic_datasource_with_runtime_data_connector .name, data_connector_name="non_existent_data_connector", data_asset_name="my_data_asset", )) # Test for illegal absence of partition_request when batch_data is specified with pytest.raises(ge_exceptions.DataConnectorError): # noinspection PyUnusedLocal batch_list: List[ Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request( batch_request=BatchRequest( datasource_name=basic_datasource_with_runtime_data_connector .name, data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset", batch_data=test_df, partition_request=None, )) # Test for illegal nullity of partition_request["partition_identifiers"] when batch_data is specified partition_request: dict = {"partition_identifiers": None} with pytest.raises(ge_exceptions.DataConnectorError): # noinspection PyUnusedLocal batch_list: List[ Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request( batch_request=BatchRequest( datasource_name=basic_datasource_with_runtime_data_connector .name, data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset", batch_data=test_df, partition_request=partition_request, )) # Test for illegal falsiness of partition_request["partition_identifiers"] when batch_data is specified partition_request: dict = {"partition_identifiers": {}} with pytest.raises(ge_exceptions.DataConnectorError): # noinspection PyUnusedLocal batch_list: List[ Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request( batch_request=BatchRequest( datasource_name=basic_datasource_with_runtime_data_connector .name, data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset", batch_data=test_df, partition_request=partition_request, ))
def test_alpha(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_alpha")) create_files_in_directory( directory=base_directory, file_name_list=[ "test_dir_alpha/A.csv", "test_dir_alpha/B.csv", "test_dir_alpha/C.csv", "test_dir_alpha/D.csv", ], ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetFilesystemDataConnector base_directory: {base_directory}/test_dir_alpha assets: A: glob_directive: "*.csv" default_regex: pattern: (.+)\\.csv group_names: - part_1 """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "datasource_name": "BASE", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) self_check_report = my_data_connector.self_check() print(json.dumps(self_check_report, indent=2)) assert self_check_report["class_name"] == "ConfiguredAssetFilesystemDataConnector" assert self_check_report["data_asset_count"] == 1 assert set(list(self_check_report["data_assets"].keys())) == {"A"} assert self_check_report["unmatched_data_reference_count"] == 0 my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition # Try to fetch a batch from a nonexistent asset my_batch_request: BatchRequest = BatchRequest( datasource_name="BASE", data_connector_name="general_filesystem_data_connector", data_asset_name="B", partition_request=None, ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 0 my_batch_request: BatchRequest = BatchRequest( datasource_name="BASE", data_connector_name="general_filesystem_data_connector", data_asset_name="A", partition_request=PartitionRequest( **{"partition_identifiers": {"part_1": "B"}} ), ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 1
def test_relative_default_and_relative_asset_base_directory_paths(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp( "test_relative_default_and_relative_asset_base_directory_paths" ) ) create_files_in_directory( directory=base_directory, file_name_list=[ "test_dir_0/A/B/C/logfile_0.csv", "test_dir_0/A/B/C/bigfile_1.csv", "test_dir_0/A/filename2.csv", "test_dir_0/A/filename3.csv", ], ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetFilesystemDataConnector base_directory: test_dir_0/A glob_directive: "*" default_regex: pattern: (.+)\\.csv group_names: - name assets: A: base_directory: B/C glob_directive: "log*.csv" pattern: (.+)_(\\d+)\\.csv group_names: - name - number """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "my_configured_asset_filesystem_data_connector", "datasource_name": "BASE", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) my_data_connector.data_context_root_directory = base_directory assert my_data_connector.base_directory == f"{base_directory}/test_dir_0/A" assert ( my_data_connector._get_full_file_path_for_asset( path="bigfile_1.csv", asset=my_data_connector.assets["A"] ) == f"{base_directory}/test_dir_0/A/B/C/bigfile_1.csv" ) self_check_report = my_data_connector.self_check() assert self_check_report == { "class_name": "ConfiguredAssetFilesystemDataConnector", "data_asset_count": 1, "example_data_asset_names": ["A"], "data_assets": { "A": { "batch_definition_count": 1, "example_data_references": ["logfile_0.csv"], } }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], "example_data_reference": {}, } my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition my_batch_request = BatchRequest( datasource_name="BASE", data_connector_name="my_configured_asset_filesystem_data_connector", data_asset_name="A", partition_request=None, ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 1
- default_identifier_name default_inferred_data_connector_name: class_name: InferredAssetFilesystemDataConnector base_directory: ../data/ default_regex: group_names: - data_asset_name pattern: (.*) """ context.test_yaml_config(datasource_yaml) context.add_datasource(**yaml.load(datasource_yaml)) # Get Validator by creating ExpectationSuite and passing in BatchRequest batch_request = BatchRequest( datasource_name="data__dir", data_connector_name="default_inferred_data_connector_name", data_asset_name="yellow_trip_data_sample_2019-01.csv", limit=1000, ) context.create_expectation_suite(expectation_suite_name="taxi.demo") validator = context.get_validator( batch_request=batch_request, expectation_suite_name="taxi.demo", ) # NOTE: The following assertion is only for testing and can be ignored by users. assert isinstance(validator, Validator) # Profile the data with the UserConfigurableProfiler and save resulting ExpectationSuite ignored_columns = [ "vendor_id", "pickup_datetime", "dropoff_datetime",
def test_more_complex_instantiation_of_InferredAssetSqlDataConnector( test_cases_for_sql_data_connector_sqlite_execution_engine, ): my_data_connector = instantiate_class_from_config( config={ "class_name": "InferredAssetSqlDataConnector", "name": "whole_table", "data_asset_name_suffix": "__whole", "include_schema_name": True, }, runtime_environment={ "execution_engine": test_cases_for_sql_data_connector_sqlite_execution_engine, "datasource_name": "my_test_datasource", }, config_defaults={"module_name": "great_expectations.datasource.data_connector"}, ) report_object = my_data_connector.self_check() assert report_object == { "class_name": "InferredAssetSqlDataConnector", "data_asset_count": 21, "data_assets": { "main.table_containing_id_spacers_for_D__whole": { "batch_definition_count": 1, "example_data_references": [{}], }, "main.table_full__I__whole": { "batch_definition_count": 1, "example_data_references": [{}], }, "main.table_partitioned_by_date_column__A__whole": { "batch_definition_count": 1, "example_data_references": [{}], }, }, "example_data_asset_names": [ "main.table_containing_id_spacers_for_D__whole", "main.table_full__I__whole", "main.table_partitioned_by_date_column__A__whole", ], "example_data_reference": { "batch_spec": { "partition_definition": {}, "table_name": "main.table_containing_id_spacers_for_D", }, "n_rows": 30, }, "example_unmatched_data_references": [], "unmatched_data_reference_count": 0, } assert my_data_connector.get_available_data_asset_names() == [ "main.table_containing_id_spacers_for_D__whole", "main.table_full__I__whole", "main.table_partitioned_by_date_column__A__whole", "main.table_partitioned_by_foreign_key__F__whole", "main.table_partitioned_by_incrementing_batch_id__E__whole", "main.table_partitioned_by_irregularly_spaced_incrementing_id_with_spacing_in_a_second_table__D__whole", "main.table_partitioned_by_multiple_columns__G__whole", "main.table_partitioned_by_regularly_spaced_incrementing_id_column__C__whole", "main.table_partitioned_by_timestamp_column__B__whole", "main.table_that_should_be_partitioned_by_random_hash__H__whole", "main.table_with_fk_reference_from_F__whole", "main.view_by_date_column__A__whole", "main.view_by_incrementing_batch_id__E__whole", "main.view_by_irregularly_spaced_incrementing_id_with_spacing_in_a_second_table__D__whole", "main.view_by_multiple_columns__G__whole", "main.view_by_regularly_spaced_incrementing_id_column__C__whole", "main.view_by_timestamp_column__B__whole", "main.view_containing_id_spacers_for_D__whole", "main.view_partitioned_by_foreign_key__F__whole", "main.view_that_should_be_partitioned_by_random_hash__H__whole", "main.view_with_fk_reference_from_F__whole", ] batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="my_test_datasource", data_connector_name="whole_table", data_asset_name="main.table_that_should_be_partitioned_by_random_hash__H__whole", ) ) assert len(batch_definition_list) == 1
context.create_expectation_suite(expectation_suite_name="test_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_suite") print(validator.head()) # NOTE: The following code is only for testing and can be ignored by users. assert isinstance(validator, ge.validator.validator.Validator) # Here is a BatchRequest naming a data_asset batch_request = BatchRequest( datasource_name="my_gcs_datasource", data_connector_name="default_inferred_data_connector_name", data_asset_name="<YOUR_DATA_ASSET_NAME>", batch_spec_passthrough={ "reader_method": "csv", "reader_options": { "header": True } }, ) # Please note this override is only to provide good UX for docs and tests. # In normal usage you'd set your data asset name directly in the BatchRequest above. batch_request.data_asset_name = ( "data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_2019-01") context.create_expectation_suite(expectation_suite_name="test_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_suite")
}, batch_identifiers={"default_identifier_name": "something_something"}, ) context.create_expectation_suite(expectation_suite_name="test_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_suite") print(validator.head()) # NOTE: The following code is only for testing and can be ignored by users. assert isinstance(validator, ge.validator.validator.Validator) # Here is a BatchRequest naming a table batch_request = BatchRequest( datasource_name="my_sqlite_datasource", data_connector_name="default_inferred_data_connector_name", data_asset_name= "yellow_tripdata_sample_2019_01", # this is the name of the table you want to retrieve ) context.create_expectation_suite(expectation_suite_name="test_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_suite") print(validator.head()) # NOTE: The following code is only for testing and can be ignored by users. assert isinstance(validator, ge.validator.validator.Validator) assert [ds["name"] for ds in context.list_datasources()] == ["my_sqlite_datasource"] assert "yellow_tripdata_sample_2019_01" in set( context.get_available_data_asset_names()["my_sqlite_datasource"] ["default_inferred_data_connector_name"])
context.create_expectation_suite( expectation_suite_name="test_suite", overwrite_existing=True ) validator = context.get_validator( batch_request=batch_request, expectation_suite_name="test_suite" ) print(validator.head()) # NOTE: The following code is only for testing and can be ignored by users. assert isinstance(validator, ge.validator.validator.Validator) # Second test for BatchRequest naming a table batch_request = BatchRequest( datasource_name="my_redshift_datasource", data_connector_name="default_inferred_data_connector_name", data_asset_name="taxi_data", # this is the name of the table you want to retrieve ) context.create_expectation_suite( expectation_suite_name="test_suite", overwrite_existing=True ) validator = context.get_validator( batch_request=batch_request, expectation_suite_name="test_suite" ) print(validator.head()) # NOTE: The following code is only for testing and can be ignored by users. assert isinstance(validator, ge.validator.validator.Validator) assert [ds["name"] for ds in context.list_datasources()] == ["my_redshift_datasource"] assert "taxi_data" in set( context.get_available_data_asset_names()["my_redshift_datasource"][
batch_identifiers={"default_identifier_name": "default_identifier"}, ) context.create_expectation_suite(expectation_suite_name="test_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_suite") print(validator.head()) # NOTE: The following code is only for testing and can be ignored by users. assert isinstance(validator, ge.validator.validator.Validator) # Second test for BatchRequest naming a table batch_request = BatchRequest( datasource_name="my_snowflake_datasource", data_connector_name="default_inferred_data_connector_name", data_asset_name= f"{sfSchema.lower()}.taxi_data", # this is the name of the table you want to retrieve ) context.create_expectation_suite(expectation_suite_name="test_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_suite") print(validator.head()) # NOTE: The following code is only for testing and can be ignored by users. assert isinstance(validator, ge.validator.validator.Validator) assert [ds["name"] for ds in context.list_datasources()] == ["my_snowflake_datasource"] assert f"{sfSchema.lower()}.taxi_data" in set( context.get_available_data_asset_names()["my_snowflake_datasource"] ["default_inferred_data_connector_name"])
def test__get_full_file_path_for_asset_pandas(fs): """ What does this test and why? File paths in DBFS need to use the `dbfs:/` protocol base instead of `/dbfs/` when being read using the `spark.read` method in the ExecutionEngine. HOWEVER when using a PandasExecutionEngine the file semantic `/dbfs/` version must be used instead. This test verifies that a config using a `/dbfs/` path is NOT translated to `dbfs:/` when preparing the PathBatchSpec for the PandasExecutionEngine. """ # Copy boto modules into fake filesystem (see https://github.com/spulec/moto/issues/1682#issuecomment-645016188) for module in [boto3, botocore]: module_dir = pathlib.Path(module.__file__).parent fs.add_real_directory(module_dir, lazy_read=False) # Copy google credentials into fake filesystem if they exist on your filesystem google_cred_file = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") if google_cred_file: fs.add_real_file(google_cred_file) base_directory: str = "/dbfs/great_expectations" fs.create_dir(base_directory) create_files_in_directory( directory=base_directory, file_name_list=[ "test_dir_0/A/B/C/logfile_0.csv", "test_dir_0/A/B/C/bigfile_1.csv", "test_dir_0/A/filename2.csv", "test_dir_0/A/filename3.csv", ], ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetDBFSDataConnector datasource_name: BASE base_directory: {base_directory}/test_dir_0/A glob_directive: "*" default_regex: pattern: (.+)\\.csv group_names: - name assets: A: base_directory: B/C glob_directive: "log*.csv" pattern: (.+)_(\\d+)\\.csv group_names: - name - number """, ) my_data_connector: ConfiguredAssetDBFSDataConnector = instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "my_configured_asset_filesystem_data_connector", "execution_engine": PandasExecutionEngine(), }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) my_data_connector.data_context_root_directory = base_directory assert (my_data_connector._get_full_file_path_for_asset( path="bigfile_1.csv", asset=my_data_connector.assets["A"]) == f"{base_directory}/test_dir_0/A/B/C/bigfile_1.csv") self_check_report = my_data_connector.self_check() assert self_check_report == { "class_name": "ConfiguredAssetDBFSDataConnector", "data_asset_count": 1, "example_data_asset_names": ["A"], "data_assets": { "A": { "batch_definition_count": 1, "example_data_references": ["logfile_0.csv"], } }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": {}, } my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition my_batch_request = BatchRequest( datasource_name="BASE", data_connector_name="my_configured_asset_filesystem_data_connector", data_asset_name="A", data_connector_query=None, ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request)) assert len(my_batch_definition_list) == 1 my_batch_definition = my_batch_definition_list[0] batch_spec: BatchSpec = my_data_connector.build_batch_spec( batch_definition=my_batch_definition) assert isinstance(batch_spec, PathBatchSpec) assert batch_spec.path == f"{base_directory}/test_dir_0/A/B/C/logfile_0.csv"
def test_return_all_batch_definitions_unsorted(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_return_all_batch_definitions_unsorted") ) create_files_in_directory( directory=base_directory, file_name_list=[ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ], ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetFilesystemDataConnector datasource_name: test_environment #execution_engine: # class_name: PandasExecutionEngine base_directory: {base_directory} glob_directive: "*.csv" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "datasource_name": "test_environment", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) with pytest.raises(TypeError): my_data_connector.get_batch_definition_list_from_batch_request() # with unnamed data_asset_name unsorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name=None, ) ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "abe", "timestamp": "20200809", "price": "1040"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200819", "price": "1300"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20200809", "price": "1500"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20201129", "price": "1900"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200713", "price": "1567"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200810", "price": "1003"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200811", "price": "1009"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200809", "price": "1002"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200810", "price": "1001"} ), ), ] assert expected == unsorted_batch_definition_list # with named data_asset_name unsorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", ) ) ) assert expected == unsorted_batch_definition_list
datasource_yaml = datasource_yaml.replace("<PATH_TO_YOUR_DATA_HERE>", data_dir_path) context.test_yaml_config(datasource_yaml) context.add_datasource(**yaml.load(datasource_yaml)) available_data_asset_names = context.datasources[ "taxi_datasource"].get_available_data_asset_names( data_connector_names="default_inferred_data_connector_name" )["default_inferred_data_connector_name"] assert len(available_data_asset_names) == 36 # Here is a BatchRequest naming an inferred data_asset. batch_request = BatchRequest( datasource_name="taxi_datasource", data_connector_name="default_inferred_data_connector_name", data_asset_name="<YOUR_DATA_ASSET_NAME>", ) # Please note this override is only to provide good UX for docs and tests. # In normal usage you'd set your data asset name directly in the BatchRequest above. batch_request.data_asset_name = "yellow_tripdata_sample_2019-01.csv" context.create_expectation_suite(expectation_suite_name="test_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_suite") print(validator.head(n_rows=10)) batch_list = context.get_batch_list(batch_request=batch_request) assert len(batch_list) == 1
def test_return_all_batch_definitions_sorted(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_return_all_batch_definitions_sorted") ) create_files_in_directory( directory=base_directory, file_name_list=[ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ], ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetFilesystemDataConnector datasource_name: test_environment #execution_engine: # class_name: PandasExecutionEngine base_directory: {base_directory} glob_directive: "*.csv" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price sorters: - orderby: asc class_name: LexicographicSorter name: name - datetime_format: "%Y%m%d" orderby: desc class_name: DateTimeSorter name: timestamp - orderby: desc class_name: NumericSorter name: price """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "datasource_name": "test_environment", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) self_check_report = my_data_connector.self_check() assert self_check_report["class_name"] == "ConfiguredAssetFilesystemDataConnector" assert self_check_report["data_asset_count"] == 1 assert self_check_report["data_assets"]["TestFiles"]["batch_definition_count"] == 10 assert self_check_report["unmatched_data_reference_count"] == 0 sorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", ) ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "abe", "timestamp": "20200809", "price": "1040"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200819", "price": "1300"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20201129", "price": "1900"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20200809", "price": "1500"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200811", "price": "1009"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200810", "price": "1003"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200713", "price": "1567"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200810", "price": "1001"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200809", "price": "1002"} ), ), ] # TEST 1: Sorting works assert expected == sorted_batch_definition_list my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_request=PartitionRequest( **{ "partition_identifiers": { "name": "james", "timestamp": "20200713", "price": "1567", } } ), ) my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition # TEST 2: Should only return the specified partition my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 1 my_batch_definition = my_batch_definition_list[0] expected_batch_definition: BatchDefinition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( **{ "name": "james", "timestamp": "20200713", "price": "1567", } ), ) assert my_batch_definition == expected_batch_definition # TEST 3: Without partition request, should return all 10 my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_request=None, ) # should return 10 my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 10
def test_get_available_data_asset_names_with_single_partition_file_data_connector( sample_datasource_v013_with_single_partition_file_data_connector, ): datasource: Datasource = ( sample_datasource_v013_with_single_partition_file_data_connector ) data_connector_names: Optional[Union[List, str]] = None # Call "get_batch_list_from_batch_request()" to fill up the caches data_connector_name: str = "test_runtime_data_connector" data_asset_name: str = "IN_MEMORY_DATA_ASSET" test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) batch_request: dict = { "datasource_name": datasource.name, "data_connector_name": data_connector_name, "data_asset_name": data_asset_name, "batch_data": test_df, "partition_request": { "partition_identifiers": { "airflow_run_id": 1234567890, }, "limit": None, }, } batch_request: BatchRequest = BatchRequest(**batch_request) # noinspection PyUnusedLocal batch_list: List[Batch] = datasource.get_batch_list_from_batch_request( batch_request=batch_request ) expected_data_asset_names: dict = { "test_runtime_data_connector": [data_asset_name], "my_filesystem_data_connector": ["DEFAULT_ASSET_NAME"], } available_data_asset_names: dict = datasource.get_available_data_asset_names( data_connector_names=data_connector_names ) assert set(available_data_asset_names.keys()) == set( expected_data_asset_names.keys() ) for connector_name, asset_list in available_data_asset_names.items(): assert set(asset_list) == set(expected_data_asset_names[connector_name]) data_connector_names = [ "my_filesystem_data_connector", "test_runtime_data_connector", ] expected_data_asset_names: dict = { "test_runtime_data_connector": [data_asset_name], "my_filesystem_data_connector": ["DEFAULT_ASSET_NAME"], } available_data_asset_names: dict = datasource.get_available_data_asset_names( data_connector_names=data_connector_names ) assert set(available_data_asset_names.keys()) == set( expected_data_asset_names.keys() ) for connector_name, asset_list in available_data_asset_names.items(): assert set(asset_list) == set(expected_data_asset_names[connector_name]) data_connector_names = ["my_filesystem_data_connector"] expected_data_asset_names: dict = { "my_filesystem_data_connector": ["DEFAULT_ASSET_NAME"] } available_data_asset_names: dict = datasource.get_available_data_asset_names( data_connector_names=data_connector_names ) assert set(available_data_asset_names.keys()) == set( expected_data_asset_names.keys() ) for connector_name, asset_list in available_data_asset_names.items(): assert set(asset_list) == set(expected_data_asset_names[connector_name]) data_connector_names = "my_filesystem_data_connector" expected_data_asset_names: dict = { "my_filesystem_data_connector": ["DEFAULT_ASSET_NAME"] } available_data_asset_names: dict = datasource.get_available_data_asset_names( data_connector_names=data_connector_names ) assert set(available_data_asset_names.keys()) == set( expected_data_asset_names.keys() ) for connector_name, asset_list in available_data_asset_names.items(): assert set(asset_list) == set(expected_data_asset_names[connector_name]) data_connector_names = ["my_filesystem_data_connector"] expected_data_asset_names: dict = { "my_filesystem_data_connector": ["DEFAULT_ASSET_NAME"] } available_data_asset_names: dict = datasource.get_available_data_asset_names( data_connector_names=data_connector_names ) assert set(available_data_asset_names.keys()) == set( expected_data_asset_names.keys() ) for connector_name, asset_list in available_data_asset_names.items(): assert set(asset_list) == set(expected_data_asset_names[connector_name])
def test_foxtrot(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_foxtrot")) create_files_in_directory( directory=base_directory, file_name_list=[ "test_dir_foxtrot/A/A-1.csv", "test_dir_foxtrot/A/A-2.csv", "test_dir_foxtrot/A/A-3.csv", "test_dir_foxtrot/B/B-1.txt", "test_dir_foxtrot/B/B-2.txt", "test_dir_foxtrot/B/B-3.txt", "test_dir_foxtrot/C/C-2017.csv", "test_dir_foxtrot/C/C-2018.csv", "test_dir_foxtrot/C/C-2019.csv", "test_dir_foxtrot/D/D-aaa.csv", "test_dir_foxtrot/D/D-bbb.csv", "test_dir_foxtrot/D/D-ccc.csv", "test_dir_foxtrot/D/D-ddd.csv", "test_dir_foxtrot/D/D-eee.csv", ], ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetFilesystemDataConnector base_directory: {base_directory}/test_dir_foxtrot assets: A: base_directory: A/ B: base_directory: B/ pattern: (.*)-(.*)\\.txt group_names: - part_1 - part_2 C: glob_directive: "*" base_directory: C/ D: glob_directive: "*" base_directory: D/ default_regex: pattern: (.*)-(.*)\\.csv group_names: - part_1 - part_2 """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "datasource_name": "BASE", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) self_check_report = my_data_connector.self_check() assert self_check_report == { "class_name": "ConfiguredAssetFilesystemDataConnector", "data_asset_count": 4, "example_data_asset_names": ["A", "B", "C"], "data_assets": { "A": { "batch_definition_count": 3, "example_data_references": [ "A-1.csv", "A-2.csv", "A-3.csv", ], }, "B": { "batch_definition_count": 3, "example_data_references": [ "B-1.txt", "B-2.txt", "B-3.txt", ], }, "C": { "batch_definition_count": 3, "example_data_references": [ "C-2017.csv", "C-2018.csv", "C-2019.csv", ], }, }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], "example_data_reference": {}, } my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition my_batch_request = BatchRequest( datasource_name="BASE", data_connector_name="general_filesystem_data_connector", data_asset_name="A", partition_request=None, ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 3
def test_get_batch_definitions_and_get_batch_basics( basic_pandas_datasource_v013): my_data_connector: ConfiguredAssetFilesystemDataConnector = ( basic_pandas_datasource_v013. data_connectors["my_filesystem_data_connector"]) create_files_in_directory( my_data_connector.base_directory, ["A_1.csv", "A_2.csv", "A_3.csv", "B_1.csv", "B_2.csv", "B_3.csv"], ) assert (len( basic_pandas_datasource_v013.get_available_batch_definitions( batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="Titanic", ))) == 6) batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition( batch_definition=BatchDefinition( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="B1", batch_identifiers=IDDict({ "letter": "B", "number": "1", }), )) # TODO Abe 20201104: Make sure this is what we truly want to do. assert batch.batch_request == {} assert isinstance(batch.data.dataframe, pd.DataFrame) assert batch.batch_definition == BatchDefinition( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="B1", batch_identifiers=IDDict({ "letter": "B", "number": "1", }), ) batch_list: List[ Batch] = basic_pandas_datasource_v013.get_batch_list_from_batch_request( batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="B1", data_connector_query={ "batch_filter_parameters": { "letter": "B", "number": "1", } }, )) assert len(batch_list) == 0 batch_list: List[ Batch] = basic_pandas_datasource_v013.get_batch_list_from_batch_request( batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="Titanic", data_connector_query={ "batch_filter_parameters": { "letter": "B", "number": "1", } }, )) assert len(batch_list) == 1 assert isinstance(batch_list[0].data.dataframe, pd.DataFrame) my_df: pd.DataFrame = pd.DataFrame({"x": range(10), "y": range(10)}) batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition( batch_definition=BatchDefinition( "my_datasource", "_pipeline", "_pipeline", batch_identifiers=IDDict({"some_random_id": 1}), ), batch_data=my_df, ) # TODO Abe 20201104: Make sure this is what we truly want to do. assert batch.batch_request == {}
def test_batches_are_accessible( multibatch_generic_csv_generator, multibatch_generic_csv_generator_context, ): """ What does this test and why? Batches created in the multibatch_generic_csv_generator fixture should be available using the multibatch_generic_csv_generator_context This test most likely duplicates tests elsewhere, but it is more of a test of the configurable fixture. """ context: DataContext = multibatch_generic_csv_generator_context data_relative_path = "../data" data_path = os.path.join(context.root_directory, data_relative_path) datasource_name = "generic_csv_generator" data_connector_name = "daily_data_connector" asset_name = "daily_data_asset" datasource = context.datasources[datasource_name] data_connector = datasource.data_connectors[data_connector_name] total_batches: int = 20 file_list = multibatch_generic_csv_generator( data_path=data_path, num_event_batches=total_batches) assert ( data_connector._get_data_reference_list_from_cache_by_data_asset_name( data_asset_name=asset_name) == file_list) batch_request_1 = BatchRequest( datasource_name="generic_csv_generator", data_connector_name="daily_data_connector", data_asset_name="daily_data_asset", data_connector_query={ "index": -1, }, ) # Should give most recent batch validator_1 = context.get_validator( batch_request=batch_request_1, create_expectation_suite_with_name="my_expectation_suite_name_1", ) metric_max = validator_1.get_metric( MetricConfiguration("column.max", metric_domain_kwargs={"column": "batch_num"})) assert metric_max == total_batches metric_value_set = validator_1.get_metric( MetricConfiguration( "column.distinct_values", metric_domain_kwargs={"column": "string_cardinality_3"}, )) assert metric_value_set == {"category0", "category1", "category2"} batch_request_2 = BatchRequest( datasource_name="generic_csv_generator", data_connector_name="daily_data_connector", data_asset_name="daily_data_asset", data_connector_query={ "index": -2, }, ) validator_2 = context.get_validator( batch_request=batch_request_2, create_expectation_suite_with_name="my_expectation_suite_name_2", ) metric_max = validator_2.get_metric( MetricConfiguration("column.max", metric_domain_kwargs={"column": "batch_num"})) assert metric_max == total_batches - 1 metric_value_set = validator_2.get_metric( MetricConfiguration( "column.distinct_values", metric_domain_kwargs={"column": "string_cardinality_3"}, )) assert metric_value_set == {"category0", "category1", "category2"} for batch_num in range(1, total_batches + 1): batch_request = BatchRequest( datasource_name="generic_csv_generator", data_connector_name="daily_data_connector", data_asset_name="daily_data_asset", data_connector_query={ "index": -batch_num, }, ) validator = context.get_validator( batch_request=batch_request, create_expectation_suite_with_name= f"my_expectation_suite_name__{batch_num}", ) metric_max = validator.get_metric( MetricConfiguration("column.max", metric_domain_kwargs={"column": "batch_num"})) assert metric_max == (total_batches + 1) - batch_num metric_value_set = validator.get_metric( MetricConfiguration( "column.distinct_values", metric_domain_kwargs={"column": "string_cardinality_3"}, )) assert metric_value_set == {"category0", "category1", "category2"}
def test_complex_regex_example_with_implicit_data_asset_names(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp( "test_complex_regex_example_with_implicit_data_asset_names" ) ) create_files_in_directory( directory=base_directory, file_name_list=[ "2020/01/alpha-1001.csv", "2020/01/beta-1002.csv", "2020/02/alpha-1003.csv", "2020/02/beta-1004.csv", "2020/03/alpha-1005.csv", "2020/03/beta-1006.csv", "2020/04/beta-1007.csv", ], ) my_data_connector: InferredAssetFilesystemDataConnector = InferredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", default_regex={ "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv", "group_names": ["year_dir", "month_dir", "data_asset_name"], }, glob_directive="*/*/*.csv", base_directory=base_directory, ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() # Test for an unknown execution environment with pytest.raises(ValueError): # noinspection PyUnusedLocal batch_definition_list: List[ BatchDefinition ] = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="non_existent_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset", ) ) # Test for an unknown data_connector with pytest.raises(ValueError): # noinspection PyUnusedLocal batch_definition_list: List[ BatchDefinition ] = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="non_existent_data_connector", data_asset_name="my_data_asset", ) ) assert ( len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", ) ) ) == 3 ) assert ( len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( data_connector_name="my_data_connector", data_asset_name="alpha", ) ) ) == 3 ) assert ( len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( data_connector_name="my_data_connector", data_asset_name="beta", ) ) ) == 4 ) assert my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", partition_request={ "partition_identifiers": {"year_dir": "2020", "month_dir": "03",} }, ) ) == [ BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", partition_definition=PartitionDefinition(year_dir="2020", month_dir="03",), ) ]
# In normal usage you'd set your path directly in the yaml above. datasource_yaml = datasource_yaml.replace("<YOUR_GCS_BUCKET_HERE>", "test_docs_data") datasource_yaml = datasource_yaml.replace( "<BUCKET_PATH_TO_DATA>", "data/taxi_yellow_tripdata_samples/") context.test_yaml_config(datasource_yaml) # <snippet> context.add_datasource(**yaml.load(datasource_yaml)) # </snippet> # batch_request with data_asset_name # <snippet> batch_request = BatchRequest( datasource_name="my_gcs_datasource", data_connector_name="default_inferred_data_connector_name", data_asset_name="<YOUR_DATA_ASSET_NAME>", ) # </snippet> # Please note this override is only to provide good UX for docs and tests. # In normal usage you'd set your data asset name directly in the BatchRequest above. batch_request.data_asset_name = ( "data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_2019-01") # <snippet> context.create_expectation_suite(expectation_suite_name="test_gcs_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_gcs_suite")
def test_redundant_information_in_naming_convention_bucket_sorted(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("logs")) create_files_in_directory( directory=base_directory, file_name_list=[ "some_bucket/2021/01/01/log_file-20210101.txt.gz", "some_bucket/2021/01/02/log_file-20210102.txt.gz", "some_bucket/2021/01/03/log_file-20210103.txt.gz", "some_bucket/2021/01/04/log_file-20210104.txt.gz", "some_bucket/2021/01/05/log_file-20210105.txt.gz", "some_bucket/2021/01/06/log_file-20210106.txt.gz", "some_bucket/2021/01/07/log_file-20210107.txt.gz", ], ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: InferredAssetFilesystemDataConnector datasource_name: test_environment name: my_inferred_asset_filesystem_data_connector base_directory: {base_directory}/ glob_directive: "*/*/*/*/*.txt.gz" default_regex: group_names: - data_asset_name - year - month - day - full_date pattern: (\\w{{11}})/(\\d{{4}})/(\\d{{2}})/(\\d{{2}})/log_file-(.*)\\.txt\\.gz sorters: - orderby: desc class_name: DateTimeSorter name: full_date """, ) my_data_connector: InferredAssetFilesystemDataConnector = instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "my_inferred_asset_filesystem_data_connector", "datasource_name": "test_environment", "execution_engine": "BASE_ENGINE", }, config_defaults={"module_name": "great_expectations.datasource.data_connector"}, ) sorted_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition( {"year": "2021", "month": "01", "day": "07", "full_date": "20210107"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition( {"year": "2021", "month": "01", "day": "06", "full_date": "20210106"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition( {"year": "2021", "month": "01", "day": "05", "full_date": "20210105"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition( {"year": "2021", "month": "01", "day": "04", "full_date": "20210104"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition( {"year": "2021", "month": "01", "day": "03", "full_date": "20210103"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition( {"year": "2021", "month": "01", "day": "02", "full_date": "20210102"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition( {"year": "2021", "month": "01", "day": "01", "full_date": "20210101"} ), ), ] assert expected == sorted_batch_definition_list
data_dir_path = "data" CONNECTION_STRING = f"sqlite:///{data_dir_path}/yellow_tripdata.db" datasource_yaml = datasource_yaml.replace("<CONNECTION_STRING>", CONNECTION_STRING) context.test_yaml_config(datasource_yaml) context.add_datasource(**yaml.load(datasource_yaml)) available_data_asset_names = context.datasources[ "taxi_datasource" ].get_available_data_asset_names(data_connector_names="whole_table")["whole_table"] assert len(available_data_asset_names) == 2 # Here is a BatchRequest referring to an un-partitioned inferred data_asset. batch_request = BatchRequest( datasource_name="taxi_datasource", data_connector_name="whole_table", data_asset_name="<YOUR_DATA_ASSET_NAME>", ) # Please note this override is only to provide good UX for docs and tests. # In normal usage you'd set your data asset name directly in the BatchRequest above. batch_request.data_asset_name: str = "main.yellow_tripdata_sample_2019_01" context.create_expectation_suite( expectation_suite_name="test_suite", overwrite_existing=True ) validator = context.get_validator( batch_request=batch_request, expectation_suite_name="test_suite" ) print(validator.head(n_rows=10))
batch_request.runtime_parameters[ "path"] = "data/yellow_tripdata_sample_2019-01.csv" context.create_expectation_suite(expectation_suite_name="test_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_suite") print(validator.head()) # NOTE: The following code is only for testing and can be ignored by users. assert isinstance(validator, ge.validator.validator.Validator) # Here is a BatchRequest naming a data_asset batch_request = BatchRequest( datasource_name="my_filesystem_datasource", data_connector_name="default_inferred_data_connector_name", data_asset_name="<YOUR_DATA_ASSET_NAME>", ) # Please note this override is only to provide good UX for docs and tests. # In normal usage you'd set your data asset name directly in the BatchRequest above. batch_request.data_asset_name = "yellow_tripdata_sample_2019-01" context.create_expectation_suite(expectation_suite_name="test_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_suite") print(validator.head()) # NOTE: The following code is only for testing and can be ignored by users. assert isinstance(validator, ge.validator.validator.Validator)
- data_asset_name pattern: (.*) """ # Note : this override is for internal GE purposes, and is intended to helps us better understand how the # Getting Started Guide is being used. It can be ignored by users. datasource_yaml = datasource_yaml.replace("getting_started_datasource", GETTING_STARTED_DATASOURCE_NAME) context.test_yaml_config(datasource_yaml) context.add_datasource(**yaml.load(datasource_yaml)) # Get Validator by creating ExpectationSuite and passing in BatchRequest batch_request = BatchRequest( datasource_name="getting_started_datasource", data_connector_name="default_inferred_data_connector_name", data_asset_name="yellow_tripdata_sample_2019-01.csv", limit=1000, ) # Note : this override is for internal GE purposes, and is intended to helps us better understand how the # Getting Started Guide is being used. It can be ignored by users. batch_request = BatchRequest( datasource_name=GETTING_STARTED_DATASOURCE_NAME, data_connector_name="default_inferred_data_connector_name", data_asset_name="yellow_tripdata_sample_2019-01.csv", limit=1000, ) expectation_suite_name = "getting_started_expectation_suite_taxi.demo" # Note : this override is for internal GE purposes, and is intended to helps us better understand how the
def test_example_with_explicit_data_asset_names(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_example_with_explicit_data_asset_names") ) create_files_in_directory( directory=base_directory, file_name_list=[ "my_base_directory/alpha/files/go/here/alpha-202001.csv", "my_base_directory/alpha/files/go/here/alpha-202002.csv", "my_base_directory/alpha/files/go/here/alpha-202003.csv", "my_base_directory/beta_here/beta-202001.txt", "my_base_directory/beta_here/beta-202002.txt", "my_base_directory/beta_here/beta-202003.txt", "my_base_directory/beta_here/beta-202004.txt", "my_base_directory/gamma-202001.csv", "my_base_directory/gamma-202002.csv", "my_base_directory/gamma-202003.csv", "my_base_directory/gamma-202004.csv", "my_base_directory/gamma-202005.csv", ], ) yaml_string = f""" class_name: ConfiguredAssetFilesystemDataConnector datasource_name: FAKE_DATASOURCE_NAME base_directory: {base_directory}/my_base_directory/ default_regex: pattern: ^(.+)-(\\d{{4}})(\\d{{2}})\\.(csv|txt)$ group_names: - data_asset_name - year_dir - month_dir assets: alpha: base_directory: {base_directory}/my_base_directory/alpha/files/go/here/ glob_directive: "*.csv" beta: base_directory: {base_directory}/my_base_directory/beta_here/ glob_directive: "*.txt" gamma: glob_directive: "*.csv" """ config = yaml.load(yaml_string) my_data_connector = instantiate_class_from_config( config, config_defaults={"module_name": "great_expectations.datasource.data_connector"}, runtime_environment={"name": "my_data_connector"}, ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() assert len(my_data_connector.get_unmatched_data_references()) == 0 assert ( len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( data_connector_name="my_data_connector", data_asset_name="alpha", ) ) ) == 3 ) assert ( len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( data_connector_name="my_data_connector", data_asset_name="beta", ) ) ) == 4 ) assert ( len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( data_connector_name="my_data_connector", data_asset_name="gamma", ) ) ) == 5 )
def test_get_validator_expectation_suite_options( data_context_with_sql_datasource_for_testing_get_batch, ): context = data_context_with_sql_datasource_for_testing_get_batch context.create_expectation_suite("some_expectations") # Successful specification with an existing expectation_suite_name context.get_validator( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", date="2020-01-15", expectation_suite_name="some_expectations", ) # Successful specification with a fetched ExpectationSuite object some_expectations = context.get_expectation_suite("some_expectations") context.get_validator( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", date="2020-01-15", expectation_suite=some_expectations, ) # Successful specification with a fresh ExpectationSuite object some_more_expectations = context.create_expectation_suite( expectation_suite_name="some_more_expectations" ) context.get_validator( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", date="2020-01-15", expectation_suite=some_more_expectations, ) # Successful specification using overwrite_existing_expectation_suite context.get_validator( batch_request=BatchRequest( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", partition_request=PartitionRequest( partition_identifiers={"date": "2020-01-15"} ), ), create_expectation_suite_with_name="yet_more_expectations", # TODO: readd # overwrite_existing_expectation_suite=True, ) # Failed specification: incorrectly typed expectation suite with pytest.raises(TypeError): context.get_validator( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", date="2020-01-15", expectation_suite={ "im": "a", "dictionary": "not a", "ExepctationSuite": False, }, )
def test_redundant_information_in_naming_convention_bucket_sorted( mock_gcs_conn, mock_list_keys, mock_emit): my_data_connector_yaml = yaml.load( """ module_name: great_expectations.datasource.data_connector class_name: InferredAssetGCSDataConnector datasource_name: test_environment name: my_inferred_asset_filesystem_data_connector bucket_or_name: test_bucket prefix: "" default_regex: group_names: - data_asset_name - year - month - day - full_date pattern: (\\w{11})/(\\d{4})/(\\d{2})/(\\d{2})/log_file-(.*)\\.txt\\.gz sorters: - orderby: desc class_name: DateTimeSorter name: full_date """, ) my_data_connector: InferredAssetGCSDataConnector = instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "my_inferred_asset_filesystem_data_connector", "execution_engine": PandasExecutionEngine(), }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) sorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name= "my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", ))) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", batch_identifiers=IDDict({ "year": "2021", "month": "01", "day": "07", "full_date": "20210107" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", batch_identifiers=IDDict({ "year": "2021", "month": "01", "day": "06", "full_date": "20210106" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", batch_identifiers=IDDict({ "year": "2021", "month": "01", "day": "05", "full_date": "20210105" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", batch_identifiers=IDDict({ "year": "2021", "month": "01", "day": "04", "full_date": "20210104" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", batch_identifiers=IDDict({ "year": "2021", "month": "01", "day": "03", "full_date": "20210103" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", batch_identifiers=IDDict({ "year": "2021", "month": "01", "day": "02", "full_date": "20210102" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", batch_identifiers=IDDict({ "year": "2021", "month": "01", "day": "01", "full_date": "20210101" }), ), ] assert expected == sorted_batch_definition_list