def test_for_self_check_using_InferredAssetFilesystemDataConnector_SparkDFExecutionEngine( spark_session, tmp_path_factory): base_directory = str( tmp_path_factory.mktemp( "basic_data_connector_inferred_asset_filesystem_data_connector")) create_files_in_directory( directory=base_directory, file_name_list=[ "alex_20201010_1000.csv", "abe_202011111_2000.csv", "will_20201212_3000.csv", ], ) my_data_connector = InferredAssetFilesystemDataConnector( name="my_data_connector", base_directory=base_directory, glob_directive="*.csv", datasource_name="FAKE_DATASOURCE", execution_engine=SparkDFExecutionEngine(), default_regex={ "pattern": "(.+)_(\\d+)_(\\d+)\\.csv", "group_names": ["data_asset_name", "timestamp", "size"], }, ) self_check_results = my_data_connector.self_check() assert self_check_results["data_asset_count"] == 3 assert self_check_results["example_data_reference"]["n_rows"] == 3
def test_simple_regex_example_with_implicit_data_asset_names_self_check( tmp_path_factory, ): base_directory = str( tmp_path_factory.mktemp( "test_simple_regex_example_with_implicit_data_asset_names" ) ) create_files_in_directory( directory=base_directory, file_name_list=[ "A-100.csv", "A-101.csv", "B-1.csv", "B-2.csv", "CCC.csv", ], ) my_data_connector: InferredAssetFilesystemDataConnector = ( InferredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(.+)-(\d+)\.csv", "group_names": [ "data_asset_name", "number", ], }, glob_directive="*", base_directory=base_directory, ) ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() self_check_report_object = my_data_connector.self_check() assert self_check_report_object == { "class_name": "InferredAssetFilesystemDataConnector", "data_asset_count": 2, "example_data_asset_names": ["A", "B"], "data_assets": { "A": { "example_data_references": ["A-100.csv", "A-101.csv"], "batch_definition_count": 2, }, "B": { "example_data_references": ["B-1.csv", "B-2.csv"], "batch_definition_count": 2, }, }, "example_unmatched_data_references": ["CCC.csv"], "unmatched_data_reference_count": 1, # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": {}, }
def test_self_check(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_self_check")) create_files_in_directory( directory=base_directory, file_name_list=[ "A-100.csv", "A-101.csv", "B-1.csv", "B-2.csv", ], ) my_data_connector: InferredAssetFilesystemDataConnector = InferredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", default_regex={ "pattern": r"(.+)-(\d+)\.csv", "group_names": ["data_asset_name", "number"], }, glob_directive="*", base_directory=base_directory, ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() self_check_report_object = my_data_connector.self_check() assert self_check_report_object == { "class_name": "InferredAssetFilesystemDataConnector", "data_asset_count": 2, "example_data_asset_names": ["A", "B"], "data_assets": { "A": { "example_data_references": ["A-100.csv", "A-101.csv"], "batch_definition_count": 2, }, "B": { "example_data_references": ["B-1.csv", "B-2.csv"], "batch_definition_count": 2, }, }, "example_unmatched_data_references": [], "unmatched_data_reference_count": 0, "example_data_reference": {}, }
def test_basic_instantiation(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_basic_instantiation")) create_files_in_directory( directory=base_directory, file_name_list=[ "path/A-100.csv", "path/A-101.csv", "directory/B-1.csv", "directory/B-2.csv", ], ) my_data_connector: InferredAssetFilesystemDataConnector = ( InferredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(.+)/(.+)-(\d+)\.csv", "group_names": ["data_asset_name", "letter", "number"], }, glob_directive="*/*.csv", base_directory=base_directory, ) ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() assert my_data_connector.get_data_reference_list_count() == 4 assert my_data_connector.get_unmatched_data_references() == [] # Illegal execution environment name with pytest.raises(ValueError): print( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="something", data_connector_name="my_data_connector", data_asset_name="something", ) ) )
def test_complex_regex_example_with_implicit_data_asset_names(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp( "test_complex_regex_example_with_implicit_data_asset_names" ) ) create_files_in_directory( directory=base_directory, file_name_list=[ "2020/01/alpha-1001.csv", "2020/01/beta-1002.csv", "2020/02/alpha-1003.csv", "2020/02/beta-1004.csv", "2020/03/alpha-1005.csv", "2020/03/beta-1006.csv", "2020/04/beta-1007.csv", ], ) my_data_connector: InferredAssetFilesystemDataConnector = ( InferredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", default_regex={ "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv", "group_names": ["year_dir", "month_dir", "data_asset_name"], }, glob_directive="*/*/*.csv", base_directory=base_directory, ) ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() # Test for an unknown execution environment with pytest.raises(ValueError): # noinspection PyUnusedLocal batch_definition_list: List[ BatchDefinition ] = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="non_existent_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset", ) ) # Test for an unknown data_connector with pytest.raises(ValueError): # noinspection PyUnusedLocal batch_definition_list: List[ BatchDefinition ] = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="non_existent_data_connector", data_asset_name="my_data_asset", ) ) assert ( len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", ) ) ) == 3 ) assert ( len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="beta", ) ) ) == 4 ) assert my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", data_connector_query={ "batch_filter_parameters": { "year_dir": "2020", "month_dir": "03", } }, ) ) == [ BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", batch_identifiers=IDDict( year_dir="2020", month_dir="03", ), ) ]