def test_partition_request_partition_request_partition_identifiers_1_key( create_files_and_instantiate_data_connector, ): my_data_connector = create_files_and_instantiate_data_connector # no limit returned_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_request={ "partition_identifiers": { "timestamp": "20200809" }, }, )) assert len(returned_batch_definition_list) == 4 expected: List[BatchDefinition] = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "abe", "timestamp": "20200809", "price": "1040" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "alex", "timestamp": "20200809", "price": "1000" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "will", "timestamp": "20200809", "price": "1002" }), ), ] assert returned_batch_definition_list == expected
def test_custom_list(periodic_table_of_elements): Hydrogen = BatchDefinition( datasource_name="A", data_connector_name="a", data_asset_name="aaa", partition_definition=PartitionDefinition({"element": "Hydrogen"}), ) Helium = BatchDefinition( datasource_name="B", data_connector_name="b", data_asset_name="bbb", partition_definition=PartitionDefinition({"element": "Helium"}), ) Lithium = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", partition_definition=PartitionDefinition({"element": "Lithium"}), ) batch_list = [Hydrogen, Helium, Lithium] my_sorter = CustomListSorter(name="element", orderby="desc", reference_list=periodic_table_of_elements) sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [Lithium, Helium, Hydrogen] my_sorter = CustomListSorter(name="element", orderby="asc", reference_list=periodic_table_of_elements) sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [Hydrogen, Helium, Lithium]
def test_create_three_batch_definitions_sort_lexicographically(): a = BatchDefinition( datasource_name="A", data_connector_name="a", data_asset_name="aaa", partition_definition=PartitionDefinition({"id": "A"}), ) b = BatchDefinition( datasource_name="B", data_connector_name="b", data_asset_name="bbb", partition_definition=PartitionDefinition({"id": "B"}), ) c = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", partition_definition=PartitionDefinition({"id": "C"}), ) batch_list = [a, b, c] # sorting by "id" reverse alphabetically (descending) my_sorter = LexicographicSorter(name="id", orderby="desc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list, ) assert sorted_batch_list == [c, b, a] # sorting by "id" reverse alphabetically (ascending) my_sorter = LexicographicSorter(name="id", orderby="asc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list, ) assert sorted_batch_list == [a, b, c]
def test_batch_definition_id(): A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"})) print(A.id) B = BatchDefinition("B", "b", "bbb", PartitionDefinition({"id": "B"})) print(B.id) assert A.id != B.id
def test_map_data_reference_string_to_batch_definition_list_using_regex(): # regex_pattern does not match --> None data_reference = "alex_20200809_1000.csv" regex_pattern = r"^(.+)_____________\.csv$" group_names = ["name", "timestamp", "price"] returned_batch_def_list = map_data_reference_string_to_batch_definition_list_using_regex( datasource_name="test_datasource", data_connector_name="test_data_connector", data_asset_name=None, data_reference=data_reference, regex_pattern=regex_pattern, group_names=group_names, ) assert returned_batch_def_list is None # no data_asset_name configured --> DEFAULT_ASSET_NAME data_reference = "alex_20200809_1000.csv" regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$" group_names = ["name", "timestamp", "price"] returned_batch_def_list = map_data_reference_string_to_batch_definition_list_using_regex( datasource_name="test_datasource", data_connector_name="test_data_connector", data_asset_name=None, data_reference=data_reference, regex_pattern=regex_pattern, group_names=group_names, ) assert returned_batch_def_list == [ BatchDefinition( datasource_name="test_datasource", data_connector_name="test_data_connector", data_asset_name="DEFAULT_ASSET_NAME", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000",} ), ) ] # data_asset_name configured returned_batch_def_list = map_data_reference_string_to_batch_definition_list_using_regex( datasource_name="test_datasource", data_connector_name="test_data_connector", data_asset_name="test_data_asset", data_reference=data_reference, regex_pattern=regex_pattern, group_names=group_names, ) assert returned_batch_def_list == [ BatchDefinition( datasource_name="test_datasource", data_connector_name="test_data_connector", data_asset_name="test_data_asset", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000",} ), ) ]
def test_convert_partition_definition_to_data_reference_string_using_regex(): pattern = r"^(.+)_(\d+)_(\d+)\.csv$" group_names = ["name", "timestamp", "price"] partition_definition = PartitionDefinition( **{ "name": "alex", "timestamp": "20200809", "price": "1000", } ) assert ( convert_partition_definition_to_data_reference_string_using_regex( partition_definition=partition_definition, regex_pattern=pattern, group_names=group_names, ) == "alex_20200809_1000.csv" ) # Test an example with an uncaptured regex group (should return a WildcardDataReference) pattern = r"^(.+)_(\d+)_\d+\.csv$" group_names = ["name", "timestamp"] partition_definition = PartitionDefinition( **{ "name": "alex", "timestamp": "20200809", "price": "1000", } ) assert ( convert_partition_definition_to_data_reference_string_using_regex( partition_definition=partition_definition, regex_pattern=pattern, group_names=group_names, ) == "alex_20200809_*.csv" ) # Test an example with an uncaptured regex group (should return a WildcardDataReference) pattern = r"^.+_(\d+)_(\d+)\.csv$" group_names = ["timestamp", "price"] partition_definition = PartitionDefinition( **{ "name": "alex", "timestamp": "20200809", "price": "1000", } ) assert ( convert_partition_definition_to_data_reference_string_using_regex( partition_definition=partition_definition, regex_pattern=pattern, group_names=group_names, ) == "*_20200809_1000.csv" )
def test_batch_definition_equality(): A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"})) B = BatchDefinition("B", "b", "bbb", PartitionDefinition({"id": "B"})) assert A != B A2 = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"})) assert A == A2
def test_batch_definition_id(): # noinspection PyUnusedLocal,PyPep8Naming A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"})) print(A.id) # noinspection PyUnusedLocal,PyPep8Naming B = BatchDefinition("B", "b", "bbb", PartitionDefinition({"id": "B"})) print(B.id) assert A.id != B.id
def test_convert_data_reference_string_to_partition_definition_using_regex(): data_reference = "alex_20200809_1000.csv" pattern = r"^(.+)_(\d+)_(\d+)\.csv$" group_names = ["name", "timestamp", "price"] assert convert_data_reference_string_to_partition_definition_using_regex( data_reference=data_reference, regex_pattern=pattern, group_names=group_names ) == ( "DEFAULT_ASSET_NAME", PartitionDefinition( { "name": "alex", "timestamp": "20200809", "price": "1000", } ), ) data_reference = "eugene_20200810_1500.csv" pattern = r"^(.+)_(\d+)_(\d+)\.csv$" group_names = ["name", "timestamp", "price"] assert convert_data_reference_string_to_partition_definition_using_regex( data_reference=data_reference, regex_pattern=pattern, group_names=group_names ) == ( "DEFAULT_ASSET_NAME", PartitionDefinition( { "name": "eugene", "timestamp": "20200810", "price": "1500", } ), ) data_reference = "DOESNT_MATCH_CAPTURING_GROUPS.csv" pattern = r"^(.+)_(\d+)_(\d+)\.csv$" group_names = ["name", "timestamp", "price"] assert ( convert_data_reference_string_to_partition_definition_using_regex( data_reference=data_reference, regex_pattern=pattern, group_names=group_names, ) is None ) data_reference = "eugene_DOESNT_MATCH_ALL_CAPTURING_GROUPS_1500.csv" pattern = r"^(.+)_(\d+)_(\d+)\.csv$" group_names = ["name", "timestamp", "price"] assert ( convert_data_reference_string_to_partition_definition_using_regex( data_reference=data_reference, regex_pattern=pattern, group_names=group_names, ) is None )
def test_map_batch_definition_to_data_reference_string_using_regex(): # not BatchDefinition my_batch_definition = "I_am_a_string" group_names = ["name", "timestamp", "price"] regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$" with pytest.raises(TypeError): # noinspection PyUnusedLocal,PyTypeChecker my_data_reference = map_batch_definition_to_data_reference_string_using_regex( batch_definition=my_batch_definition, regex_pattern=regex_pattern, group_names=group_names, ) # group names do not match my_batch_definition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ) group_names = ["i", "wont", "match"] regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$" with pytest.raises(KeyError): my_data_reference = map_batch_definition_to_data_reference_string_using_regex( batch_definition=my_batch_definition, regex_pattern=regex_pattern, group_names=group_names, ) # success my_batch_definition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ) group_names = ["name", "timestamp", "price"] regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$" my_data_reference = map_batch_definition_to_data_reference_string_using_regex( batch_definition=my_batch_definition, regex_pattern=regex_pattern, group_names=group_names, ) assert my_data_reference == "eugene_20200809_1500.csv"
def test_date_time(): first = BatchDefinition( datasource_name="A", data_connector_name="a", data_asset_name="aaa", partition_definition=PartitionDefinition({"date": "20210101"}), ) second = BatchDefinition( datasource_name="B", data_connector_name="b", data_asset_name="bbb", partition_definition=PartitionDefinition({"date": "20210102"}), ) third = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", partition_definition=PartitionDefinition({"date": "20210103"}), ) batch_list = [first, second, third] my_sorter = DateTimeSorter(name="date", datetime_format="%Y%m%d", orderby="desc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [third, second, first] my_sorter = DateTimeSorter(name="date", datetime_format="%Y%m%d", orderby="asc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [first, second, third] with pytest.raises(ge_exceptions.SorterError): # numeric date_time_format i_dont_work = DateTimeSorter(name="date", datetime_format=12345, orderby="desc") my_date_is_not_a_string = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", partition_definition=PartitionDefinition({"date": 20210103}), ) batch_list = [first, second, third, my_date_is_not_a_string] my_sorter = DateTimeSorter(name="date", datetime_format="%Y%m%d", orderby="desc") with pytest.raises(ge_exceptions.SorterError): sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
def test_batch_definition_equality(): # noinspection PyUnusedLocal,PyPep8Naming A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"})) # noinspection PyUnusedLocal,PyPep8Naming B = BatchDefinition("B", "b", "bbb", PartitionDefinition({"id": "B"})) assert A != B # noinspection PyUnusedLocal,PyPep8Naming A2 = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"})) assert A == A2
def test_partition_request_sorted_filtered_by_custom_filter_with_slice_as_list( create_files_and_instantiate_data_connector, ): # <TODO> is this behavior correct? my_data_connector = create_files_and_instantiate_data_connector # Note that both a function and a lambda Callable types are acceptable as the definition of a custom filter. def my_custom_partition_selector(partition_definition: dict) -> bool: return (partition_definition["name"] in ["abe", "james", "eugene"] and datetime.datetime.strptime( partition_definition["timestamp"], "%Y%m%d").date() > datetime.datetime(2020, 7, 15).date()) returned_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_request={ "custom_filter_function": my_custom_partition_selector, "index": [1, 3], }, ))) assert len(returned_batch_definition_list) == 2 expected: List[BatchDefinition] = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "eugene", "timestamp": "20201129", "price": "1900" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ), ] assert returned_batch_definition_list == expected
def test_partition_request_partition_request_partition_identifiers_1_key_and_index( create_files_and_instantiate_data_connector, ): my_data_connector = create_files_and_instantiate_data_connector # no limit returned_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_request={ "partition_identifiers": { "name": "james" }, "index": 0, }, ))) assert len(returned_batch_definition_list) == 1 expected: List[BatchDefinition] = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition({ "name": "james", "timestamp": "20200713", "price": "1567" }), ), ] assert returned_batch_definition_list == expected
def get_batch_definition_list_from_batch_request(self, batch_request: BatchRequest): self._validate_batch_request(batch_request=batch_request) if self._data_references_cache is None: self._refresh_data_references_cache() batch_definition_list: List[BatchDefinition] = [] try: sub_cache = self._data_references_cache[batch_request.data_asset_name] except KeyError as e: raise KeyError( f"data_asset_name {batch_request.data_asset_name} is not recognized." ) for partition_definition in sub_cache: batch_definition: BatchDefinition = BatchDefinition( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=batch_request.data_asset_name, partition_definition=PartitionDefinition(partition_definition), ) if batch_definition_matches_batch_request(batch_definition, batch_request): batch_definition_list.append(batch_definition) return batch_definition_list
def test_batch_definition_instantiation(): with pytest.raises(TypeError): A = BatchDefinition("A", "a", "aaa", {"id": "A"}) A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"})) print(A.id)
def test_batch_definition_instantiation(): with pytest.raises(TypeError): # noinspection PyTypeChecker,PyUnusedLocal,PyPep8Naming A = BatchDefinition("A", "a", "aaa", {"id": "A"}) A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"})) print(A.id)
def test_create_three_batch_definitions_sort_numerically(): one = BatchDefinition( datasource_name="A", data_connector_name="a", data_asset_name="aaa", partition_definition=PartitionDefinition({"id": 1}), ) two = BatchDefinition( datasource_name="B", data_connector_name="b", data_asset_name="bbb", partition_definition=PartitionDefinition({"id": 2}), ) three = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", partition_definition=PartitionDefinition({"id": 3}), ) batch_list = [one, two, three] my_sorter = NumericSorter(name="id", orderby="desc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [three, two, one] my_sorter = NumericSorter(name="id", orderby="asc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [one, two, three] # testing a non-numeric, which should throw an error i_should_not_work = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", partition_definition=PartitionDefinition({"id": "aaa"}), ) batch_list = [one, two, three, i_should_not_work] with pytest.raises(ge_exceptions.SorterError): sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
def _map_data_reference_to_batch_definition_list( self, data_reference, data_asset_name: Optional[str] = None #: Any, ) -> Optional[List[BatchDefinition]]: # Note: This is a bit hacky, but it works. In sql_data_connectors, data references *are* dictionaries, # allowing us to invoke `PartitionDefinition(data_reference)` return [ BatchDefinition( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=data_asset_name, partition_definition=PartitionDefinition(data_reference), ) ]
def test_batch__str__method(): batch = Batch( data=None, batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset_name", ), batch_definition=BatchDefinition( datasource_name="my_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset_name", partition_definition=PartitionDefinition({}), ), batch_spec=BatchSpec(path="/some/path/some.file"), batch_markers=BatchMarkers(ge_load_time="FAKE_LOAD_TIME"), ) print(batch.__str__()) assert ( batch.__str__() == """{ "data": "None", "batch_request": { "datasource_name": "my_datasource", "data_connector_name": "my_data_connector", "data_asset_name": "my_data_asset_name", "partition_request": null }, "batch_definition": { "datasource_name": "my_datasource", "data_connector_name": "my_data_connector", "data_asset_name": "my_data_asset_name", "partition_definition": {} }, "batch_spec": "{'path': '/some/path/some.file'}", "batch_markers": "{'ge_load_time': 'FAKE_LOAD_TIME'}" }""" )
def test_get_batch_definitions_and_get_batch_basics( basic_datasource_with_runtime_data_connector, ): test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) data_connector_name: str = "test_runtime_data_connector" data_asset_name: str = "test_asset_1" batch_request: dict = { "datasource_name": basic_datasource_with_runtime_data_connector.name, "data_connector_name": data_connector_name, "data_asset_name": data_asset_name, "batch_data": test_df, "partition_request": { "batch_identifiers": { "airflow_run_id": 1234567890, } }, "limit": None, } batch_request: BatchRequest = BatchRequest(**batch_request) assert (len( basic_datasource_with_runtime_data_connector. get_available_batch_definitions(batch_request=batch_request)) == 1) my_df: pd.DataFrame = pd.DataFrame({"x": range(10), "y": range(10)}) batch: Batch = (basic_datasource_with_runtime_data_connector. get_batch_from_batch_definition( batch_definition=BatchDefinition( "my_datasource", "_pipeline", "_pipeline", partition_definition=PartitionDefinition( {"some_random_id": 1}), ), batch_data=my_df, )) assert batch.batch_request == {}
def test_get_batch_definitions_and_get_batch_basics(basic_pandas_datasource_v013): my_data_connector: ConfiguredAssetFilesystemDataConnector = ( basic_pandas_datasource_v013.data_connectors["my_filesystem_data_connector"] ) create_files_in_directory( my_data_connector.base_directory, ["A_1.csv", "A_2.csv", "A_3.csv", "B_1.csv", "B_2.csv", "B_3.csv"], ) assert ( len( basic_pandas_datasource_v013.get_available_batch_definitions( batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="Titanic", ) ) ) == 6 ) batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition( batch_definition=BatchDefinition( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="B1", partition_definition=PartitionDefinition( { "letter": "B", "number": "1", } ), ) ) # TODO Abe 20201104: Make sure this is what we truly want to do. assert batch.batch_request == {} assert isinstance(batch.data.dataframe, pd.DataFrame) assert batch.batch_definition == BatchDefinition( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="B1", partition_definition=PartitionDefinition( { "letter": "B", "number": "1", } ), ) batch_list: List[ Batch ] = basic_pandas_datasource_v013.get_batch_list_from_batch_request( batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="B1", partition_request={ "batch_identifiers": { "letter": "B", "number": "1", } }, ) ) assert len(batch_list) == 0 batch_list: List[ Batch ] = basic_pandas_datasource_v013.get_batch_list_from_batch_request( batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="Titanic", partition_request={ "batch_identifiers": { "letter": "B", "number": "1", } }, ) ) assert len(batch_list) == 1 assert isinstance(batch_list[0].data.dataframe, pd.DataFrame) my_df: pd.DataFrame = pd.DataFrame({"x": range(10), "y": range(10)}) batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition( batch_definition=BatchDefinition( "my_datasource", "_pipeline", "_pipeline", partition_definition=PartitionDefinition({"some_random_id": 1}), ), batch_data=my_df, ) # TODO Abe 20201104: Make sure this is what we truly want to do. assert batch.batch_request == {}
def test_complex_regex_example_with_implicit_data_asset_names(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp( "test_complex_regex_example_with_implicit_data_asset_names" ) ) create_files_in_directory( directory=base_directory, file_name_list=[ "2020/01/alpha-1001.csv", "2020/01/beta-1002.csv", "2020/02/alpha-1003.csv", "2020/02/beta-1004.csv", "2020/03/alpha-1005.csv", "2020/03/beta-1006.csv", "2020/04/beta-1007.csv", ], ) my_data_connector: InferredAssetFilesystemDataConnector = ( InferredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", default_regex={ "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv", "group_names": ["year_dir", "month_dir", "data_asset_name"], }, glob_directive="*/*/*.csv", base_directory=base_directory, ) ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() # Test for an unknown execution environment with pytest.raises(ValueError): # noinspection PyUnusedLocal batch_definition_list: List[ BatchDefinition ] = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="non_existent_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset", ) ) # Test for an unknown data_connector with pytest.raises(ValueError): # noinspection PyUnusedLocal batch_definition_list: List[ BatchDefinition ] = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="non_existent_data_connector", data_asset_name="my_data_asset", ) ) assert ( len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", ) ) ) == 3 ) assert ( len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="beta", ) ) ) == 4 ) assert my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", partition_request={ "batch_identifiers": { "year_dir": "2020", "month_dir": "03", } }, ) ) == [ BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", partition_definition=PartitionDefinition( year_dir="2020", month_dir="03", ), ) ]
def test_complex_regex_example_with_implicit_data_asset_names(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "2020/01/alpha-1001.csv", "2020/01/beta-1002.csv", "2020/02/alpha-1003.csv", "2020/02/beta-1004.csv", "2020/03/alpha-1005.csv", "2020/03/beta-1006.csv", "2020/04/beta-1007.csv", ] for key in keys: client.put_object(Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key) my_data_connector: InferredAssetS3DataConnector = InferredAssetS3DataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", default_regex={ "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv", "group_names": ["year_dir", "month_dir", "data_asset_name"], }, bucket=bucket, prefix="", ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() # Test for an unknown execution environment with pytest.raises(ValueError): # noinspection PyUnusedLocal batch_definition_list: List[ BatchDefinition] = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="non_existent_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset", )) # Test for an unknown data_connector with pytest.raises(ValueError): # noinspection PyUnusedLocal batch_definition_list: List[ BatchDefinition] = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="non_existent_data_connector", data_asset_name="my_data_asset", )) assert (len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", ))) == 3) assert (len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( data_connector_name="my_data_connector", data_asset_name="alpha", ))) == 3) assert (len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( data_connector_name="my_data_connector", data_asset_name="beta", ))) == 4) assert my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", partition_request={ "partition_identifiers": { "year_dir": "2020", "month_dir": "03", } }, )) == [ BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", partition_definition=PartitionDefinition( year_dir="2020", month_dir="03", ), ) ]
def test_redundant_information_in_naming_convention_bucket_sorted(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "some_bucket/2021/01/01/log_file-20210101.txt.gz", "some_bucket/2021/01/02/log_file-20210102.txt.gz", "some_bucket/2021/01/03/log_file-20210103.txt.gz", "some_bucket/2021/01/04/log_file-20210104.txt.gz", "some_bucket/2021/01/05/log_file-20210105.txt.gz", "some_bucket/2021/01/06/log_file-20210106.txt.gz", "some_bucket/2021/01/07/log_file-20210107.txt.gz", ] for key in keys: client.put_object(Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: InferredAssetS3DataConnector datasource_name: test_environment name: my_inferred_asset_filesystem_data_connector bucket: {bucket} prefix: "" default_regex: group_names: - data_asset_name - year - month - day - full_date pattern: (\\w{{11}})/(\\d{{4}})/(\\d{{2}})/(\\d{{2}})/log_file-(.*)\\.txt\\.gz sorters: - orderby: desc class_name: DateTimeSorter name: full_date """, ) my_data_connector: InferredAssetS3DataConnector = instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "my_inferred_asset_filesystem_data_connector", "datasource_name": "test_environment", "execution_engine": "BASE_ENGINE", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) sorted_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", )) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition({ "year": "2021", "month": "01", "day": "07", "full_date": "20210107" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition({ "year": "2021", "month": "01", "day": "06", "full_date": "20210106" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition({ "year": "2021", "month": "01", "day": "05", "full_date": "20210105" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition({ "year": "2021", "month": "01", "day": "04", "full_date": "20210104" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition({ "year": "2021", "month": "01", "day": "03", "full_date": "20210103" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition({ "year": "2021", "month": "01", "day": "02", "full_date": "20210102" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition({ "year": "2021", "month": "01", "day": "01", "full_date": "20210101" }), ), ] assert expected == sorted_batch_definition_list
def test_return_all_batch_definitions_sorted(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_return_all_batch_definitions_sorted") ) create_files_in_directory( directory=base_directory, file_name_list=[ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ], ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetFilesystemDataConnector datasource_name: test_environment #execution_engine: # class_name: PandasExecutionEngine base_directory: {base_directory} glob_directive: "*.csv" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price sorters: - orderby: asc class_name: LexicographicSorter name: name - datetime_format: "%Y%m%d" orderby: desc class_name: DateTimeSorter name: timestamp - orderby: desc class_name: NumericSorter name: price """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "datasource_name": "test_environment", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) self_check_report = my_data_connector.self_check() assert self_check_report["class_name"] == "ConfiguredAssetFilesystemDataConnector" assert self_check_report["data_asset_count"] == 1 assert self_check_report["data_assets"]["TestFiles"]["batch_definition_count"] == 10 assert self_check_report["unmatched_data_reference_count"] == 0 sorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", ) ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "abe", "timestamp": "20200809", "price": "1040"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200819", "price": "1300"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20201129", "price": "1900"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20200809", "price": "1500"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200811", "price": "1009"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200810", "price": "1003"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200713", "price": "1567"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200810", "price": "1001"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200809", "price": "1002"} ), ), ] # TEST 1: Sorting works assert expected == sorted_batch_definition_list my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_request=PartitionRequest( **{ "batch_identifiers": { "name": "james", "timestamp": "20200713", "price": "1567", } } ), ) my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition # TEST 2: Should only return the specified partition my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 1 my_batch_definition = my_batch_definition_list[0] expected_batch_definition: BatchDefinition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( **{ "name": "james", "timestamp": "20200713", "price": "1567", } ), ) assert my_batch_definition == expected_batch_definition # TEST 3: Without partition request, should return all 10 my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_request=None, ) # should return 10 my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 10
def test_return_all_batch_definitions_unsorted(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_return_all_batch_definitions_unsorted") ) create_files_in_directory( directory=base_directory, file_name_list=[ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ], ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetFilesystemDataConnector datasource_name: test_environment #execution_engine: # class_name: PandasExecutionEngine base_directory: {base_directory} glob_directive: "*.csv" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "datasource_name": "test_environment", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) with pytest.raises(TypeError): my_data_connector.get_batch_definition_list_from_batch_request() # with unnamed data_asset_name with pytest.raises(TypeError): my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name=None, ) ) # with unnamed data_asset_name unsorted_batch_definition_list = ( my_data_connector._get_batch_definition_list_from_batch_request( BatchRequestBase( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name=None, ) ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "abe", "timestamp": "20200809", "price": "1040"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200819", "price": "1300"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20200809", "price": "1500"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20201129", "price": "1900"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200713", "price": "1567"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200810", "price": "1003"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200811", "price": "1009"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200809", "price": "1002"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200810", "price": "1001"} ), ), ] assert expected == unsorted_batch_definition_list # with named data_asset_name unsorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", ) ) ) assert expected == unsorted_batch_definition_list
def test_return_all_batch_definitions_sorted(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ] for key in keys: client.put_object( Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetS3DataConnector datasource_name: test_environment #execution_engine: # class_name: PandasExecutionEngine bucket: {bucket} prefix: "" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price sorters: - orderby: asc class_name: LexicographicSorter name: name - datetime_format: "%Y%m%d" orderby: desc class_name: DateTimeSorter name: timestamp - orderby: desc class_name: NumericSorter name: price """, ) my_data_connector: ConfiguredAssetS3DataConnector = instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_s3_data_connector", "datasource_name": "test_environment", }, config_defaults={"module_name": "great_expectations.datasource.data_connector"}, ) self_check_report = my_data_connector.self_check() assert self_check_report["class_name"] == "ConfiguredAssetS3DataConnector" assert self_check_report["data_asset_count"] == 1 assert self_check_report["data_assets"]["TestFiles"]["batch_definition_count"] == 10 assert self_check_report["unmatched_data_reference_count"] == 0 sorted_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "abe", "timestamp": "20200809", "price": "1040"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200819", "price": "1300"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20201129", "price": "1900"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20200809", "price": "1500"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200811", "price": "1009"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200810", "price": "1003"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200713", "price": "1567"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200810", "price": "1001"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200809", "price": "1002"} ), ), ] # TEST 1: Sorting works assert expected == sorted_batch_definition_list my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_request=PartitionRequest( **{ "partition_identifiers": { "name": "james", "timestamp": "20200713", "price": "1567", } } ), ) my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition # TEST 2: Should only return the specified partition my_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) assert len(my_batch_definition_list) == 1 my_batch_definition = my_batch_definition_list[0] expected_batch_definition: BatchDefinition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( **{"name": "james", "timestamp": "20200713", "price": "1567",} ), ) assert my_batch_definition == expected_batch_definition # TEST 3: Without partition request, should return all 10 my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_request=None, ) # should return 10 my_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) assert len(my_batch_definition_list) == 10
def test_return_all_batch_definitions_unsorted(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ] for key in keys: client.put_object( Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetS3DataConnector datasource_name: test_environment #execution_engine: # class_name: PandasExecutionEngine bucket: {bucket} prefix: "" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price """, ) my_data_connector: ConfiguredAssetS3DataConnector = instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_s3_data_connector", "datasource_name": "test_environment", }, config_defaults={"module_name": "great_expectations.datasource.data_connector"}, ) with pytest.raises(TypeError): my_data_connector.get_batch_definition_list_from_batch_request() # with unnamed data_asset_name unsorted_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name=None, ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "abe", "timestamp": "20200809", "price": "1040"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200819", "price": "1300"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20200809", "price": "1500"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20201129", "price": "1900"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200713", "price": "1567"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200810", "price": "1003"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200811", "price": "1009"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200809", "price": "1002"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200810", "price": "1001"} ), ), ] assert expected == unsorted_batch_definition_list # with named data_asset_name unsorted_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", ) ) assert expected == unsorted_batch_definition_list
def test_redundant_information_in_naming_convention_bucket_sorted(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("logs")) create_files_in_directory( directory=base_directory, file_name_list=[ "some_bucket/2021/01/01/log_file-20210101.txt.gz", "some_bucket/2021/01/02/log_file-20210102.txt.gz", "some_bucket/2021/01/03/log_file-20210103.txt.gz", "some_bucket/2021/01/04/log_file-20210104.txt.gz", "some_bucket/2021/01/05/log_file-20210105.txt.gz", "some_bucket/2021/01/06/log_file-20210106.txt.gz", "some_bucket/2021/01/07/log_file-20210107.txt.gz", ], ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: InferredAssetFilesystemDataConnector datasource_name: test_environment name: my_inferred_asset_filesystem_data_connector base_directory: {base_directory}/ glob_directive: "*/*/*/*/*.txt.gz" default_regex: group_names: - data_asset_name - year - month - day - full_date pattern: (\\w{{11}})/(\\d{{4}})/(\\d{{2}})/(\\d{{2}})/log_file-(.*)\\.txt\\.gz sorters: - orderby: desc class_name: DateTimeSorter name: full_date """, ) my_data_connector: InferredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "my_inferred_asset_filesystem_data_connector", "datasource_name": "test_environment", "execution_engine": "BASE_ENGINE", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) sorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", ) ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition( {"year": "2021", "month": "01", "day": "07", "full_date": "20210107"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition( {"year": "2021", "month": "01", "day": "06", "full_date": "20210106"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition( {"year": "2021", "month": "01", "day": "05", "full_date": "20210105"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition( {"year": "2021", "month": "01", "day": "04", "full_date": "20210104"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition( {"year": "2021", "month": "01", "day": "03", "full_date": "20210103"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition( {"year": "2021", "month": "01", "day": "02", "full_date": "20210102"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", partition_definition=PartitionDefinition( {"year": "2021", "month": "01", "day": "01", "full_date": "20210101"} ), ), ] assert expected == sorted_batch_definition_list