def test_map_batch_definition_to_data_reference_string_using_regex(): # not BatchDefinition my_batch_definition = "I_am_a_string" group_names = ["name", "timestamp", "price"] regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$" with pytest.raises(TypeError): # noinspection PyUnusedLocal,PyTypeChecker my_data_reference = map_batch_definition_to_data_reference_string_using_regex( batch_definition=my_batch_definition, regex_pattern=regex_pattern, group_names=group_names, ) # group names do not match my_batch_definition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ) group_names = ["i", "wont", "match"] regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$" with pytest.raises(KeyError): # noinspection PyUnusedLocal my_data_reference = map_batch_definition_to_data_reference_string_using_regex( batch_definition=my_batch_definition, regex_pattern=regex_pattern, group_names=group_names, ) # success my_batch_definition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ) group_names = ["name", "timestamp", "price"] regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$" my_data_reference = map_batch_definition_to_data_reference_string_using_regex( batch_definition=my_batch_definition, regex_pattern=regex_pattern, group_names=group_names, ) assert my_data_reference == "eugene_20200809_1500.csv"
def _get_data_reference_list_from_cache_by_data_asset_name( self, data_asset_name: str) -> List[str]: """ Fetch data_references corresponding to data_asset_name from the cache. """ regex_config: dict = self._get_regex_config( data_asset_name=data_asset_name) pattern: str = regex_config["pattern"] group_names: List[str] = regex_config["group_names"] batch_definition_list = self._get_batch_definition_list_from_batch_request( batch_request=BatchRequestBase( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=data_asset_name, )) path_list: List[str] = [ map_batch_definition_to_data_reference_string_using_regex( batch_definition=batch_definition, regex_pattern=pattern, group_names=group_names, ) for batch_definition in batch_definition_list ] # TODO: Sort with a real sorter here path_list.sort() return path_list
def _map_batch_definition_to_data_reference( self, batch_definition: BatchDefinition) -> str: data_asset_name: str = batch_definition.data_asset_name regex_config: dict = self._get_regex_config( data_asset_name=data_asset_name) pattern: str = regex_config["pattern"] group_names: List[str] = regex_config["group_names"] return map_batch_definition_to_data_reference_string_using_regex( batch_definition=batch_definition, regex_pattern=pattern, group_names=group_names, )