Ejemplo n.º 1
0
def test_map_batch_definition_to_data_reference_string_using_regex():
    # not BatchDefinition
    my_batch_definition = "I_am_a_string"
    group_names = ["name", "timestamp", "price"]
    regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    with pytest.raises(TypeError):
        # noinspection PyUnusedLocal,PyTypeChecker
        my_data_reference = map_batch_definition_to_data_reference_string_using_regex(
            batch_definition=my_batch_definition,
            regex_pattern=regex_pattern,
            group_names=group_names,
        )

    # group names do not match
    my_batch_definition = BatchDefinition(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        batch_identifiers=IDDict({
            "name": "eugene",
            "timestamp": "20200809",
            "price": "1500"
        }),
    )
    group_names = ["i", "wont", "match"]
    regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    with pytest.raises(KeyError):
        # noinspection PyUnusedLocal
        my_data_reference = map_batch_definition_to_data_reference_string_using_regex(
            batch_definition=my_batch_definition,
            regex_pattern=regex_pattern,
            group_names=group_names,
        )

    # success
    my_batch_definition = BatchDefinition(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        batch_identifiers=IDDict({
            "name": "eugene",
            "timestamp": "20200809",
            "price": "1500"
        }),
    )
    group_names = ["name", "timestamp", "price"]
    regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$"

    my_data_reference = map_batch_definition_to_data_reference_string_using_regex(
        batch_definition=my_batch_definition,
        regex_pattern=regex_pattern,
        group_names=group_names,
    )
    assert my_data_reference == "eugene_20200809_1500.csv"
    def _get_data_reference_list_from_cache_by_data_asset_name(
            self, data_asset_name: str) -> List[str]:
        """
        Fetch data_references corresponding to data_asset_name from the cache.
        """
        regex_config: dict = self._get_regex_config(
            data_asset_name=data_asset_name)
        pattern: str = regex_config["pattern"]
        group_names: List[str] = regex_config["group_names"]

        batch_definition_list = self._get_batch_definition_list_from_batch_request(
            batch_request=BatchRequestBase(
                datasource_name=self.datasource_name,
                data_connector_name=self.name,
                data_asset_name=data_asset_name,
            ))

        path_list: List[str] = [
            map_batch_definition_to_data_reference_string_using_regex(
                batch_definition=batch_definition,
                regex_pattern=pattern,
                group_names=group_names,
            ) for batch_definition in batch_definition_list
        ]

        # TODO: Sort with a real sorter here
        path_list.sort()

        return path_list
 def _map_batch_definition_to_data_reference(
         self, batch_definition: BatchDefinition) -> str:
     data_asset_name: str = batch_definition.data_asset_name
     regex_config: dict = self._get_regex_config(
         data_asset_name=data_asset_name)
     pattern: str = regex_config["pattern"]
     group_names: List[str] = regex_config["group_names"]
     return map_batch_definition_to_data_reference_string_using_regex(
         batch_definition=batch_definition,
         regex_pattern=pattern,
         group_names=group_names,
     )