Ejemplo n.º 1
0
 def test_config_missing_type(self, sane_config):
     """Check the error if type attribute is missing for some data set(s)
     in the config"""
     del sane_config["catalog"]["boats"]["type"]
     pattern = r"`type` is missing from DataSet \'boats\' " r"catalog configuration"
     with pytest.raises(DataSetError, match=pattern):
         DataCatalog.from_config(**sane_config)
Ejemplo n.º 2
0
 def test_from_sane_config_load_versions_warn(self, sane_config):
     sane_config["catalog"]["boats"]["versioned"] = True
     version = generate_timestamp()
     load_version = {"non-boart": version}
     pattern = r"\`load_versions\` keys \[non-boart\] are not found in the catalog\."
     with pytest.warns(UserWarning, match=pattern):
         DataCatalog.from_config(**sane_config, load_versions=load_version)
Ejemplo n.º 3
0
 def test_config_invalid_arguments(self, sane_config):
     """Check the error if the data set config contains invalid arguments"""
     sane_config["catalog"]["boats"]["save_and_load_args"] = False
     pattern = (r"DataSet 'boats' must only contain arguments valid for "
                r"the constructor of `.*CSVDataSet`")
     with pytest.raises(DataSetError, match=pattern):
         DataCatalog.from_config(**sane_config)
Ejemplo n.º 4
0
 def test_config_invalid_data_set(self, sane_config):
     """Check the error if the type points to invalid class"""
     sane_config["catalog"]["boats"]["type"] = "DataCatalog"
     pattern = (r"DataSet 'boats' type `.*DataCatalog` is invalid: all "
                r"data set types must extend `AbstractDataSet`")
     with pytest.raises(DataSetError, match=pattern):
         DataCatalog.from_config(**sane_config)
Ejemplo n.º 5
0
    def test_config_relative_import(self, sane_config):
        """Check the error if the type points to a relative import"""
        sane_config["catalog"]["boats"]["type"] = ".CSVDataSetInvalid"

        pattern = "`type` class path does not support relative paths"
        with pytest.raises(DataSetError, match=re.escape(pattern)):
            DataCatalog.from_config(**sane_config)
Ejemplo n.º 6
0
    def test_config_invalid_module(self, sane_config):
        """Check the error if the type points to nonexistent module"""
        sane_config["catalog"]["boats"][
            "type"] = "kedro.invalid_module_name.io.CSVDataSet"

        error_msg = "Class `kedro.invalid_module_name.io.CSVDataSet` not found"
        with pytest.raises(DataSetError, match=re.escape(error_msg)):
            DataCatalog.from_config(**sane_config)
Ejemplo n.º 7
0
 def test_config_missing_type(self, sane_config):
     """Check the error if type attribute is missing for some data set(s)
     in the config"""
     del sane_config["catalog"]["boats"]["type"]
     pattern = (
         "An exception occurred when parsing config for DataSet `boats`:\n"
         "`type` is missing from DataSet catalog configuration")
     with pytest.raises(DataSetError, match=re.escape(pattern)):
         DataCatalog.from_config(**sane_config)
Ejemplo n.º 8
0
    def test_config_missing_class(self, sane_config):
        """Check the error if the type points to nonexistent class"""
        sane_config["catalog"]["boats"]["type"] = "kedro.io.CSVDataSetInvalid"

        pattern = (
            "An exception occurred when parsing config for DataSet `boats`:\n"
            "Class `kedro.io.CSVDataSetInvalid` not found")
        with pytest.raises(DataSetError, match=re.escape(pattern)):
            DataCatalog.from_config(**sane_config)
Ejemplo n.º 9
0
 def test_config_invalid_data_set(self, sane_config):
     """Check the error if the type points to invalid class"""
     sane_config["catalog"]["boats"]["type"] = "DataCatalog"
     pattern = (
         "An exception occurred when parsing config for DataSet `boats`:\n"
         "DataSet type `kedro.io.data_catalog.DataCatalog` is invalid: "
         "all data set types must extend `AbstractDataSet`")
     with pytest.raises(DataSetError, match=re.escape(pattern)):
         DataCatalog.from_config(**sane_config)
Ejemplo n.º 10
0
 def test_config_missing_class(self, sane_config):
     """Check the error if the type points to nonexistent class"""
     sane_config["catalog"]["boats"][
         "type"] = "kedro.io.CSVLocalDataSetInvalid"
     pattern = (
         r"Class `kedro.io.CSVLocalDataSetInvalid` for DataSet `boats` not found."
     )
     with pytest.raises(DataSetError, match=pattern):
         DataCatalog.from_config(**sane_config)
Ejemplo n.º 11
0
    def test_link_credentials(self, sane_config, mocker):
        """Test credentials being linked to the relevant data set"""
        mock_client = mocker.patch("kedro.extras.datasets.pandas.csv_dataset.fsspec")
        config = deepcopy(sane_config)
        del config["catalog"]["boats"]

        DataCatalog.from_config(**config)

        expected_client_kwargs = sane_config["credentials"]["s3_credentials"]
        mock_client.filesystem.assert_called_with("s3", **expected_client_kwargs)
Ejemplo n.º 12
0
    def test_missing_dependency(self, sane_config, mocker):
        """Test that dependency is missing."""
        pattern = "dependency issue"

        import_error = ImportError(pattern)
        import_error.name = pattern  # import_error.name cannot be None

        mocker.patch("kedro.io.core.load_obj", side_effect=import_error)
        with pytest.raises(DataSetError, match=pattern):
            DataCatalog.from_config(**sane_config)
Ejemplo n.º 13
0
 def test_from_sane_config_versioned_warn(self, caplog, sane_config,
                                          versioned):
     """Check the warning if `version` attribute was added
     to the data set config"""
     sane_config["catalog"]["boats"]["versioned"] = versioned
     sane_config["catalog"]["boats"]["version"] = True
     DataCatalog.from_config(**sane_config)
     log_record = caplog.records[0]
     assert log_record.levelname == "WARNING"
     assert ("`version` attribute removed from `boats` data set "
             "configuration since it is a reserved word and cannot be "
             "directly specified" in log_record.message)
Ejemplo n.º 14
0
    def test_nested_credentials(self, sane_config_with_nested_creds, mocker):
        mock_client = mocker.patch("kedro.io.csv_s3.S3FileSystem")
        DataCatalog.from_config(**sane_config_with_nested_creds)

        expected_client_kwargs = {
            "nested": {
                "credentials": {
                    "aws_access_key_id": "OTHER_FAKE_ACCESS_KEY",
                    "aws_secret_access_key": "OTHER_FAKE_SECRET_KEY",
                }
            },
            "key": "secret",
        }
        mock_client.assert_called_once_with(client_kwargs=expected_client_kwargs)
Ejemplo n.º 15
0
    def test_missing_dependency(self, sane_config, mocker):
        """Test that dependency is missing."""
        pattern = "dependency issue"

        # pylint: disable=unused-argument,inconsistent-return-statements
        def dummy_load(obj_path, *args, **kwargs):
            if obj_path == "kedro.extras.datasets.pandas.CSVDataSet":
                raise AttributeError(pattern)
            if obj_path == "kedro.extras.datasets.pandas.__all__":
                return ["CSVDataSet"]

        mocker.patch("kedro.io.core.load_obj", side_effect=dummy_load)
        with pytest.raises(DataSetError, match=pattern):
            DataCatalog.from_config(**sane_config)
Ejemplo n.º 16
0
    def test_link_credentials(self, sane_config, mocker):
        """Test credentials being linked to the relevant data set"""
        mock_client = mocker.patch("kedro.io.csv_s3.S3FileSystem")

        DataCatalog.from_config(**sane_config)

        expected_client_kwargs = {
            "aws_access_key_id":
            sane_config["credentials"]["s3_credentials"]["aws_access_key_id"],
            "aws_secret_access_key":
            sane_config["credentials"]["s3_credentials"]
            ["aws_secret_access_key"],
        }
        mock_client.assert_called_once_with(
            client_kwargs=expected_client_kwargs)
Ejemplo n.º 17
0
def kedro_catalog():
    from kedro.config import ConfigLoader
    from kedro.io import DataCatalog
    conf_paths = ['conf/base', 'conf/local']
    conf_loader = ConfigLoader(conf_paths)
    conf_catalog = conf_loader.get('catalog*', 'catalog*/**')
    return DataCatalog.from_config(conf_catalog)
Ejemplo n.º 18
0
    def _create_catalog(  # pylint: disable=no-self-use,too-many-arguments
        self,
        conf_catalog: Dict[str, Any],
        conf_creds: Dict[str, Any],
        save_version: str = None,
        journal: Journal = None,
        load_versions: Dict[str, str] = None,
    ) -> DataCatalog:
        """A factory method for the DataCatalog instantiation.

        Returns:
            DataCatalog defined in `catalog.yml`.

        """
        hook_manager = get_hook_manager()
        catalog = hook_manager.hook.register_catalog(  # pylint: disable=no-member
            catalog=conf_catalog,
            credentials=conf_creds,
            load_versions=load_versions,
            save_version=save_version,
            journal=journal,
        )

        return catalog or DataCatalog.from_config(  # for backwards compatibility
            conf_catalog, conf_creds, load_versions, save_version, journal)
Ejemplo n.º 19
0
def create_catalog(config: ConfigLoader, **kwargs) -> DataCatalog:
    """Loads Kedro's ``DataCatalog``.

    Args:
        config: ConfigLoader which can be queried to access the project config.
        kwargs: Ignore any additional arguments added in the future.

    Returns:
        DataCatalog defined in `catalog.yml`.

    """
    conf_logging = config.get("logging*", "logging*/**")
    logging.config.dictConfig(conf_logging)
    conf_catalog = config.get("catalog*", "catalog*/**")

    try:
        conf_creds = config.get("credentials*", "credentials*/**")
    except MissingConfigException:
        warn("Your Kedro project is missing a credentials file!")
        conf_creds = None

    conf_params = config.get("parameters*", "parameters*/**")
    logging.config.dictConfig(conf_logging)
    catalog = DataCatalog.from_config(conf_catalog, conf_creds)
    catalog.add_feed_dict({"parameters": conf_params})
    return catalog
Ejemplo n.º 20
0
 def test_config_bad_version(self):
     config = yaml.safe_load(StringIO(YML_CONFIG_VERSIONED_BAD))
     with pytest.raises(
         DataSetError,
         match=r"Cached datasets should specify that they are "
         r"versioned in the `CachedDataSet`, not in the "
         r"wrapped dataset",
     ):
         _ = DataCatalog.from_config(config, load_versions={"test_ds": "42"})
Ejemplo n.º 21
0
    def test_nested_credentials(self, sane_config_with_nested_creds, mocker):
        mock_client = mocker.patch("kedro.extras.datasets.pandas.csv_dataset.fsspec")
        config = deepcopy(sane_config_with_nested_creds)
        del config["catalog"]["boats"]
        DataCatalog.from_config(**config)

        expected_client_kwargs = {
            "client_kwargs": {
                "credentials": {
                    "client_kwargs": {
                        "aws_access_key_id": "OTHER_FAKE_ACCESS_KEY",
                        "aws_secret_access_key": "OTHER_FAKE_SECRET_KEY",
                    }
                }
            },
            "key": "secret",
        }
        mock_client.filesystem.assert_called_once_with("s3", **expected_client_kwargs)
 def register_catalog(
     self,
     catalog: Optional[Dict[str, Dict[str, Any]]],
     credentials: Dict[str, Dict[str, Any]],
     load_versions: Dict[str, str],
     save_version: str,
 ) -> DataCatalog:
     return DataCatalog.from_config(catalog, credentials, load_versions,
                                    save_version)
Ejemplo n.º 23
0
def test_from_sane_config_default(sane_config, dummy_dataframe, tmpdir):
    catalog = DataCatalog.from_config(sane_config["catalog"],
                                      sane_config["credentials"])
    catalog_with_default = DataCatalogWithDefault.from_data_catalog(
        catalog, default_csv)
    path = str(tmpdir.mkdir("sub").join("missing.csv"))
    catalog_with_default.save(path, dummy_dataframe)
    reloaded_df = catalog_with_default.load(path)
    assert dummy_dataframe.equals(reloaded_df)
Ejemplo n.º 24
0
    def _create_catalog(  # pylint: disable=no-self-use
            self, conf_catalog: Dict[str, Any],
            conf_creds: Dict[str, Any]) -> DataCatalog:
        """A hook for changing the creation of the DataCatalog instance.

        Returns:
            DataCatalog defined in `catalog.yml`.

        """
        return DataCatalog.from_config(conf_catalog, conf_creds)
Ejemplo n.º 25
0
 def _create_catalog(  # pylint: disable=no-self-use,too-many-arguments
     self,
     conf_catalog: Dict[str, Any],
     conf_creds: Dict[str, Any],
     save_version: str = None,
     journal: Journal = None,
     load_versions: Dict[str, str] = None,
 ) -> DataCatalog:
     return DataCatalog.from_config(conf_catalog, conf_creds, load_versions,
                                    save_version, journal)
Ejemplo n.º 26
0
    def test_LV3_to_decimalWSG84_2(self):
        conf_loader = ConfigLoader(['conf/base'])
        conf_catalog = conf_loader.get('catalog*', 'catalog/**')
        catalog = DataCatalog.from_config(conf_catalog)
        df = catalog.load("foehn_stations")
        lon_fun, lat_fun = LV3_to_decimalWSG84(x=df["x_LV03"], y=df["y_LV03"])

        # There are some deviations in the data from MeteoSwiss, thus higher atol
        assert_allclose(lon_fun, df["longitude"], atol=0.01)
        assert_allclose(lat_fun, df["latitude"], atol=0.01)
Ejemplo n.º 27
0
    def test_fill_missing_coordinates(self):
        conf_loader = ConfigLoader(['conf/base'])
        conf_catalog = conf_loader.get('catalog*', 'catalog/**')
        catalog = DataCatalog.from_config(conf_catalog)
        df = catalog.load("fire_data_cleansed")

        assert df[[
            "coordinates_x", "coordinates_y", "longitude", "latitude",
            "municipality"
        ]].isnull().sum().sum() == 0
Ejemplo n.º 28
0
    def test_load_version(self, sane_config, dummy_dataframe, mocker):
        """Test load versioned data sets from config"""
        new_dataframe = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]})
        sane_config["catalog"]["boats"]["versioned"] = True
        mocker.patch(
            "kedro.io.data_catalog.generate_timestamp", side_effect=["first", "second"]
        )

        # save first version of the dataset
        catalog = DataCatalog.from_config(**sane_config)
        catalog.save("boats", dummy_dataframe)

        # save second version of the dataset
        catalog = DataCatalog.from_config(**sane_config)
        catalog.save("boats", new_dataframe)

        assert_frame_equal(catalog.load("boats", version="first"), dummy_dataframe)
        assert_frame_equal(catalog.load("boats", version="second"), new_dataframe)
        assert_frame_equal(catalog.load("boats"), new_dataframe)
Ejemplo n.º 29
0
    def test_load_version_on_unversioned_dataset(
        self, sane_config, dummy_dataframe, mocker
    ):
        mocker.patch("kedro.io.data_catalog.generate_timestamp", return_value="first")

        catalog = DataCatalog.from_config(**sane_config)
        catalog.save("boats", dummy_dataframe)

        with pytest.raises(DataSetError):
            catalog.load("boats", version="first")
Ejemplo n.º 30
0
    def _create_catalog(self, conf_catalog, conf_creds) -> DataCatalog:
        save_version = generate_current_version()

        run_id = pai.current_run_uuid()
        if run_id:
            save_version += "-" + run_id

        return DataCatalog.from_config(conf_catalog,
                                       conf_creds,
                                       save_version=save_version)