コード例 #1
0
ファイル: test_catalog.py プロジェクト: zeta1999/kedro
    def test_no_param_datasets_in_respose(self, fake_cli_invoke,
                                          fake_load_context, mocker):
        yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML")
        mocked_context = fake_load_context.return_value
        catalog_data_sets = {
            "iris_data": CSVDataSet("test.csv"),
            "parameters": MemoryDataSet(),
            "params:data_ratio": MemoryDataSet(),
            "intermediate": MemoryDataSet(),
            "not_used": CSVDataSet("test2.csv"),
        }

        pl_obj_data_sets = catalog_data_sets.keys() - {"not_used"}
        mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets)
        mocked_context.pipelines.keys.return_value = (self.PIPELINE_NAME, )
        mocked_pl_obj = mocked_context.pipelines.get.return_value
        mocked_pl_obj.data_sets.return_value = pl_obj_data_sets

        result = fake_cli_invoke(["catalog", "list"])

        assert not result.exit_code
        # 'parameters' and 'params:data_ratio' should not appear in the response
        expected_dict = {
            "DataSets in 'pipeline' pipeline": {
                "Datasets mentioned in pipeline": {
                    "CSVDataSet": ["iris_data"],
                    "MemoryDataSet": ["intermediate"],
                },
                "Datasets not mentioned in pipeline": {
                    "CSVDataSet": ["not_used"]
                },
            }
        }
        yaml_dump_mock.assert_called_once_with(expected_dict)
コード例 #2
0
ファイル: test_catalog.py プロジェクト: zeta1999/kedro
    def test_no_missing_datasets(self, fake_cli_invoke, fake_load_context,
                                 fake_repo_path):
        mocked_context = fake_load_context.return_value

        catalog_data_sets = {
            "input_data": CSVDataSet("test.csv"),
            "output_data": CSVDataSet("test2.csv"),
        }
        mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets)
        mocked_context.pipelines = {
            self.PIPELINE_NAME:
            Pipeline([node(identity, "input_data", "output_data")])
        }

        mocked_context.project_path = fake_repo_path
        mocked_context.CONF_ROOT = "conf"

        data_catalog_file = (fake_repo_path / "conf" / "base" / "catalog" /
                             f"{self.PIPELINE_NAME}.yml")

        result = fake_cli_invoke(
            ["catalog", "create", "--pipeline", self.PIPELINE_NAME])

        assert not result.exit_code
        assert not data_catalog_file.exists()
コード例 #3
0
    def test_no_missing_datasets(
        self,
        fake_project_cli,
        fake_metadata,
        fake_load_context,
        fake_repo_path,
        mock_pipelines,
    ):
        mocked_context = fake_load_context.return_value

        catalog_data_sets = {
            "input_data": CSVDataSet("test.csv"),
            "output_data": CSVDataSet("test2.csv"),
        }
        mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets)
        mocked_context.project_path = fake_repo_path
        mock_pipelines[self.PIPELINE_NAME] = Pipeline(
            [node(identity, "input_data", "output_data")])

        data_catalog_file = (fake_repo_path / "conf" / "base" / "catalog" /
                             f"{self.PIPELINE_NAME}.yml")

        result = CliRunner().invoke(
            fake_project_cli,
            ["catalog", "create", "--pipeline", self.PIPELINE_NAME],
            obj=fake_metadata,
        )

        assert not result.exit_code
        assert not data_catalog_file.exists()
コード例 #4
0
    def test_save_options_csv(self, tmp_path, sample_spark_df):
        # To cross check the correct Spark save operation we save to
        # a single spark partition with csv format and retrieve it with Kedro
        # CSVDataSet
        temp_dir = Path(str(tmp_path / "test_data"))
        spark_data_set = SparkDataSet(
            filepath=str(temp_dir),
            file_format="csv",
            save_args={
                "sep": "|",
                "header": True
            },
        )
        spark_df = sample_spark_df.coalesce(1)
        spark_data_set.save(spark_df)

        single_csv_file = [
            f for f in temp_dir.iterdir() if f.is_file() and f.suffix == ".csv"
        ][0]

        csv_local_data_set = CSVDataSet(filepath=str(single_csv_file),
                                        load_args={"sep": "|"})
        pandas_df = csv_local_data_set.load()

        assert pandas_df[pandas_df["name"] == "Alex"]["age"][0] == 31
コード例 #5
0
def catalog_with_encoder():
    return DataCatalog({
        "raw_data": MemoryDataSet(),
        "data": MemoryDataSet(),
        "encoder": CSVDataSet("fake/path/to/encoder.csv"),
        "model": CSVDataSet("fake/path/to/model.csv"),
    })
コード例 #6
0
 def test_catalog_release(self, mocker):
     fs_mock = mocker.patch("fsspec.filesystem").return_value
     filepath = "test.csv"
     data_set = CSVDataSet(filepath=filepath)
     assert data_set._version_cache.currsize == 0  # no cache if unversioned
     data_set.release()
     fs_mock.invalidate_cache.assert_called_once_with(filepath)
     assert data_set._version_cache.currsize == 0
コード例 #7
0
 def test_load_options_csv(self, tmp_path, sample_pandas_df):
     filepath = str(tmp_path / "data")
     local_csv_data_set = CSVDataSet(filepath=filepath)
     local_csv_data_set.save(sample_pandas_df)
     spark_data_set = SparkDataSet(filepath=filepath,
                                   file_format="csv",
                                   load_args={"header": True})
     spark_df = spark_data_set.load()
     assert spark_df.filter(col("Name") == "Alex").count() == 1
コード例 #8
0
def catalog_with_stopwords():
    catalog_with_stopwords = DataCatalog({
        "data":
        MemoryDataSet(),
        "cleaned_data":
        MemoryDataSet(),
        "stopwords_from_nltk":
        CSVDataSet("fake/path/to/stopwords.csv"),
        "model":
        CSVDataSet("fake/path/to/model.csv"),
    })
    return catalog_with_stopwords
コード例 #9
0
ファイル: test_context.py プロジェクト: vermashivam679/kedro
    def test_run_load_versions(self, tmp_path, dummy_context, dummy_dataframe, mocker):
        class DummyContext(KedroContext):
            project_name = "bob"
            package_name = "bob"
            project_version = kedro_version

            def _get_pipelines(self) -> Dict[str, Pipeline]:
                return {"__default__": Pipeline([node(identity, "cars", "boats")])}

        mocker.patch("logging.config.dictConfig")
        dummy_context = DummyContext(str(tmp_path))
        filepath = str(dummy_context.project_path / "cars.csv")

        old_save_version = generate_timestamp()
        old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]})
        old_csv_data_set = CSVDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, old_save_version),
        )
        old_csv_data_set.save(old_df)

        new_save_version = generate_timestamp()
        new_csv_data_set = CSVDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, new_save_version),
        )
        new_csv_data_set.save(dummy_dataframe)

        load_versions = {"cars": old_save_version}
        dummy_context.run(load_versions=load_versions)
        assert not dummy_context.catalog.load("boats").equals(dummy_dataframe)
        assert dummy_context.catalog.load("boats").equals(old_df)
コード例 #10
0
ファイル: test_context.py プロジェクト: periwinkleFTW/kedro
    def test_run_load_versions(self, dummy_context, dummy_dataframe):
        filepath = (dummy_context.project_path / "cars.csv").as_posix()

        old_save_version = generate_timestamp()
        old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]})
        old_csv_data_set = CSVDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, old_save_version),
        )
        old_csv_data_set.save(old_df)

        sleep(0.5)
        new_save_version = generate_timestamp()
        new_csv_data_set = CSVDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, new_save_version),
        )
        new_csv_data_set.save(dummy_dataframe)

        load_versions = {"cars": old_save_version}
        dummy_context.run(load_versions=load_versions, pipeline_name="simple")
        assert not dummy_context.catalog.load("boats").equals(dummy_dataframe)
        assert dummy_context.catalog.load("boats").equals(old_df)
コード例 #11
0
def dummy_catalog():
    dummy_catalog = DataCatalog({
        "raw_data": MemoryDataSet(),
        "data": MemoryDataSet(),
        "model": CSVDataSet("fake/path/to/model.csv"),
    })
    return dummy_catalog
コード例 #12
0
    def test_http_filesystem_no_versioning(self):
        pattern = r"HTTP\(s\) DataSet doesn't support versioning\."

        with pytest.raises(DataSetError, match=pattern):
            CSVDataSet(
                filepath="https://example.com/file.csv", version=Version(None, None)
            )
コード例 #13
0
    def test_default_dataset(self, fake_project_cli, fake_metadata,
                             fake_load_context, mocker, mock_pipelines):
        """Test that datasets that are found in `Pipeline.data_sets()`,
        but not in the catalog, are outputted under the key "DefaultDataset".
        """
        yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML")
        mocked_context = fake_load_context.return_value
        catalog_data_sets = {"some_dataset": CSVDataSet("test.csv")}
        mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets)
        mocker.patch.object(
            mock_pipelines[PIPELINE_NAME],
            "data_sets",
            return_value=catalog_data_sets.keys() | {"intermediate"},
        )

        result = CliRunner().invoke(
            fake_project_cli,
            ["catalog", "list"],
            obj=fake_metadata,
        )

        assert not result.exit_code
        expected_dict = {
            f"DataSets in '{PIPELINE_NAME}' pipeline": {
                "Datasets mentioned in pipeline": {
                    "CSVDataSet": ["some_dataset"],
                    "DefaultDataSet": ["intermediate"],
                }
            }
        }
        key = f"DataSets in '{PIPELINE_NAME}' pipeline"
        assert yaml_dump_mock.call_count == 1
        assert yaml_dump_mock.call_args[0][0][key] == expected_dict[key]
コード例 #14
0
    def test_protocol_usage(self, filepath, instance_type, credentials):
        data_set = CSVDataSet(filepath=filepath, credentials=credentials)
        assert isinstance(data_set._fs, instance_type)

        path = filepath.split(PROTOCOL_DELIMITER, 1)[-1]

        assert str(data_set._filepath) == path
        assert isinstance(data_set._filepath, PurePosixPath)
コード例 #15
0
    def test_no_param_datasets_in_respose(self, fake_project_cli,
                                          fake_metadata, fake_load_context,
                                          mocker, mock_pipelines):
        yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML")
        mocked_context = fake_load_context.return_value
        catalog_data_sets = {
            "iris_data": CSVDataSet("test.csv"),
            "intermediate": MemoryDataSet(),
            "parameters": MemoryDataSet(),
            "params:data_ratio": MemoryDataSet(),
            "not_used": CSVDataSet("test2.csv"),
        }

        mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets)
        mocker.patch.object(
            mock_pipelines[PIPELINE_NAME],
            "data_sets",
            return_value=catalog_data_sets.keys() - {"not_used"},
        )

        result = CliRunner().invoke(
            fake_project_cli,
            ["catalog", "list"],
            obj=fake_metadata,
        )

        assert not result.exit_code
        # 'parameters' and 'params:data_ratio' should not appear in the response
        expected_dict = {
            f"DataSets in '{PIPELINE_NAME}' pipeline": {
                "Datasets mentioned in pipeline": {
                    "CSVDataSet": ["iris_data"],
                    "MemoryDataSet": ["intermediate"],
                },
                "Datasets not mentioned in pipeline": {
                    "CSVDataSet": ["not_used"]
                },
            }
        }
        key = f"DataSets in '{PIPELINE_NAME}' pipeline"
        assert yaml_dump_mock.call_count == 1
        assert yaml_dump_mock.call_args[0][0][key] == expected_dict[key]
コード例 #16
0
    def test_multiple_loads(self, versioned_csv_data_set, dummy_dataframe,
                            filepath_csv):
        """Test that if a new version is created mid-run, by an
        external system, it won't be loaded in the current run."""
        versioned_csv_data_set.save(dummy_dataframe)
        versioned_csv_data_set.load()
        v1 = versioned_csv_data_set.resolve_load_version()

        # force-drop a newer version into the same location
        v_new = generate_timestamp()
        CSVDataSet(filepath=filepath_csv,
                   version=Version(v_new, v_new)).save(dummy_dataframe)

        versioned_csv_data_set.load()
        v2 = versioned_csv_data_set.resolve_load_version()

        assert v2 == v1  # v2 should not be v_new!
        ds_new = CSVDataSet(filepath=filepath_csv, version=Version(None, None))
        assert (ds_new.resolve_load_version() == v_new
                )  # new version is discoverable by a new instance
コード例 #17
0
    def test_version_str_repr(self, load_version, save_version):
        """Test that version is in string representation of the class instance
        when applicable."""
        filepath = "test.csv"
        ds = CSVDataSet(filepath=filepath)
        ds_versioned = CSVDataSet(filepath=filepath,
                                  version=Version(load_version, save_version))
        assert filepath in str(ds)
        assert "version" not in str(ds)

        assert filepath in str(ds_versioned)
        ver_str = f"version=Version(load={load_version}, save='{save_version}')"
        assert ver_str in str(ds_versioned)
        assert "CSVDataSet" in str(ds_versioned)
        assert "CSVDataSet" in str(ds)
        assert "protocol" in str(ds_versioned)
        assert "protocol" in str(ds)
        # Default save_args
        assert "save_args={'index': False}" in str(ds)
        assert "save_args={'index': False}" in str(ds_versioned)
コード例 #18
0
ファイル: test_data_catalog.py プロジェクト: zeta1999/kedro
def multi_catalog(mocker):
    csv = CSVDataSet(filepath="abc.csv")
    parq = ParquetDataSet(filepath="xyz.parq")
    journal = mocker.Mock()
    layers = {"raw": {"abc.csv"}, "model": {"xyz.parq"}}
    return DataCatalog({
        "abc": csv,
        "xyz": parq
    },
                       journal=journal,
                       layers=layers)
コード例 #19
0
    def test_protocol_usage(self, filepath, instance_type):
        data_set = CSVDataSet(filepath=filepath)
        assert isinstance(data_set._fs, instance_type)

        # _strip_protocol() doesn't strip http(s) protocol
        if data_set._protocol == "https":
            path = filepath.split("://")[-1]
        else:
            path = data_set._fs._strip_protocol(filepath)

        assert str(data_set._filepath) == path
        assert isinstance(data_set._filepath, PurePosixPath)
コード例 #20
0
    def test_replacing_non_alphanumeric_characters(self):
        """Test replacing non alphanumeric characters in datasets names"""
        csv = CSVDataSet(filepath="abc.csv")
        datasets = {"ds1@spark": csv, "ds2_spark": csv, "ds3.csv": csv}

        catalog = DataCatalog(data_sets=datasets)
        assert "ds1@spark" not in catalog.datasets.__dict__
        assert "ds3.csv" not in catalog.datasets.__dict__

        assert "ds2_spark" in catalog.datasets.__dict__
        assert "ds1__spark" in catalog.datasets.__dict__
        assert "ds3__csv" in catalog.datasets.__dict__
コード例 #21
0
    def test_replacing_nonword_characters(self):
        """Test replacing non-word characters in dataset names"""
        csv = CSVDataSet(filepath="abc.csv")
        datasets = {"ds1@spark": csv, "ds2_spark": csv, "ds3.csv": csv, "jalapeño": csv}

        catalog = DataCatalog(data_sets=datasets)
        assert "ds1@spark" not in catalog.datasets.__dict__
        assert "ds2__spark" not in catalog.datasets.__dict__
        assert "ds3.csv" not in catalog.datasets.__dict__
        assert "jalape__o" not in catalog.datasets.__dict__

        assert "ds1__spark" in catalog.datasets.__dict__
        assert "ds2_spark" in catalog.datasets.__dict__
        assert "ds3__csv" in catalog.datasets.__dict__
        assert "jalapeño" in catalog.datasets.__dict__
コード例 #22
0
def catalog_with_parameters():
    catalog_with_parameters = DataCatalog({
        "data":
        MemoryDataSet(),
        "cleaned_data":
        MemoryDataSet(),
        "params:stopwords":
        MemoryDataSet(["Hello", "Hi"]),
        "params:penalty":
        MemoryDataSet(0.1),
        "model":
        CSVDataSet("fake/path/to/model.csv"),
        "params:threshold":
        MemoryDataSet(0.5),
    })
    return catalog_with_parameters
コード例 #23
0
    def before_pipeline_run(self, run_params: Dict[str, Any], pipeline,
                            catalog):
        """A hook implementation to add a catalog entry
        based on the filename passed to the command line, e.g.:
            kedro run --params=input:iris_1.csv
            kedro run --params=input:iris_2.csv
            kedro run --params=input:iris_3.csv
        """
        filename = run_params["extra_params"]["input"]

        # add input dataset
        input_dataset_name = "example_iris_data"
        input_dataset = CSVDataSet(filepath=f"data/01_raw/{filename}")
        catalog.add(input_dataset_name, input_dataset)

        # add output dataset
        output_dataset_name = "example_reporting_data"
        output_dataset = TextDataSet(filepath=f"data/08_reporting/{filename}")
        catalog.add(output_dataset_name, output_dataset)
コード例 #24
0
 def test_datasets_on_add(self, data_catalog_from_config):
     """Check datasets are updated correctly after adding"""
     data_catalog_from_config.add("new_dataset", CSVDataSet("some_path"))
     assert isinstance(data_catalog_from_config.datasets.new_dataset, CSVDataSet)
     assert isinstance(data_catalog_from_config.datasets.boats, CSVDataSet)
コード例 #25
0
def multi_catalog(mocker):
    csv = CSVDataSet(filepath="abc.csv")
    parq = ParquetDataSet(filepath="xyz.parq")
    journal = mocker.Mock()
    return DataCatalog({"abc": csv, "xyz": parq}, journal=journal)
コード例 #26
0
def data_set(filepath):
    return CSVDataSet(filepath=filepath, save_args={"index": False})
コード例 #27
0
def versioned_csv_data_set(filepath_csv, load_version, save_version):
    return CSVDataSet(filepath=filepath_csv,
                      version=Version(load_version, save_version))
コード例 #28
0
def csv_data_set(filepath_csv, load_args, save_args):
    return CSVDataSet(filepath=filepath_csv,
                      load_args=load_args,
                      save_args=save_args)
コード例 #29
0
 def test_catalog_release(self, mocker):
     fs_mock = mocker.patch("fsspec.filesystem").return_value
     filepath = "test.csv"
     data_set = CSVDataSet(filepath=filepath)
     data_set.release()
     fs_mock.invalidate_cache.assert_called_once_with(filepath)
コード例 #30
0
    def test_release_instance_cache(self, dummy_dataframe, filepath_csv):
        """Test that cache invalidation does not affect other instances"""
        ds_a = CSVDataSet(filepath=filepath_csv, version=Version(None, None))
        assert ds_a._version_cache.currsize == 0
        ds_a.save(dummy_dataframe)  # create a version
        assert ds_a._version_cache.currsize == 2

        ds_b = CSVDataSet(filepath=filepath_csv, version=Version(None, None))
        assert ds_b._version_cache.currsize == 0
        ds_b.resolve_save_version()
        assert ds_b._version_cache.currsize == 1
        ds_b.resolve_load_version()
        assert ds_b._version_cache.currsize == 2

        ds_a.release()

        # dataset A cache is cleared
        assert ds_a._version_cache.currsize == 0

        # dataset B cache is unaffected
        assert ds_b._version_cache.currsize == 2