Esempio n. 1
0
    def test_load_exact(self, mocker):
        ts = generate_timestamp()
        versioned_hdfs = SparkDataSet(
            filepath="hdfs://{}".format(HDFS_PREFIX), version=Version(ts, None)
        )
        get_spark = mocker.patch.object(versioned_hdfs, "_get_spark")

        versioned_hdfs.load()

        get_spark.return_value.read.load.assert_called_once_with(
            "hdfs://{fn}/{f}/{v}/{f}".format(fn=FOLDER_NAME, f=FILENAME, v=ts),
            "parquet",
        )
Esempio n. 2
0
    def test_run_load_versions(self, tmp_path, dummy_context, dummy_dataframe,
                               mocker):
        class DummyContext(KedroContext):
            project_name = "bob"
            package_name = "bob"
            project_version = kedro_version

            def _get_pipelines(self) -> Dict[str, Pipeline]:
                return {
                    "__default__": Pipeline([node(identity, "cars", "boats")])
                }

        mocker.patch("logging.config.dictConfig")
        dummy_context = DummyContext(str(tmp_path))
        filepath = (dummy_context.project_path / "cars.csv").as_posix()

        old_save_version = generate_timestamp()
        old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]})
        old_csv_data_set = CSVDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, old_save_version),
        )
        old_csv_data_set.save(old_df)

        sleep(0.5)
        new_save_version = generate_timestamp()
        new_csv_data_set = CSVDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, new_save_version),
        )
        new_csv_data_set.save(dummy_dataframe)

        load_versions = {"cars": old_save_version}
        dummy_context.run(load_versions=load_versions)
        assert not dummy_context.catalog.load("boats").equals(dummy_dataframe)
        assert dummy_context.catalog.load("boats").equals(old_df)
Esempio n. 3
0
    def test_load_exact(self, mocker):
        ts = generate_timestamp()
        ds_s3 = SparkDataSet(
            filepath="s3a://{}/{}".format(BUCKET_NAME, FILENAME),
            version=Version(ts, None),
            credentials=AWS_CREDENTIALS,
        )
        get_spark = mocker.patch.object(ds_s3, "_get_spark")

        ds_s3.load()

        get_spark.return_value.read.load.assert_called_once_with(
            "s3a://{b}/{f}/{v}/{f}".format(b=BUCKET_NAME, f=FILENAME, v=ts),
            "parquet")
Esempio n. 4
0
    def test_from_sane_config_versioned(self, sane_config, dummy_dataframe):
        """Test load and save of versioned data sets from config"""
        sane_config["catalog"]["boats"]["versioned"] = True
        version = generate_timestamp()
        journal = Journal({"run_id": "fake-id", "project_path": "fake-path"})
        catalog = DataCatalog.from_config(**sane_config,
                                          load_versions={"boats": version},
                                          save_version=version,
                                          journal=journal)

        assert catalog._journal == journal  # pylint: disable=protected-access

        catalog.save("boats", dummy_dataframe)
        path = Path(sane_config["catalog"]["boats"]["filepath"])
        path = path / version / path.name
        assert path.is_file()
        reloaded_df = catalog.load("boats")
        assert_frame_equal(reloaded_df, dummy_dataframe)
Esempio n. 5
0
    def test_multiple_loads(self, versioned_image_dataset, image_object,
                            filepath_png):
        """Test that if a new version is created mid-run, by an
        external system, it won't be loaded in the current run."""
        versioned_image_dataset.save(image_object)
        v1 = versioned_image_dataset.resolve_load_version()

        # force-drop a newer version into the same location
        v_new = generate_timestamp()
        ImageDataSet(filepath=filepath_png,
                     version=Version(v_new, v_new)).save(image_object)

        v2 = versioned_image_dataset.resolve_load_version()

        assert v2 == v1  # v2 should not be v_new!
        ds_new = ImageDataSet(filepath=filepath_png,
                              version=Version(None, None))
        assert (ds_new.resolve_load_version() == v_new
                )  # new version is discoverable by a new instance
Esempio n. 6
0
    def test_multiple_loads(self, versioned_csv_data_set, dummy_dataframe,
                            filepath_csv):
        """Test that if a new version is created mid-run, by an
        external system, it won't be loaded in the current run."""
        versioned_csv_data_set.save(dummy_dataframe)
        versioned_csv_data_set.load()
        v1 = versioned_csv_data_set.resolve_load_version()

        # force-drop a newer version into the same location
        v_new = generate_timestamp()
        CSVDataSet(filepath=filepath_csv,
                   version=Version(v_new, v_new)).save(dummy_dataframe)

        versioned_csv_data_set.load()
        v2 = versioned_csv_data_set.resolve_load_version()

        assert v2 == v1  # v2 should not be v_new!
        ds_new = CSVDataSet(filepath=filepath_csv, version=Version(None, None))
        assert (ds_new.resolve_load_version() == v_new
                )  # new version is discoverable by a new instance
Esempio n. 7
0
    def create(
        cls,
        project_path: Union[Path, str] = None,
        save_on_close: bool = True,
        env: str = None,
    ) -> "KedroSession":
        """Create a new instance of ``KedroSession``.

        Args:
            project_path: Path to the project root directory.
            save_on_close: Whether or not to save the session when it's closed.
            env: Environment for the KedroContext.

        Returns:
            A new ``KedroSession`` instance.
        """
        # pylint: disable=protected-access
        session = cls(
            project_path=project_path,
            session_id=generate_timestamp(),
            save_on_close=save_on_close,
        )

        session_data = get_static_project_data(session._project_path)
        session_data["project_path"] = session._project_path
        session_data["session_id"] = session.session_id
        session_data.update(_describe_git(session._project_path))

        ctx = click.get_current_context(silent=True)
        if ctx:
            session_data["cli"] = _jsonify_cli_context(ctx)

        if env:
            session_data["env"] = env

        session._store.update(session_data)
        return session
Esempio n. 8
0
    def from_config(
        cls: Type,
        catalog: Optional[Dict[str, Dict[str, Any]]],
        credentials: Dict[str, Dict[str, Any]] = None,
        load_versions: Dict[str, str] = None,
        save_version: str = None,
        journal: Journal = None,
    ) -> "DataCatalog":
        """Create a ``DataCatalog`` instance from configuration. This is a
        factory method used to provide developers with a way to instantiate
        ``DataCatalog`` with configuration parsed from configuration files.

        Args:
            catalog: A dictionary whose keys are the data set names and
                the values are dictionaries with the constructor arguments
                for classes implementing ``AbstractDataSet``. The data set
                class to be loaded is specified with the key ``type`` and their
                fully qualified class name. All ``kedro.io`` data set can be
                specified by their class name only, i.e. their module name
                can be omitted.
            credentials: A dictionary containing credentials for different
                data sets. Use the ``credentials`` key in a ``AbstractDataSet``
                to refer to the appropriate credentials as shown in the example
                below.
            load_versions: A mapping between dataset names and versions
                to load. Has no effect on data sets without enabled versioning.
            save_version: Version string to be used for ``save`` operations
                by all data sets with enabled versioning. It must: a) be a
                case-insensitive string that conforms with operating system
                filename limitations, b) always return the latest version when
                sorted in lexicographical order.
            journal: Instance of Journal.

        Returns:
            An instantiated ``DataCatalog`` containing all specified
            data sets, created and ready to use.

        Raises:
            DataSetError: When the method fails to create any of the data
                sets from their config.
            DataSetNotFoundError: When `load_versions` refers to a dataset that doesn't
                exist in the catalog.

        Example:
        ::

            >>> config = {
            >>>     "cars": {
            >>>         "type": "pandas.CSVDataSet",
            >>>         "filepath": "cars.csv",
            >>>         "save_args": {
            >>>             "index": False
            >>>         }
            >>>     },
            >>>     "boats": {
            >>>         "type": "pandas.CSVDataSet",
            >>>         "filepath": "s3://aws-bucket-name/boats.csv",
            >>>         "credentials": "boats_credentials"
            >>>         "save_args": {
            >>>             "index": False
            >>>         }
            >>>     }
            >>> }
            >>>
            >>> credentials = {
            >>>     "boats_credentials": {
            >>>         "client_kwargs": {
            >>>             "aws_access_key_id": "<your key id>",
            >>>             "aws_secret_access_key": "<your secret>"
            >>>         }
            >>>      }
            >>> }
            >>>
            >>> catalog = DataCatalog.from_config(config, credentials)
            >>>
            >>> df = catalog.load("cars")
            >>> catalog.save("boats", df)
        """
        data_sets = {}
        catalog = copy.deepcopy(catalog) or {}
        credentials = copy.deepcopy(credentials) or {}
        run_id = journal.run_id if journal else None
        save_version = save_version or run_id or generate_timestamp()
        load_versions = copy.deepcopy(load_versions) or {}

        missing_keys = load_versions.keys() - catalog.keys()
        if missing_keys:
            raise DataSetNotFoundError(
                f"`load_versions` keys [{', '.join(sorted(missing_keys))}] "
                f"are not found in the catalog.")

        layers = defaultdict(set)  # type: Dict[str, Set[str]]
        for ds_name, ds_config in catalog.items():
            ds_layer = ds_config.pop("layer", None)
            if ds_layer is not None:
                layers[ds_layer].add(ds_name)

            ds_config = _resolve_credentials(ds_config, credentials)
            data_sets[ds_name] = AbstractDataSet.from_config(
                ds_name, ds_config, load_versions.get(ds_name), save_version)

        dataset_layers = layers or None
        return cls(data_sets=data_sets, journal=journal, layers=dataset_layers)
Esempio n. 9
0
def save_version(request):
    return request.param or generate_timestamp()
Esempio n. 10
0
    def run(  # pylint: disable=too-many-arguments,too-many-locals
        self,
        tags: Iterable[str] = None,
        runner: AbstractRunner = None,
        node_names: Iterable[str] = None,
        from_nodes: Iterable[str] = None,
        to_nodes: Iterable[str] = None,
        from_inputs: Iterable[str] = None,
        load_versions: Dict[str, str] = None,
        pipeline_name: str = None,
    ) -> Dict[str, Any]:
        """Runs the pipeline with a specified runner.

        Args:
            tags: An optional list of node tags which should be used to
                filter the nodes of the ``Pipeline``. If specified, only the nodes
                containing *any* of these tags will be run.
            runner: An optional parameter specifying the runner that you want to run
                the pipeline with.
            node_names: An optional list of node names which should be used to
                filter the nodes of the ``Pipeline``. If specified, only the nodes
                with these names will be run.
            from_nodes: An optional list of node names which should be used as a
                starting point of the new ``Pipeline``.
            to_nodes: An optional list of node names which should be used as an
                end point of the new ``Pipeline``.
            from_inputs: An optional list of input datasets which should be used as a
                starting point of the new ``Pipeline``.
            load_versions: An optional flag to specify a particular dataset version timestamp
                to load.
            pipeline_name: Name of the ``Pipeline`` to execute.
                Defaults to "__default__".
        Raises:
            KedroContextError: If the resulting ``Pipeline`` is empty
                or incorrect tags are provided.
        Returns:
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.
        """
        # Report project name
        logging.info("** Kedro project %s", self.project_path.name)

        try:
            pipeline = self._get_pipeline(name=pipeline_name)
        except NotImplementedError:
            common_migration_message = (
                "`ProjectContext._get_pipeline(self, name)` method is expected. "
                "Please refer to the 'Modular Pipelines' section of the documentation."
            )
            if pipeline_name:
                raise KedroContextError(
                    "The project is not fully migrated to use multiple pipelines. "
                    + common_migration_message
                )

            warn(
                "You are using the deprecated pipeline construction mechanism. "
                + common_migration_message,
                DeprecationWarning,
            )
            pipeline = self.pipeline

        filtered_pipeline = self._filter_pipeline(
            pipeline=pipeline,
            tags=tags,
            from_nodes=from_nodes,
            to_nodes=to_nodes,
            node_names=node_names,
            from_inputs=from_inputs,
        )

        run_id = generate_timestamp()

        record_data = {
            "run_id": run_id,
            "project_path": str(self.project_path),
            "env": self.env,
            "kedro_version": self.project_version,
            "tags": tags,
            "from_nodes": from_nodes,
            "to_nodes": to_nodes,
            "node_names": node_names,
            "from_inputs": from_inputs,
            "load_versions": load_versions,
            "pipeline_name": pipeline_name,
        }
        journal = Journal(record_data)

        catalog = self._get_catalog(
            save_version=run_id, journal=journal, load_versions=load_versions
        )

        # Run the runner
        runner = runner or SequentialRunner()
        return runner.run(filtered_pipeline, catalog)
Esempio n. 11
0
    def create(  # pylint: disable=too-many-arguments
        cls,
        package_name: str = None,
        project_path: Union[Path, str] = None,
        save_on_close: bool = True,
        env: str = None,
        extra_params: Dict[str, Any] = None,
    ) -> "KedroSession":
        """Create a new instance of ``KedroSession`` with the session data.

        Args:
            package_name: Package name for the Kedro project the session is
                created for.
            project_path: Path to the project root directory. Default is
                current working directory Path.cwd().
            save_on_close: Whether or not to save the session when it's closed.
            env: Environment for the KedroContext.
            extra_params: Optional dictionary containing extra project parameters
                for underlying KedroContext. If specified, will update (and therefore
                take precedence over) the parameters retrieved from the project
                configuration.

        Returns:
            A new ``KedroSession`` instance.
        """

        # this is to make sure that for workflows that manually create session
        # without going through one of our known entrypoints, e.g. some plugins like kedro-airflow,
        # the project is still properly configured. This is for backward compatibility
        # and should be removed in 0.18.
        if package_name is not None:
            configure_project(package_name)

        session = cls(
            package_name=package_name,
            project_path=project_path,
            session_id=generate_timestamp(),
            save_on_close=save_on_close,
        )

        # have to explicitly type session_data otherwise mypy will complain
        # possibly related to this: https://github.com/python/mypy/issues/1430
        session_data: Dict[str, Any] = {
            "package_name": session._package_name,
            "project_path": session._project_path,
            "session_id": session.session_id,
            **_describe_git(session._project_path),
        }

        ctx = click.get_current_context(silent=True)
        if ctx:
            session_data["cli"] = _jsonify_cli_context(ctx)

        env = env or os.getenv("KEDRO_ENV")
        if env:
            session_data["env"] = env

        if extra_params:
            session_data["extra_params"] = extra_params

        session._store.update(session_data)

        # we need a ConfigLoader registered in order to be able to set up logging
        session._setup_logging()
        return session
Esempio n. 12
0
def version():
    load_version = None  # use latest
    save_version = generate_timestamp()  # freeze save version
    return Version(load_version, save_version)
Esempio n. 13
0
    def from_config(
        cls: Type,
        catalog: Optional[Dict[str, Dict[str, Any]]],
        credentials: Dict[str, Dict[str, Any]] = None,
        load_versions: Dict[str, str] = None,
        save_version: str = None,
        journal: Journal = None,
    ) -> "DataCatalog":
        """Create a ``DataCatalog`` instance from configuration. This is a
        factory method used to provide developers with a way to instantiate
        ``DataCatalog`` with configuration parsed from configuration files.

        Args:
            catalog: A dictionary whose keys are the data set names and
                the values are dictionaries with the constructor arguments
                for classes implementing ``AbstractDataSet``. The data set
                class to be loaded is specified with the key ``type`` and their
                fully qualified class name. All ``kedro.io`` data set can be
                specified by their class name only, i.e. their module name
                can be omitted.
            credentials: A dictionary containing credentials for different
                data sets. Use the ``credentials`` key in a ``AbstractDataSet``
                to refer to the appropriate credentials as shown in the example
                below.
            load_versions: A mapping between dataset names and versions
                to load. Has no effect on data sets without enabled versioning.
            save_version: Version string to be used for ``save`` operations
                by all data sets with enabled versioning. It must: a) be a
                case-insensitive string that conforms with operating system
                filename limitations, b) always return the latest version when
                sorted in lexicographical order.
            journal: Instance of Journal.

        Returns:
            An instantiated ``DataCatalog`` containing all specified
            data sets, created and ready to use.

        Raises:
            DataSetError: When the method fails to create any of the data
                sets from their config.

        Example:
        ::

            >>> config = {
            >>>     "cars": {
            >>>         "type": "CSVLocalDataSet",
            >>>         "filepath": "cars.csv",
            >>>         "save_args": {
            >>>             "index": False
            >>>         }
            >>>     },
            >>>     "boats": {
            >>>         "type": "CSVS3DataSet",
            >>>         "filepath": "boats.csv",
            >>>         "bucket_name": "mck-147789798-bucket",
            >>>         "credentials": "boats_credentials"
            >>>         "save_args": {
            >>>             "index": False
            >>>         }
            >>>     }
            >>> }
            >>>
            >>> credentials = {
            >>>     "boats_credentials": {
            >>>         "aws_access_key_id": "<your key id>",
            >>>         "aws_secret_access_key": "<your secret>"
            >>>      }
            >>> }
            >>>
            >>> catalog = DataCatalog.from_config(config, credentials)
            >>>
            >>> df = catalog.load("cars")
            >>> catalog.save("boats", df)
        """
        data_sets = {}
        catalog = copy.deepcopy(catalog) or {}
        credentials = copy.deepcopy(credentials) or {}
        run_id = journal.run_id if journal else None
        save_version = save_version or run_id or generate_timestamp()
        load_versions = copy.deepcopy(load_versions) or {}

        missing_keys = load_versions.keys() - catalog.keys()
        if missing_keys:
            warn("`load_versions` keys [{}] are not found in the catalog.".
                 format(", ".join(sorted(missing_keys))))

        for ds_name, ds_config in catalog.items():
            if "type" not in ds_config:
                raise DataSetError("`type` is missing from DataSet '{}' "
                                   "catalog configuration".format(ds_name))
            if CREDENTIALS_KEY in ds_config:
                ds_config[CREDENTIALS_KEY] = _get_credentials(
                    ds_config.pop(CREDENTIALS_KEY),
                    credentials  # credentials name
                )
            data_sets[ds_name] = AbstractDataSet.from_config(
                ds_name, ds_config, load_versions.get(ds_name), save_version)
        return cls(data_sets=data_sets, journal=journal)