Example #1
0
def initialize_model(
    args,  # type: Type[EasyDict]
    model_key="uplift_model_params",  # type: str
    default_estimator="sklearn.linear_model.LogisticRegression",  # type: str
):
    # type: (...) -> Type[sklearn.base.BaseEstimator]

    if not isinstance(args[model_key], dict):
        model = args[model_key]
        return model

    model_params = args[model_key].copy()
    if not model_params.get("estimator"):
        model_params["estimator"] = default_estimator
    estimator_str = model_params.pop("estimator")
    estimator_obj = load_obj(estimator_str)

    const_params = (
        (model_params.pop("const_params") or dict())
        if "const_params" in model_params
        else dict()
    )

    if not model_params.get("search_cv"):
        const_params.update(model_params)
        model = estimator_obj(**const_params)
        return model

    search_cv_str = model_params.pop("search_cv")
    search_cv_obj = load_obj(search_cv_str)
    model_params["estimator"] = estimator_obj(**const_params)
    model = search_cv_obj(**model_params)
    return model
Example #2
0
def _load_obj(class_path: str) -> Optional[object]:
    mod_path, _, class_name = class_path.rpartition(".")
    try:
        available_classes = load_obj(f"{mod_path}.__all__")
    # ModuleNotFoundError: When `load_obj` can't find `mod_path` (e.g `kedro.io.pandas`)
    #                      this is because we try a combination of all prefixes.
    # AttributeError: When `load_obj` manages to load `mod_path` but it doesn't have an
    #                 `__all__` attribute -- either because it's a custom or a kedro.io dataset
    except (ModuleNotFoundError, AttributeError, ValueError):
        available_classes = None

    try:
        class_obj = load_obj(class_path)
    except (ModuleNotFoundError, ValueError):
        return None
    except AttributeError as exc:
        if available_classes and class_name in available_classes:
            raise DataSetError(
                f"{exc} Please see the documentation on how to "
                f"install relevant dependencies for {class_path}:\n"
                f"https://kedro.readthedocs.io/en/stable/"
                f"04_kedro_project_setup/01_dependencies.html"
            ) from exc
        return None

    return class_obj
Example #3
0
def evaluate_model(
    regressor: LinearRegression,
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_test: np.ndarray,
    y_test: np.ndarray,
    metrics: List[str],
):
    """Calculate the coefficient of determination and log the result.

        Args:
            regressor: Trained model.
            X_test: Testing data of independent features.
            y_test: Testing data for price.

    """
    y_train_pred = regressor.predict(X_train)
    y_test_pred = regressor.predict(X_test)
    results = []
    for metric in metrics:
        results.append({
            "Metric": metric.rpartition(".")[2],
            "Train": load_obj(metric)(y_train, y_train_pred),
            "Test": load_obj(metric)(y_test, y_test_pred),
        })
    return pd.DataFrame(results).set_index("Metric")
Example #4
0
def _load_callables(func, default_module):
    func = func or (lambda *args, **kwargs:
                    (args[0] if args else list(kwargs.values())[0]
                     if kwargs else None))
    funcs = func if isinstance(func, list) else [func]

    for f in funcs:
        if isinstance(f, str):
            f_list = f.rsplit(".", 1)
            obj = f_list[-1]
            module = f_list[0] if len(f_list) == 2 else None
            assert module or default_module, (
                "The module to which '{}' belongs is unknown. ".format(obj) +
                "Specify the module (e.g. foo.bar) using the name format"
                " (e.g. 'foo.bar.{}') ".format(obj) +
                "or default_module argument.")
        else:
            assert callable(f), "{} should be callable or str.".format(f)

    funcs = [
        f if callable(f) else load_obj(f, default_obj_path=default_module)
        if isinstance(f, str) else None for f in funcs
    ]

    return funcs
Example #5
0
def load_package_context(project_path: Path, package_name: str,
                         **kwargs) -> KedroContext:
    """Loads the KedroContext object of a Kedro project package,
    as output by `kedro package` and installed via `pip`.
    This function is only intended to be used in a project's `run.py`.
    If you are looking to load KedroContext object for any other workflow,
    you might want to use ``load_context`` instead.

    Args:
        project_path: Path to the Kedro project, i.e. where `conf/` resides.
        package_name: Name of the installed Kedro project package.
        kwargs: Optional kwargs for ``ProjectContext`` class in `run.py`.

    Returns:
        Instance of ``KedroContext`` class defined in Kedro project.

    Raises:
        KedroContextError: Either '.kedro.yml' was not found
            or loaded context has package conflict.
    """
    context_path = f"{package_name}.run.ProjectContext"
    try:
        context_class = load_obj(context_path)
    except ModuleNotFoundError:
        raise KedroContextError(
            f"Cannot load context object from {context_path} for package {package_name}."
        )

    # update kwargs with env from the environment variable (defaults to None if not set)
    # need to do this because some CLI command (e.g `kedro run`) defaults to passing in `env=None`
    kwargs["env"] = kwargs.get("env") or os.getenv("KEDRO_ENV")

    # Instantiate the context after changing the cwd for logging to be properly configured.
    context = context_class(project_path=project_path, **kwargs)
    return context
Example #6
0
def run(
        from_inputs: Iterable[str],
        to_outputs: Iterable[str],
        from_nodes: Iterable[str],
        to_nodes: Iterable[str],
        node_names: Iterable[str],
        tag: Iterable[str],
        pipeline: str,
        runner: str,
        is_async: bool,
        env: str,
        params: Dict[str, Any],
        config: click.Path,  # pylint: disable=unused-argument
) -> None:
    """Run the pipeline."""
    runner_class = load_obj(obj_path=runner, default_obj_path="kedro.runner")
    tag = _get_values_as_tuple(values=tag) if tag else tag
    node_names = _get_values_as_tuple(
        values=node_names) if node_names else node_names
    package_name = str(Path(__file__).resolve().parent.name)
    with KedroSession.create(package_name=package_name,
                             env=env,
                             extra_params=params) as session:
        session.run(
            tags=tag,
            runner=runner_class(is_async=is_async),
            node_names=node_names,
            from_nodes=from_nodes,
            to_nodes=to_nodes,
            from_inputs=from_inputs,
            to_outputs=to_outputs,
            pipeline_name=pipeline,
        )
Example #7
0
def run(
    tag,
    env,
    parallel,
    runner,
    node_names,
    to_nodes,
    from_nodes,
    from_inputs,
    load_version,
    pipeline,
    config,
    params,
):
    """Run the pipeline."""
    if parallel and runner:
        raise KedroCliError(
            "Both --parallel and --runner options cannot be used together. "
            "Please use either --parallel or --runner.")
    if parallel:
        runner = "ParallelRunner"
    runner_class = load_obj(runner,
                            "kedro.runner") if runner else SequentialRunner

    context = load_context(Path.cwd(), env=env, extra_params=params)
    context.run(
        tags=tag,
        runner=runner_class(),
        node_names=node_names,
        from_nodes=from_nodes,
        to_nodes=to_nodes,
        from_inputs=from_inputs,
        load_versions=load_version,
        pipeline_name=pipeline,
    )
Example #8
0
def load_context(project_path: Union[str, Path], **kwargs) -> KedroContext:
    """Loads the KedroContext object of a Kedro Project based on the path specified
    in `.kedro.yml`.
    This function will change the current working directory to the project path.

    Args:
        project_path: Path to the Kedro project.
        kwargs: Optional kwargs for ``ProjectContext`` class in `run.py`.

    Returns:
        Instance of ``KedroContext`` class defined in Kedro project.

    Raises:
        KedroContextError: Either '.kedro.yml' was not found
            or loaded context has package conflict.

    """
    project_path = Path(project_path).expanduser().resolve()
    src_path = str(project_path / "src")

    if src_path not in sys.path:
        sys.path.insert(0, src_path)

    if "PYTHONPATH" not in os.environ:
        os.environ["PYTHONPATH"] = src_path

    kedro_yaml = project_path / ".kedro.yml"

    try:
        with kedro_yaml.open("r") as kedro_yml:
            kedro_yaml_content = yaml.safe_load(kedro_yml)
    except FileNotFoundError:
        raise KedroContextError(
            "Could not find '.kedro.yml' in {}. If you have created your project "
            "with Kedro version <0.15.0, make sure to update your project template. "
            "See https://github.com/quantumblacklabs/kedro/blob/master/RELEASE.md "
            "for how to migrate your Kedro project.".format(str(project_path))
        )
    except Exception:
        raise KedroContextError("Failed to parse '.kedro.yml' file")

    try:
        context_path = kedro_yaml_content["context_path"]
    except (KeyError, TypeError):
        raise KedroContextError(
            "'.kedro.yml' doesn't have a required `context_path` field. "
            "Please refer to the documentation."
        )

    context_class = load_obj(context_path)

    if os.getcwd() != str(project_path):
        logging.getLogger(__name__).warning(
            "Changing the current working directory to %s", str(project_path)
        )
        os.chdir(str(project_path))  # Move to project root

    # Instantiate the context after changing the cwd for logging to be properly configured.
    context = context_class(project_path, **kwargs)
    return context
Example #9
0
def load_context(project_path: Union[str, Path], **kwargs) -> KedroContext:
    """Loads the KedroContext object of a Kedro Project.
    This is the default way to load the KedroContext object for normal workflows such as
    CLI, Jupyter Notebook, Plugins, etc. It assumes the following project structure
    under the given project_path::

       <project_path>
           |__ <src_dir>
           |__ .kedro.yml
           |__ kedro_cli.py
           |__ pyproject.toml

    The name of the <scr_dir> is `src` by default. The `.kedro.yml` or `pyproject.toml` can
    be used for configuration. If `.kedro.yml` exists, it will be used otherwise, `pyproject.toml`
    will be treated as the configuration file (Kedro configuration should be under
    `[tool.kedro]` section).

    Args:
        project_path: Path to the Kedro project.
        kwargs: Optional kwargs for ``ProjectContext`` class in `run.py`.

    Returns:
        Instance of ``KedroContext`` class defined in Kedro project.

    Raises:
        KedroContextError: Neither '.kedro.yml' nor `pyproject.toml` was found
            or `[tool.kedro]` section is missing in `pyproject.toml`, or loaded context
            has package conflict.

    """
    project_path = Path(project_path).expanduser().resolve()
    static_data = get_static_project_data(project_path)

    source_dir = static_data["source_dir"]
    validate_source_path(source_dir, project_path)

    if "context_path" not in static_data:
        conf_file = static_data["config_file"].name
        raise KedroContextError(
            f"'{conf_file}' doesn't have a required `context_path` field. "
            f"Please refer to the documentation."
        )

    if str(source_dir) not in sys.path:
        sys.path.insert(0, str(source_dir))

    if "PYTHONPATH" not in os.environ:
        os.environ["PYTHONPATH"] = str(source_dir)

    context_class = load_obj(static_data["context_path"])

    # update kwargs with env from the environment variable
    # (defaults to None if not set)
    # need to do this because some CLI command (e.g `kedro run`) defaults to
    # passing in `env=None`
    kwargs["env"] = kwargs.get("env") or os.getenv("KEDRO_ENV")

    context = context_class(project_path=project_path, **kwargs)
    return context
Example #10
0
def parse_dataset_definition(
    config: Dict[str, Any], load_version: str = None, save_version: str = None
) -> Tuple[Type[AbstractDataSet], Dict]:
    """Parse and instantiate a dataset class using the configuration provided.

    Args:
        config: Data set config dictionary. It *must* contain the `type` key
            with fully qualified class name.
        load_version: Version string to be used for ``load`` operation if
                the data set is versioned. Has no effect on the data set
                if versioning was not enabled.
        save_version: Version string to be used for ``save`` operation if
            the data set is versioned. Has no effect on the data set
            if versioning was not enabled.

    Raises:
        DataSetError: If the function fails to parse the configuration provided.

    Returns:
        2-tuple: (Dataset class object, configuration dictionary)
    """
    save_version = save_version or generate_timestamp()
    config = copy.deepcopy(config)

    if "type" not in config:
        raise DataSetError("`type` is missing from DataSet catalog configuration")

    class_obj = config.pop("type")

    if isinstance(class_obj, str):
        try:
            class_obj = load_obj(class_obj, "kedro.io")
        except ImportError:
            raise DataSetError(
                "Cannot import module when trying to load type `{}`.".format(class_obj)
            )
        except AttributeError:
            raise DataSetError("Class `{}` not found.".format(class_obj))
    if not issubclass(class_obj, AbstractDataSet):
        raise DataSetError(
            "DataSet type `{}.{}` is invalid: all data set types must extend "
            "`AbstractDataSet`.".format(class_obj.__module__, class_obj.__qualname__)
        )

    if VERSION_KEY in config:
        # remove "version" key so that it's not passed
        # to the "unversioned" data set constructor
        message = (
            "`%s` attribute removed from data set configuration since it is a "
            "reserved word and cannot be directly specified"
        )
        logging.getLogger(__name__).warning(message, VERSION_KEY)
        del config[VERSION_KEY]
    if config.pop(VERSIONED_FLAG_KEY, False):  # data set is versioned
        config[VERSION_KEY] = Version(load_version, save_version)

    return class_obj, config
def run(
    tag,
    env,
    parallel,
    streaming,
    runner,
    node_names,
    to_nodes,
    from_nodes,
    from_inputs,
    load_version,
    pipeline,
    config,
    params,
):
    """Run the pipeline."""
    if parallel and runner:
        raise KedroCliError(
            "Both --parallel and --runner options cannot be used together. "
            "Please use either --parallel or --runner.")
    runner = runner or "SequentialRunner"
    if parallel:
        runner = "ParallelRunner"
    runner_class = load_obj(runner, "kedro.runner")
    if streaming:
        runner_class = load_obj("src.runner.StreamingRunner")

    tag = _get_values_as_tuple(tag) if tag else tag
    node_names = _get_values_as_tuple(node_names) if node_names else node_names

    context = load_context(Path.cwd(), env=env, extra_params=params)
    context.run(
        tags=tag,
        runner=runner_class(),
        node_names=node_names,
        from_nodes=from_nodes,
        to_nodes=to_nodes,
        from_inputs=from_inputs,
        load_versions=load_version,
        pipeline_name=pipeline,
    )
Example #12
0
def _load_obj(class_path: str) -> Optional[object]:
    try:
        class_obj = load_obj(class_path)
    except ImportError as error:
        if error.name in class_path:
            return None
        # class_obj was successfully loaded, but some dependencies are missing.
        raise DataSetError("{} for {}".format(error, class_path))
    except (AttributeError, ValueError):
        return None

    return class_obj
Example #13
0
def run(
    tag,
    env,
    parallel,
    runner,
    is_async,
    node_names,
    to_nodes,
    from_nodes,
    from_inputs,
    to_outputs,
    load_version,
    pipeline,
    config,
    params,
):
    """Run the pipeline."""
    if parallel and runner:
        raise KedroCliError(
            "Both --parallel and --runner options cannot be used together. "
            "Please use either --parallel or --runner."
        )
    runner = runner or "SequentialRunner"
    if parallel:
        deprecation_message = (
            "DeprecationWarning: The behaviour of --parallel and -p flags will change. "
            "In Kedro 0.18.0, `-p` will be an alias for `--pipeline` and the "
            "`--parallel` flag will no longer exist. Instead, the parallel runner "
            "should be used by specifying `--runner=ParallelRunner` (or "
            "`-r ParallelRunner`)."
        )
        click.secho(deprecation_message, fg="red")
        runner = "ParallelRunner"
    runner_class = load_obj(runner, "kedro.runner")

    tag = _get_values_as_tuple(tag) if tag else tag
    node_names = _get_values_as_tuple(node_names) if node_names else node_names

    with KedroSession.create(env=env, extra_params=params) as session:
        session.run(
            tags=tag,
            runner=runner_class(is_async=is_async),
            node_names=node_names,
            from_nodes=from_nodes,
            to_nodes=to_nodes,
            from_inputs=from_inputs,
            to_outputs=to_outputs,
            load_versions=load_version,
            pipeline_name=pipeline,
        )
def main(
    tags: Iterable[str] = None,
    env: str = None,
    runner: str = None,
):
    """Application main entry point.

    Args:
        tags: An optional list of node tags which should be used to
            filter the nodes of the ``Pipeline``. If specified, only the nodes
            containing *any* of these tags will be added to the ``Pipeline``.
        env: An optional parameter specifying the environment in which
            the ``Pipeline`` should be run. If not specified defaults to "local".
        runner: An optional parameter specifying the runner that you want to run
            the pipeline with.

    Raises:
        KedroCliError: If the resulting ``Pipeline`` is empty.

    """
    # Report project name
    logging.info("** Kedro project {}".format(Path.cwd().name))

    # Load Catalog
    conf = get_config(project_path=str(Path.cwd()), env=env)
    catalog = create_catalog(config=conf)

    # Load the pipeline
    pipeline = create_pipeline()
    pipeline = pipeline.only_nodes_with_tags(*tags) if tags else pipeline
    if not pipeline.nodes:
        if tags:
            raise KedroCliError("Pipeline contains no nodes with tags: " +
                                str(tags))
        raise KedroCliError("Pipeline contains no nodes")

    # Load the runner
    # When either --parallel or --runner is used, class_obj is assigned to runner
    runner = load_obj(runner, "kedro.runner") if runner else SequentialRunner

    # Initialise SparkSession
    spark = init_spark_session()

    # Run the runner
    #runner().run(pipeline, catalog)

    # Run the pipeline
    #io.add_feed_dict({'parameters': parameters}, replace=True)
    SequentialRunner().run(pipeline, catalog)
Example #15
0
def run(tag, env, parallel, runner, node_names, to_nodes, from_nodes):
    """Run the pipeline."""
    from new_kedro_project.run import main
    from_nodes = [n for n in from_nodes.split(",") if n]
    to_nodes = [n for n in to_nodes.split(",") if n]

    if parallel and runner:
        raise KedroCliError(
            "Both --parallel and --runner options cannot be used together. "
            "Please use either --parallel or --runner."
        )
    if parallel:
        runner = "ParallelRunner"
    runner_class = load_obj(runner, "kedro.runner") if runner else SequentialRunner

    main(tags=tag, env=env, runner=runner_class(), node_names=node_names, from_nodes=from_nodes, to_nodes=to_nodes)
Example #16
0
def load_context(project_path: Union[str, Path], **kwargs) -> KedroContext:
    """Loads the KedroContext object of a Kedro Project as defined in `src/<package-name>/run.py`.
    This function will change the current working directory to the project path.

    Args:
        project_path: Path to the Kedro project.
        kwargs: Optional kwargs for ``ProjectContext`` class in `run.py`.

    Returns:
        Instance of ``KedroContext`` class defined in Kedro project.

    Raises:
        KedroContextError: Either '.kedro.yml' was not found
            or loaded context has package conflict.

    """
    project_path = Path(project_path).expanduser().resolve()
    src_path = str(project_path / "src")

    if src_path not in sys.path:
        sys.path.insert(0, src_path)

    if "PYTHONPATH" not in os.environ:
        os.environ["PYTHONPATH"] = src_path

    kedro_yaml = project_path / ".kedro.yml"
    try:
        with kedro_yaml.open("r") as kedro_yml:

            context_path = yaml.safe_load(kedro_yml)["context_path"]
    except Exception:
        raise KedroContextError(
            "Could not retrive 'context_path' from '.kedro.yml' in {}. If you have created "
            "your project with Kedro version <0.15.0, make sure to update your project template. "
            "See https://github.com/quantumblacklabs/kedro/blob/master/RELEASE.md "
            "for how to migrate your Kedro project.".format(str(project_path)))

    context_class = load_obj(context_path)

    if os.getcwd() != str(project_path):
        logging.getLogger(__name__).warning(
            "Changing the current working directory to %s", str(project_path))
        os.chdir(str(project_path))  # Move to project root

    # Instantiate the context after changing the cwd for logging to be properly configured.
    context = context_class(project_path, **kwargs)
    return context
Example #17
0
def run(
    tag,
    env,
    parallel,
    runner,
    is_async,
    node_names,
    to_nodes,
    from_nodes,
    from_inputs,
    load_version,
    pipeline,
    config,
    params,
):
    """Run the pipeline."""
    if parallel and runner:
        raise KedroCliError(
            "Both --parallel and --runner options cannot be used together. "
            "Please use either --parallel or --runner.")
    runner = runner or "SequentialRunner"
    if parallel:
        runner = "ParallelRunner"
    runner_class = load_obj(runner, "kedro.runner")

    tag = _get_values_as_tuple(tag) if tag else tag
    node_names = _get_values_as_tuple(node_names) if node_names else node_names

    package_name = str(Path(__file__).resolve().parent.name)
    with KedroSession.create(package_name, env=env,
                             extra_params=params) as session:
        session.run(
            tags=tag,
            runner=runner_class(is_async=is_async),
            node_names=node_names,
            from_nodes=from_nodes,
            to_nodes=to_nodes,
            from_inputs=from_inputs,
            load_versions=load_version,
            pipeline_name=pipeline,
        )

        # Logging parameters for some e2e tests
        params_to_log = session.load_context().params
        logging.info("Parameters: %s", json.dumps(params_to_log,
                                                  sort_keys=True))
Example #18
0
def run(
    tag,
    env,
    parallel,
    runner,
    is_async,
    node_names,
    to_nodes,
    from_nodes,
    from_inputs,
    load_version,
    pipeline,
    config,
    params,
    run_mode,
):
    """Run the pipeline."""
    if parallel and runner:
        raise KedroCliError(
            "Both --parallel and --runner options cannot be used together. "
            "Please use either --parallel or --runner."
        )
    runner = runner or "SequentialRunner"
    if parallel:
        runner = "ParallelRunner"
    runner_class = load_obj(runner, "kedro.runner")

    project_hooks.set_mode(run_mode)

    tag = _get_values_as_tuple(tag) if tag else tag
    node_names = _get_values_as_tuple(node_names) if node_names else node_names

    package_name = str(Path(__file__).resolve().parent.name)
    with KedroSession.create(package_name, env=env, extra_params=params) as session:
        session.run(
            tags=tag,
            runner=runner_class(is_async=is_async),
            node_names=node_names,
            from_nodes=from_nodes,
            to_nodes=to_nodes,
            from_inputs=from_inputs,
            load_versions=load_version,
            pipeline_name=pipeline,
        )
Example #19
0
def load_context(project_path: Union[str, Path], **kwargs) -> KedroContext:
    """Loads the KedroContext object of a Kedro Project as defined in `src/<package-name>/run.py`.
    This function will change the current working directory to the project path.

    Args:
        project_path: Path to the Kedro project.
        kwargs: Optional custom arguments defined by users, which will be passed to
        ProjectContext class in `run.py`. kwargs will need to be passed explicitly to
        the constructor of ProjectContext.

    Returns:
        Instance of KedroContext class defined in Kedro project.

    Raises:
        KedroContextError: Either '.kedro.yml' was not found
        or loaded context has package conflict.

    """
    project_path = Path(project_path).expanduser().resolve()
    if str(project_path) not in sys.path:
        sys.path.append(str(project_path))

    kedro_yaml = project_path / ".kedro.yml"
    try:
        with kedro_yaml.open("r") as kedro_yml:

            context_path = yaml.safe_load(kedro_yml)["context_path"]
    except Exception:
        raise KedroContextError(
            "Could not retrive 'context_path' from '.kedro.yml' in {}. If you have created "
            "your project with Kedro version <0.15.0, make sure to update your project template. "
            "See https://github.com/quantumblacklabs/kedro/blob/master/RELEASE.md "
            "for how to migrate your Kedro project.".format(str(project_path)))

    context_class = load_obj(context_path)
    context = context_class(project_path, **kwargs)

    if os.getcwd() != str(project_path):
        warn("Changing the current working directory to {}".format(
            str(project_path)))
        os.chdir(str(project_path))  # Move to project root
    return context
Example #20
0
    def _register_hooks(self, auto: bool = False) -> None:
        """Register all hooks as specified in ``hooks`` with the global ``hook_manager``,
        and, optionally, from installed plugins.

        Args:
            auto: An optional flag to enable auto-discovery and registration of plugin hooks.
        """
        hook_manager = get_hook_manager()

        # enrich with hooks specified in .kedro.yml or pyproject.toml if .kedro.yml doesn't exist
        hooks_locations = self.static_data.get("hooks", [])
        configured_hooks = tuple(load_obj(hook) for hook in hooks_locations)

        all_hooks = self.hooks + configured_hooks
        for hooks_collection in all_hooks:
            # Sometimes users might create more than one context instance, in which case
            # hooks have already been registered, so we perform a simple check here
            # to avoid an error being raised and break user's workflow.
            if not hook_manager.is_registered(hooks_collection):
                hook_manager.register(hooks_collection)

        if auto:
            self._register_hooks_setuptools()
Example #21
0
    def from_config(cls, config: Dict[str, Any]) -> "BaseSessionStore":
        """Create a session store instance using the configuration provided.

        Args:
            config: Session store config dictionary.

        Raises:
            ValueError: When the function fails to create the session store
                from its config.

        Returns:
            An instance of an ``BaseSessionStore`` subclass.
        """
        config = deepcopy(config)

        class_obj = config.pop("type", BaseSessionStore)
        if isinstance(class_obj, str):
            class_obj = load_obj(class_obj, BaseSessionStore.__module__)

        classpath = f"{class_obj.__module__}.{class_obj.__qualname__}"

        if not issubclass(class_obj, BaseSessionStore):
            raise ValueError(f"Store type `{classpath}` is invalid: "
                             f"it must extend `BaseSessionStore`.")

        try:
            store = class_obj(**config)
        except TypeError as err:
            raise ValueError(
                f"\n{err}.\nStore config must only contain arguments valid "
                f"for the constructor of `{classpath}`.") from err
        except Exception as err:
            raise ValueError(
                f"\n{err}.\nFailed to instantiate session store of type `{classpath}`."
            ) from err
        return store
Example #22
0
    def __init__(
        self,
        path: str,
        dataset: Union[str, Type[AbstractDataSet], Dict[str, Any]],
        checkpoint: Union[str, Dict[str, Any]] = None,
        filepath_arg: str = "filepath",
        filename_suffix: str = "",
        credentials: Dict[str, Any] = None,
        load_args: Dict[str, Any] = None,
        fs_args: Dict[str, Any] = None,
    ):
        """Creates a new instance of ``IncrementalDataSet``.

        Args:
            path: Path to the folder containing partitioned data.
                If path starts with the protocol (e.g., ``s3://``) then the
                corresponding ``fsspec`` concrete filesystem implementation will
                be used. If protocol is not specified,
                ``fsspec.implementations.local.LocalFileSystem`` will be used.
                **Note:** Some concrete implementations are bundled with ``fsspec``,
                while others (like ``s3`` or ``gcs``) must be installed separately
                prior to usage of the ``PartitionedDataSet``.
            dataset: Underlying dataset definition. This is used to instantiate
                the dataset for each file located inside the ``path``.
                Accepted formats are:
                a) object of a class that inherits from ``AbstractDataSet``
                b) a string representing a fully qualified class name to such class
                c) a dictionary with ``type`` key pointing to a string from b),
                other keys are passed to the Dataset initializer.
                Credentials for the dataset can be explicitly specified in
                this configuration.
            checkpoint: Optional checkpoint configuration. Accepts a dictionary
                with the corresponding dataset definition including ``filepath``
                (unlike ``dataset`` argument). Checkpoint configuration is
                described here:
                https://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#checkpoint-configuration
                Credentials for the checkpoint can be explicitly specified
                in this configuration.
            filepath_arg: Underlying dataset initializer argument that will
                contain a path to each corresponding partition file.
                If unspecified, defaults to "filepath".
            filename_suffix: If specified, only partitions that end with this
                string will be processed.
            credentials: Protocol-specific options that will be passed to
                ``fsspec.filesystem``
                https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem,
                the dataset dataset initializer and the checkpoint. If
                the dataset or the checkpoint configuration contains explicit
                credentials spec, then such spec will take precedence.
                All possible credentials management scenarios are documented here:
                https://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials
            load_args: Keyword arguments to be passed into ``find()`` method of
                the filesystem implementation.
            fs_args: Extra arguments to pass into underlying filesystem class constructor
                (e.g. `{"project": "my-project"}` for ``GCSFileSystem``).

        Raises:
            DataSetError: If versioning is enabled for the underlying dataset.
        """

        super().__init__(
            path=path,
            dataset=dataset,
            filepath_arg=filepath_arg,
            filename_suffix=filename_suffix,
            credentials=credentials,
            load_args=load_args,
            fs_args=fs_args,
        )

        self._checkpoint_config = self._parse_checkpoint_config(checkpoint)
        self._force_checkpoint = self._checkpoint_config.pop(
            "force_checkpoint", None)

        comparison_func = self._checkpoint_config.pop("comparison_func",
                                                      operator.gt)
        if isinstance(comparison_func, str):
            comparison_func = load_obj(comparison_func)
        self._comparison_func = comparison_func
Example #23
0
 def test_load_obj_invalid_module(self):
     with pytest.raises(ImportError,
                        match=r"No module named 'missing_path'"):
         load_obj("InvalidClass", "missing_path")
Example #24
0
 def test_load_obj_invalid_attribute(self):
     with pytest.raises(AttributeError,
                        match=r"Object `InvalidClass` cannot be loaded"):
         load_obj("InvalidClass", "tests.test_utils")
Example #25
0
 def test_load_obj_default_path(self):
     extracted_obj = load_obj("DummyClass", "tests.test_utils")
     assert extracted_obj is DummyClass
Example #26
0
 def test_load_obj(self):
     extracted_obj = load_obj("tests.test_utils.DummyClass")
     assert extracted_obj is DummyClass
Example #27
0
def load_context(project_path: Union[str, Path], **kwargs) -> KedroContext:
    """Loads the KedroContext object of a Kedro Project.
    This is the default way to load the KedroContext object for normal workflows such as
    CLI, Jupyter Notebook, Plugins, etc. It assumes the following project structure
    under the given project_path::

       <project_path>
           |__ <src_dir>
           |__ .kedro.yml
           |__ kedro_cli.py

    The name of the <scr_dir> is `src` by default and configurable in `.kedro.yml`.

    Args:
        project_path: Path to the Kedro project.
        kwargs: Optional kwargs for ``ProjectContext`` class in `run.py`.

    Returns:
        Instance of ``KedroContext`` class defined in Kedro project.

    Raises:
        KedroContextError: Either '.kedro.yml' was not found
            or loaded context has package conflict.

    """
    project_path = Path(project_path).expanduser().resolve()
    kedro_yaml = project_path / ".kedro.yml"

    try:
        with kedro_yaml.open("r") as kedro_yml:
            kedro_yaml_content = yaml.safe_load(kedro_yml)
    except FileNotFoundError:
        raise KedroContextError(
            "Could not find '.kedro.yml' in {}. If you have created your project "
            "with Kedro version <0.15.0, make sure to update your project template. "
            "See https://github.com/quantumblacklabs/kedro/blob/master/RELEASE.md "
            "for how to migrate your Kedro project.".format(str(project_path)))
    except Exception:
        raise KedroContextError("Failed to parse '.kedro.yml' file")

    src_prefix = Path(kedro_yaml_content.get("source_dir", "src")).expanduser()
    src_path = (project_path / src_prefix).resolve()
    validate_source_path(src_path, project_path)

    if str(src_path) not in sys.path:
        sys.path.insert(0, str(src_path))

    if "PYTHONPATH" not in os.environ:
        os.environ["PYTHONPATH"] = str(src_path)

    try:
        context_path = kedro_yaml_content["context_path"]
    except (KeyError, TypeError):
        raise KedroContextError(
            "'.kedro.yml' doesn't have a required `context_path` field. "
            "Please refer to the documentation.")

    context_class = load_obj(context_path)

    # update kwargs with env from the environment variable (defaults to None if not set)
    # need to do this because some CLI command (e.g `kedro run`) defaults to passing in `env=None`
    kwargs["env"] = kwargs.get("env") or os.getenv("KEDRO_ENV")

    # Instantiate the context after changing the cwd for logging to be properly configured.
    context = context_class(project_path=project_path, **kwargs)
    return context
Example #28
0
def _load_obj(class_path: str) -> Optional[object]:
    try:
        class_obj = load_obj(class_path)
    except (ImportError, AttributeError, ValueError):
        return None
    return class_obj
Example #29
0
    def from_config(
        cls: Type,
        name: str,
        config: Dict[str, Any],
        load_version: str = None,
        save_version: str = None,
    ) -> "AbstractDataSet":
        """Create a data set instance using the configuration provided.

        Args:
            name: Data set name.
            config: Data set config dictionary.
            load_version: Version string to be used for ``load`` operation if
                the data set is versioned. Has no effect on the data set
                if versioning was not enabled.
            save_version: Version string to be used for ``save`` operation if
                the data set is versioned. Has no effect on the data set
                if versioning was not enabled.

        Returns:
            An instance of an ``AbstractDataSet`` subclass.

        Raises:
            DataSetError: When the function fails to create the data set
                from its config.

        """
        config = copy.deepcopy(config)
        save_version = save_version or generate_current_version()

        if VERSION_KEY in config:
            # remove "version" key so that it's not passed
            # to the 'unversioned' data set constructor
            message = (
                "`%s` attribute removed from `%s` data set "
                "configuration since it is a reserved word and cannot "
                "be directly specified",
                VERSION_KEY,
                name,
            )
            logging.getLogger(__name__).warning(*message)
            del config[VERSION_KEY]
        if config.pop(VERSIONED_FLAG_KEY, False):  # data set is versioned
            config[VERSION_KEY] = Version(load_version, save_version)

        dataset_class_path = config.pop("type")
        try:
            class_obj = load_obj(dataset_class_path, "kedro.io")
        except ImportError:
            raise DataSetError("Cannot import module when trying to load type "
                               "`{}` for DataSet `{}`.".format(
                                   dataset_class_path, name))
        except AttributeError:
            raise DataSetError("Class `{}` for DataSet `{}` not found.".format(
                dataset_class_path, name))

        if not issubclass(class_obj, AbstractDataSet):
            raise DataSetError("DataSet '{}' type `{}.{}` is invalid: "
                               "all data set types must extend "
                               "`AbstractDataSet`.".format(
                                   name, class_obj.__module__,
                                   class_obj.__qualname__))
        try:
            data_set = class_obj(**config)
        except TypeError as err:
            raise DataSetError("\n{}.\nDataSet '{}' must only contain "
                               "arguments valid for the constructor "
                               "of `{}.{}`.".format(str(err), name,
                                                    class_obj.__module__,
                                                    class_obj.__qualname__))
        except Exception as err:
            raise DataSetError("\n{}.\nFailed to instantiate DataSet "
                               "'{}' of type `{}.{}`.".format(
                                   str(err), name, class_obj.__module__,
                                   class_obj.__qualname__))
        return data_set