def initialize_model( args, # type: Type[EasyDict] model_key="uplift_model_params", # type: str default_estimator="sklearn.linear_model.LogisticRegression", # type: str ): # type: (...) -> Type[sklearn.base.BaseEstimator] if not isinstance(args[model_key], dict): model = args[model_key] return model model_params = args[model_key].copy() if not model_params.get("estimator"): model_params["estimator"] = default_estimator estimator_str = model_params.pop("estimator") estimator_obj = load_obj(estimator_str) const_params = ( (model_params.pop("const_params") or dict()) if "const_params" in model_params else dict() ) if not model_params.get("search_cv"): const_params.update(model_params) model = estimator_obj(**const_params) return model search_cv_str = model_params.pop("search_cv") search_cv_obj = load_obj(search_cv_str) model_params["estimator"] = estimator_obj(**const_params) model = search_cv_obj(**model_params) return model
def _load_obj(class_path: str) -> Optional[object]: mod_path, _, class_name = class_path.rpartition(".") try: available_classes = load_obj(f"{mod_path}.__all__") # ModuleNotFoundError: When `load_obj` can't find `mod_path` (e.g `kedro.io.pandas`) # this is because we try a combination of all prefixes. # AttributeError: When `load_obj` manages to load `mod_path` but it doesn't have an # `__all__` attribute -- either because it's a custom or a kedro.io dataset except (ModuleNotFoundError, AttributeError, ValueError): available_classes = None try: class_obj = load_obj(class_path) except (ModuleNotFoundError, ValueError): return None except AttributeError as exc: if available_classes and class_name in available_classes: raise DataSetError( f"{exc} Please see the documentation on how to " f"install relevant dependencies for {class_path}:\n" f"https://kedro.readthedocs.io/en/stable/" f"04_kedro_project_setup/01_dependencies.html" ) from exc return None return class_obj
def evaluate_model( regressor: LinearRegression, X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray, metrics: List[str], ): """Calculate the coefficient of determination and log the result. Args: regressor: Trained model. X_test: Testing data of independent features. y_test: Testing data for price. """ y_train_pred = regressor.predict(X_train) y_test_pred = regressor.predict(X_test) results = [] for metric in metrics: results.append({ "Metric": metric.rpartition(".")[2], "Train": load_obj(metric)(y_train, y_train_pred), "Test": load_obj(metric)(y_test, y_test_pred), }) return pd.DataFrame(results).set_index("Metric")
def _load_callables(func, default_module): func = func or (lambda *args, **kwargs: (args[0] if args else list(kwargs.values())[0] if kwargs else None)) funcs = func if isinstance(func, list) else [func] for f in funcs: if isinstance(f, str): f_list = f.rsplit(".", 1) obj = f_list[-1] module = f_list[0] if len(f_list) == 2 else None assert module or default_module, ( "The module to which '{}' belongs is unknown. ".format(obj) + "Specify the module (e.g. foo.bar) using the name format" " (e.g. 'foo.bar.{}') ".format(obj) + "or default_module argument.") else: assert callable(f), "{} should be callable or str.".format(f) funcs = [ f if callable(f) else load_obj(f, default_obj_path=default_module) if isinstance(f, str) else None for f in funcs ] return funcs
def load_package_context(project_path: Path, package_name: str, **kwargs) -> KedroContext: """Loads the KedroContext object of a Kedro project package, as output by `kedro package` and installed via `pip`. This function is only intended to be used in a project's `run.py`. If you are looking to load KedroContext object for any other workflow, you might want to use ``load_context`` instead. Args: project_path: Path to the Kedro project, i.e. where `conf/` resides. package_name: Name of the installed Kedro project package. kwargs: Optional kwargs for ``ProjectContext`` class in `run.py`. Returns: Instance of ``KedroContext`` class defined in Kedro project. Raises: KedroContextError: Either '.kedro.yml' was not found or loaded context has package conflict. """ context_path = f"{package_name}.run.ProjectContext" try: context_class = load_obj(context_path) except ModuleNotFoundError: raise KedroContextError( f"Cannot load context object from {context_path} for package {package_name}." ) # update kwargs with env from the environment variable (defaults to None if not set) # need to do this because some CLI command (e.g `kedro run`) defaults to passing in `env=None` kwargs["env"] = kwargs.get("env") or os.getenv("KEDRO_ENV") # Instantiate the context after changing the cwd for logging to be properly configured. context = context_class(project_path=project_path, **kwargs) return context
def run( from_inputs: Iterable[str], to_outputs: Iterable[str], from_nodes: Iterable[str], to_nodes: Iterable[str], node_names: Iterable[str], tag: Iterable[str], pipeline: str, runner: str, is_async: bool, env: str, params: Dict[str, Any], config: click.Path, # pylint: disable=unused-argument ) -> None: """Run the pipeline.""" runner_class = load_obj(obj_path=runner, default_obj_path="kedro.runner") tag = _get_values_as_tuple(values=tag) if tag else tag node_names = _get_values_as_tuple( values=node_names) if node_names else node_names package_name = str(Path(__file__).resolve().parent.name) with KedroSession.create(package_name=package_name, env=env, extra_params=params) as session: session.run( tags=tag, runner=runner_class(is_async=is_async), node_names=node_names, from_nodes=from_nodes, to_nodes=to_nodes, from_inputs=from_inputs, to_outputs=to_outputs, pipeline_name=pipeline, )
def run( tag, env, parallel, runner, node_names, to_nodes, from_nodes, from_inputs, load_version, pipeline, config, params, ): """Run the pipeline.""" if parallel and runner: raise KedroCliError( "Both --parallel and --runner options cannot be used together. " "Please use either --parallel or --runner.") if parallel: runner = "ParallelRunner" runner_class = load_obj(runner, "kedro.runner") if runner else SequentialRunner context = load_context(Path.cwd(), env=env, extra_params=params) context.run( tags=tag, runner=runner_class(), node_names=node_names, from_nodes=from_nodes, to_nodes=to_nodes, from_inputs=from_inputs, load_versions=load_version, pipeline_name=pipeline, )
def load_context(project_path: Union[str, Path], **kwargs) -> KedroContext: """Loads the KedroContext object of a Kedro Project based on the path specified in `.kedro.yml`. This function will change the current working directory to the project path. Args: project_path: Path to the Kedro project. kwargs: Optional kwargs for ``ProjectContext`` class in `run.py`. Returns: Instance of ``KedroContext`` class defined in Kedro project. Raises: KedroContextError: Either '.kedro.yml' was not found or loaded context has package conflict. """ project_path = Path(project_path).expanduser().resolve() src_path = str(project_path / "src") if src_path not in sys.path: sys.path.insert(0, src_path) if "PYTHONPATH" not in os.environ: os.environ["PYTHONPATH"] = src_path kedro_yaml = project_path / ".kedro.yml" try: with kedro_yaml.open("r") as kedro_yml: kedro_yaml_content = yaml.safe_load(kedro_yml) except FileNotFoundError: raise KedroContextError( "Could not find '.kedro.yml' in {}. If you have created your project " "with Kedro version <0.15.0, make sure to update your project template. " "See https://github.com/quantumblacklabs/kedro/blob/master/RELEASE.md " "for how to migrate your Kedro project.".format(str(project_path)) ) except Exception: raise KedroContextError("Failed to parse '.kedro.yml' file") try: context_path = kedro_yaml_content["context_path"] except (KeyError, TypeError): raise KedroContextError( "'.kedro.yml' doesn't have a required `context_path` field. " "Please refer to the documentation." ) context_class = load_obj(context_path) if os.getcwd() != str(project_path): logging.getLogger(__name__).warning( "Changing the current working directory to %s", str(project_path) ) os.chdir(str(project_path)) # Move to project root # Instantiate the context after changing the cwd for logging to be properly configured. context = context_class(project_path, **kwargs) return context
def load_context(project_path: Union[str, Path], **kwargs) -> KedroContext: """Loads the KedroContext object of a Kedro Project. This is the default way to load the KedroContext object for normal workflows such as CLI, Jupyter Notebook, Plugins, etc. It assumes the following project structure under the given project_path:: <project_path> |__ <src_dir> |__ .kedro.yml |__ kedro_cli.py |__ pyproject.toml The name of the <scr_dir> is `src` by default. The `.kedro.yml` or `pyproject.toml` can be used for configuration. If `.kedro.yml` exists, it will be used otherwise, `pyproject.toml` will be treated as the configuration file (Kedro configuration should be under `[tool.kedro]` section). Args: project_path: Path to the Kedro project. kwargs: Optional kwargs for ``ProjectContext`` class in `run.py`. Returns: Instance of ``KedroContext`` class defined in Kedro project. Raises: KedroContextError: Neither '.kedro.yml' nor `pyproject.toml` was found or `[tool.kedro]` section is missing in `pyproject.toml`, or loaded context has package conflict. """ project_path = Path(project_path).expanduser().resolve() static_data = get_static_project_data(project_path) source_dir = static_data["source_dir"] validate_source_path(source_dir, project_path) if "context_path" not in static_data: conf_file = static_data["config_file"].name raise KedroContextError( f"'{conf_file}' doesn't have a required `context_path` field. " f"Please refer to the documentation." ) if str(source_dir) not in sys.path: sys.path.insert(0, str(source_dir)) if "PYTHONPATH" not in os.environ: os.environ["PYTHONPATH"] = str(source_dir) context_class = load_obj(static_data["context_path"]) # update kwargs with env from the environment variable # (defaults to None if not set) # need to do this because some CLI command (e.g `kedro run`) defaults to # passing in `env=None` kwargs["env"] = kwargs.get("env") or os.getenv("KEDRO_ENV") context = context_class(project_path=project_path, **kwargs) return context
def parse_dataset_definition( config: Dict[str, Any], load_version: str = None, save_version: str = None ) -> Tuple[Type[AbstractDataSet], Dict]: """Parse and instantiate a dataset class using the configuration provided. Args: config: Data set config dictionary. It *must* contain the `type` key with fully qualified class name. load_version: Version string to be used for ``load`` operation if the data set is versioned. Has no effect on the data set if versioning was not enabled. save_version: Version string to be used for ``save`` operation if the data set is versioned. Has no effect on the data set if versioning was not enabled. Raises: DataSetError: If the function fails to parse the configuration provided. Returns: 2-tuple: (Dataset class object, configuration dictionary) """ save_version = save_version or generate_timestamp() config = copy.deepcopy(config) if "type" not in config: raise DataSetError("`type` is missing from DataSet catalog configuration") class_obj = config.pop("type") if isinstance(class_obj, str): try: class_obj = load_obj(class_obj, "kedro.io") except ImportError: raise DataSetError( "Cannot import module when trying to load type `{}`.".format(class_obj) ) except AttributeError: raise DataSetError("Class `{}` not found.".format(class_obj)) if not issubclass(class_obj, AbstractDataSet): raise DataSetError( "DataSet type `{}.{}` is invalid: all data set types must extend " "`AbstractDataSet`.".format(class_obj.__module__, class_obj.__qualname__) ) if VERSION_KEY in config: # remove "version" key so that it's not passed # to the "unversioned" data set constructor message = ( "`%s` attribute removed from data set configuration since it is a " "reserved word and cannot be directly specified" ) logging.getLogger(__name__).warning(message, VERSION_KEY) del config[VERSION_KEY] if config.pop(VERSIONED_FLAG_KEY, False): # data set is versioned config[VERSION_KEY] = Version(load_version, save_version) return class_obj, config
def run( tag, env, parallel, streaming, runner, node_names, to_nodes, from_nodes, from_inputs, load_version, pipeline, config, params, ): """Run the pipeline.""" if parallel and runner: raise KedroCliError( "Both --parallel and --runner options cannot be used together. " "Please use either --parallel or --runner.") runner = runner or "SequentialRunner" if parallel: runner = "ParallelRunner" runner_class = load_obj(runner, "kedro.runner") if streaming: runner_class = load_obj("src.runner.StreamingRunner") tag = _get_values_as_tuple(tag) if tag else tag node_names = _get_values_as_tuple(node_names) if node_names else node_names context = load_context(Path.cwd(), env=env, extra_params=params) context.run( tags=tag, runner=runner_class(), node_names=node_names, from_nodes=from_nodes, to_nodes=to_nodes, from_inputs=from_inputs, load_versions=load_version, pipeline_name=pipeline, )
def _load_obj(class_path: str) -> Optional[object]: try: class_obj = load_obj(class_path) except ImportError as error: if error.name in class_path: return None # class_obj was successfully loaded, but some dependencies are missing. raise DataSetError("{} for {}".format(error, class_path)) except (AttributeError, ValueError): return None return class_obj
def run( tag, env, parallel, runner, is_async, node_names, to_nodes, from_nodes, from_inputs, to_outputs, load_version, pipeline, config, params, ): """Run the pipeline.""" if parallel and runner: raise KedroCliError( "Both --parallel and --runner options cannot be used together. " "Please use either --parallel or --runner." ) runner = runner or "SequentialRunner" if parallel: deprecation_message = ( "DeprecationWarning: The behaviour of --parallel and -p flags will change. " "In Kedro 0.18.0, `-p` will be an alias for `--pipeline` and the " "`--parallel` flag will no longer exist. Instead, the parallel runner " "should be used by specifying `--runner=ParallelRunner` (or " "`-r ParallelRunner`)." ) click.secho(deprecation_message, fg="red") runner = "ParallelRunner" runner_class = load_obj(runner, "kedro.runner") tag = _get_values_as_tuple(tag) if tag else tag node_names = _get_values_as_tuple(node_names) if node_names else node_names with KedroSession.create(env=env, extra_params=params) as session: session.run( tags=tag, runner=runner_class(is_async=is_async), node_names=node_names, from_nodes=from_nodes, to_nodes=to_nodes, from_inputs=from_inputs, to_outputs=to_outputs, load_versions=load_version, pipeline_name=pipeline, )
def main( tags: Iterable[str] = None, env: str = None, runner: str = None, ): """Application main entry point. Args: tags: An optional list of node tags which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes containing *any* of these tags will be added to the ``Pipeline``. env: An optional parameter specifying the environment in which the ``Pipeline`` should be run. If not specified defaults to "local". runner: An optional parameter specifying the runner that you want to run the pipeline with. Raises: KedroCliError: If the resulting ``Pipeline`` is empty. """ # Report project name logging.info("** Kedro project {}".format(Path.cwd().name)) # Load Catalog conf = get_config(project_path=str(Path.cwd()), env=env) catalog = create_catalog(config=conf) # Load the pipeline pipeline = create_pipeline() pipeline = pipeline.only_nodes_with_tags(*tags) if tags else pipeline if not pipeline.nodes: if tags: raise KedroCliError("Pipeline contains no nodes with tags: " + str(tags)) raise KedroCliError("Pipeline contains no nodes") # Load the runner # When either --parallel or --runner is used, class_obj is assigned to runner runner = load_obj(runner, "kedro.runner") if runner else SequentialRunner # Initialise SparkSession spark = init_spark_session() # Run the runner #runner().run(pipeline, catalog) # Run the pipeline #io.add_feed_dict({'parameters': parameters}, replace=True) SequentialRunner().run(pipeline, catalog)
def run(tag, env, parallel, runner, node_names, to_nodes, from_nodes): """Run the pipeline.""" from new_kedro_project.run import main from_nodes = [n for n in from_nodes.split(",") if n] to_nodes = [n for n in to_nodes.split(",") if n] if parallel and runner: raise KedroCliError( "Both --parallel and --runner options cannot be used together. " "Please use either --parallel or --runner." ) if parallel: runner = "ParallelRunner" runner_class = load_obj(runner, "kedro.runner") if runner else SequentialRunner main(tags=tag, env=env, runner=runner_class(), node_names=node_names, from_nodes=from_nodes, to_nodes=to_nodes)
def load_context(project_path: Union[str, Path], **kwargs) -> KedroContext: """Loads the KedroContext object of a Kedro Project as defined in `src/<package-name>/run.py`. This function will change the current working directory to the project path. Args: project_path: Path to the Kedro project. kwargs: Optional kwargs for ``ProjectContext`` class in `run.py`. Returns: Instance of ``KedroContext`` class defined in Kedro project. Raises: KedroContextError: Either '.kedro.yml' was not found or loaded context has package conflict. """ project_path = Path(project_path).expanduser().resolve() src_path = str(project_path / "src") if src_path not in sys.path: sys.path.insert(0, src_path) if "PYTHONPATH" not in os.environ: os.environ["PYTHONPATH"] = src_path kedro_yaml = project_path / ".kedro.yml" try: with kedro_yaml.open("r") as kedro_yml: context_path = yaml.safe_load(kedro_yml)["context_path"] except Exception: raise KedroContextError( "Could not retrive 'context_path' from '.kedro.yml' in {}. If you have created " "your project with Kedro version <0.15.0, make sure to update your project template. " "See https://github.com/quantumblacklabs/kedro/blob/master/RELEASE.md " "for how to migrate your Kedro project.".format(str(project_path))) context_class = load_obj(context_path) if os.getcwd() != str(project_path): logging.getLogger(__name__).warning( "Changing the current working directory to %s", str(project_path)) os.chdir(str(project_path)) # Move to project root # Instantiate the context after changing the cwd for logging to be properly configured. context = context_class(project_path, **kwargs) return context
def run( tag, env, parallel, runner, is_async, node_names, to_nodes, from_nodes, from_inputs, load_version, pipeline, config, params, ): """Run the pipeline.""" if parallel and runner: raise KedroCliError( "Both --parallel and --runner options cannot be used together. " "Please use either --parallel or --runner.") runner = runner or "SequentialRunner" if parallel: runner = "ParallelRunner" runner_class = load_obj(runner, "kedro.runner") tag = _get_values_as_tuple(tag) if tag else tag node_names = _get_values_as_tuple(node_names) if node_names else node_names package_name = str(Path(__file__).resolve().parent.name) with KedroSession.create(package_name, env=env, extra_params=params) as session: session.run( tags=tag, runner=runner_class(is_async=is_async), node_names=node_names, from_nodes=from_nodes, to_nodes=to_nodes, from_inputs=from_inputs, load_versions=load_version, pipeline_name=pipeline, ) # Logging parameters for some e2e tests params_to_log = session.load_context().params logging.info("Parameters: %s", json.dumps(params_to_log, sort_keys=True))
def run( tag, env, parallel, runner, is_async, node_names, to_nodes, from_nodes, from_inputs, load_version, pipeline, config, params, run_mode, ): """Run the pipeline.""" if parallel and runner: raise KedroCliError( "Both --parallel and --runner options cannot be used together. " "Please use either --parallel or --runner." ) runner = runner or "SequentialRunner" if parallel: runner = "ParallelRunner" runner_class = load_obj(runner, "kedro.runner") project_hooks.set_mode(run_mode) tag = _get_values_as_tuple(tag) if tag else tag node_names = _get_values_as_tuple(node_names) if node_names else node_names package_name = str(Path(__file__).resolve().parent.name) with KedroSession.create(package_name, env=env, extra_params=params) as session: session.run( tags=tag, runner=runner_class(is_async=is_async), node_names=node_names, from_nodes=from_nodes, to_nodes=to_nodes, from_inputs=from_inputs, load_versions=load_version, pipeline_name=pipeline, )
def load_context(project_path: Union[str, Path], **kwargs) -> KedroContext: """Loads the KedroContext object of a Kedro Project as defined in `src/<package-name>/run.py`. This function will change the current working directory to the project path. Args: project_path: Path to the Kedro project. kwargs: Optional custom arguments defined by users, which will be passed to ProjectContext class in `run.py`. kwargs will need to be passed explicitly to the constructor of ProjectContext. Returns: Instance of KedroContext class defined in Kedro project. Raises: KedroContextError: Either '.kedro.yml' was not found or loaded context has package conflict. """ project_path = Path(project_path).expanduser().resolve() if str(project_path) not in sys.path: sys.path.append(str(project_path)) kedro_yaml = project_path / ".kedro.yml" try: with kedro_yaml.open("r") as kedro_yml: context_path = yaml.safe_load(kedro_yml)["context_path"] except Exception: raise KedroContextError( "Could not retrive 'context_path' from '.kedro.yml' in {}. If you have created " "your project with Kedro version <0.15.0, make sure to update your project template. " "See https://github.com/quantumblacklabs/kedro/blob/master/RELEASE.md " "for how to migrate your Kedro project.".format(str(project_path))) context_class = load_obj(context_path) context = context_class(project_path, **kwargs) if os.getcwd() != str(project_path): warn("Changing the current working directory to {}".format( str(project_path))) os.chdir(str(project_path)) # Move to project root return context
def _register_hooks(self, auto: bool = False) -> None: """Register all hooks as specified in ``hooks`` with the global ``hook_manager``, and, optionally, from installed plugins. Args: auto: An optional flag to enable auto-discovery and registration of plugin hooks. """ hook_manager = get_hook_manager() # enrich with hooks specified in .kedro.yml or pyproject.toml if .kedro.yml doesn't exist hooks_locations = self.static_data.get("hooks", []) configured_hooks = tuple(load_obj(hook) for hook in hooks_locations) all_hooks = self.hooks + configured_hooks for hooks_collection in all_hooks: # Sometimes users might create more than one context instance, in which case # hooks have already been registered, so we perform a simple check here # to avoid an error being raised and break user's workflow. if not hook_manager.is_registered(hooks_collection): hook_manager.register(hooks_collection) if auto: self._register_hooks_setuptools()
def from_config(cls, config: Dict[str, Any]) -> "BaseSessionStore": """Create a session store instance using the configuration provided. Args: config: Session store config dictionary. Raises: ValueError: When the function fails to create the session store from its config. Returns: An instance of an ``BaseSessionStore`` subclass. """ config = deepcopy(config) class_obj = config.pop("type", BaseSessionStore) if isinstance(class_obj, str): class_obj = load_obj(class_obj, BaseSessionStore.__module__) classpath = f"{class_obj.__module__}.{class_obj.__qualname__}" if not issubclass(class_obj, BaseSessionStore): raise ValueError(f"Store type `{classpath}` is invalid: " f"it must extend `BaseSessionStore`.") try: store = class_obj(**config) except TypeError as err: raise ValueError( f"\n{err}.\nStore config must only contain arguments valid " f"for the constructor of `{classpath}`.") from err except Exception as err: raise ValueError( f"\n{err}.\nFailed to instantiate session store of type `{classpath}`." ) from err return store
def __init__( self, path: str, dataset: Union[str, Type[AbstractDataSet], Dict[str, Any]], checkpoint: Union[str, Dict[str, Any]] = None, filepath_arg: str = "filepath", filename_suffix: str = "", credentials: Dict[str, Any] = None, load_args: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, ): """Creates a new instance of ``IncrementalDataSet``. Args: path: Path to the folder containing partitioned data. If path starts with the protocol (e.g., ``s3://``) then the corresponding ``fsspec`` concrete filesystem implementation will be used. If protocol is not specified, ``fsspec.implementations.local.LocalFileSystem`` will be used. **Note:** Some concrete implementations are bundled with ``fsspec``, while others (like ``s3`` or ``gcs``) must be installed separately prior to usage of the ``PartitionedDataSet``. dataset: Underlying dataset definition. This is used to instantiate the dataset for each file located inside the ``path``. Accepted formats are: a) object of a class that inherits from ``AbstractDataSet`` b) a string representing a fully qualified class name to such class c) a dictionary with ``type`` key pointing to a string from b), other keys are passed to the Dataset initializer. Credentials for the dataset can be explicitly specified in this configuration. checkpoint: Optional checkpoint configuration. Accepts a dictionary with the corresponding dataset definition including ``filepath`` (unlike ``dataset`` argument). Checkpoint configuration is described here: https://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#checkpoint-configuration Credentials for the checkpoint can be explicitly specified in this configuration. filepath_arg: Underlying dataset initializer argument that will contain a path to each corresponding partition file. If unspecified, defaults to "filepath". filename_suffix: If specified, only partitions that end with this string will be processed. credentials: Protocol-specific options that will be passed to ``fsspec.filesystem`` https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem, the dataset dataset initializer and the checkpoint. If the dataset or the checkpoint configuration contains explicit credentials spec, then such spec will take precedence. All possible credentials management scenarios are documented here: https://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials load_args: Keyword arguments to be passed into ``find()`` method of the filesystem implementation. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). Raises: DataSetError: If versioning is enabled for the underlying dataset. """ super().__init__( path=path, dataset=dataset, filepath_arg=filepath_arg, filename_suffix=filename_suffix, credentials=credentials, load_args=load_args, fs_args=fs_args, ) self._checkpoint_config = self._parse_checkpoint_config(checkpoint) self._force_checkpoint = self._checkpoint_config.pop( "force_checkpoint", None) comparison_func = self._checkpoint_config.pop("comparison_func", operator.gt) if isinstance(comparison_func, str): comparison_func = load_obj(comparison_func) self._comparison_func = comparison_func
def test_load_obj_invalid_module(self): with pytest.raises(ImportError, match=r"No module named 'missing_path'"): load_obj("InvalidClass", "missing_path")
def test_load_obj_invalid_attribute(self): with pytest.raises(AttributeError, match=r"Object `InvalidClass` cannot be loaded"): load_obj("InvalidClass", "tests.test_utils")
def test_load_obj_default_path(self): extracted_obj = load_obj("DummyClass", "tests.test_utils") assert extracted_obj is DummyClass
def test_load_obj(self): extracted_obj = load_obj("tests.test_utils.DummyClass") assert extracted_obj is DummyClass
def load_context(project_path: Union[str, Path], **kwargs) -> KedroContext: """Loads the KedroContext object of a Kedro Project. This is the default way to load the KedroContext object for normal workflows such as CLI, Jupyter Notebook, Plugins, etc. It assumes the following project structure under the given project_path:: <project_path> |__ <src_dir> |__ .kedro.yml |__ kedro_cli.py The name of the <scr_dir> is `src` by default and configurable in `.kedro.yml`. Args: project_path: Path to the Kedro project. kwargs: Optional kwargs for ``ProjectContext`` class in `run.py`. Returns: Instance of ``KedroContext`` class defined in Kedro project. Raises: KedroContextError: Either '.kedro.yml' was not found or loaded context has package conflict. """ project_path = Path(project_path).expanduser().resolve() kedro_yaml = project_path / ".kedro.yml" try: with kedro_yaml.open("r") as kedro_yml: kedro_yaml_content = yaml.safe_load(kedro_yml) except FileNotFoundError: raise KedroContextError( "Could not find '.kedro.yml' in {}. If you have created your project " "with Kedro version <0.15.0, make sure to update your project template. " "See https://github.com/quantumblacklabs/kedro/blob/master/RELEASE.md " "for how to migrate your Kedro project.".format(str(project_path))) except Exception: raise KedroContextError("Failed to parse '.kedro.yml' file") src_prefix = Path(kedro_yaml_content.get("source_dir", "src")).expanduser() src_path = (project_path / src_prefix).resolve() validate_source_path(src_path, project_path) if str(src_path) not in sys.path: sys.path.insert(0, str(src_path)) if "PYTHONPATH" not in os.environ: os.environ["PYTHONPATH"] = str(src_path) try: context_path = kedro_yaml_content["context_path"] except (KeyError, TypeError): raise KedroContextError( "'.kedro.yml' doesn't have a required `context_path` field. " "Please refer to the documentation.") context_class = load_obj(context_path) # update kwargs with env from the environment variable (defaults to None if not set) # need to do this because some CLI command (e.g `kedro run`) defaults to passing in `env=None` kwargs["env"] = kwargs.get("env") or os.getenv("KEDRO_ENV") # Instantiate the context after changing the cwd for logging to be properly configured. context = context_class(project_path=project_path, **kwargs) return context
def _load_obj(class_path: str) -> Optional[object]: try: class_obj = load_obj(class_path) except (ImportError, AttributeError, ValueError): return None return class_obj
def from_config( cls: Type, name: str, config: Dict[str, Any], load_version: str = None, save_version: str = None, ) -> "AbstractDataSet": """Create a data set instance using the configuration provided. Args: name: Data set name. config: Data set config dictionary. load_version: Version string to be used for ``load`` operation if the data set is versioned. Has no effect on the data set if versioning was not enabled. save_version: Version string to be used for ``save`` operation if the data set is versioned. Has no effect on the data set if versioning was not enabled. Returns: An instance of an ``AbstractDataSet`` subclass. Raises: DataSetError: When the function fails to create the data set from its config. """ config = copy.deepcopy(config) save_version = save_version or generate_current_version() if VERSION_KEY in config: # remove "version" key so that it's not passed # to the 'unversioned' data set constructor message = ( "`%s` attribute removed from `%s` data set " "configuration since it is a reserved word and cannot " "be directly specified", VERSION_KEY, name, ) logging.getLogger(__name__).warning(*message) del config[VERSION_KEY] if config.pop(VERSIONED_FLAG_KEY, False): # data set is versioned config[VERSION_KEY] = Version(load_version, save_version) dataset_class_path = config.pop("type") try: class_obj = load_obj(dataset_class_path, "kedro.io") except ImportError: raise DataSetError("Cannot import module when trying to load type " "`{}` for DataSet `{}`.".format( dataset_class_path, name)) except AttributeError: raise DataSetError("Class `{}` for DataSet `{}` not found.".format( dataset_class_path, name)) if not issubclass(class_obj, AbstractDataSet): raise DataSetError("DataSet '{}' type `{}.{}` is invalid: " "all data set types must extend " "`AbstractDataSet`.".format( name, class_obj.__module__, class_obj.__qualname__)) try: data_set = class_obj(**config) except TypeError as err: raise DataSetError("\n{}.\nDataSet '{}' must only contain " "arguments valid for the constructor " "of `{}.{}`.".format(str(err), name, class_obj.__module__, class_obj.__qualname__)) except Exception as err: raise DataSetError("\n{}.\nFailed to instantiate DataSet " "'{}' of type `{}.{}`.".format( str(err), name, class_obj.__module__, class_obj.__qualname__)) return data_set