def lint(files): """Run flake8, isort and (on Python >=3.6) black.""" # pylint: disable=unused-import if not files: files = ("src/tests", "src/hintech") try: import isort import autoflake import mypy import vulture except ImportError as exc: raise KedroCliError(NO_DEPENDENCY_MESSAGE.format(exc.name)) python_call("isort", ("-q", "-y", "-rc", "-sl") + files) python_call( "autoflake", ("--remove-all-unused-imports", "--recursive", "--remove-unused-variables", "--in-place", "--exclude=__init__.py") + files) if sys.version_info[:2] >= (3, 6): try: import black except ImportError: raise KedroCliError(NO_DEPENDENCY_MESSAGE.format("black")) python_call("black", ("-l 120", ) + files) python_call( "isort", ("-q", "-y", "-ca", "-rc", "-tc", "-up", "-fgw=0", "-m=3", "-w=120") + files) python_call("mypy", ("src/hintech", )) python_call("vulture", ("--min-confidence=70", ) + files)
def _check_config_ok(config_path: str, config: Dict[str, Any]) -> Dict[str, Any]: """Check that the configuration file contains all needed variables. Args: config_path: The path of the config file. config: The config as a dictionary. Returns: Config dictionary. Raises: KedroCliError: If the config file is empty or does not contain all keys from template/cookiecutter.json and output_dir. """ if config is None: _show_example_config() raise KedroCliError(config_path + " is empty") required_in_config = _get_default_config().keys() for var in required_in_config: if var not in config: click.echo("\n" + config_path + ":") click.echo(yaml.dump(config, default_flow_style=False)) _show_example_config() raise KedroCliError("[" + var + "] not found in " + config_path) config["output_dir"] = _fix_user_path(config["output_dir"]) _assert_output_dir_ok(config["output_dir"]) _assert_repo_name_ok(config["repo_name"]) _assert_pkg_name_ok(config["python_package"]) _assert_include_example_ok(config["include_example"]) return config
def _check_config_ok(config_path: str, config: Dict[str, Any]) -> Dict[str, Any]: """Check that the configuration file contains all needed variables. Args: config_path: The path of the config file. config: The config as a dictionary. Returns: Config dictionary. Raises: KedroCliError: If the config file is empty or does not contain all keys from template/cookiecutter.json and output_dir. """ if config is None: _show_example_config() raise KedroCliError(config_path + " is empty") missing_keys = _get_default_config().keys() - config.keys() if missing_keys: click.echo(f"\n{config_path}:") click.echo(yaml.dump(config, default_flow_style=False)) _show_example_config() missing_keys_str = ", ".join(str(k) for k in missing_keys) raise KedroCliError(f"[{missing_keys_str}] not found in {config_path}") config["output_dir"] = _fix_user_path(config["output_dir"]) _assert_output_dir_ok(config["output_dir"]) _assert_repo_name_ok(config["repo_name"]) _assert_pkg_name_ok(config["python_package"]) _assert_include_example_ok(config["include_example"]) return config
def activate_nbstripout(): """Install the nbstripout git hook to automatically clean notebooks.""" secho( ("Notebook output cells will be automatically cleared before committing" " to git."), fg="yellow", ) try: import nbstripout # pylint: disable=unused-import except ImportError: raise KedroCliError(NO_NBSTRIPOUT_MESSAGE) try: res = subprocess.run( ["git", "rev-parse", "--git-dir"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) if res.returncode: raise KedroCliError("Not a git repository. Run `git init` first.") except FileNotFoundError: raise KedroCliError("Git executable not found. Install Git first.") call(["nbstripout", "--install"])
def docker_group(): """Dockerize your Kedro project.""" # check that docker is running try: res = subprocess.run( ["docker", "version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL ).returncode except FileNotFoundError: raise KedroCliError(NO_DOCKER_MESSAGE) if res: raise KedroCliError(NO_DOCKER_MESSAGE)
def main( tags: Iterable[str] = None, env: str = None, runner: str = None, ): """Application main entry point. Args: tags: An optional list of node tags which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes containing *any* of these tags will be added to the ``Pipeline``. env: An optional parameter specifying the environment in which the ``Pipeline`` should be run. If not specified defaults to "local". runner: An optional parameter specifying the runner that you want to run the pipeline with. Raises: KedroCliError: If the resulting ``Pipeline`` is empty. """ # Report project name logging.info("** Kedro project {}".format(Path.cwd().name)) # Load Catalog conf = get_config(project_path=str(Path.cwd()), env=env) catalog = create_catalog(config=conf) # Load the pipeline pipeline = create_pipeline() pipeline = pipeline.only_nodes_with_tags(*tags) if tags else pipeline if not pipeline.nodes: if tags: raise KedroCliError("Pipeline contains no nodes with tags: " + str(tags)) raise KedroCliError("Pipeline contains no nodes") # Load the runner # When either --parallel or --runner is used, class_obj is assigned to runner runner = load_obj(runner, "kedro.runner") if runner else SequentialRunner # Initialise SparkSession spark = init_spark_session() # Run the runner #runner().run(pipeline, catalog) # Run the pipeline #io.add_feed_dict({'parameters': parameters}, replace=True) SequentialRunner().run(pipeline, catalog)
def _assert_include_example_ok(include_example): if not isinstance(include_example, bool): message = ( "`{}` value for `include_example` is invalid. It must be a boolean value " "True or False.".format(include_example) ) raise KedroCliError(message)
def _port_callback(ctx, param, value): # pylint: disable=unused-argument if is_port_in_use(value): raise KedroCliError( "Port {} is already in use on the host. " "Please specify an alternative port number.".format(value) ) return value
def run( tag, env, parallel, runner, node_names, to_nodes, from_nodes, from_inputs, load_version, pipeline, config, params, ): """Run the pipeline.""" if parallel and runner: raise KedroCliError( "Both --parallel and --runner options cannot be used together. " "Please use either --parallel or --runner.") if parallel: runner = "ParallelRunner" runner_class = load_obj(runner, "kedro.runner") if runner else SequentialRunner context = load_context(Path.cwd(), env=env, extra_params=params) context.run( tags=tag, runner=runner_class(), node_names=node_names, from_nodes=from_nodes, to_nodes=to_nodes, from_inputs=from_inputs, load_versions=load_version, pipeline_name=pipeline, )
def convert_notebook(all_flag, overwrite_flag, filepath): """Convert selected or all notebooks found in a Kedro project to Kedro code, by exporting code from the appropriately-tagged cells: Cells tagged as `node` will be copied over to a Python file matching the name of the notebook, under `src/<package_name>/nodes`. *Note*: Make sure your notebooks have unique names! FILEPATH: Path(s) to exact notebook file(s) to be converted. Both relative and absolute paths are accepted. Should not be provided if --all flag is already present. """ context = load_context(Path.cwd()) if not filepath and not all_flag: secho( "Please specify a notebook filepath " "or add '--all' to convert all notebooks." ) sys.exit(1) kedro_project_path = context.project_path kedro_package_name = "kedro_demo_feb2020" if all_flag: # pathlib glob does not ignore hidden directories, # whereas Python glob does, which is more useful in # ensuring checkpoints will not be included pattern = kedro_project_path / "**" / "*.ipynb" notebooks = sorted(Path(p) for p in iglob(str(pattern), recursive=True)) else: notebooks = [Path(f) for f in filepath] counter = Counter(n.stem for n in notebooks) non_unique_names = [name for name, counts in counter.items() if counts > 1] if non_unique_names: raise KedroCliError( "Found non-unique notebook names! " "Please rename the following: {}".format(", ".join(non_unique_names)) ) for notebook in notebooks: secho("Converting notebook '{}'...".format(str(notebook))) output_path = ( kedro_project_path / "src" / kedro_package_name / "nodes" / "{}.py".format(notebook.stem) ) if output_path.is_file(): overwrite = overwrite_flag or click.confirm( "Output file {} already exists. Overwrite?".format(str(output_path)), default=False, ) if overwrite: export_nodes(notebook, output_path) else: export_nodes(notebook, output_path) secho("Done!")
def test(args): """Run the test suite.""" try: import pytest # pylint: disable=unused-import except ImportError: raise KedroCliError(NO_DEPENDENCY_MESSAGE.format("pytest")) else: python_call("pytest", args)
def test(args): """Run the test suite.""" try: import pytest # pylint: disable=unused-import except ImportError: raise KedroCliError(NO_PYTEST_MESSAGE) else: python_call("pytest", args)
def _load_from_file(load_file: str) -> dict: global data # pylint: disable=global-statement,invalid-name data = json.loads(Path(load_file).read_text()) for key in ["nodes", "edges", "tags"]: if key not in data: raise KedroCliError( "Invalid file, top level key '{}' not found.".format(key)) return data
def _assert_repo_name_ok(repo_name): if not re.match(r"^\w+(-*\w+)*$", repo_name): message = ( "`{}` is not a valid repository name. It must contain " "only word symbols and/or hyphens, must also start and " "end with alphanumeric symbol.".format(repo_name) ) raise KedroCliError(message)
def viz(host, port, browser, load_file, save_file, pipeline, env): """Visualize the pipeline using kedroviz.""" try: _call_viz(host, port, browser, load_file, save_file, pipeline, env) except KedroCliError: raise except Exception as ex: raise KedroCliError(str(ex))
def _pytest_module(script_filename, args): # TODO: improve this function logging.info(f'Testing Python script "{script_filename}" with pytest...') rtn = pytest.main([script_filename, *args]) if rtn != 0: raise KedroCliError( f'Python script pytest returned non-zero exit code: {rtn}') logging.info('Testing done.')
def _get_pipeline_from_context(context, pipeline_name): if match(kedro.__version__, ">=0.15.2"): return context._get_pipeline( # pylint: disable=protected-access name=pipeline_name) # Kedro 0.15.0 or 0.15.1 if pipeline_name: raise KedroCliError(ERROR_PIPELINE_FLAG_NOT_SUPPORTED) return context.pipeline
def run(tag, env, parallel, runner): """Run the pipeline.""" from predictive_maintenance.run import main if parallel and runner: raise KedroCliError( "Both --parallel and --runner options cannot be used together. " "Please use either --parallel or --runner.") if parallel: runner = "ParallelRunner" main(tags=tag, env=env, runner=runner)
def _get_pipeline_catalog_from_kedro14(env): try: pipeline = get_project_context("create_pipeline")() get_config = get_project_context("get_config") conf = get_config(str(Path.cwd()), env) create_catalog = get_project_context("create_catalog") catalog = create_catalog(config=conf) return pipeline, catalog except (ImportError, KeyError): raise KedroCliError(ERROR_PROJECT_ROOT)
def run(tag, env, parallel, runner): """Run the pipeline.""" from {{cookiecutter.python_package}}.run import main if parallel and runner: raise KedroCliError( "Both --parallel and --runner options cannot be used together. " "Please use either --parallel or --runner." ) if parallel: runner = "ParallelRunner" main(tags=tag, env=env, runner=runner)
def lint(files): """Run flake8, isort and (on Python >=3.6) black.""" # pylint: disable=unused-import if not files: files = ("src/tests", "src/kedro_code_forensics") try: import flake8 import isort except ImportError as exc: raise KedroCliError(NO_DEPENDENCY_MESSAGE.format(exc.name)) python_call("isort", ("-rc", "-tc", "-up", "-fgw=0", "-m=3", "-w=88") + files) if sys.version_info[:2] >= (3, 6): try: import black except ImportError: raise KedroCliError(NO_DEPENDENCY_MESSAGE.format("black")) python_call("black", files) python_call("flake8", ("--max-line-length=88", ) + files)
def _assert_pkg_name_ok(pkg_name: str): """Check that python package name is in line with PEP8 requirements. Args: pkg_name: Candidate Python package name. Raises: KedroCliError: If package name violates the requirements. """ base_message = "`{}` is not a valid Python package name.".format(pkg_name) if not re.match(r"^[a-zA-Z_]", pkg_name): message = base_message + " It must start with a letter or underscore." raise KedroCliError(message) if len(pkg_name) < 2: message = base_message + " It must be at least 2 characters long." raise KedroCliError(message) if not re.match(r"^\w+$", pkg_name[1:]): message = (base_message + " It must contain only letters, " "digits, and/or underscores.") raise KedroCliError(message)
def _call_viz( host=None, port=None, browser=None, load_file=None, save_file=None, pipeline_name=None, env=None, ): global data # pylint: disable=global-statement,invalid-name if load_file: data = _load_from_file(load_file) else: if match(kedro.__version__, ">=0.15.0"): from kedro.context import KedroContextError try: context = get_project_context("context", env=env) pipeline = _get_pipeline_from_context(context, pipeline_name) except KedroContextError: raise KedroCliError(ERROR_PROJECT_ROOT) catalog = context.catalog else: # Kedro 0.14.* if pipeline_name: raise KedroCliError(ERROR_PIPELINE_FLAG_NOT_SUPPORTED) pipeline, catalog = _get_pipeline_catalog_from_kedro14(env) data = format_pipeline_data(pipeline, catalog) if save_file: Path(save_file).write_text(json.dumps(data, indent=4, sort_keys=True)) else: if browser: webbrowser.open_new("http://127.0.0.1:{:d}/".format(port)) app.run(host=host, port=port)
def _handle_exception(msg, end=True): """Pretty print the current exception then exit.""" if _KEDRO_CONTEXT["verbose"]: click.secho(traceback.format_exc(), nl=False, fg="yellow") else: etype, value, _ = sys.exc_info() click.secho( "".join(*traceback.format_exception_only(etype, value)) + "Run with --verbose to see the full exception", fg="yellow", ) if end: raise KedroCliError(msg) click.secho("Error: " + msg, fg="red") # pragma: no cover
def _assert_output_dir_ok(output_dir: str): """Check that output directory exists. Args: output_dir: Output directory path. Raises: KedroCliError: If the output directory does not exist. """ if not os.path.exists(output_dir): message = ("`{}` is not a valid output directory. " "It must be a relative or absolute path " "to an existing directory.".format(output_dir)) raise KedroCliError(message)
def run(tag, env, parallel, runner, node_names, to_nodes, from_nodes): """Run the pipeline.""" from new_kedro_project.run import main from_nodes = [n for n in from_nodes.split(",") if n] to_nodes = [n for n in to_nodes.split(",") if n] if parallel and runner: raise KedroCliError( "Both --parallel and --runner options cannot be used together. " "Please use either --parallel or --runner." ) if parallel: runner = "ParallelRunner" runner_class = load_obj(runner, "kedro.runner") if runner else SequentialRunner main(tags=tag, env=env, runner=runner_class(), node_names=node_names, from_nodes=from_nodes, to_nodes=to_nodes)
def check_docker_image_exists(image: str): """ Check that the specified Docker image exists locally. Args: image: Docker image name. Raises: KedroCliError: If specified Docker image was not found. """ command = ["docker", "images", "-q", image] res = subprocess.run(command, stdout=PIPE, stderr=DEVNULL, check=False) if not res.stdout: cmd = "kedro docker build --image {0}".format(image) raise KedroCliError( "Unable to find image `{0}` locally. Please build it first by running:\n" "{1}".format(image, cmd))
def list_datasets(pipeline, env): """Show datasets per type.""" title = "DataSets in '{}' pipeline" not_mentioned = "Datasets not mentioned in pipeline" mentioned = "Datasets mentioned in pipeline" context = load_context(Path.cwd(), env=env) datasets_meta = context.catalog._data_sets catalog_ds = set(context.catalog.list()) pipelines = pipeline or context.pipelines.keys() result = {} for pipeline in pipelines: pl_obj = context.pipelines.get(pipeline) if pl_obj: pipeline_ds = pl_obj.data_sets() else: existing_pls = ", ".join(sorted(context.pipelines.keys())) raise KedroCliError( "{} pipeline not found! Existing pipelines: {}".format( pipeline, existing_pls)) unused_ds = catalog_ds - pipeline_ds default_ds = pipeline_ds - catalog_ds used_ds = catalog_ds - unused_ds unused_by_type = _map_type_to_datasets(unused_ds, datasets_meta) used_by_type = _map_type_to_datasets(used_ds, datasets_meta) if default_ds: used_by_type["DefaultDataSet"].extend(default_ds) data = ((not_mentioned, dict(unused_by_type)), (mentioned, dict(used_by_type))) result[title.format(pipeline)] = { key: value for key, value in data if value } secho(yaml.dump(result))
def lint(files, check_only): """Run flake8, isort and (on Python >=3.6) black.""" files = files or (str( SOURCE_PATH / "tests"), str(SOURCE_PATH / KEDRO_PACKAGE_NAME)) try: import flake8 import isort import black except ImportError as exc: raise KedroCliError( NO_DEPENDENCY_MESSAGE.format(module=exc.name, src=str(SOURCE_PATH))) python_call("black", ("--check", ) + files if check_only else files) python_call("flake8", ("--max-line-length=88", ) + files) check_flag = ("-c", ) if check_only else () python_call("isort", (*check_flag, "-rc", "-tc", "-up", "-fgw=0", "-m=3", "-w=88") + files)
def compose_docker_run_args( host_root: str = None, container_root: str = None, mount_volumes: Sequence[str] = None, required_args: Sequence[Tuple[str, Union[str, None]]] = None, optional_args: Sequence[Tuple[str, Union[str, None]]] = None, user_args: Sequence[str] = None, ) -> List[str]: """ Make a list of arguments for the docker command. Args: host_root: Path project root on the host. It must be provided if `mount_volumes` are specified, optional otherwise. container_root: Path to project root in the container (e.g., `/home/kedro/<repo_name>`). It must be provided if `mount_volumes` are specified, optional otherwise. mount_volumes: List of volumes to be mounted. required_args: List of required arguments. optional_args: List of optional arguments, these will be added if only not present in `user_args` list. user_args: List of arguments already specified by the user. Raises: KedroCliError: If `mount_volumes` are provided but either `host_root` or `container_root` are missing. Returns: List of arguments for the docker command. """ mount_volumes = mount_volumes or [] required_args = required_args or [] optional_args = optional_args or [] user_args = user_args or [] split_user_args = {ua.split("=", 1)[0] for ua in user_args} def _add_args(name_: str, value_: str = None, force_: bool = False) -> List[str]: """ Add extra args to existing list of CLI args. Args: name_: Arg name to add. value_: Arg value to add, skipped if None. force_: Add the argument even if it's present in the current list of args. Returns: List containing the new args and (optionally) its value or an empty list if no values to be added. """ if not force_ and name_ in split_user_args: return [] return [name_] if value_ is None else [name_, value_] if mount_volumes: if not (host_root and container_root): raise KedroCliError("Both `host_root` and `container_root` must " "be specified in `compose_docker_run_args` " "call if `mount_volumes` are provided.") vol_gen = _list_docker_volumes(host_root, container_root, mount_volumes) combined_args = list(chain.from_iterable(vol_gen)) else: combined_args = [] for arg_name, arg_value in required_args: combined_args += _add_args(arg_name, arg_value, True) for arg_name, arg_value in optional_args: combined_args += _add_args(arg_name, arg_value) return combined_args + user_args