Ejemplo n.º 1
0
def lint(files):
    """Run flake8, isort and (on Python >=3.6) black."""
    # pylint: disable=unused-import
    if not files:
        files = ("src/tests", "src/hintech")

    try:
        import isort
        import autoflake
        import mypy
        import vulture
    except ImportError as exc:
        raise KedroCliError(NO_DEPENDENCY_MESSAGE.format(exc.name))

    python_call("isort", ("-q", "-y", "-rc", "-sl") + files)
    python_call(
        "autoflake",
        ("--remove-all-unused-imports", "--recursive",
         "--remove-unused-variables", "--in-place", "--exclude=__init__.py") +
        files)

    if sys.version_info[:2] >= (3, 6):
        try:
            import black
        except ImportError:
            raise KedroCliError(NO_DEPENDENCY_MESSAGE.format("black"))
        python_call("black", ("-l 120", ) + files)

    python_call(
        "isort",
        ("-q", "-y", "-ca", "-rc", "-tc", "-up", "-fgw=0", "-m=3", "-w=120") +
        files)
    python_call("mypy", ("src/hintech", ))
    python_call("vulture", ("--min-confidence=70", ) + files)
Ejemplo n.º 2
0
def _check_config_ok(config_path: str, config: Dict[str, Any]) -> Dict[str, Any]:
    """Check that the configuration file contains all needed variables.

    Args:
        config_path: The path of the config file.
        config: The config as a dictionary.

    Returns:
        Config dictionary.

    Raises:
        KedroCliError: If the config file is empty or does not contain all
            keys from template/cookiecutter.json and output_dir.

    """
    if config is None:
        _show_example_config()
        raise KedroCliError(config_path + " is empty")

    required_in_config = _get_default_config().keys()

    for var in required_in_config:
        if var not in config:
            click.echo("\n" + config_path + ":")
            click.echo(yaml.dump(config, default_flow_style=False))
            _show_example_config()

            raise KedroCliError("[" + var + "] not found in " + config_path)

    config["output_dir"] = _fix_user_path(config["output_dir"])
    _assert_output_dir_ok(config["output_dir"])
    _assert_repo_name_ok(config["repo_name"])
    _assert_pkg_name_ok(config["python_package"])
    _assert_include_example_ok(config["include_example"])
    return config
Ejemplo n.º 3
0
def _check_config_ok(config_path: str, config: Dict[str, Any]) -> Dict[str, Any]:
    """Check that the configuration file contains all needed variables.

    Args:
        config_path: The path of the config file.
        config: The config as a dictionary.

    Returns:
        Config dictionary.

    Raises:
        KedroCliError: If the config file is empty or does not contain all
            keys from template/cookiecutter.json and output_dir.

    """
    if config is None:
        _show_example_config()
        raise KedroCliError(config_path + " is empty")

    missing_keys = _get_default_config().keys() - config.keys()

    if missing_keys:
        click.echo(f"\n{config_path}:")
        click.echo(yaml.dump(config, default_flow_style=False))
        _show_example_config()

        missing_keys_str = ", ".join(str(k) for k in missing_keys)
        raise KedroCliError(f"[{missing_keys_str}] not found in {config_path}")

    config["output_dir"] = _fix_user_path(config["output_dir"])
    _assert_output_dir_ok(config["output_dir"])
    _assert_repo_name_ok(config["repo_name"])
    _assert_pkg_name_ok(config["python_package"])
    _assert_include_example_ok(config["include_example"])
    return config
def activate_nbstripout():
    """Install the nbstripout git hook to automatically clean notebooks."""
    secho(
        ("Notebook output cells will be automatically cleared before committing"
         " to git."),
        fg="yellow",
    )

    try:
        import nbstripout  # pylint: disable=unused-import
    except ImportError:
        raise KedroCliError(NO_NBSTRIPOUT_MESSAGE)

    try:
        res = subprocess.run(
            ["git", "rev-parse", "--git-dir"],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        if res.returncode:
            raise KedroCliError("Not a git repository. Run `git init` first.")
    except FileNotFoundError:
        raise KedroCliError("Git executable not found. Install Git first.")

    call(["nbstripout", "--install"])
Ejemplo n.º 5
0
def docker_group():
    """Dockerize your Kedro project."""
    # check that docker is running
    try:
        res = subprocess.run(
            ["docker", "version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
        ).returncode
    except FileNotFoundError:
        raise KedroCliError(NO_DOCKER_MESSAGE)
    if res:
        raise KedroCliError(NO_DOCKER_MESSAGE)
def main(
    tags: Iterable[str] = None,
    env: str = None,
    runner: str = None,
):
    """Application main entry point.

    Args:
        tags: An optional list of node tags which should be used to
            filter the nodes of the ``Pipeline``. If specified, only the nodes
            containing *any* of these tags will be added to the ``Pipeline``.
        env: An optional parameter specifying the environment in which
            the ``Pipeline`` should be run. If not specified defaults to "local".
        runner: An optional parameter specifying the runner that you want to run
            the pipeline with.

    Raises:
        KedroCliError: If the resulting ``Pipeline`` is empty.

    """
    # Report project name
    logging.info("** Kedro project {}".format(Path.cwd().name))

    # Load Catalog
    conf = get_config(project_path=str(Path.cwd()), env=env)
    catalog = create_catalog(config=conf)

    # Load the pipeline
    pipeline = create_pipeline()
    pipeline = pipeline.only_nodes_with_tags(*tags) if tags else pipeline
    if not pipeline.nodes:
        if tags:
            raise KedroCliError("Pipeline contains no nodes with tags: " +
                                str(tags))
        raise KedroCliError("Pipeline contains no nodes")

    # Load the runner
    # When either --parallel or --runner is used, class_obj is assigned to runner
    runner = load_obj(runner, "kedro.runner") if runner else SequentialRunner

    # Initialise SparkSession
    spark = init_spark_session()

    # Run the runner
    #runner().run(pipeline, catalog)

    # Run the pipeline
    #io.add_feed_dict({'parameters': parameters}, replace=True)
    SequentialRunner().run(pipeline, catalog)
Ejemplo n.º 7
0
def _assert_include_example_ok(include_example):
    if not isinstance(include_example, bool):
        message = (
            "`{}` value for `include_example` is invalid. It must be a boolean value "
            "True or False.".format(include_example)
        )
        raise KedroCliError(message)
Ejemplo n.º 8
0
def _port_callback(ctx, param, value):  # pylint: disable=unused-argument
    if is_port_in_use(value):
        raise KedroCliError(
            "Port {} is already in use on the host. "
            "Please specify an alternative port number.".format(value)
        )
    return value
Ejemplo n.º 9
0
def run(
    tag,
    env,
    parallel,
    runner,
    node_names,
    to_nodes,
    from_nodes,
    from_inputs,
    load_version,
    pipeline,
    config,
    params,
):
    """Run the pipeline."""
    if parallel and runner:
        raise KedroCliError(
            "Both --parallel and --runner options cannot be used together. "
            "Please use either --parallel or --runner.")
    if parallel:
        runner = "ParallelRunner"
    runner_class = load_obj(runner,
                            "kedro.runner") if runner else SequentialRunner

    context = load_context(Path.cwd(), env=env, extra_params=params)
    context.run(
        tags=tag,
        runner=runner_class(),
        node_names=node_names,
        from_nodes=from_nodes,
        to_nodes=to_nodes,
        from_inputs=from_inputs,
        load_versions=load_version,
        pipeline_name=pipeline,
    )
Ejemplo n.º 10
0
def convert_notebook(all_flag, overwrite_flag, filepath):
    """Convert selected or all notebooks found in a Kedro project
    to Kedro code, by exporting code from the appropriately-tagged cells:
    Cells tagged as `node` will be copied over to a Python file matching
    the name of the notebook, under `src/<package_name>/nodes`.
    *Note*: Make sure your notebooks have unique names!
    FILEPATH: Path(s) to exact notebook file(s) to be converted. Both
    relative and absolute paths are accepted.
    Should not be provided if --all flag is already present.
    """
    context = load_context(Path.cwd())

    if not filepath and not all_flag:
        secho(
            "Please specify a notebook filepath "
            "or add '--all' to convert all notebooks."
        )
        sys.exit(1)

    kedro_project_path = context.project_path
    kedro_package_name = "kedro_demo_feb2020"

    if all_flag:
        # pathlib glob does not ignore hidden directories,
        # whereas Python glob does, which is more useful in
        # ensuring checkpoints will not be included
        pattern = kedro_project_path / "**" / "*.ipynb"
        notebooks = sorted(Path(p) for p in iglob(str(pattern), recursive=True))
    else:
        notebooks = [Path(f) for f in filepath]

    counter = Counter(n.stem for n in notebooks)
    non_unique_names = [name for name, counts in counter.items() if counts > 1]
    if non_unique_names:
        raise KedroCliError(
            "Found non-unique notebook names! "
            "Please rename the following: {}".format(", ".join(non_unique_names))
        )

    for notebook in notebooks:
        secho("Converting notebook '{}'...".format(str(notebook)))
        output_path = (
            kedro_project_path
            / "src"
            / kedro_package_name
            / "nodes"
            / "{}.py".format(notebook.stem)
        )

        if output_path.is_file():
            overwrite = overwrite_flag or click.confirm(
                "Output file {} already exists. Overwrite?".format(str(output_path)),
                default=False,
            )
            if overwrite:
                export_nodes(notebook, output_path)
        else:
            export_nodes(notebook, output_path)

    secho("Done!")
Ejemplo n.º 11
0
def test(args):
    """Run the test suite."""
    try:
        import pytest  # pylint: disable=unused-import
    except ImportError:
        raise KedroCliError(NO_DEPENDENCY_MESSAGE.format("pytest"))
    else:
        python_call("pytest", args)
def test(args):
    """Run the test suite."""
    try:
        import pytest  # pylint: disable=unused-import
    except ImportError:
        raise KedroCliError(NO_PYTEST_MESSAGE)
    else:
        python_call("pytest", args)
Ejemplo n.º 13
0
def _load_from_file(load_file: str) -> dict:
    global data  # pylint: disable=global-statement,invalid-name
    data = json.loads(Path(load_file).read_text())
    for key in ["nodes", "edges", "tags"]:
        if key not in data:
            raise KedroCliError(
                "Invalid file, top level key '{}' not found.".format(key))
    return data
Ejemplo n.º 14
0
def _assert_repo_name_ok(repo_name):
    if not re.match(r"^\w+(-*\w+)*$", repo_name):
        message = (
            "`{}` is not a valid repository name. It must contain "
            "only word symbols and/or hyphens, must also start and "
            "end with alphanumeric symbol.".format(repo_name)
        )
        raise KedroCliError(message)
Ejemplo n.º 15
0
def viz(host, port, browser, load_file, save_file, pipeline, env):
    """Visualize the pipeline using kedroviz."""
    try:
        _call_viz(host, port, browser, load_file, save_file, pipeline, env)
    except KedroCliError:
        raise
    except Exception as ex:
        raise KedroCliError(str(ex))
Ejemplo n.º 16
0
def _pytest_module(script_filename, args):
    # TODO: improve this function
    logging.info(f'Testing Python script "{script_filename}" with pytest...')
    rtn = pytest.main([script_filename, *args])
    if rtn != 0:
        raise KedroCliError(
            f'Python script pytest returned non-zero exit code: {rtn}')
    logging.info('Testing done.')
Ejemplo n.º 17
0
def _get_pipeline_from_context(context, pipeline_name):
    if match(kedro.__version__, ">=0.15.2"):
        return context._get_pipeline(  # pylint: disable=protected-access
            name=pipeline_name)
    # Kedro 0.15.0 or 0.15.1
    if pipeline_name:
        raise KedroCliError(ERROR_PIPELINE_FLAG_NOT_SUPPORTED)
    return context.pipeline
def run(tag, env, parallel, runner):
    """Run the pipeline."""
    from predictive_maintenance.run import main
    if parallel and runner:
        raise KedroCliError(
            "Both --parallel and --runner options cannot be used together. "
            "Please use either --parallel or --runner.")
    if parallel:
        runner = "ParallelRunner"
    main(tags=tag, env=env, runner=runner)
Ejemplo n.º 19
0
def _get_pipeline_catalog_from_kedro14(env):
    try:
        pipeline = get_project_context("create_pipeline")()
        get_config = get_project_context("get_config")
        conf = get_config(str(Path.cwd()), env)
        create_catalog = get_project_context("create_catalog")
        catalog = create_catalog(config=conf)
        return pipeline, catalog
    except (ImportError, KeyError):
        raise KedroCliError(ERROR_PROJECT_ROOT)
Ejemplo n.º 20
0
def run(tag, env, parallel, runner):
    """Run the pipeline."""
    from {{cookiecutter.python_package}}.run import main
    if parallel and runner:
        raise KedroCliError(
            "Both --parallel and --runner options cannot be used together. "
            "Please use either --parallel or --runner."
        )
    if parallel:
        runner = "ParallelRunner"
    main(tags=tag, env=env, runner=runner)
Ejemplo n.º 21
0
def lint(files):
    """Run flake8, isort and (on Python >=3.6) black."""
    # pylint: disable=unused-import
    if not files:
        files = ("src/tests", "src/kedro_code_forensics")

    try:
        import flake8
        import isort
    except ImportError as exc:
        raise KedroCliError(NO_DEPENDENCY_MESSAGE.format(exc.name))

    python_call("isort",
                ("-rc", "-tc", "-up", "-fgw=0", "-m=3", "-w=88") + files)
    if sys.version_info[:2] >= (3, 6):
        try:
            import black
        except ImportError:
            raise KedroCliError(NO_DEPENDENCY_MESSAGE.format("black"))
        python_call("black", files)
    python_call("flake8", ("--max-line-length=88", ) + files)
Ejemplo n.º 22
0
def _assert_pkg_name_ok(pkg_name: str):
    """Check that python package name is in line with PEP8 requirements.

    Args:
        pkg_name: Candidate Python package name.

    Raises:
        KedroCliError: If package name violates the requirements.
    """

    base_message = "`{}` is not a valid Python package name.".format(pkg_name)
    if not re.match(r"^[a-zA-Z_]", pkg_name):
        message = base_message + " It must start with a letter or underscore."
        raise KedroCliError(message)
    if len(pkg_name) < 2:
        message = base_message + " It must be at least 2 characters long."
        raise KedroCliError(message)
    if not re.match(r"^\w+$", pkg_name[1:]):
        message = (base_message + " It must contain only letters, "
                   "digits, and/or underscores.")
        raise KedroCliError(message)
Ejemplo n.º 23
0
def _call_viz(
    host=None,
    port=None,
    browser=None,
    load_file=None,
    save_file=None,
    pipeline_name=None,
    env=None,
):
    global data  # pylint: disable=global-statement,invalid-name

    if load_file:
        data = _load_from_file(load_file)
    else:
        if match(kedro.__version__, ">=0.15.0"):
            from kedro.context import KedroContextError

            try:
                context = get_project_context("context", env=env)
                pipeline = _get_pipeline_from_context(context, pipeline_name)
            except KedroContextError:
                raise KedroCliError(ERROR_PROJECT_ROOT)
            catalog = context.catalog

        else:
            # Kedro 0.14.*
            if pipeline_name:
                raise KedroCliError(ERROR_PIPELINE_FLAG_NOT_SUPPORTED)
            pipeline, catalog = _get_pipeline_catalog_from_kedro14(env)

        data = format_pipeline_data(pipeline, catalog)

    if save_file:
        Path(save_file).write_text(json.dumps(data, indent=4, sort_keys=True))
    else:
        if browser:
            webbrowser.open_new("http://127.0.0.1:{:d}/".format(port))
        app.run(host=host, port=port)
Ejemplo n.º 24
0
def _handle_exception(msg, end=True):
    """Pretty print the current exception then exit."""
    if _KEDRO_CONTEXT["verbose"]:
        click.secho(traceback.format_exc(), nl=False, fg="yellow")
    else:
        etype, value, _ = sys.exc_info()
        click.secho(
            "".join(*traceback.format_exception_only(etype, value)) +
            "Run with --verbose to see the full exception",
            fg="yellow",
        )
    if end:
        raise KedroCliError(msg)
    click.secho("Error: " + msg, fg="red")  # pragma: no cover
Ejemplo n.º 25
0
def _assert_output_dir_ok(output_dir: str):
    """Check that output directory exists.

    Args:
        output_dir: Output directory path.

    Raises:
        KedroCliError: If the output directory does not exist.

    """
    if not os.path.exists(output_dir):
        message = ("`{}` is not a valid output directory. "
                   "It must be a relative or absolute path "
                   "to an existing directory.".format(output_dir))
        raise KedroCliError(message)
Ejemplo n.º 26
0
def run(tag, env, parallel, runner, node_names, to_nodes, from_nodes):
    """Run the pipeline."""
    from new_kedro_project.run import main
    from_nodes = [n for n in from_nodes.split(",") if n]
    to_nodes = [n for n in to_nodes.split(",") if n]

    if parallel and runner:
        raise KedroCliError(
            "Both --parallel and --runner options cannot be used together. "
            "Please use either --parallel or --runner."
        )
    if parallel:
        runner = "ParallelRunner"
    runner_class = load_obj(runner, "kedro.runner") if runner else SequentialRunner

    main(tags=tag, env=env, runner=runner_class(), node_names=node_names, from_nodes=from_nodes, to_nodes=to_nodes)
Ejemplo n.º 27
0
def check_docker_image_exists(image: str):
    """
    Check that the specified Docker image exists locally.

    Args:
        image: Docker image name.

    Raises:
        KedroCliError: If specified Docker image was not found.

    """
    command = ["docker", "images", "-q", image]
    res = subprocess.run(command, stdout=PIPE, stderr=DEVNULL, check=False)
    if not res.stdout:
        cmd = "kedro docker build --image {0}".format(image)
        raise KedroCliError(
            "Unable to find image `{0}` locally. Please build it first by running:\n"
            "{1}".format(image, cmd))
Ejemplo n.º 28
0
def list_datasets(pipeline, env):
    """Show datasets per type."""
    title = "DataSets in '{}' pipeline"
    not_mentioned = "Datasets not mentioned in pipeline"
    mentioned = "Datasets mentioned in pipeline"

    context = load_context(Path.cwd(), env=env)
    datasets_meta = context.catalog._data_sets
    catalog_ds = set(context.catalog.list())

    pipelines = pipeline or context.pipelines.keys()

    result = {}
    for pipeline in pipelines:
        pl_obj = context.pipelines.get(pipeline)
        if pl_obj:
            pipeline_ds = pl_obj.data_sets()
        else:
            existing_pls = ", ".join(sorted(context.pipelines.keys()))
            raise KedroCliError(
                "{} pipeline not found! Existing pipelines: {}".format(
                    pipeline, existing_pls))

        unused_ds = catalog_ds - pipeline_ds
        default_ds = pipeline_ds - catalog_ds
        used_ds = catalog_ds - unused_ds

        unused_by_type = _map_type_to_datasets(unused_ds, datasets_meta)
        used_by_type = _map_type_to_datasets(used_ds, datasets_meta)

        if default_ds:
            used_by_type["DefaultDataSet"].extend(default_ds)

        data = ((not_mentioned, dict(unused_by_type)), (mentioned,
                                                        dict(used_by_type)))
        result[title.format(pipeline)] = {
            key: value
            for key, value in data if value
        }

    secho(yaml.dump(result))
Ejemplo n.º 29
0
def lint(files, check_only):
    """Run flake8, isort and (on Python >=3.6) black."""
    files = files or (str(
        SOURCE_PATH / "tests"), str(SOURCE_PATH / KEDRO_PACKAGE_NAME))

    try:
        import flake8
        import isort
        import black
    except ImportError as exc:
        raise KedroCliError(
            NO_DEPENDENCY_MESSAGE.format(module=exc.name,
                                         src=str(SOURCE_PATH)))

    python_call("black", ("--check", ) + files if check_only else files)
    python_call("flake8", ("--max-line-length=88", ) + files)

    check_flag = ("-c", ) if check_only else ()
    python_call("isort",
                (*check_flag, "-rc", "-tc", "-up", "-fgw=0", "-m=3", "-w=88") +
                files)
Ejemplo n.º 30
0
def compose_docker_run_args(
    host_root: str = None,
    container_root: str = None,
    mount_volumes: Sequence[str] = None,
    required_args: Sequence[Tuple[str, Union[str, None]]] = None,
    optional_args: Sequence[Tuple[str, Union[str, None]]] = None,
    user_args: Sequence[str] = None,
) -> List[str]:
    """
    Make a list of arguments for the docker command.

    Args:
        host_root: Path project root on the host. It must be provided
            if `mount_volumes` are specified, optional otherwise.
        container_root: Path to project root in the container
            (e.g., `/home/kedro/<repo_name>`). It must be
            provided if `mount_volumes` are specified, optional otherwise.
        mount_volumes: List of volumes to be mounted.
        required_args: List of required arguments.
        optional_args: List of optional arguments, these will be added if only
            not present in `user_args` list.
        user_args: List of arguments already specified by the user.
    Raises:
        KedroCliError: If `mount_volumes` are provided but either `host_root`
            or `container_root` are missing.

    Returns:
        List of arguments for the docker command.
    """

    mount_volumes = mount_volumes or []
    required_args = required_args or []
    optional_args = optional_args or []
    user_args = user_args or []
    split_user_args = {ua.split("=", 1)[0] for ua in user_args}

    def _add_args(name_: str,
                  value_: str = None,
                  force_: bool = False) -> List[str]:
        """
        Add extra args to existing list of CLI args.
        Args:
            name_: Arg name to add.
            value_: Arg value to add, skipped if None.
            force_: Add the argument even if it's present in the current list of args.

        Returns:
            List containing the new args and (optionally) its value or an empty list
                if no values to be added.
        """
        if not force_ and name_ in split_user_args:
            return []
        return [name_] if value_ is None else [name_, value_]

    if mount_volumes:
        if not (host_root and container_root):
            raise KedroCliError("Both `host_root` and `container_root` must "
                                "be specified in `compose_docker_run_args` "
                                "call if `mount_volumes` are provided.")
        vol_gen = _list_docker_volumes(host_root, container_root,
                                       mount_volumes)
        combined_args = list(chain.from_iterable(vol_gen))
    else:
        combined_args = []
    for arg_name, arg_value in required_args:
        combined_args += _add_args(arg_name, arg_value, True)
    for arg_name, arg_value in optional_args:
        combined_args += _add_args(arg_name, arg_value)
    return combined_args + user_args