Beispiel #1
0
def describe_pipeline(metadata: ProjectMetadata, name, **kwargs):  # pylint: disable=unused-argument, protected-access
    """Describe a pipeline by providing a pipeline name.
    Defaults to the __default__ pipeline. (DEPRECATED)
    """
    deprecation_message = (
        "DeprecationWarning: Command `kedro pipeline describe` is deprecated. "
        "Please use `kedro registry describe` instead.")
    click.secho(deprecation_message, fg="red")

    pipeline_obj = pipelines.get(name)
    if not pipeline_obj:
        all_pipeline_names = pipelines.keys()
        existing_pipelines = ", ".join(sorted(all_pipeline_names))
        raise KedroCliError(
            f"`{name}` pipeline not found. Existing pipelines: [{existing_pipelines}]"
        )

    nodes = []
    for node in pipeline_obj.nodes:
        namespace = f"{node.namespace}." if node.namespace else ""
        nodes.append(
            f"{namespace}{node._name or node._func_name} ({node._func_name})")
    result = {"Nodes": nodes}

    click.echo(yaml.dump(result))
def create(
    metadata: ProjectMetadata, pipeline_name, env, target_path
):  # pylint: disable=too-many-locals
    """Create an Airflow DAG for a project"""
    loader = jinja2.FileSystemLoader(str(Path(__file__).parent))
    jinja_env = jinja2.Environment(autoescape=True, loader=loader, lstrip_blocks=True)
    jinja_env.filters["slugify"] = slugify
    template = jinja_env.get_template("airflow_dag_template.j2")

    package_name = metadata.package_name
    dag_filename = f"{package_name}_dag.py"

    target_path = Path(target_path)
    target_path = target_path / dag_filename

    target_path.parent.mkdir(parents=True, exist_ok=True)

    pipeline = pipelines.get(pipeline_name)

    dependencies = defaultdict(list)
    for node, parent_nodes in pipeline.node_dependencies.items():
        for parent in parent_nodes:  # pragma: no cover
            dependencies[parent].append(node)

    template.stream(
        dag_name=package_name,
        dependencies=dependencies,
        env=env,
        pipeline_name=pipeline_name,
        package_name=package_name,
        pipeline=pipeline,
    ).dump(str(target_path))

    secho("")
    secho("An Airflow DAG has been generated in:", fg="green")
    secho(str(target_path))
    secho("This file should be copied to your Airflow DAG folder.", fg="yellow")
    secho(
        "The Airflow configuration can be customized by editing this file.", fg="green"
    )
    secho("")
    secho(
        "This file also contains the path to the config directory, this directory will need to "
        "be available to Airflow and any workers.",
        fg="yellow",
    )
    secho("")
    secho(
        "Additionally all data sets must have an entry in the data catalog.",
        fg="yellow",
    )
    secho(
        "And all local paths in both the data catalog and log config must be absolute paths.",
        fg="yellow",
    )
    secho("")
Beispiel #3
0
def describe_pipeline(metadata: ProjectMetadata, name, **kwargs):  # pylint: disable=unused-argument
    """Describe a pipeline by providing a pipeline name."""
    pipeline_obj = pipelines.get(name)
    if not pipeline_obj:
        all_pipeline_names = pipelines.keys()
        existing_pipelines = ", ".join(sorted(all_pipeline_names))
        raise KedroCliError(
            f"`{name}` pipeline not found. Existing pipelines: [{existing_pipelines}]"
        )

    result = {
        "Nodes": [
            f"{node.short_name} ({node._func_name})"  # pylint: disable=protected-access
            for node in pipeline_obj.nodes
        ]
    }

    click.echo(yaml.dump(result))
Beispiel #4
0
def create_catalog(metadata: ProjectMetadata, pipeline_name, env):
    """Create Data Catalog YAML configuration with missing datasets.

    Add `MemoryDataSet` datasets to Data Catalog YAML configuration file
    for each dataset in a registered pipeline if it is missing from
    the `DataCatalog`.

    The catalog configuration will be saved to
    `<conf_root>/<env>/catalog/<pipeline_name>.yml` file.
    """
    env = env or "base"
    session = _create_session(metadata.package_name, env=env)
    context = session.load_context()

    pipeline = pipelines.get(pipeline_name)

    if not pipeline:
        existing_pipelines = ", ".join(sorted(pipelines.keys()))
        raise KedroCliError(
            f"`{pipeline_name}` pipeline not found! Existing pipelines: {existing_pipelines}"
        )

    pipe_datasets = {
        ds_name
        for ds_name in pipeline.data_sets()
        if not ds_name.startswith("params:") and ds_name != "parameters"
    }

    catalog_datasets = {
        ds_name
        for ds_name in context.catalog._data_sets.keys()  # pylint: disable=protected-access
        if not ds_name.startswith("params:") and ds_name != "parameters"
    }

    # Datasets that are missing in Data Catalog
    missing_ds = sorted(pipe_datasets - catalog_datasets)
    if missing_ds:
        catalog_path = (context.project_path / settings.CONF_ROOT / env /
                        "catalog" / f"{pipeline_name}.yml")
        _add_missing_datasets_to_catalog(missing_ds, catalog_path)
        click.echo(
            f"Data Catalog YAML configuration was created: {catalog_path}")
    else:
        click.echo("All datasets are already configured.")
Beispiel #5
0
def list_datasets(metadata: ProjectMetadata, pipeline, env):
    """Show datasets per type."""
    title = "DataSets in '{}' pipeline"
    not_mentioned = "Datasets not mentioned in pipeline"
    mentioned = "Datasets mentioned in pipeline"

    session = _create_session(metadata.package_name, env=env)
    context = session.load_context()
    datasets_meta = context.catalog._data_sets  # pylint: disable=protected-access
    catalog_ds = set(context.catalog.list())

    target_pipelines = pipeline or pipelines.keys()

    result = {}
    for pipe in target_pipelines:
        pl_obj = pipelines.get(pipe)
        if pl_obj:
            pipeline_ds = pl_obj.data_sets()
        else:
            existing_pls = ", ".join(sorted(pipelines.keys()))
            raise KedroCliError(
                f"`{pipe}` pipeline not found! Existing pipelines: {existing_pls}"
            )

        unused_ds = catalog_ds - pipeline_ds
        default_ds = pipeline_ds - catalog_ds
        used_ds = catalog_ds - unused_ds

        unused_by_type = _map_type_to_datasets(unused_ds, datasets_meta)
        used_by_type = _map_type_to_datasets(used_ds, datasets_meta)

        if default_ds:
            used_by_type["DefaultDataSet"].extend(default_ds)

        data = ((not_mentioned, dict(unused_by_type)), (mentioned,
                                                        dict(used_by_type)))
        result[title.format(pipe)] = {
            key: value
            for key, value in data if value
        }

    secho(yaml.dump(result))
Beispiel #6
0
def describe_registered_pipeline(metadata: ProjectMetadata, name, **kwargs):  # pylint: disable=unused-argument, protected-access
    """Describe a registered pipeline by providing a pipeline name.
    Defaults to the `__default__` pipeline.
    """
    pipeline_obj = pipelines.get(name)
    if not pipeline_obj:
        all_pipeline_names = pipelines.keys()
        existing_pipelines = ", ".join(sorted(all_pipeline_names))
        raise KedroCliError(
            f"`{name}` pipeline not found. Existing pipelines: [{existing_pipelines}]"
        )

    nodes = []
    for node in pipeline_obj.nodes:
        namespace = f"{node.namespace}." if node.namespace else ""
        nodes.append(
            f"{namespace}{node._name or node._func_name} ({node._func_name})")
    result = {"Nodes": nodes}

    click.echo(yaml.dump(result))