def describe_pipeline(metadata: ProjectMetadata, name, **kwargs): # pylint: disable=unused-argument, protected-access """Describe a pipeline by providing a pipeline name. Defaults to the __default__ pipeline. (DEPRECATED) """ deprecation_message = ( "DeprecationWarning: Command `kedro pipeline describe` is deprecated. " "Please use `kedro registry describe` instead.") click.secho(deprecation_message, fg="red") pipeline_obj = pipelines.get(name) if not pipeline_obj: all_pipeline_names = pipelines.keys() existing_pipelines = ", ".join(sorted(all_pipeline_names)) raise KedroCliError( f"`{name}` pipeline not found. Existing pipelines: [{existing_pipelines}]" ) nodes = [] for node in pipeline_obj.nodes: namespace = f"{node.namespace}." if node.namespace else "" nodes.append( f"{namespace}{node._name or node._func_name} ({node._func_name})") result = {"Nodes": nodes} click.echo(yaml.dump(result))
def create( metadata: ProjectMetadata, pipeline_name, env, target_path ): # pylint: disable=too-many-locals """Create an Airflow DAG for a project""" loader = jinja2.FileSystemLoader(str(Path(__file__).parent)) jinja_env = jinja2.Environment(autoescape=True, loader=loader, lstrip_blocks=True) jinja_env.filters["slugify"] = slugify template = jinja_env.get_template("airflow_dag_template.j2") package_name = metadata.package_name dag_filename = f"{package_name}_dag.py" target_path = Path(target_path) target_path = target_path / dag_filename target_path.parent.mkdir(parents=True, exist_ok=True) pipeline = pipelines.get(pipeline_name) dependencies = defaultdict(list) for node, parent_nodes in pipeline.node_dependencies.items(): for parent in parent_nodes: # pragma: no cover dependencies[parent].append(node) template.stream( dag_name=package_name, dependencies=dependencies, env=env, pipeline_name=pipeline_name, package_name=package_name, pipeline=pipeline, ).dump(str(target_path)) secho("") secho("An Airflow DAG has been generated in:", fg="green") secho(str(target_path)) secho("This file should be copied to your Airflow DAG folder.", fg="yellow") secho( "The Airflow configuration can be customized by editing this file.", fg="green" ) secho("") secho( "This file also contains the path to the config directory, this directory will need to " "be available to Airflow and any workers.", fg="yellow", ) secho("") secho( "Additionally all data sets must have an entry in the data catalog.", fg="yellow", ) secho( "And all local paths in both the data catalog and log config must be absolute paths.", fg="yellow", ) secho("")
def describe_pipeline(metadata: ProjectMetadata, name, **kwargs): # pylint: disable=unused-argument """Describe a pipeline by providing a pipeline name.""" pipeline_obj = pipelines.get(name) if not pipeline_obj: all_pipeline_names = pipelines.keys() existing_pipelines = ", ".join(sorted(all_pipeline_names)) raise KedroCliError( f"`{name}` pipeline not found. Existing pipelines: [{existing_pipelines}]" ) result = { "Nodes": [ f"{node.short_name} ({node._func_name})" # pylint: disable=protected-access for node in pipeline_obj.nodes ] } click.echo(yaml.dump(result))
def create_catalog(metadata: ProjectMetadata, pipeline_name, env): """Create Data Catalog YAML configuration with missing datasets. Add `MemoryDataSet` datasets to Data Catalog YAML configuration file for each dataset in a registered pipeline if it is missing from the `DataCatalog`. The catalog configuration will be saved to `<conf_root>/<env>/catalog/<pipeline_name>.yml` file. """ env = env or "base" session = _create_session(metadata.package_name, env=env) context = session.load_context() pipeline = pipelines.get(pipeline_name) if not pipeline: existing_pipelines = ", ".join(sorted(pipelines.keys())) raise KedroCliError( f"`{pipeline_name}` pipeline not found! Existing pipelines: {existing_pipelines}" ) pipe_datasets = { ds_name for ds_name in pipeline.data_sets() if not ds_name.startswith("params:") and ds_name != "parameters" } catalog_datasets = { ds_name for ds_name in context.catalog._data_sets.keys() # pylint: disable=protected-access if not ds_name.startswith("params:") and ds_name != "parameters" } # Datasets that are missing in Data Catalog missing_ds = sorted(pipe_datasets - catalog_datasets) if missing_ds: catalog_path = (context.project_path / settings.CONF_ROOT / env / "catalog" / f"{pipeline_name}.yml") _add_missing_datasets_to_catalog(missing_ds, catalog_path) click.echo( f"Data Catalog YAML configuration was created: {catalog_path}") else: click.echo("All datasets are already configured.")
def list_datasets(metadata: ProjectMetadata, pipeline, env): """Show datasets per type.""" title = "DataSets in '{}' pipeline" not_mentioned = "Datasets not mentioned in pipeline" mentioned = "Datasets mentioned in pipeline" session = _create_session(metadata.package_name, env=env) context = session.load_context() datasets_meta = context.catalog._data_sets # pylint: disable=protected-access catalog_ds = set(context.catalog.list()) target_pipelines = pipeline or pipelines.keys() result = {} for pipe in target_pipelines: pl_obj = pipelines.get(pipe) if pl_obj: pipeline_ds = pl_obj.data_sets() else: existing_pls = ", ".join(sorted(pipelines.keys())) raise KedroCliError( f"`{pipe}` pipeline not found! Existing pipelines: {existing_pls}" ) unused_ds = catalog_ds - pipeline_ds default_ds = pipeline_ds - catalog_ds used_ds = catalog_ds - unused_ds unused_by_type = _map_type_to_datasets(unused_ds, datasets_meta) used_by_type = _map_type_to_datasets(used_ds, datasets_meta) if default_ds: used_by_type["DefaultDataSet"].extend(default_ds) data = ((not_mentioned, dict(unused_by_type)), (mentioned, dict(used_by_type))) result[title.format(pipe)] = { key: value for key, value in data if value } secho(yaml.dump(result))
def describe_registered_pipeline(metadata: ProjectMetadata, name, **kwargs): # pylint: disable=unused-argument, protected-access """Describe a registered pipeline by providing a pipeline name. Defaults to the `__default__` pipeline. """ pipeline_obj = pipelines.get(name) if not pipeline_obj: all_pipeline_names = pipelines.keys() existing_pipelines = ", ".join(sorted(all_pipeline_names)) raise KedroCliError( f"`{name}` pipeline not found. Existing pipelines: [{existing_pipelines}]" ) nodes = [] for node in pipeline_obj.nodes: namespace = f"{node.namespace}." if node.namespace else "" nodes.append( f"{namespace}{node._name or node._func_name} ({node._func_name})") result = {"Nodes": nodes} click.echo(yaml.dump(result))