def data_science_pipeline() -> Pipeline:
    """Create the data science pipeline."""
    return Pipeline(
        nodes=[
            node(
                func=fit_pca,
                inputs={
                    "x": "primary_classified_x",
                    "kwargs": "params:fit_pca",
                },
                outputs={
                    "x": "model_output_pca_x",
                    "variance": "model_output_pca_variance",
                },
                name="fit-pca",
                tags="pca",
            ),
            node(
                func=fit_tsne,
                inputs={
                    "x": "primary_classified_x",
                    "kwargs": "params:fit_tsne",
                },
                outputs="model_output_tsne_x",
                name="fit-tsne",
                tags="tsne",
            ),
        ]
    )
def data_visualization_pipeline() -> Pipeline:
    """Create the data visualization pipeline."""
    return Pipeline(nodes=[
        node(
            func=plot_pca,
            inputs={
                "x": "model_output_pca_x",
                "y": "primary_classified_y",
                "variance": "model_output_pca_variance",
                "metadata": "params:metadata",
                "kwargs": "params:plot_pca",
            },
            outputs="reporting_pca",
            name="plot-pca",
            tags="pca",
        ),
        node(
            func=plot_tsne,
            inputs={
                "x": "model_output_tsne_x",
                "y": "primary_classified_y",
                "metadata": "params:metadata",
                "kwargs": "params:plot_tsne",
            },
            outputs="reporting_tsne",
            name="plot-tsne",
            tags="tsne",
        ),
    ])
Beispiel #3
0
def _validate_datasets_exist(
    inputs: AbstractSet[str],
    outputs: AbstractSet[str],
    parameters: AbstractSet[str],
    pipe: Pipeline,
) -> None:
    inputs = {_strip_transcoding(k) for k in inputs}
    outputs = {_strip_transcoding(k) for k in outputs}

    existing = {_strip_transcoding(ds) for ds in pipe.data_sets()}
    non_existent = (inputs | outputs | parameters) - existing
    if non_existent:
        raise ModularPipelineError(
            "Failed to map datasets and/or parameters: {}".format(", ".join(
                sorted(non_existent))))
def _validate_inputs_outputs(
    inputs: AbstractSet[str], outputs: AbstractSet[str], pipe: Pipeline
) -> None:
    """Safeguards to ensure that:
    - parameters are not specified under inputs
    - inputs are only free inputs
    - outputs do not contain free inputs
    """
    inputs = {_strip_transcoding(k) for k in inputs}
    outputs = {_strip_transcoding(k) for k in outputs}

    if any(_is_parameter(i) for i in inputs):
        raise ModularPipelineError(
            "Parameters should be specified in the `parameters` argument"
        )

    free_inputs = {_strip_transcoding(i) for i in pipe.inputs()}

    if not inputs <= free_inputs:
        raise ModularPipelineError("Inputs should be free inputs to the pipeline")

    if outputs & free_inputs:
        raise ModularPipelineError("Outputs can't contain free inputs to the pipeline")
Beispiel #5
0
def pipeline(
    pipe: Pipeline,
    *,
    inputs: Dict[str, str] = None,
    outputs: Dict[str, str] = None,
    parameters: Dict[str, str] = None,
    namespace: str = None,
) -> Pipeline:
    """Create a copy of the pipeline and its nodes,
    with some dataset names and node names modified.

    Args:
        pipe: Original modular pipeline to integrate
        inputs: A map of the existing input name to the new one.
            Must only refer to the pipeline's free inputs.
        outputs: A map of the existing output name to the new one.
            Can refer to both the pipeline's free outputs, as well
            as intermediate results that need to be exposed.
        parameters: A map of existing parameter to the new one.
        namespace: A prefix to give to all dataset names,
            except those explicitly named with the `inputs`/`outputs`
            arguments, and parameter references (`params:` and `parameters`).

    Raises:
        ModularPipelineError: When inputs, outputs or parameters are incorrectly
            specified, or they do not exist on the original pipeline.
        ValueError: When underlying pipeline nodes inputs/outputs are not
            any of the expected types (str, dict, list, or None).

    Returns:
        A new ``Pipeline`` object with the new nodes, modified as requested.
    """
    # pylint: disable=protected-access
    inputs = copy.deepcopy(inputs) or {}
    outputs = copy.deepcopy(outputs) or {}
    parameters = copy.deepcopy(parameters) or {}

    _validate_datasets_exist(inputs.keys(), outputs.keys(), parameters.keys(),
                             pipe)
    _validate_inputs_outputs(inputs.keys(), outputs.keys(), pipe)

    mapping = {**inputs, **outputs, **parameters}

    def _prefix(name: str) -> str:
        return f"{namespace}.{name}" if namespace else name

    def _is_transcode_base_in_mapping(name: str) -> bool:
        base_name, _ = _transcode_split(name)
        return base_name in mapping

    def _map_transcode_base(name: str):
        base_name, transcode_suffix = _transcode_split(name)
        return TRANSCODING_SEPARATOR.join(
            (mapping[base_name], transcode_suffix))

    def _rename(name: str):
        rules = [
            # if name mapped to new name, update with new name
            (lambda n: n in mapping, lambda n: mapping[n]),
            # if it's a parameter, leave as is (don't namespace)
            (_is_parameter, lambda n: n),
            # if transcode base is mapped to a new name, update with new base
            (_is_transcode_base_in_mapping, _map_transcode_base),
            # if namespace given, prefix name using that namespace
            (lambda n: bool(namespace), _prefix),
        ]

        for predicate, processor in rules:
            if predicate(name):
                return processor(name)

        # leave name as is
        return name

    def _process_dataset_names(
        datasets: Union[None, str, List[str], Dict[str, str]]
    ) -> Union[None, str, List[str], Dict[str, str]]:
        if datasets is None:
            return None
        if isinstance(datasets, str):
            return _rename(datasets)
        if isinstance(datasets, list):
            return [_rename(name) for name in datasets]
        if isinstance(datasets, dict):
            return {key: _rename(value) for key, value in datasets.items()}

        raise ValueError(  # pragma: no cover
            f"Unexpected input {datasets} of type {type(datasets)}")

    def _copy_node(node: Node) -> Node:
        new_namespace = node.namespace
        if namespace:
            new_namespace = (f"{namespace}.{node.namespace}"
                             if node.namespace else namespace)

        return node._copy(
            inputs=_process_dataset_names(node._inputs),
            outputs=_process_dataset_names(node._outputs),
            namespace=new_namespace,
        )

    new_nodes = [_copy_node(n) for n in pipe.nodes]

    return Pipeline(new_nodes)
def data_engineering_pipeline() -> Pipeline:
    """Create the data engineering pipeline."""
    return Pipeline(nodes=[
        node(
            func=extract,
            inputs="raw_matlab_image",
            outputs="intermediate_image",
            name="extract-image",
            tags=["pca", "tsne", "tcn"],
        ),
        node(
            func=extract,
            inputs="raw_matlab_ground_truth",
            outputs="intermediate_ground_truth",
            name="extract-ground-truth",
            tags=["pca", "tsne", "tcn"],
        ),
        node(
            func=scale,
            inputs={
                "image": "intermediate_image",
                "kwargs": "params:scale"
            },
            outputs="scale_image",
            name="scale-image",
            tags=["pca", "tsne", "tcn"],
        ),
        node(
            func=separate,
            inputs={
                "image": "scale_image",
                "ground_truth": "intermediate_ground_truth",
            },
            outputs={
                "classified_x": "primary_classified_x",
                "unclassified_x": "primary_unclassified_x",
                "classified_y": "primary_classified_y",
                "unclassified_y": "primary_unclassified_y",
            },
            name="separate-classified-and-unclassified-samples",
            tags=["pca", "tsne", "tcn"],
        ),
        node(
            func=split,
            inputs={
                "x": "primary_classified_x",
                "y": "primary_classified_y",
                "kwargs": "params:split",
            },
            outputs={
                "x_train": "model_input_classified_x_train",
                "x_test": "model_input_classified_x_test",
                "x_valid": "model_input_classified_x_valid",
                "y_train": "model_input_classified_y_train",
                "y_test": "model_input_classified_y_test",
                "y_valid": "model_input_classified_y_valid",
            },
            name="split-dataset",
            tags="tcn",
        ),
    ])