def register_pipelines() -> Dict[str, Pipeline]:
    """Register the project's pipelines.

    Returns:
        A mapping from a pipeline name to a ``Pipeline`` object.

    """
    data_processing_pipeline = dp.create_pipeline()
    data_science_pipeline = ds.create_pipeline()

    unfiltered_ds_pipeline = pipeline(data_science_pipeline,
                                      namespace="unfiltered",
                                      inputs={"model_input_table"})

    data_filtering_pipeline = df.create_pipeline()

    filtered_ds_pipeline = (pipeline(
        data_filtering_pipeline,
        inputs={"input_table": "model_input_table"},
        outputs={"output_table": "filtered.model_input_table"},
        namespace="filtered",
    ) + pipeline(data_science_pipeline, namespace="filtered"))

    return {
        "__default__": (data_processing_pipeline + unfiltered_ds_pipeline +
                        filtered_ds_pipeline),
        "dp":
        data_processing_pipeline,
        "ds":
        unfiltered_ds_pipeline + filtered_ds_pipeline,
        "filtered_pipeline":
        data_processing_pipeline + filtered_ds_pipeline,
        "unfiltered_pipeline":
        data_processing_pipeline + unfiltered_ds_pipeline,
    }
def register_pipelines() -> Dict[str, Pipeline]:
    """Register the project's pipelines.

    Returns:
        A mapping from a pipeline name to a ``Pipeline`` object.

    """
    data_processing_pipeline = dp.create_pipeline()
    data_science_pipeline = ds.create_pipeline()

    # TODO 1: give data_science_pipeline the namespace "unfiltered" and make sure it
    #  links up correctly to the model_input_table input.
    unfiltered_ds_pipeline = data_science_pipeline

    # TODO 2: create the data_filtering_pipeline using the create_pipeline method.
    # TODO 2: alter the appropriate parameters file to filter by engine_type == "Quantum".
    data_filtering_pipeline = Pipeline([])

    # TODO 3: give data_science_pipeline the namespace "filtered". Connect the
    #  data_filtering_pipeline onto the model_input_table as input and
    #  the data science pipeline as output using `inputs`, `outputs` and `namespace`.
    #  Add the pipeline to the __default__ and ds registered pipelines.
    filtered_ds_pipeline = pipeline(data_filtering_pipeline) + pipeline(
        data_science_pipeline)

    return {
        # TODO 4: update the pipeline registry to register two new pipelines
        #  "unfiltered_pipeline" and "filtered_pipeline" that run the data processing
        #  and appropriate data science pipelines.
        "__default__": (data_processing_pipeline + unfiltered_ds_pipeline),
        "dp": data_processing_pipeline,
        "ds": unfiltered_ds_pipeline,
    }
Beispiel #3
0
    def test_connect_existing_pipelines(self):
        """
        Two pipelines exist, the dataset names do not match.
        We `transform` them to work together.
        """
        cook_pipeline = Pipeline([
            node(defrost, "frozen_meat", "meat"),
            node(grill, "meat", "grilled_meat")
        ])

        lunch_pipeline = Pipeline([node(eat, "food", "output")])

        pipeline1 = (
            pipeline(cook_pipeline, outputs={"grilled_meat": "food"}) +
            lunch_pipeline)

        pipeline2 = cook_pipeline + pipeline(lunch_pipeline,
                                             inputs={"food": "grilled_meat"})
        pipeline3 = pipeline(cook_pipeline,
                             outputs={"grilled_meat": "NEW_NAME"}) + pipeline(
                                 lunch_pipeline, inputs={"food": "NEW_NAME"})

        for pipe in [pipeline1, pipeline2, pipeline3]:
            catalog = DataCatalog(
                {}, feed_dict={"frozen_meat": "frozen_meat_data"})
            result = SequentialRunner().run(pipe, catalog)
            assert result == {
                "output": "frozen_meat_data_defrosted_grilled_done"
            }
Beispiel #4
0
    def test_reuse_same_pipeline(self):
        """
        The same pipeline needs to be used twice in the same big pipeline.
        Normally dataset and node names would conflict,
        so we need to `transform` the pipelines.
        """
        cook_pipeline = Pipeline([
            node(defrost, "frozen_meat", "meat", name="defrost_node"),
            node(grill, "meat", "grilled_meat", name="grill_node"),
        ])
        breakfast_pipeline = Pipeline(
            [node(eat, "breakfast_food", "breakfast_output")])
        lunch_pipeline = Pipeline([node(eat, "lunch_food", "lunch_output")])

        # We are using two different mechanisms here for breakfast and lunch,
        # renaming and prefixing pipelines differently.
        pipe = (pipeline(
            cook_pipeline,
            outputs={"grilled_meat": "breakfast_food"},
            namespace="breakfast",
        ) + breakfast_pipeline + pipeline(cook_pipeline, namespace="lunch") +
                pipeline(lunch_pipeline,
                         inputs={"lunch_food": "lunch.grilled_meat"}))
        catalog = DataCatalog(
            {},
            feed_dict={
                "breakfast.frozen_meat": "breakfast_frozen_meat",
                "lunch.frozen_meat": "lunch_frozen_meat",
            },
        )
        result = SequentialRunner().run(pipe, catalog)
        assert result == {
            "breakfast_output": "breakfast_frozen_meat_defrosted_grilled_done",
            "lunch_output": "lunch_frozen_meat_defrosted_grilled_done",
        }
Beispiel #5
0
    def register_pipelines(self) -> Dict[str, Pipeline]:
        """Register the project's pipeline.

        Returns:
            A mapping from a pipeline name to a ``Pipeline`` object.

        """
        feature_pipeline = feature.create_pipeline()
        target_pipeline = target.create_pipeline()

        return {
            "__default__":
            Pipeline([
                pipeline(
                    feature_pipeline,
                    inputs={
                        "raw_data": "raw_train",
                        "raw_patient_profile": "raw_patient_profile",
                        "raw_health_camp_detail": "raw_health_camp_detail",
                    },
                    namespace="train",
                ),
                pipeline(
                    feature_pipeline,
                    inputs={
                        "raw_data": "raw_test",
                        "raw_patient_profile": "raw_patient_profile",
                        "raw_health_camp_detail": "raw_health_camp_detail",
                    },
                    namespace="test",
                ),
                target_pipeline,
            ])
        }
Beispiel #6
0
    def test_bad_outputs_mapping(self):
        raw_pipeline = Pipeline([
            node(biconcat, ["A", "params:alpha"], "AA", name="node1"),
            node(biconcat, ["AA", "parameters"], "BB", name="node2"),
        ])

        pattern = "Outputs can't contain free inputs to the pipeline"
        with pytest.raises(ModularPipelineError, match=pattern):
            pipeline(raw_pipeline, outputs={"A": "C"})
Beispiel #7
0
    def test_missing_dataset_name(
        self, func, inputs, outputs, inputs_map, outputs_map, expected_missing
    ):  # pylint: disable=too-many-arguments
        raw_pipeline = Pipeline([node(func, inputs, outputs)])

        with pytest.raises(ModularPipelineError, match=r"Failed to map datasets") as e:
            pipeline(
                raw_pipeline, namespace="PREFIX", inputs=inputs_map, outputs=outputs_map
            )
        assert ", ".join(expected_missing) in str(e.value)
Beispiel #8
0
    def test_parameters_specified_under_inputs(self):
        raw_pipeline = Pipeline([
            node(biconcat, ["A", "params:alpha"], "AA", name="node1"),
            node(biconcat, ["AA", "parameters"], "BB", name="node2"),
        ])

        pattern = r"Parameters should be specified in the `parameters` argument"
        with pytest.raises(ModularPipelineError, match=pattern):
            pipeline(raw_pipeline, inputs={"params:alpha": "params:beta"})

        with pytest.raises(ModularPipelineError, match=pattern):
            pipeline(raw_pipeline, inputs={"parameters": "some_yaml_dataset"})
Beispiel #9
0
    def test_non_existent_parameters_mapped(self):
        raw_pipeline = Pipeline([
            node(biconcat, ["A", "params:alpha"], "AA", name="node1"),
            node(biconcat, ["AA", "CC"], "BB", name="node2"),
        ])

        pattern = r"Failed to map datasets and/or parameters: params:beta"
        with pytest.raises(ModularPipelineError, match=pattern):
            pipeline(raw_pipeline, parameters={"params:beta": "params:gamma"})

        pattern = r"Failed to map datasets and/or parameters: parameters"
        with pytest.raises(ModularPipelineError, match=pattern):
            pipeline(raw_pipeline,
                     parameters={"parameters": "some_yaml_dataset"})
Beispiel #10
0
    def test_default_node_name_is_namespaced(self):
        """Check that auto-generated node names are also namespaced"""
        raw_pipeline = Pipeline([node(identity, "A", "B")])
        first_layer_nested_pipe = pipeline(raw_pipeline, namespace="PREFIX")
        resulting_node = first_layer_nested_pipe.nodes[0]

        assert resulting_node.name.startswith("PREFIX.")
        assert resulting_node.namespace == "PREFIX"

        second_layer_nested_pipe = pipeline(first_layer_nested_pipe, namespace="PRE")
        resulting_node = second_layer_nested_pipe.nodes[0]

        assert resulting_node.name.startswith("PRE.")
        assert resulting_node.namespace == "PRE.PREFIX"
Beispiel #11
0
    def test_transform_dataset_names(self):
        """
        Rename some datasets, test string, list and dict formats.
        """
        raw_pipeline = Pipeline(
            [
                node(identity, "A", "B", name="node1"),
                node(biconcat, ["C", "D"], ["E", "F"], name="node2"),
                node(
                    biconcat, {"input1": "H", "input2": "J"}, {"K": "L"}, name="node3"
                ),
            ]
        )

        resulting_pipeline = pipeline(
            raw_pipeline,
            inputs={"A": "A_new", "D": "D_new", "H": "H_new"},
            outputs={"B": "B_new", "E": "E_new", "L": "L_new"},
        )

        # make sure the order is correct
        nodes = sorted(resulting_pipeline.nodes)
        assert nodes[0]._inputs == "A_new"
        assert nodes[0]._outputs == "B_new"

        assert nodes[1]._inputs == ["C", "D_new"]
        assert nodes[1]._outputs == ["E_new", "F"]

        assert nodes[2]._inputs == {"input1": "H_new", "input2": "J"}
        assert nodes[2]._outputs == {"K": "L_new"}
Beispiel #12
0
    def test_transform_params_prefix_and_parameters(self):
        """
        Test that transform should skip `params:` and `parameters`: str, list and dict.
        """
        raw_pipeline = Pipeline(
            [
                node(identity, "parameters", "params:B", name="node1"),
                node(biconcat, ["params:C", "D"], ["parameters", "F"], name="node2"),
                node(
                    biconcat,
                    {"input1": "params:H", "input2": "parameters"},
                    {"K": "L"},
                    name="node3",
                ),
            ]
        )
        resulting_pipeline = pipeline(raw_pipeline, namespace="PREFIX")
        nodes = sorted(resulting_pipeline.nodes)
        assert nodes[0]._inputs == "parameters"
        assert nodes[0]._outputs == "params:B"

        assert nodes[1]._inputs == ["params:C", "PREFIX.D"]
        assert nodes[1]._outputs == ["parameters", "PREFIX.F"]

        assert nodes[2]._inputs == {"input1": "params:H", "input2": "parameters"}
        assert nodes[2]._outputs == {"K": "PREFIX.L"}
        assert nodes[2].name == "PREFIX.node3"
Beispiel #13
0
def register_pipelines() -> Dict[str, Pipeline]:
    """Register the project's pipelines.

    Returns:
        A mapping from a pipeline name to a ``Pipeline`` object.
    """
    feature_engineering_train_pipe = _fe.create_pipeline(
    ).only_nodes_with_tags("training")
    feature_engineering_inference_pipe = pipeline(
        _fe.create_pipeline().only_nodes_with_tags("inference"),
        inputs={
            "iris_X_test": "iris_X_test",
            "normalizer": "normalizer"
        },
        outputs={"iris_X_test_normalized": "iris_X_test_normalized"},
        namespace="inference",
    )

    modelling_train_pipe = _modelling.create_pipeline().only_nodes_with_tags(
        "training")
    return {
        "__default__": Pipeline([]),
        "fe_train": feature_engineering_train_pipe,
        "fe_inference": feature_engineering_inference_pipe,
        "model_train": modelling_train_pipe,
    }
Beispiel #14
0
    def test_expose_intermediate_output(self):
        """Check that we don't namespace an intermediary dataset, anywhere it
        is used - either input or output"""
        raw_pipeline = Pipeline(
            [
                node(identity, "A", "B", name="node1"),
                node(identity, "B", "C", name="node2"),
                node(identity, "C", "D", name="node3"),
                node(biconcat, ["D", "params:x"], "X", name="node4"),
            ]
        )
        resulting_pipeline = pipeline(
            raw_pipeline, outputs={"B": "B_new"}, namespace="ACTUAL"
        )
        actual_nodes = resulting_pipeline.nodes

        assert actual_nodes[0]._outputs == "B_new"
        assert actual_nodes[1]._inputs == "B_new"

        assert actual_nodes[0]._inputs == "ACTUAL.A"
        assert actual_nodes[1]._outputs == "ACTUAL.C"
        assert actual_nodes[2]._inputs == "ACTUAL.C"
        assert actual_nodes[2]._outputs == "ACTUAL.D"

        assert actual_nodes[3]._inputs == ["ACTUAL.D", "params:x"]
        assert actual_nodes[3]._outputs == "ACTUAL.X"
Beispiel #15
0
    def test_empty_output(self):
        raw_pipeline = Pipeline([node(biconcat, ["A", "B"], None)])

        resulting_pipeline = pipeline(
            raw_pipeline, namespace="PREFIX", inputs={"A": "A_new"}
        )
        assert resulting_pipeline.nodes[0]._inputs == ["A_new", "PREFIX.B"]
        assert resulting_pipeline.nodes[0]._outputs is None
Beispiel #16
0
    def test_empty_input(self):
        raw_pipeline = Pipeline([node(constant_output, None, ["A", "B"])])

        resulting_pipeline = pipeline(
            raw_pipeline, namespace="PREFIX", outputs={"A": "A_new"}
        )
        assert resulting_pipeline.nodes[0]._inputs is None
        assert resulting_pipeline.nodes[0]._outputs == ["A_new", "PREFIX.B"]
Beispiel #17
0
    def test_dataset_transcoding_mapping_base_name(self):
        raw_pipeline = Pipeline([node(biconcat, ["C@pandas", "D"], ["E@spark", "F"])])
        resulting_pipeline = pipeline(
            raw_pipeline, namespace="PREFIX", inputs={"C": "C_new"}
        )

        assert resulting_pipeline.nodes[0]._inputs == ["C_new@pandas", "PREFIX.D"]
        assert resulting_pipeline.nodes[0]._outputs == ["PREFIX.E@spark", "PREFIX.F"]
Beispiel #18
0
    def test_pipeline_tags(self):
        tagged_pipeline = pipeline(
            [
                node(constant_output, None, "A"),
                node(constant_output, None, "B")
            ],
            tags="tag",
        )

        assert all(n.tags == {"tag"} for n in tagged_pipeline.nodes)
Beispiel #19
0
    def test_node_properties_preserved(self):
        """
        Check that we don't loose any valuable properties on node cloning.
        Also an explicitly defined name should get prefixed.
        """
        raw_pipeline = Pipeline([node(identity, "A", "B", name="node1", tags=["tag1"])])
        raw_pipeline = raw_pipeline.decorate(lambda: None)
        resulting_pipeline = pipeline(raw_pipeline, namespace="PREFIX")

        assert resulting_pipeline.nodes[0].name == "PREFIX.node1"
        assert resulting_pipeline.nodes[0].tags == {"tag1"}
        assert len(resulting_pipeline.nodes[0]._decorators) == 1
Beispiel #20
0
def create_pipeline(**kwargs):
    return pipeline([
        node(
            split_data,
            ["example_iris_data", "params:example_test_data_ratio"],
            dict(
                train_x="example_train_x",
                train_y="example_train_y",
                test_x="example_test_x",
                test_y="example_test_y",
            ),
        )
    ])
Beispiel #21
0
 def test_prefixing_and_renaming(self):
     """
     Prefixing and renaming at the same time.
     Explicitly renamed  datasets should not be prefixed anymore.
     """
     raw_pipeline = Pipeline([node(biconcat, ["C", "D"], ["E", "F"])])
     resulting_pipeline = pipeline(
         raw_pipeline,
         namespace="PREFIX",
         inputs={"C": "C_new"},
         outputs={"E": "E_new"},
     )
     assert resulting_pipeline.nodes[0]._inputs == ["C_new", "PREFIX.D"]
     assert resulting_pipeline.nodes[0]._outputs == ["E_new", "PREFIX.F"]
Beispiel #22
0
    def register_pipelines(self) -> Dict[str, Pipeline]:
        """Register the project's pipeline.

        Returns:
            A mapping from a pipeline name to a ``Pipeline`` object.

        """
        data_engineering_pipeline = pipeline(
            de.create_pipeline(), outputs={"master_table": "ds_main_table"})
        data_science_pipeline = ds.create_pipeline()

        return {
            "__default__": data_engineering_pipeline + data_science_pipeline,
            "de": data_engineering_pipeline,
            "ds": data_science_pipeline,
        }
Beispiel #23
0
def create_pipeline(**kwargs):
    return pipeline(
        [
            node(
                train_model,
                ["example_train_x", "example_train_y", "parameters"],
                "example_model",
            ),
            node(
                predict,
                dict(model="example_model", test_x="example_test_x"),
                "example_predictions",
            ),
            node(report_accuracy, ["example_predictions", "example_test_y"], None),
        ]
    )
Beispiel #24
0
    def test_dataset_transcoding_mapping_full_dataset(self):
        raw_pipeline = Pipeline(
            [
                node(biconcat, ["A@pandas", "B"], "C"),
                node(biconcat, ["A@spark", "C"], "CC"),
            ]
        )
        resulting_pipeline = pipeline(
            raw_pipeline, inputs={"A@pandas": "Alpha"}, namespace="PREFIX"
        )

        assert resulting_pipeline.nodes[0]._inputs == ["Alpha", "PREFIX.B"]
        assert resulting_pipeline.nodes[0]._outputs == "PREFIX.C"

        assert resulting_pipeline.nodes[1]._inputs == ["PREFIX.A@spark", "PREFIX.C"]
        assert resulting_pipeline.nodes[1]._outputs == "PREFIX.CC"
def register_pipelines() -> Dict[str, Pipeline]:
    """Register the project's pipelines.

    Returns:
        A mapping from a pipeline name to a ``Pipeline`` object.
    """
    data_engineering_pipeline = de.create_pipeline()
    data_processing_pipeline = pipeline(
        de.create_pipeline(), namespace="data_processing"
    )
    data_science_pipeline = ds.create_pipeline()

    return {
        "de": data_engineering_pipeline,
        "ds": data_science_pipeline,
        "dp": data_processing_pipeline,
        "__default__": data_engineering_pipeline + data_science_pipeline,
    }
Beispiel #26
0
    def test_prefix_exclude_free_inputs(self, inputs, outputs):
        raw_pipeline = Pipeline([
            node(identity, "A", "B", name="node1"),
            node(identity, "B", "C", name="node2"),
            node(identity, "C", "D", name="node3"),
        ])
        resulting_pipeline = pipeline(raw_pipeline,
                                      inputs=inputs,
                                      outputs=outputs,
                                      namespace="PREFIX")
        nodes = sorted(resulting_pipeline.nodes)
        assert nodes[0]._inputs == "A"
        assert nodes[0]._outputs == "PREFIX.B"

        assert nodes[1]._inputs == "PREFIX.B"
        assert nodes[1]._outputs == "PREFIX.C"

        assert nodes[2]._inputs == "PREFIX.C"
        assert nodes[2]._outputs == "D"
def create_pipeline(**kwargs) -> Pipeline:
    return pipeline([
        node(
            func=split_data,
            inputs=["example_iris_data", "parameters"],
            outputs=["X_train", "X_test", "y_train", "y_test"],
            name="split",
        ),
        node(
            func=make_predictions,
            inputs=["X_train", "X_test", "y_train"],
            outputs="y_pred",
            name="make_predictions",
        ),
        node(
            func=report_accuracy,
            inputs=["y_pred", "y_test"],
            outputs=None,
            name="report_accuracy",
        ),
    ])
Beispiel #28
0
    def test_parameters_updated(self):
        raw_pipeline = Pipeline([
            node(biconcat, ["A", "params:x"], "AA", name="node1"),
            node(biconcat, ["AA", "params:y"], "B", name="node2"),
            node(biconcat, ["B", "params:x"], "BB", name="node3"),
        ])
        resulting_pipeline = pipeline(
            raw_pipeline,
            outputs={"B": "B_new"},
            parameters={"params:x": "params:y"},
            namespace="ACTUAL",
        )
        actual_nodes = resulting_pipeline.nodes

        assert actual_nodes[0]._inputs == ["ACTUAL.A", "params:y"]
        assert actual_nodes[0]._outputs == "ACTUAL.AA"

        assert actual_nodes[1]._inputs == ["ACTUAL.AA", "params:y"]
        assert actual_nodes[1]._outputs == "B_new"

        assert actual_nodes[2]._inputs == ["B_new", "params:y"]
        assert actual_nodes[2]._outputs == "ACTUAL.BB"
Beispiel #29
0
    def test_prefix_dataset_names(self):
        """
        Simple prefixing for dataset of all formats: str, list and dict
        """
        raw_pipeline = Pipeline(
            [
                node(identity, "A", "B", name="node1"),
                node(biconcat, ["C", "D"], ["E", "F"], name="node2"),
                node(
                    biconcat, {"input1": "H", "input2": "J"}, {"K": "L"}, name="node3"
                ),
            ]
        )
        resulting_pipeline = pipeline(raw_pipeline, namespace="PREFIX")
        nodes = sorted(resulting_pipeline.nodes)
        assert nodes[0]._inputs == "PREFIX.A"
        assert nodes[0]._outputs == "PREFIX.B"

        assert nodes[1]._inputs == ["PREFIX.C", "PREFIX.D"]
        assert nodes[1]._outputs == ["PREFIX.E", "PREFIX.F"]

        assert nodes[2]._inputs == {"input1": "PREFIX.H", "input2": "PREFIX.J"}
        assert nodes[2]._outputs == {"K": "PREFIX.L"}
Beispiel #30
0
        combine_data_horizontal,
        [
            "percent_mean_data",
            "percent_typical_data",
            "percent_median_data",
            "positional_val_data",
        ],
        "final_score_data",
        name="final_scoring_node",
    ),
])
# Each of the following pipelines are here to do the ranking for each
#  scoring type
full_ppr_pipeline = pipeline(
    ranking_pipeline,
    inputs={"scored_data": "scored_ppr_data"},
    outputs={"final_score_data": "scoring.ppr"},
    namespace="ppr",
)

full_half_ppr_pipeline = pipeline(
    ranking_pipeline,
    inputs={"scored_data": "scored_half_ppr_data"},
    outputs={"final_score_data": "scoring.half_ppr"},
    namespace="hppr",
)

full_standard_pipeline = pipeline(
    ranking_pipeline,
    inputs={"scored_data": "scored_standard_data"},
    outputs={"final_score_data": "scoring.standard"},
    namespace="std",