def register_pipelines() -> Dict[str, Pipeline]: """Register the project's pipelines. Returns: A mapping from a pipeline name to a ``Pipeline`` object. """ data_processing_pipeline = dp.create_pipeline() data_science_pipeline = ds.create_pipeline() unfiltered_ds_pipeline = pipeline(data_science_pipeline, namespace="unfiltered", inputs={"model_input_table"}) data_filtering_pipeline = df.create_pipeline() filtered_ds_pipeline = (pipeline( data_filtering_pipeline, inputs={"input_table": "model_input_table"}, outputs={"output_table": "filtered.model_input_table"}, namespace="filtered", ) + pipeline(data_science_pipeline, namespace="filtered")) return { "__default__": (data_processing_pipeline + unfiltered_ds_pipeline + filtered_ds_pipeline), "dp": data_processing_pipeline, "ds": unfiltered_ds_pipeline + filtered_ds_pipeline, "filtered_pipeline": data_processing_pipeline + filtered_ds_pipeline, "unfiltered_pipeline": data_processing_pipeline + unfiltered_ds_pipeline, }
def register_pipelines() -> Dict[str, Pipeline]: """Register the project's pipelines. Returns: A mapping from a pipeline name to a ``Pipeline`` object. """ data_processing_pipeline = dp.create_pipeline() data_science_pipeline = ds.create_pipeline() # TODO 1: give data_science_pipeline the namespace "unfiltered" and make sure it # links up correctly to the model_input_table input. unfiltered_ds_pipeline = data_science_pipeline # TODO 2: create the data_filtering_pipeline using the create_pipeline method. # TODO 2: alter the appropriate parameters file to filter by engine_type == "Quantum". data_filtering_pipeline = Pipeline([]) # TODO 3: give data_science_pipeline the namespace "filtered". Connect the # data_filtering_pipeline onto the model_input_table as input and # the data science pipeline as output using `inputs`, `outputs` and `namespace`. # Add the pipeline to the __default__ and ds registered pipelines. filtered_ds_pipeline = pipeline(data_filtering_pipeline) + pipeline( data_science_pipeline) return { # TODO 4: update the pipeline registry to register two new pipelines # "unfiltered_pipeline" and "filtered_pipeline" that run the data processing # and appropriate data science pipelines. "__default__": (data_processing_pipeline + unfiltered_ds_pipeline), "dp": data_processing_pipeline, "ds": unfiltered_ds_pipeline, }
def test_connect_existing_pipelines(self): """ Two pipelines exist, the dataset names do not match. We `transform` them to work together. """ cook_pipeline = Pipeline([ node(defrost, "frozen_meat", "meat"), node(grill, "meat", "grilled_meat") ]) lunch_pipeline = Pipeline([node(eat, "food", "output")]) pipeline1 = ( pipeline(cook_pipeline, outputs={"grilled_meat": "food"}) + lunch_pipeline) pipeline2 = cook_pipeline + pipeline(lunch_pipeline, inputs={"food": "grilled_meat"}) pipeline3 = pipeline(cook_pipeline, outputs={"grilled_meat": "NEW_NAME"}) + pipeline( lunch_pipeline, inputs={"food": "NEW_NAME"}) for pipe in [pipeline1, pipeline2, pipeline3]: catalog = DataCatalog( {}, feed_dict={"frozen_meat": "frozen_meat_data"}) result = SequentialRunner().run(pipe, catalog) assert result == { "output": "frozen_meat_data_defrosted_grilled_done" }
def test_reuse_same_pipeline(self): """ The same pipeline needs to be used twice in the same big pipeline. Normally dataset and node names would conflict, so we need to `transform` the pipelines. """ cook_pipeline = Pipeline([ node(defrost, "frozen_meat", "meat", name="defrost_node"), node(grill, "meat", "grilled_meat", name="grill_node"), ]) breakfast_pipeline = Pipeline( [node(eat, "breakfast_food", "breakfast_output")]) lunch_pipeline = Pipeline([node(eat, "lunch_food", "lunch_output")]) # We are using two different mechanisms here for breakfast and lunch, # renaming and prefixing pipelines differently. pipe = (pipeline( cook_pipeline, outputs={"grilled_meat": "breakfast_food"}, namespace="breakfast", ) + breakfast_pipeline + pipeline(cook_pipeline, namespace="lunch") + pipeline(lunch_pipeline, inputs={"lunch_food": "lunch.grilled_meat"})) catalog = DataCatalog( {}, feed_dict={ "breakfast.frozen_meat": "breakfast_frozen_meat", "lunch.frozen_meat": "lunch_frozen_meat", }, ) result = SequentialRunner().run(pipe, catalog) assert result == { "breakfast_output": "breakfast_frozen_meat_defrosted_grilled_done", "lunch_output": "lunch_frozen_meat_defrosted_grilled_done", }
def register_pipelines(self) -> Dict[str, Pipeline]: """Register the project's pipeline. Returns: A mapping from a pipeline name to a ``Pipeline`` object. """ feature_pipeline = feature.create_pipeline() target_pipeline = target.create_pipeline() return { "__default__": Pipeline([ pipeline( feature_pipeline, inputs={ "raw_data": "raw_train", "raw_patient_profile": "raw_patient_profile", "raw_health_camp_detail": "raw_health_camp_detail", }, namespace="train", ), pipeline( feature_pipeline, inputs={ "raw_data": "raw_test", "raw_patient_profile": "raw_patient_profile", "raw_health_camp_detail": "raw_health_camp_detail", }, namespace="test", ), target_pipeline, ]) }
def test_bad_outputs_mapping(self): raw_pipeline = Pipeline([ node(biconcat, ["A", "params:alpha"], "AA", name="node1"), node(biconcat, ["AA", "parameters"], "BB", name="node2"), ]) pattern = "Outputs can't contain free inputs to the pipeline" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, outputs={"A": "C"})
def test_missing_dataset_name( self, func, inputs, outputs, inputs_map, outputs_map, expected_missing ): # pylint: disable=too-many-arguments raw_pipeline = Pipeline([node(func, inputs, outputs)]) with pytest.raises(ModularPipelineError, match=r"Failed to map datasets") as e: pipeline( raw_pipeline, namespace="PREFIX", inputs=inputs_map, outputs=outputs_map ) assert ", ".join(expected_missing) in str(e.value)
def test_parameters_specified_under_inputs(self): raw_pipeline = Pipeline([ node(biconcat, ["A", "params:alpha"], "AA", name="node1"), node(biconcat, ["AA", "parameters"], "BB", name="node2"), ]) pattern = r"Parameters should be specified in the `parameters` argument" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, inputs={"params:alpha": "params:beta"}) with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, inputs={"parameters": "some_yaml_dataset"})
def test_non_existent_parameters_mapped(self): raw_pipeline = Pipeline([ node(biconcat, ["A", "params:alpha"], "AA", name="node1"), node(biconcat, ["AA", "CC"], "BB", name="node2"), ]) pattern = r"Failed to map datasets and/or parameters: params:beta" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, parameters={"params:beta": "params:gamma"}) pattern = r"Failed to map datasets and/or parameters: parameters" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, parameters={"parameters": "some_yaml_dataset"})
def test_default_node_name_is_namespaced(self): """Check that auto-generated node names are also namespaced""" raw_pipeline = Pipeline([node(identity, "A", "B")]) first_layer_nested_pipe = pipeline(raw_pipeline, namespace="PREFIX") resulting_node = first_layer_nested_pipe.nodes[0] assert resulting_node.name.startswith("PREFIX.") assert resulting_node.namespace == "PREFIX" second_layer_nested_pipe = pipeline(first_layer_nested_pipe, namespace="PRE") resulting_node = second_layer_nested_pipe.nodes[0] assert resulting_node.name.startswith("PRE.") assert resulting_node.namespace == "PRE.PREFIX"
def test_transform_dataset_names(self): """ Rename some datasets, test string, list and dict formats. """ raw_pipeline = Pipeline( [ node(identity, "A", "B", name="node1"), node(biconcat, ["C", "D"], ["E", "F"], name="node2"), node( biconcat, {"input1": "H", "input2": "J"}, {"K": "L"}, name="node3" ), ] ) resulting_pipeline = pipeline( raw_pipeline, inputs={"A": "A_new", "D": "D_new", "H": "H_new"}, outputs={"B": "B_new", "E": "E_new", "L": "L_new"}, ) # make sure the order is correct nodes = sorted(resulting_pipeline.nodes) assert nodes[0]._inputs == "A_new" assert nodes[0]._outputs == "B_new" assert nodes[1]._inputs == ["C", "D_new"] assert nodes[1]._outputs == ["E_new", "F"] assert nodes[2]._inputs == {"input1": "H_new", "input2": "J"} assert nodes[2]._outputs == {"K": "L_new"}
def test_transform_params_prefix_and_parameters(self): """ Test that transform should skip `params:` and `parameters`: str, list and dict. """ raw_pipeline = Pipeline( [ node(identity, "parameters", "params:B", name="node1"), node(biconcat, ["params:C", "D"], ["parameters", "F"], name="node2"), node( biconcat, {"input1": "params:H", "input2": "parameters"}, {"K": "L"}, name="node3", ), ] ) resulting_pipeline = pipeline(raw_pipeline, namespace="PREFIX") nodes = sorted(resulting_pipeline.nodes) assert nodes[0]._inputs == "parameters" assert nodes[0]._outputs == "params:B" assert nodes[1]._inputs == ["params:C", "PREFIX.D"] assert nodes[1]._outputs == ["parameters", "PREFIX.F"] assert nodes[2]._inputs == {"input1": "params:H", "input2": "parameters"} assert nodes[2]._outputs == {"K": "PREFIX.L"} assert nodes[2].name == "PREFIX.node3"
def register_pipelines() -> Dict[str, Pipeline]: """Register the project's pipelines. Returns: A mapping from a pipeline name to a ``Pipeline`` object. """ feature_engineering_train_pipe = _fe.create_pipeline( ).only_nodes_with_tags("training") feature_engineering_inference_pipe = pipeline( _fe.create_pipeline().only_nodes_with_tags("inference"), inputs={ "iris_X_test": "iris_X_test", "normalizer": "normalizer" }, outputs={"iris_X_test_normalized": "iris_X_test_normalized"}, namespace="inference", ) modelling_train_pipe = _modelling.create_pipeline().only_nodes_with_tags( "training") return { "__default__": Pipeline([]), "fe_train": feature_engineering_train_pipe, "fe_inference": feature_engineering_inference_pipe, "model_train": modelling_train_pipe, }
def test_expose_intermediate_output(self): """Check that we don't namespace an intermediary dataset, anywhere it is used - either input or output""" raw_pipeline = Pipeline( [ node(identity, "A", "B", name="node1"), node(identity, "B", "C", name="node2"), node(identity, "C", "D", name="node3"), node(biconcat, ["D", "params:x"], "X", name="node4"), ] ) resulting_pipeline = pipeline( raw_pipeline, outputs={"B": "B_new"}, namespace="ACTUAL" ) actual_nodes = resulting_pipeline.nodes assert actual_nodes[0]._outputs == "B_new" assert actual_nodes[1]._inputs == "B_new" assert actual_nodes[0]._inputs == "ACTUAL.A" assert actual_nodes[1]._outputs == "ACTUAL.C" assert actual_nodes[2]._inputs == "ACTUAL.C" assert actual_nodes[2]._outputs == "ACTUAL.D" assert actual_nodes[3]._inputs == ["ACTUAL.D", "params:x"] assert actual_nodes[3]._outputs == "ACTUAL.X"
def test_empty_output(self): raw_pipeline = Pipeline([node(biconcat, ["A", "B"], None)]) resulting_pipeline = pipeline( raw_pipeline, namespace="PREFIX", inputs={"A": "A_new"} ) assert resulting_pipeline.nodes[0]._inputs == ["A_new", "PREFIX.B"] assert resulting_pipeline.nodes[0]._outputs is None
def test_empty_input(self): raw_pipeline = Pipeline([node(constant_output, None, ["A", "B"])]) resulting_pipeline = pipeline( raw_pipeline, namespace="PREFIX", outputs={"A": "A_new"} ) assert resulting_pipeline.nodes[0]._inputs is None assert resulting_pipeline.nodes[0]._outputs == ["A_new", "PREFIX.B"]
def test_dataset_transcoding_mapping_base_name(self): raw_pipeline = Pipeline([node(biconcat, ["C@pandas", "D"], ["E@spark", "F"])]) resulting_pipeline = pipeline( raw_pipeline, namespace="PREFIX", inputs={"C": "C_new"} ) assert resulting_pipeline.nodes[0]._inputs == ["C_new@pandas", "PREFIX.D"] assert resulting_pipeline.nodes[0]._outputs == ["PREFIX.E@spark", "PREFIX.F"]
def test_pipeline_tags(self): tagged_pipeline = pipeline( [ node(constant_output, None, "A"), node(constant_output, None, "B") ], tags="tag", ) assert all(n.tags == {"tag"} for n in tagged_pipeline.nodes)
def test_node_properties_preserved(self): """ Check that we don't loose any valuable properties on node cloning. Also an explicitly defined name should get prefixed. """ raw_pipeline = Pipeline([node(identity, "A", "B", name="node1", tags=["tag1"])]) raw_pipeline = raw_pipeline.decorate(lambda: None) resulting_pipeline = pipeline(raw_pipeline, namespace="PREFIX") assert resulting_pipeline.nodes[0].name == "PREFIX.node1" assert resulting_pipeline.nodes[0].tags == {"tag1"} assert len(resulting_pipeline.nodes[0]._decorators) == 1
def create_pipeline(**kwargs): return pipeline([ node( split_data, ["example_iris_data", "params:example_test_data_ratio"], dict( train_x="example_train_x", train_y="example_train_y", test_x="example_test_x", test_y="example_test_y", ), ) ])
def test_prefixing_and_renaming(self): """ Prefixing and renaming at the same time. Explicitly renamed datasets should not be prefixed anymore. """ raw_pipeline = Pipeline([node(biconcat, ["C", "D"], ["E", "F"])]) resulting_pipeline = pipeline( raw_pipeline, namespace="PREFIX", inputs={"C": "C_new"}, outputs={"E": "E_new"}, ) assert resulting_pipeline.nodes[0]._inputs == ["C_new", "PREFIX.D"] assert resulting_pipeline.nodes[0]._outputs == ["E_new", "PREFIX.F"]
def register_pipelines(self) -> Dict[str, Pipeline]: """Register the project's pipeline. Returns: A mapping from a pipeline name to a ``Pipeline`` object. """ data_engineering_pipeline = pipeline( de.create_pipeline(), outputs={"master_table": "ds_main_table"}) data_science_pipeline = ds.create_pipeline() return { "__default__": data_engineering_pipeline + data_science_pipeline, "de": data_engineering_pipeline, "ds": data_science_pipeline, }
def create_pipeline(**kwargs): return pipeline( [ node( train_model, ["example_train_x", "example_train_y", "parameters"], "example_model", ), node( predict, dict(model="example_model", test_x="example_test_x"), "example_predictions", ), node(report_accuracy, ["example_predictions", "example_test_y"], None), ] )
def test_dataset_transcoding_mapping_full_dataset(self): raw_pipeline = Pipeline( [ node(biconcat, ["A@pandas", "B"], "C"), node(biconcat, ["A@spark", "C"], "CC"), ] ) resulting_pipeline = pipeline( raw_pipeline, inputs={"A@pandas": "Alpha"}, namespace="PREFIX" ) assert resulting_pipeline.nodes[0]._inputs == ["Alpha", "PREFIX.B"] assert resulting_pipeline.nodes[0]._outputs == "PREFIX.C" assert resulting_pipeline.nodes[1]._inputs == ["PREFIX.A@spark", "PREFIX.C"] assert resulting_pipeline.nodes[1]._outputs == "PREFIX.CC"
def register_pipelines() -> Dict[str, Pipeline]: """Register the project's pipelines. Returns: A mapping from a pipeline name to a ``Pipeline`` object. """ data_engineering_pipeline = de.create_pipeline() data_processing_pipeline = pipeline( de.create_pipeline(), namespace="data_processing" ) data_science_pipeline = ds.create_pipeline() return { "de": data_engineering_pipeline, "ds": data_science_pipeline, "dp": data_processing_pipeline, "__default__": data_engineering_pipeline + data_science_pipeline, }
def test_prefix_exclude_free_inputs(self, inputs, outputs): raw_pipeline = Pipeline([ node(identity, "A", "B", name="node1"), node(identity, "B", "C", name="node2"), node(identity, "C", "D", name="node3"), ]) resulting_pipeline = pipeline(raw_pipeline, inputs=inputs, outputs=outputs, namespace="PREFIX") nodes = sorted(resulting_pipeline.nodes) assert nodes[0]._inputs == "A" assert nodes[0]._outputs == "PREFIX.B" assert nodes[1]._inputs == "PREFIX.B" assert nodes[1]._outputs == "PREFIX.C" assert nodes[2]._inputs == "PREFIX.C" assert nodes[2]._outputs == "D"
def create_pipeline(**kwargs) -> Pipeline: return pipeline([ node( func=split_data, inputs=["example_iris_data", "parameters"], outputs=["X_train", "X_test", "y_train", "y_test"], name="split", ), node( func=make_predictions, inputs=["X_train", "X_test", "y_train"], outputs="y_pred", name="make_predictions", ), node( func=report_accuracy, inputs=["y_pred", "y_test"], outputs=None, name="report_accuracy", ), ])
def test_parameters_updated(self): raw_pipeline = Pipeline([ node(biconcat, ["A", "params:x"], "AA", name="node1"), node(biconcat, ["AA", "params:y"], "B", name="node2"), node(biconcat, ["B", "params:x"], "BB", name="node3"), ]) resulting_pipeline = pipeline( raw_pipeline, outputs={"B": "B_new"}, parameters={"params:x": "params:y"}, namespace="ACTUAL", ) actual_nodes = resulting_pipeline.nodes assert actual_nodes[0]._inputs == ["ACTUAL.A", "params:y"] assert actual_nodes[0]._outputs == "ACTUAL.AA" assert actual_nodes[1]._inputs == ["ACTUAL.AA", "params:y"] assert actual_nodes[1]._outputs == "B_new" assert actual_nodes[2]._inputs == ["B_new", "params:y"] assert actual_nodes[2]._outputs == "ACTUAL.BB"
def test_prefix_dataset_names(self): """ Simple prefixing for dataset of all formats: str, list and dict """ raw_pipeline = Pipeline( [ node(identity, "A", "B", name="node1"), node(biconcat, ["C", "D"], ["E", "F"], name="node2"), node( biconcat, {"input1": "H", "input2": "J"}, {"K": "L"}, name="node3" ), ] ) resulting_pipeline = pipeline(raw_pipeline, namespace="PREFIX") nodes = sorted(resulting_pipeline.nodes) assert nodes[0]._inputs == "PREFIX.A" assert nodes[0]._outputs == "PREFIX.B" assert nodes[1]._inputs == ["PREFIX.C", "PREFIX.D"] assert nodes[1]._outputs == ["PREFIX.E", "PREFIX.F"] assert nodes[2]._inputs == {"input1": "PREFIX.H", "input2": "PREFIX.J"} assert nodes[2]._outputs == {"K": "PREFIX.L"}
combine_data_horizontal, [ "percent_mean_data", "percent_typical_data", "percent_median_data", "positional_val_data", ], "final_score_data", name="final_scoring_node", ), ]) # Each of the following pipelines are here to do the ranking for each # scoring type full_ppr_pipeline = pipeline( ranking_pipeline, inputs={"scored_data": "scored_ppr_data"}, outputs={"final_score_data": "scoring.ppr"}, namespace="ppr", ) full_half_ppr_pipeline = pipeline( ranking_pipeline, inputs={"scored_data": "scored_half_ppr_data"}, outputs={"final_score_data": "scoring.half_ppr"}, namespace="hppr", ) full_standard_pipeline = pipeline( ranking_pipeline, inputs={"scored_data": "scored_standard_data"}, outputs={"final_score_data": "scoring.standard"}, namespace="std",