def pipeline_ml_with_inputs_artifacts():
    full_pipeline = Pipeline([
        node(
            func=remove_stopwords,
            inputs=dict(data="data", stopwords="stopwords_from_nltk"),
            outputs="cleaned_data",
            tags=["training", "inference"],
        ),
        node(
            func=train_fun,
            inputs="cleaned_data",
            outputs="model",
            tags=["training"],
        ),
        node(
            func=predict_fun,
            inputs=["model", "cleaned_data"],
            outputs="predictions",
            tags=["inference"],
        ),
    ])
    pipeline_ml_with_inputs_artifacts = pipeline_ml_factory(
        training=full_pipeline.only_nodes_with_tags("training"),
        inference=full_pipeline.only_nodes_with_tags("inference"),
        input_name="data",
    )
    return pipeline_ml_with_inputs_artifacts
Exemple #2
0
def pipeline_ml_obj():
    def preprocess_fun(data):
        return data

    def fit_fun(data):
        return 2

    def predict_fun(model, data):
        return data * model

    full_pipeline = Pipeline([
        node(
            func=preprocess_fun,
            inputs="raw_data",
            outputs="data",
            tags=["inference", "training"],
        ),
        node(func=fit_fun, inputs="data", outputs="model", tags=["training"]),
        node(
            func=predict_fun,
            inputs=["data", "model"],
            outputs="predictions",
            tags=["inference"],
        ),
    ])

    pipeline_ml_obj = pipeline_ml_factory(
        training=full_pipeline.only_nodes_with_tags("training"),
        inference=full_pipeline.only_nodes_with_tags("inference"),
        input_name="raw_data",
    )

    return pipeline_ml_obj
def pipeline_ml_with_parameters():
    def remove_stopwords(data, stopwords):
        return data

    def train_fun_hyperparam(data, hyperparam):
        return 2

    def predict_fun(model, data):
        return data * model

    def convert_probs_to_pred(data, threshold):
        return (data > threshold) * 1

    full_pipeline = Pipeline([
        # almost the same that previsously but stopwords are parameters
        # this is a shared parameter between inference and training22
        node(
            func=remove_stopwords,
            inputs=dict(data="data", stopwords="params:stopwords"),
            outputs="cleaned_data",
            tags=["training", "inference"],
        ),
        # parameters in training pipeline, should not be persisted
        node(
            func=train_fun_hyperparam,
            inputs=["cleaned_data", "params:penalty"],
            outputs="model",
            tags=["training"],
        ),
        node(
            func=predict_fun,
            inputs=["model", "cleaned_data"],
            outputs="predicted_probs",
            tags=["inference"],
        ),
        # this time, there is a parameter only for the inference pipeline
        node(
            func=convert_probs_to_pred,
            inputs=["predicted_probs", "params:threshold"],
            outputs="predictions",
            tags=["inference"],
        ),
    ])
    pipeline_ml_with_parameters = pipeline_ml_factory(
        training=full_pipeline.only_nodes_with_tags("training"),
        inference=full_pipeline.only_nodes_with_tags("inference"),
        input_name="data",
        log_model_kwargs={
            "conda_env": {
                "python": "3.7.0",
                "dependencies": ["kedro==0.16.5"]
            },
        },
    )
    return pipeline_ml_with_parameters
Exemple #4
0
    def _filter_pipeline(
        self,
        pipeline: Pipeline,
        tags: Iterable[str] = None,
        from_nodes: Iterable[str] = None,
        to_nodes: Iterable[str] = None,
        node_names: Iterable[str] = None,
        from_inputs: Iterable[str] = None,
    ) -> Pipeline:
        """Filter the pipeline as the intersection of all conditions."""
        new_pipeline = pipeline
        # We need to intersect with the pipeline because the order
        # of operations matters, so we don't want to do it incrementally.
        # As an example, with a pipeline of nodes 1,2,3, think of
        # "from 1", and "only 1 and 3" - the order you do them in results in
        # either 1 & 3, or just 1.
        if tags:
            new_pipeline &= pipeline.only_nodes_with_tags(*tags)
            if not new_pipeline.nodes:
                raise KedroContextError(
                    "Pipeline contains no nodes with tags: {}".format(
                        str(tags)))
        if from_nodes:
            new_pipeline &= pipeline.from_nodes(*from_nodes)
        if to_nodes:
            new_pipeline &= pipeline.to_nodes(*to_nodes)
        if node_names:
            new_pipeline &= pipeline.only_nodes(*node_names)
        if from_inputs:
            new_pipeline &= pipeline.from_inputs(*from_inputs)

        if not new_pipeline.nodes:
            raise KedroContextError("Pipeline contains no nodes")
        return new_pipeline
def pipeline_ml_with_intermediary_artifacts():
    full_pipeline = Pipeline([
        node(
            func=preprocess_fun,
            inputs="raw_data",
            outputs="data",
            tags=["training"],
        ),
        node(
            func=fit_encoder_fun,
            inputs="data",
            outputs="encoder",
            tags=["training"],
        ),
        node(
            func=apply_encoder_fun,
            inputs=["encoder", "data"],
            outputs="encoded_data",
            tags=["training", "inference"],
        ),
        node(
            func=train_fun,
            inputs="encoded_data",
            outputs="model",
            tags=["training"],
        ),
        node(
            func=predict_fun,
            inputs=["model", "encoded_data"],
            outputs="predictions",
            tags=["inference"],
        ),
    ])
    pipeline_ml_with_tag = pipeline_ml(
        training=full_pipeline.only_nodes_with_tags("training"),
        inference=full_pipeline.only_nodes_with_tags("inference"),
        input_name="data",
    )
    return pipeline_ml_with_tag
def pipeline_ml_with_parameters():
    full_pipeline = Pipeline([
        # almost the same that previsously but stopwords are parameters
        # this is a shared parameter between inference and training22
        node(
            func=remove_stopwords,
            inputs=dict(data="data", stopwords="params:stopwords"),
            outputs="cleaned_data",
            tags=["training", "inference"],
        ),
        # parameters in training pipeline, should not be persisted
        node(
            func=train_fun_hyperparam,
            inputs=["cleaned_data", "params:penalty"],
            outputs="model",
            tags=["training"],
        ),
        node(
            func=predict_fun,
            inputs=["model", "cleaned_data"],
            outputs="predicted_probs",
            tags=["inference"],
        ),
        # this time, there is a parameter only for the inference pipeline
        node(
            func=convert_probs_to_pred,
            inputs=["predicted_probs", "params:threshold"],
            outputs="predictions",
            tags=["inference"],
        ),
    ])
    pipeline_ml_with_parameters = pipeline_ml_factory(
        training=full_pipeline.only_nodes_with_tags("training"),
        inference=full_pipeline.only_nodes_with_tags("inference"),
        input_name="data",
    )
    return pipeline_ml_with_parameters
Exemple #7
0
def create_pipelines(*tags: str):
    example_pipeline = Pipeline(
        [
            node(
                lambda x: x,
                "A",
                "B",
                name="node_1",
                tags=[
                    "apple",
                    "orange",
                    "banana",
                    "lemon",
                    "grape",
                    "coconut",
                    "fresh strawberries!",
                ],
            ),
            node(
                sum_dfs,
                ["B", "C"],
                "D",
                name="node_2",
                tags=["apple", "orange", "lemon"],
            ),
            node(
                identity,
                "D",
                "E",
                name="node_3",
                tags=["apple", "orange", "banana", "cherry"],
            ),
            node(identity, "D", "F", name="node_4", tags=["apple", "cherry"]),
        ]
    )

    if tags:
        pipeline = Pipeline([])
        for tag in tags:
            pipeline += example_pipeline.only_nodes_with_tags(tag)
        if not pipeline.nodes:
            raise ValueError(
                "Not found any nodes having any of the following "
                "tags attached: {}".format(", ".join(tags))
            )
    else:
        pipeline = example_pipeline

    return {"__default__": pipeline}