def test_release_transcoded(self, is_async):
        runner = ParallelRunner(is_async=is_async)
        log = runner._manager.list()

        pipeline = Pipeline(
            [node(source, None, "ds@save"), node(sink, "ds@load", None)]
        )
        catalog = DataCatalog(
            {
                "ds@save": LoggingDataSet(log, "save"),
                "ds@load": LoggingDataSet(log, "load"),
            }
        )

        ParallelRunner().run(pipeline, catalog)

        # we want to see both datasets being released
        assert list(log) == [("release", "save"), ("load", "load"), ("release", "load")]
Exemple #2
0
def create_pipelines(**kwargs) -> Dict[str, Pipeline]:
    """Create the project's pipeline.

    Args:
        kwargs: Ignore any additional arguments added in the future.

    Returns:
        A mapping from a pipeline name to a ``Pipeline`` object.

    """


    return {
        "__default__": Pipeline([
            node(lambda x: x.describe(), inputs='pandas_iris_data', outputs=None),
            node(lambda x: x.show(), inputs='spark_iris_data', outputs=None),
        ])
    }
Exemple #3
0
    def test_count_multiple_loads(self):
        runner = ThreadRunner()
        log = []

        pipeline = Pipeline([
            node(source, None, "dataset"),
            node(sink, "dataset", None, name="bob"),
            node(sink, "dataset", None, name="fred"),
        ])
        catalog = DataCatalog({"dataset": LoggingDataSet(log, "dataset")})
        runner.run(pipeline, catalog)

        # we want to the release after both the loads
        assert list(log) == [
            ("load", "dataset"),
            ("load", "dataset"),
            ("release", "dataset"),
        ]
def perm_importance_pipeline(**kwargs):
    return Pipeline([
        node(
            func=fit_model,
            inputs=["y", "X"],
            outputs="regressor",
            name="fit_model_node",
        ),
        node(
            func=evaluate_perm_importance,
            inputs=[
                "regressor", "y", "X", "w", "y_true", "features_mask",
                "parameters"
            ],
            outputs=None,
            name="evaluate_perm_importance_node",
        ),
    ])
def create_pipeline(**kwargs):
    """Create the project's pipeline.

    Args:
        kwargs: Ignore any additional arguments added in the future.

    Returns:
        Pipeline: The resulting pipeline.

    """

    ###########################################################################
    # Here you can find an example pipeline with 4 nodes.
    #
    # PLEASE DELETE THIS PIPELINE ONCE YOU START WORKING ON YOUR OWN PROJECT AS
    # WELL AS THE FILE nodes/example.py
    # -------------------------------------------------------------------------

    pipeline = Pipeline([
        node(
            split_data,
            ["example_iris_data", "parameters"],
            dict(
                train_x="example_train_x",
                train_y="example_train_y",
                test_x="example_test_x",
                test_y="example_test_y",
            ),
        ),
        node(
            train_model,
            ["example_train_x", "example_train_y", "parameters"],
            "example_model",
        ),
        node(
            predict,
            dict(model="example_model", test_x="example_test_x"),
            "example_predictions",
        ),
        node(report_accuracy, ["example_predictions", "example_test_y"], None),
    ])
    ###########################################################################

    return pipeline
Exemple #6
0
def create_pipeline(**kwargs):

    return Pipeline(
        [
            node(
                preprocess_cryptocurrencies,
                inputs="cryptocurrency",
                outputs="processed_cryptocurrencies",
                name="preprocessing_currencies",
            ),
            node(
                filter_cryptocurrencies,
                inputs="processed_cryptocurrencies",
                outputs="filtered_cryptocurrencies",
                name="filtering_cryptocurrencies",
            ),
            node(
                format_cryptocurrencies,
                inputs="filtered_cryptocurrencies",
                outputs="formated_cyptocurrencies",
                name="formating_cryptocurrencies",
            ),
            node(
                yearvalid_cryptocurrencies,
                inputs="formated_cyptocurrencies",
                outputs="validyear_cryptocurrencies",
                name="validatingyear_cryptocurrencies",
            ),
            node(
                inverse_cryptocurrencies,
                inputs="validyear_cryptocurrencies",
                outputs="reversed_cryptocurrencies",
                name="reversing_cryptocurrencies",
            ),
            node(
                format_totalbitcoins,
                inputs="total-bitcoins",
                outputs="formated_total_bitcoins",
                name="formating_total_bitcoins",
            ),
        ],
        tags=["cd_tag"],
    )
Exemple #7
0
def create_pipeline(**kwargs):
    return Pipeline([
        node(cleanse_fire_data,
             "raw_fire_data",
             "fire_data_basic_cleansed",
             name="cleanse_fire_data"),
        node(transform_datetime,
             "fire_data_basic_cleansed",
             "fire_data_with_date_info",
             name="transform_datetime"),
        node(fill_missing_coordinates,
             "fire_data_with_date_info",
             "fire_data_cleansed",
             name="fill_missing_coordinates"),
        node(calculate_closest_station,
             ["fire_data_cleansed", "foehn_stations", "parameters"],
             "fire_data_cleansed_stations",
             name="calculate_closest_station")
    ])
Exemple #8
0
def create_annual_projections_pipeline(**kwargs):
    """
    Combine together all the annual projection sources into a single annual projection
    This only works for the current year, which is assumed

    Returns:
        Pipeline: this pipeline gets all the annual projections from various sources and combines them into a single projection via average statistics
    """
    return Pipeline(
        [
            create_fp_proj_pipeline(),
            create_cbs_proj_pipeline(),
            node(
                func=average_stats_by_player,
                inputs=LOCAL_PROJECTIONS,
                outputs="projections.annual",
            ),
        ]
    )
Exemple #9
0
def create_pipeline(**kwargs):
    return Pipeline(
        [
            node(
                func=modeling.split_data,
                inputs=["train_prep", "parameters"],
                outputs=["X_train", "X_test", "y_train", "y_test"],
            ),
            node(func=modeling.train_model,
                 inputs=["X_train", "y_train"],
                 outputs="clf"),
            node(
                func=modeling.evaluate_model,
                inputs=["clf", "X_test", "y_test"],
                outputs=None,
            ),
        ],
        tags=["ds_tag"],
    )
Exemple #10
0
def create_pipeline(**kwargs):
    nodes = [
        _node(start_test, ['test_df', 'item_df'], 'test_start_df'),
        _node(pricing_map, ['train_enc_df', 'test_start_df'], 'price_df'),
        _node(join_test,
              ['test_start_df', 'price_df'] + [f'{col}_test_enc_feature' for col in category_cols],
              'test_join_df'),
        _node(lag_test,
              ['test_lag_df', 'test_join_df'],
              'x_test'),
    ]
    for col in category_cols:
        nodes.append(_node(
            _partial(encode_test, col),
            ['train_enc_df', 'test_start_df'],
            f'{col}_test_enc_feature',
            f'_{col}'
        ))
    return Pipeline(nodes)
Exemple #11
0
def test_node_hook_logging_above_limit_truncate_strategy(
        kedro_project, dummy_run_params, param_length):

    _write_yaml(
        kedro_project / "conf" / "local" / "mlflow.yml",
        dict(hooks=dict(node=dict(long_parameters_strategy="truncate")), ),
    )

    mlflow_tracking_uri = (kedro_project / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)

    mlflow_node_hook = MlflowNodeHook()

    param_value = param_length * "a"
    node_inputs = {"params:my_param": param_value}

    project_metadata = _get_project_metadata(kedro_project)
    _add_src_to_path(project_metadata.source_dir, kedro_project)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
            package_name=project_metadata.package_name,
            project_path=kedro_project,
    ):
        with mlflow.start_run():
            mlflow_node_hook.before_pipeline_run(
                run_params=dummy_run_params,
                pipeline=Pipeline([]),
                catalog=DataCatalog(),
            )
            mlflow_node_hook.before_node_run(
                node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None),
                catalog=DataCatalog(),  # can be empty
                inputs=node_inputs,
                is_async=False,
                run_id="132",
            )
            run_id = mlflow.active_run().info.run_id

        mlflow_client = MlflowClient(mlflow_tracking_uri)
        current_run = mlflow_client.get_run(run_id)
        assert current_run.data.params == {
            "my_param": param_value[0:MAX_PARAM_VAL_LENGTH]
        }
def create_pipeline(**kwargs):
    return Pipeline([
        node(
            func=split_data,
            inputs=["master_table", "parameters"],
            outputs=["X_train@pickle", "X_test", "y_train", "y_test"],
        ),
        node(
            func=train_model_sagemaker,
            inputs=["X_train@path", "params:sklearn_estimator_kwargs"],
            outputs="model_path",
        ),
        node(untar_model, inputs="model_path", outputs="regressor"),
        node(
            func=evaluate_model,
            inputs=["regressor", "X_test", "y_test"],
            outputs=None,
        ),
    ])
Exemple #13
0
def create_pipeline(**kwargs):
    return Pipeline([
        node(
            train_fasttext_model,
            [
                "classification_train", "params:model_classification_name",
                "parameters"
            ],
            "fasttext_classification_model_path",
        ),
        node(report_accuracy_fasttext,
             ["fasttext_classification_model_path", "classification_test"],
             None),
        node(
            train_fasttext_model,
            [
                "favour_against_train", "params:model_favour_against_name",
                "parameters"
            ],
            "fasttext_favour_against_model_path",
        ),
        node(report_accuracy_fasttext,
             ["fasttext_favour_against_model_path", "favour_against_test"],
             None),
        node(
            train_fasttext_model,
            [
                "opinion_towards_train", "params:model_opinion_towards_name",
                "parameters"
            ],
            "fasttext_opinion_towards_model_path",
        ),
        node(report_accuracy_fasttext,
             ["fasttext_opinion_towards_model_path", "opinion_towards_test"],
             None),
        node(
            train_fasttext_model,
            ["sentiment_train", "params:model_sentiment_name", "parameters"],
            "fasttext_sentiment_model_path",
        ),
        node(report_accuracy_fasttext,
             ["fasttext_sentiment_model_path", "sentiment_test"], None)
    ])
Exemple #14
0
def community_prep(**kwargs):
    """community_prep prepares the text files."""

    tags = ["community-prep"]

    nodes = [
        node(
            format_textfiles,
            ["flow_coal_nx_nodes", "flow_coal_nx_edges"],
            [],
            tags=tags + ["community-prep_coal"],
        ),
        node(format_textfiles, ["flow_oil_nx_nodes", "flow_oil_nx_edges"], [],
             tags=tags + ["community-prep_oil"]),
        node(format_textfiles, ["flow_gas_nx_nodes", "flow_gas_nx_edges"], [],
             tags=tags + ["community-prep_gas"]),
    ]

    return Pipeline(nodes)
Exemple #15
0
def create_pipeline(**kwargs):
    return Pipeline([
        node(func=split_data,
             inputs=['preprocessed_stroke_data', 'parameters'],
             outputs=['X_train', 'X_test', 'y_train', 'y_test'],
             name='split_data_node'),
        node(func=over_sample_data,
             inputs=['X_train', 'y_train', 'parameters'],
             outputs=['X_train_res', 'y_train_res'],
             name='over_sample_node'),
        node(func=train_model,
             inputs=['X_train_res', 'y_train_res'],
             outputs='lgbm',
             name='train_model_node'),
        node(func=evaluate_model,
             inputs=['lgbm', 'X_test', 'y_test'],
             outputs=None,
             name='evaluate_model_node')
    ])
Exemple #16
0
    def test_dont_release_inputs_and_outputs(self):
        manager = ParallelRunnerManager()
        manager.start()
        log = manager.list()

        pipeline = Pipeline(
            [node(identity, "in", "middle"), node(identity, "middle", "out")]
        )
        catalog = DataCatalog(
            {
                "in": manager.LoggingDataSet(log, "in", "stuff"),
                "middle": manager.LoggingDataSet(log, "middle"),
                "out": manager.LoggingDataSet(log, "out"),
            }
        )
        ParallelRunner().run(pipeline, catalog)

        # we don't want to see release in or out in here
        assert list(log) == [("load", "in"), ("load", "middle"), ("release", "middle")]
Exemple #17
0
    def register_pipelines(self) -> Dict[str, Pipeline]:
        """Register the project's pipeline.

        Returns:
            A mapping from a pipeline name to a ``Pipeline`` object.

        """

        return {
            "__default__":
            Pipeline([
                node(
                    lambda: range(10),
                    inputs=None,
                    outputs="range_one",
                    name="create_range_one",
                ),
                node(
                    lambda _range: [i**2 for i in _range],
                    inputs="range_one",
                    outputs="square_range_one",
                    name="create_square_range_one",
                ),
                node(
                    lambda _range: [i + 1 for i in _range],
                    inputs="square_range_one",
                    outputs="add_one",
                    name="create_add_one",
                ),
                node(
                    lambda: range(100, 110),
                    inputs=None,
                    outputs="range_two",
                    name="create_range_two",
                ),
                node(
                    lambda one, two: [*one, *two],
                    inputs=["add_one", "range_two"],
                    outputs="join",
                    name="create_join",
                ),
            ])
        }
    def test_count_multiple_loads(self, is_async):
        runner = ParallelRunner(is_async=is_async)
        log = runner._manager.list()

        pipeline = Pipeline([
            node(source, None, "dataset"),
            node(sink, "dataset", None, name="bob"),
            node(sink, "dataset", None, name="fred"),
        ])
        catalog = DataCatalog(
            {"dataset": runner._manager.LoggingDataSet(log, "dataset")})
        runner.run(pipeline, catalog)

        # we want to the release after both the loads
        assert list(log) == [
            ("load", "dataset"),
            ("load", "dataset"),
            ("release", "dataset"),
        ]
Exemple #19
0
    def _validate_catalog(cls, catalog: DataCatalog, pipeline: Pipeline):
        """Ensure that all data sets are serializable and that we do not have
        any non proxied memory data sets being used as outputs as their content
        will not be synchronized across threads.
        """

        data_sets = catalog._data_sets  # pylint: disable=protected-access

        unserializable = []
        for name, data_set in data_sets.items():
            if getattr(data_set, "_SINGLE_PROCESS", False):  # SKIP_IF_NO_SPARK
                unserializable.append(name)
                continue
            try:
                ForkingPickler.dumps(data_set)
            except (AttributeError, PicklingError):
                unserializable.append(name)

        if unserializable:
            raise AttributeError(
                "The following data sets cannot be used with multiprocessing: "
                "{}\nIn order to utilize multiprocessing you need to make sure "
                "all data sets are serializable, i.e. data sets should not make "
                "use of lambda functions, nested functions, closures etc.\nIf you "
                "are using custom decorators ensure they are correctly using "
                "functools.wraps().".format(sorted(unserializable))
            )

        memory_data_sets = []
        for name, data_set in data_sets.items():
            if (
                name in pipeline.all_outputs()
                and isinstance(data_set, MemoryDataSet)
                and not isinstance(data_set, BaseProxy)
            ):
                memory_data_sets.append(name)

        if memory_data_sets:
            raise AttributeError(
                "The following data sets are memory data sets: {}\n"
                "ParallelRunner does not support output to externally created "
                "MemoryDataSets".format(sorted(memory_data_sets))
            )
Exemple #20
0
def create_pipeline():

    return Pipeline([
        node(
            generate_date_range,
            inputs=['params:start_date', 'params:end_date'],
            outputs='dates_to_download'
        ),
        node(
            parallel_get_temp_data,
            inputs=['dates_to_download', 'already_downloaded_dates'],
            outputs=['downloaded_dates', 'already_downloaded_dates!'],
        ),
        node(
            parallel_choose_station,
            inputs=['downloaded_dates', 'params:station_id'],
            outputs='downloaded_station_data',
        )
    ])
Exemple #21
0
    def test_dont_release_inputs_and_outputs(self, is_async):
        runner = ParallelRunner(is_async=is_async)
        log = runner._manager.list()

        pipeline = Pipeline(
            [node(identity, "in", "middle"), node(identity, "middle", "out")]
        )
        # pylint: disable=no-member
        catalog = DataCatalog(
            {
                "in": runner._manager.LoggingDataSet(log, "in", "stuff"),
                "middle": runner._manager.LoggingDataSet(log, "middle"),
                "out": runner._manager.LoggingDataSet(log, "out"),
            }
        )
        ParallelRunner().run(pipeline, catalog)

        # we don't want to see release in or out in here
        assert list(log) == [("load", "in"), ("load", "middle"), ("release", "middle")]
Exemple #22
0
    def test_prefix_exclude_free_inputs(self, inputs, outputs):
        raw_pipeline = Pipeline([
            node(identity, "A", "B", name="node1"),
            node(identity, "B", "C", name="node2"),
            node(identity, "C", "D", name="node3"),
        ])
        resulting_pipeline = pipeline(raw_pipeline,
                                      inputs=inputs,
                                      outputs=outputs,
                                      namespace="PREFIX")
        nodes = sorted(resulting_pipeline.nodes)
        assert nodes[0]._inputs == "A"
        assert nodes[0]._outputs == "PREFIX.B"

        assert nodes[1]._inputs == "PREFIX.B"
        assert nodes[1]._outputs == "PREFIX.C"

        assert nodes[2]._inputs == "PREFIX.C"
        assert nodes[2]._outputs == "D"
Exemple #23
0
def sjoin_pipeline(**kwargs):
    """ The sjoin pipeline performs spatial joins between the datasets"""
    tags = ["sjoin"]

    sjoin_nodes = [
        node(spatialjoin, [f'prp_{sector1}_data', f'prp_{sector2}_data'],
             f'sjoin_edges_{sector1}_{sector2}',
             tags=tags + ['sjoin_mp', f'sjoin_{sector1}_{sector2}'])
        for sector1, sector2 in SJOIN_PAIRS
    ]
    null_nodes = [
        node(null_forward,
             f'prp_{sector}_data',
             f'sjoin_{sector}_data',
             tags=tags + ['sjoin_null', f'sjoin_null_{sector}'])
        for sector in ALL_SECTORS
    ]

    return Pipeline(sjoin_nodes + null_nodes)
def create_pipeline(**kwargs):
    return Pipeline(
        [
            node(
                training,
                dict(train_x="categorical_house_price_train_x",
                     train_y="house_price_train_y",
                     parameters="parameters"),
                "best_model",
            ),
            node(
                predict_lightgbm_model,
                dict(model="best_model",
                     test_x="categorical_house_price_test_x",
                     test_id="house_price_test_id"),
                "submit_predictions",
            )
        ]
    )
Exemple #25
0
    def test_apply(self):
        nodes = sorted(
            [
                node(identity, "number", "output1", name="identity1"),
                node(identity, "output1", "output2", name="biconcat"),
                node(identity, "output2", "output", name="identity3"),
            ],
            key=lambda x: x.name,
        )

        pipeline = Pipeline(nodes).decorate(apply_f, apply_g)
        catalog = DataCatalog({}, dict(number=1))
        result = SequentialRunner().run(pipeline, catalog)
        decorated_nodes = sorted(pipeline.nodes, key=lambda x: x.name)

        assert result["output"] == "g(f(g(f(g(f(1))))))"
        assert len(pipeline.nodes) == 3
        assert all(n1.name == n2.name
                   for n1, n2 in zip(nodes, decorated_nodes))
Exemple #26
0
def create_pipeline(**kwargs):
    return Pipeline([
        node(
            utils.make_partial(create_health_score_target,
                               health_score_column="Health_Score"),
            ["raw_train", "raw_first_health_camp_attended"],
            "tgt_first_health_camp_outcome_favorable",
        ),
        node(
            utils.make_partial(create_health_score_target,
                               health_score_column="Health Score"),
            ["raw_train", "raw_second_health_camp_attended"],
            "tgt_second_health_camp_outcome_favorable",
        ),
        node(
            utils.make_partial(
                create_stall_visited_target,
                stall_visited_column="Number_of_stall_visited",
            ),
            ["raw_train", "raw_third_health_camp_attended"],
            "tgt_third_health_camp_outcome_favorable",
        ),
        node(
            utils.join_all,
            [
                "tgt_first_health_camp_outcome_favorable",
                "tgt_second_health_camp_outcome_favorable",
                "tgt_third_health_camp_outcome_favorable",
            ],
            "tgt_joined",
        ),
        node(
            utils.methodcaller("sum", axis=1),
            "tgt_joined",
            "tgt_combined",
        ),
        node(
            utils.methodcaller("to_frame",
                               name="tgt_health_camp_outcome_favorable"),
            "tgt_combined",
            "tgt_health_camp_outcome_favorable",
        ),
    ])
    def test_connected_pipeline(self, disjoint_pipeline):
        """Connect two separate pipelines."""
        nodes = disjoint_pipeline["nodes"]
        subpipeline = Pipeline(nodes, name="subpipeline")

        assert len(subpipeline.inputs()) == 2
        assert len(subpipeline.outputs()) == 2

        pipeline = Pipeline(
            [node(identity, "C", "D", name="connecting_node"), subpipeline], name="main"
        )

        assert len(pipeline.nodes) == 1 + len(nodes)
        assert len(pipeline.inputs()) == 1
        assert len(pipeline.outputs()) == 1
        assert all(pipeline.name in n.tags for n in pipeline.nodes)
        assert all(
            subpipeline.name in n.tags
            for n in pipeline.nodes
            if n.name != "connecting_node"
        )
def create_pipeline():
    def func1(a, b):  # pylint: disable=unused-argument
        return a

    def func2(a, b):  # pylint: disable=unused-argument
        return a

    return Pipeline([
        # unnamed node with no tags and basic io
        node(func1, ["bob_in", "parameters"], ["bob_out"]),
        # named node with tags and transcoding
        node(
            func2,
            ["fred_in@pandas", "parameters"],
            ["fred_out@pandas"],
            name="my_node",
            tags=["bob"],
        ),
    ])
    def _run(self, pipeline: Pipeline, catalog: DataCatalog) -> None:
        """The method implementing sequential pipeline running.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.

        Raises:
            ValueError: if the Pipeline is not compatible with Airflow
        """

        print("_run")

        data_sets = catalog._data_sets  # pylint: disable=protected-access
        memory_data_sets = []
        for name, data_set in data_sets.items():
            if name in pipeline.all_outputs() and isinstance(
                    data_set, MemoryDataSet):
                memory_data_sets.append(name)

        if memory_data_sets:
            raise ValueError(
                "The following output data sets are memory data sets: {}\n"
                "AirflowRunner does not support output to MemoryDataSets".
                format(", ".join("'{}'".format(ds)
                                 for ds in memory_data_sets)))

        node_dependencies = pipeline.node_dependencies
        operators_by_node = {}
        for node in node_dependencies:
            name = slugify(node.name)
            operators_by_node[node] = PythonOperator(
                task_id=name,
                provide_context=True,
                python_callable=self.create_task(node, catalog),
                dag=self._dag,
                **self._operator_arguments(name))

        for node, dependencies in node_dependencies.items():
            for dependency in dependencies:
                operators_by_node[node].set_upstream(
                    operators_by_node[dependency])
Exemple #30
0
def create_weekly_results_pipeline(start_date=None, end_date=None, **kwargs):
    """
    Gather the weekly results from the local and remote into a single sourc

    Load unprocessed partitions concat together the partitions,
        this takes care of setting the year column in the data on load
    Process the data
        - set the data source
        - remove unwanted columns
        - remap the names on the columns we do want
    fixup the player_names
    Add into the exisiting annual.results file for downstream processing
    """
    return Pipeline(
        [
            node(
                concat_partitions,
                inputs="results.weekly.raw",
                outputs="combined_weekly_results",
            ),
            node(
                fixup_player_names,
                inputs="combined_weekly_results",
                outputs="combined_weekly_results_b",
            ),
            node(
                consolidate_player_positions,
                inputs="combined_weekly_results_b",
                outputs="combined_weekly_results_c",
            ),
            node(
                func=preferred_column_order,
                inputs="combined_weekly_results_c",
                outputs="results.weekly",
            )
            # node(
            #     split_year_from_week,
            #     inputs="combined_weekly_results_b",
            #     outputs="results.weekly",
            # ),
        ]
    )