def test_release_transcoded(self, is_async): runner = ParallelRunner(is_async=is_async) log = runner._manager.list() pipeline = Pipeline( [node(source, None, "ds@save"), node(sink, "ds@load", None)] ) catalog = DataCatalog( { "ds@save": LoggingDataSet(log, "save"), "ds@load": LoggingDataSet(log, "load"), } ) ParallelRunner().run(pipeline, catalog) # we want to see both datasets being released assert list(log) == [("release", "save"), ("load", "load"), ("release", "load")]
def create_pipelines(**kwargs) -> Dict[str, Pipeline]: """Create the project's pipeline. Args: kwargs: Ignore any additional arguments added in the future. Returns: A mapping from a pipeline name to a ``Pipeline`` object. """ return { "__default__": Pipeline([ node(lambda x: x.describe(), inputs='pandas_iris_data', outputs=None), node(lambda x: x.show(), inputs='spark_iris_data', outputs=None), ]) }
def test_count_multiple_loads(self): runner = ThreadRunner() log = [] pipeline = Pipeline([ node(source, None, "dataset"), node(sink, "dataset", None, name="bob"), node(sink, "dataset", None, name="fred"), ]) catalog = DataCatalog({"dataset": LoggingDataSet(log, "dataset")}) runner.run(pipeline, catalog) # we want to the release after both the loads assert list(log) == [ ("load", "dataset"), ("load", "dataset"), ("release", "dataset"), ]
def perm_importance_pipeline(**kwargs): return Pipeline([ node( func=fit_model, inputs=["y", "X"], outputs="regressor", name="fit_model_node", ), node( func=evaluate_perm_importance, inputs=[ "regressor", "y", "X", "w", "y_true", "features_mask", "parameters" ], outputs=None, name="evaluate_perm_importance_node", ), ])
def create_pipeline(**kwargs): """Create the project's pipeline. Args: kwargs: Ignore any additional arguments added in the future. Returns: Pipeline: The resulting pipeline. """ ########################################################################### # Here you can find an example pipeline with 4 nodes. # # PLEASE DELETE THIS PIPELINE ONCE YOU START WORKING ON YOUR OWN PROJECT AS # WELL AS THE FILE nodes/example.py # ------------------------------------------------------------------------- pipeline = Pipeline([ node( split_data, ["example_iris_data", "parameters"], dict( train_x="example_train_x", train_y="example_train_y", test_x="example_test_x", test_y="example_test_y", ), ), node( train_model, ["example_train_x", "example_train_y", "parameters"], "example_model", ), node( predict, dict(model="example_model", test_x="example_test_x"), "example_predictions", ), node(report_accuracy, ["example_predictions", "example_test_y"], None), ]) ########################################################################### return pipeline
def create_pipeline(**kwargs): return Pipeline( [ node( preprocess_cryptocurrencies, inputs="cryptocurrency", outputs="processed_cryptocurrencies", name="preprocessing_currencies", ), node( filter_cryptocurrencies, inputs="processed_cryptocurrencies", outputs="filtered_cryptocurrencies", name="filtering_cryptocurrencies", ), node( format_cryptocurrencies, inputs="filtered_cryptocurrencies", outputs="formated_cyptocurrencies", name="formating_cryptocurrencies", ), node( yearvalid_cryptocurrencies, inputs="formated_cyptocurrencies", outputs="validyear_cryptocurrencies", name="validatingyear_cryptocurrencies", ), node( inverse_cryptocurrencies, inputs="validyear_cryptocurrencies", outputs="reversed_cryptocurrencies", name="reversing_cryptocurrencies", ), node( format_totalbitcoins, inputs="total-bitcoins", outputs="formated_total_bitcoins", name="formating_total_bitcoins", ), ], tags=["cd_tag"], )
def create_pipeline(**kwargs): return Pipeline([ node(cleanse_fire_data, "raw_fire_data", "fire_data_basic_cleansed", name="cleanse_fire_data"), node(transform_datetime, "fire_data_basic_cleansed", "fire_data_with_date_info", name="transform_datetime"), node(fill_missing_coordinates, "fire_data_with_date_info", "fire_data_cleansed", name="fill_missing_coordinates"), node(calculate_closest_station, ["fire_data_cleansed", "foehn_stations", "parameters"], "fire_data_cleansed_stations", name="calculate_closest_station") ])
def create_annual_projections_pipeline(**kwargs): """ Combine together all the annual projection sources into a single annual projection This only works for the current year, which is assumed Returns: Pipeline: this pipeline gets all the annual projections from various sources and combines them into a single projection via average statistics """ return Pipeline( [ create_fp_proj_pipeline(), create_cbs_proj_pipeline(), node( func=average_stats_by_player, inputs=LOCAL_PROJECTIONS, outputs="projections.annual", ), ] )
def create_pipeline(**kwargs): return Pipeline( [ node( func=modeling.split_data, inputs=["train_prep", "parameters"], outputs=["X_train", "X_test", "y_train", "y_test"], ), node(func=modeling.train_model, inputs=["X_train", "y_train"], outputs="clf"), node( func=modeling.evaluate_model, inputs=["clf", "X_test", "y_test"], outputs=None, ), ], tags=["ds_tag"], )
def create_pipeline(**kwargs): nodes = [ _node(start_test, ['test_df', 'item_df'], 'test_start_df'), _node(pricing_map, ['train_enc_df', 'test_start_df'], 'price_df'), _node(join_test, ['test_start_df', 'price_df'] + [f'{col}_test_enc_feature' for col in category_cols], 'test_join_df'), _node(lag_test, ['test_lag_df', 'test_join_df'], 'x_test'), ] for col in category_cols: nodes.append(_node( _partial(encode_test, col), ['train_enc_df', 'test_start_df'], f'{col}_test_enc_feature', f'_{col}' )) return Pipeline(nodes)
def test_node_hook_logging_above_limit_truncate_strategy( kedro_project, dummy_run_params, param_length): _write_yaml( kedro_project / "conf" / "local" / "mlflow.yml", dict(hooks=dict(node=dict(long_parameters_strategy="truncate")), ), ) mlflow_tracking_uri = (kedro_project / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) mlflow_node_hook = MlflowNodeHook() param_value = param_length * "a" node_inputs = {"params:my_param": param_value} project_metadata = _get_project_metadata(kedro_project) _add_src_to_path(project_metadata.source_dir, kedro_project) configure_project(project_metadata.package_name) with KedroSession.create( package_name=project_metadata.package_name, project_path=kedro_project, ): with mlflow.start_run(): mlflow_node_hook.before_pipeline_run( run_params=dummy_run_params, pipeline=Pipeline([]), catalog=DataCatalog(), ) mlflow_node_hook.before_node_run( node=node(func=lambda x: x, inputs=dict(x="a"), outputs=None), catalog=DataCatalog(), # can be empty inputs=node_inputs, is_async=False, run_id="132", ) run_id = mlflow.active_run().info.run_id mlflow_client = MlflowClient(mlflow_tracking_uri) current_run = mlflow_client.get_run(run_id) assert current_run.data.params == { "my_param": param_value[0:MAX_PARAM_VAL_LENGTH] }
def create_pipeline(**kwargs): return Pipeline([ node( func=split_data, inputs=["master_table", "parameters"], outputs=["X_train@pickle", "X_test", "y_train", "y_test"], ), node( func=train_model_sagemaker, inputs=["X_train@path", "params:sklearn_estimator_kwargs"], outputs="model_path", ), node(untar_model, inputs="model_path", outputs="regressor"), node( func=evaluate_model, inputs=["regressor", "X_test", "y_test"], outputs=None, ), ])
def create_pipeline(**kwargs): return Pipeline([ node( train_fasttext_model, [ "classification_train", "params:model_classification_name", "parameters" ], "fasttext_classification_model_path", ), node(report_accuracy_fasttext, ["fasttext_classification_model_path", "classification_test"], None), node( train_fasttext_model, [ "favour_against_train", "params:model_favour_against_name", "parameters" ], "fasttext_favour_against_model_path", ), node(report_accuracy_fasttext, ["fasttext_favour_against_model_path", "favour_against_test"], None), node( train_fasttext_model, [ "opinion_towards_train", "params:model_opinion_towards_name", "parameters" ], "fasttext_opinion_towards_model_path", ), node(report_accuracy_fasttext, ["fasttext_opinion_towards_model_path", "opinion_towards_test"], None), node( train_fasttext_model, ["sentiment_train", "params:model_sentiment_name", "parameters"], "fasttext_sentiment_model_path", ), node(report_accuracy_fasttext, ["fasttext_sentiment_model_path", "sentiment_test"], None) ])
def community_prep(**kwargs): """community_prep prepares the text files.""" tags = ["community-prep"] nodes = [ node( format_textfiles, ["flow_coal_nx_nodes", "flow_coal_nx_edges"], [], tags=tags + ["community-prep_coal"], ), node(format_textfiles, ["flow_oil_nx_nodes", "flow_oil_nx_edges"], [], tags=tags + ["community-prep_oil"]), node(format_textfiles, ["flow_gas_nx_nodes", "flow_gas_nx_edges"], [], tags=tags + ["community-prep_gas"]), ] return Pipeline(nodes)
def create_pipeline(**kwargs): return Pipeline([ node(func=split_data, inputs=['preprocessed_stroke_data', 'parameters'], outputs=['X_train', 'X_test', 'y_train', 'y_test'], name='split_data_node'), node(func=over_sample_data, inputs=['X_train', 'y_train', 'parameters'], outputs=['X_train_res', 'y_train_res'], name='over_sample_node'), node(func=train_model, inputs=['X_train_res', 'y_train_res'], outputs='lgbm', name='train_model_node'), node(func=evaluate_model, inputs=['lgbm', 'X_test', 'y_test'], outputs=None, name='evaluate_model_node') ])
def test_dont_release_inputs_and_outputs(self): manager = ParallelRunnerManager() manager.start() log = manager.list() pipeline = Pipeline( [node(identity, "in", "middle"), node(identity, "middle", "out")] ) catalog = DataCatalog( { "in": manager.LoggingDataSet(log, "in", "stuff"), "middle": manager.LoggingDataSet(log, "middle"), "out": manager.LoggingDataSet(log, "out"), } ) ParallelRunner().run(pipeline, catalog) # we don't want to see release in or out in here assert list(log) == [("load", "in"), ("load", "middle"), ("release", "middle")]
def register_pipelines(self) -> Dict[str, Pipeline]: """Register the project's pipeline. Returns: A mapping from a pipeline name to a ``Pipeline`` object. """ return { "__default__": Pipeline([ node( lambda: range(10), inputs=None, outputs="range_one", name="create_range_one", ), node( lambda _range: [i**2 for i in _range], inputs="range_one", outputs="square_range_one", name="create_square_range_one", ), node( lambda _range: [i + 1 for i in _range], inputs="square_range_one", outputs="add_one", name="create_add_one", ), node( lambda: range(100, 110), inputs=None, outputs="range_two", name="create_range_two", ), node( lambda one, two: [*one, *two], inputs=["add_one", "range_two"], outputs="join", name="create_join", ), ]) }
def test_count_multiple_loads(self, is_async): runner = ParallelRunner(is_async=is_async) log = runner._manager.list() pipeline = Pipeline([ node(source, None, "dataset"), node(sink, "dataset", None, name="bob"), node(sink, "dataset", None, name="fred"), ]) catalog = DataCatalog( {"dataset": runner._manager.LoggingDataSet(log, "dataset")}) runner.run(pipeline, catalog) # we want to the release after both the loads assert list(log) == [ ("load", "dataset"), ("load", "dataset"), ("release", "dataset"), ]
def _validate_catalog(cls, catalog: DataCatalog, pipeline: Pipeline): """Ensure that all data sets are serializable and that we do not have any non proxied memory data sets being used as outputs as their content will not be synchronized across threads. """ data_sets = catalog._data_sets # pylint: disable=protected-access unserializable = [] for name, data_set in data_sets.items(): if getattr(data_set, "_SINGLE_PROCESS", False): # SKIP_IF_NO_SPARK unserializable.append(name) continue try: ForkingPickler.dumps(data_set) except (AttributeError, PicklingError): unserializable.append(name) if unserializable: raise AttributeError( "The following data sets cannot be used with multiprocessing: " "{}\nIn order to utilize multiprocessing you need to make sure " "all data sets are serializable, i.e. data sets should not make " "use of lambda functions, nested functions, closures etc.\nIf you " "are using custom decorators ensure they are correctly using " "functools.wraps().".format(sorted(unserializable)) ) memory_data_sets = [] for name, data_set in data_sets.items(): if ( name in pipeline.all_outputs() and isinstance(data_set, MemoryDataSet) and not isinstance(data_set, BaseProxy) ): memory_data_sets.append(name) if memory_data_sets: raise AttributeError( "The following data sets are memory data sets: {}\n" "ParallelRunner does not support output to externally created " "MemoryDataSets".format(sorted(memory_data_sets)) )
def create_pipeline(): return Pipeline([ node( generate_date_range, inputs=['params:start_date', 'params:end_date'], outputs='dates_to_download' ), node( parallel_get_temp_data, inputs=['dates_to_download', 'already_downloaded_dates'], outputs=['downloaded_dates', 'already_downloaded_dates!'], ), node( parallel_choose_station, inputs=['downloaded_dates', 'params:station_id'], outputs='downloaded_station_data', ) ])
def test_dont_release_inputs_and_outputs(self, is_async): runner = ParallelRunner(is_async=is_async) log = runner._manager.list() pipeline = Pipeline( [node(identity, "in", "middle"), node(identity, "middle", "out")] ) # pylint: disable=no-member catalog = DataCatalog( { "in": runner._manager.LoggingDataSet(log, "in", "stuff"), "middle": runner._manager.LoggingDataSet(log, "middle"), "out": runner._manager.LoggingDataSet(log, "out"), } ) ParallelRunner().run(pipeline, catalog) # we don't want to see release in or out in here assert list(log) == [("load", "in"), ("load", "middle"), ("release", "middle")]
def test_prefix_exclude_free_inputs(self, inputs, outputs): raw_pipeline = Pipeline([ node(identity, "A", "B", name="node1"), node(identity, "B", "C", name="node2"), node(identity, "C", "D", name="node3"), ]) resulting_pipeline = pipeline(raw_pipeline, inputs=inputs, outputs=outputs, namespace="PREFIX") nodes = sorted(resulting_pipeline.nodes) assert nodes[0]._inputs == "A" assert nodes[0]._outputs == "PREFIX.B" assert nodes[1]._inputs == "PREFIX.B" assert nodes[1]._outputs == "PREFIX.C" assert nodes[2]._inputs == "PREFIX.C" assert nodes[2]._outputs == "D"
def sjoin_pipeline(**kwargs): """ The sjoin pipeline performs spatial joins between the datasets""" tags = ["sjoin"] sjoin_nodes = [ node(spatialjoin, [f'prp_{sector1}_data', f'prp_{sector2}_data'], f'sjoin_edges_{sector1}_{sector2}', tags=tags + ['sjoin_mp', f'sjoin_{sector1}_{sector2}']) for sector1, sector2 in SJOIN_PAIRS ] null_nodes = [ node(null_forward, f'prp_{sector}_data', f'sjoin_{sector}_data', tags=tags + ['sjoin_null', f'sjoin_null_{sector}']) for sector in ALL_SECTORS ] return Pipeline(sjoin_nodes + null_nodes)
def create_pipeline(**kwargs): return Pipeline( [ node( training, dict(train_x="categorical_house_price_train_x", train_y="house_price_train_y", parameters="parameters"), "best_model", ), node( predict_lightgbm_model, dict(model="best_model", test_x="categorical_house_price_test_x", test_id="house_price_test_id"), "submit_predictions", ) ] )
def test_apply(self): nodes = sorted( [ node(identity, "number", "output1", name="identity1"), node(identity, "output1", "output2", name="biconcat"), node(identity, "output2", "output", name="identity3"), ], key=lambda x: x.name, ) pipeline = Pipeline(nodes).decorate(apply_f, apply_g) catalog = DataCatalog({}, dict(number=1)) result = SequentialRunner().run(pipeline, catalog) decorated_nodes = sorted(pipeline.nodes, key=lambda x: x.name) assert result["output"] == "g(f(g(f(g(f(1))))))" assert len(pipeline.nodes) == 3 assert all(n1.name == n2.name for n1, n2 in zip(nodes, decorated_nodes))
def create_pipeline(**kwargs): return Pipeline([ node( utils.make_partial(create_health_score_target, health_score_column="Health_Score"), ["raw_train", "raw_first_health_camp_attended"], "tgt_first_health_camp_outcome_favorable", ), node( utils.make_partial(create_health_score_target, health_score_column="Health Score"), ["raw_train", "raw_second_health_camp_attended"], "tgt_second_health_camp_outcome_favorable", ), node( utils.make_partial( create_stall_visited_target, stall_visited_column="Number_of_stall_visited", ), ["raw_train", "raw_third_health_camp_attended"], "tgt_third_health_camp_outcome_favorable", ), node( utils.join_all, [ "tgt_first_health_camp_outcome_favorable", "tgt_second_health_camp_outcome_favorable", "tgt_third_health_camp_outcome_favorable", ], "tgt_joined", ), node( utils.methodcaller("sum", axis=1), "tgt_joined", "tgt_combined", ), node( utils.methodcaller("to_frame", name="tgt_health_camp_outcome_favorable"), "tgt_combined", "tgt_health_camp_outcome_favorable", ), ])
def test_connected_pipeline(self, disjoint_pipeline): """Connect two separate pipelines.""" nodes = disjoint_pipeline["nodes"] subpipeline = Pipeline(nodes, name="subpipeline") assert len(subpipeline.inputs()) == 2 assert len(subpipeline.outputs()) == 2 pipeline = Pipeline( [node(identity, "C", "D", name="connecting_node"), subpipeline], name="main" ) assert len(pipeline.nodes) == 1 + len(nodes) assert len(pipeline.inputs()) == 1 assert len(pipeline.outputs()) == 1 assert all(pipeline.name in n.tags for n in pipeline.nodes) assert all( subpipeline.name in n.tags for n in pipeline.nodes if n.name != "connecting_node" )
def create_pipeline(): def func1(a, b): # pylint: disable=unused-argument return a def func2(a, b): # pylint: disable=unused-argument return a return Pipeline([ # unnamed node with no tags and basic io node(func1, ["bob_in", "parameters"], ["bob_out"]), # named node with tags and transcoding node( func2, ["fred_in@pandas", "parameters"], ["fred_out@pandas"], name="my_node", tags=["bob"], ), ])
def _run(self, pipeline: Pipeline, catalog: DataCatalog) -> None: """The method implementing sequential pipeline running. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. Raises: ValueError: if the Pipeline is not compatible with Airflow """ print("_run") data_sets = catalog._data_sets # pylint: disable=protected-access memory_data_sets = [] for name, data_set in data_sets.items(): if name in pipeline.all_outputs() and isinstance( data_set, MemoryDataSet): memory_data_sets.append(name) if memory_data_sets: raise ValueError( "The following output data sets are memory data sets: {}\n" "AirflowRunner does not support output to MemoryDataSets". format(", ".join("'{}'".format(ds) for ds in memory_data_sets))) node_dependencies = pipeline.node_dependencies operators_by_node = {} for node in node_dependencies: name = slugify(node.name) operators_by_node[node] = PythonOperator( task_id=name, provide_context=True, python_callable=self.create_task(node, catalog), dag=self._dag, **self._operator_arguments(name)) for node, dependencies in node_dependencies.items(): for dependency in dependencies: operators_by_node[node].set_upstream( operators_by_node[dependency])
def create_weekly_results_pipeline(start_date=None, end_date=None, **kwargs): """ Gather the weekly results from the local and remote into a single sourc Load unprocessed partitions concat together the partitions, this takes care of setting the year column in the data on load Process the data - set the data source - remove unwanted columns - remap the names on the columns we do want fixup the player_names Add into the exisiting annual.results file for downstream processing """ return Pipeline( [ node( concat_partitions, inputs="results.weekly.raw", outputs="combined_weekly_results", ), node( fixup_player_names, inputs="combined_weekly_results", outputs="combined_weekly_results_b", ), node( consolidate_player_positions, inputs="combined_weekly_results_b", outputs="combined_weekly_results_c", ), node( func=preferred_column_order, inputs="combined_weekly_results_c", outputs="results.weekly", ) # node( # split_year_from_week, # inputs="combined_weekly_results_b", # outputs="results.weekly", # ), ] )