def test_valid(self, simple_tuple_node_list): nodes = [node(*tup) for tup in simple_tuple_node_list] assert len(nodes) == len(simple_tuple_node_list)
def test_bad_node(func, expected): with pytest.raises(ValueError, match=expected): node(*func())
def test_tag_nodes(self): tagged_node = node(identity, "input", "output", tags=["hello"]).tag(["world"]) assert "hello" in tagged_node.tags assert "world" in tagged_node.tags assert len(tagged_node.tags) == 2
def test_node_less_than(self): first = node(identity, "input1", "output1", name="A") second = node(identity, "input1", "output1", name="B") assert first < second assert first is not second
def test_different_input_list_order_not_equal(self): first = node(biconcat, ["input1", "input2"], "output1", name="A") second = node(biconcat, ["input2", "input1"], "output1", name="A") assert first != second
def test_inputs_none(self): dummy_node = node(constant_output, None, "output") assert dummy_node.inputs == []
def test_outputs_str(self): dummy_node = node(identity, "input1", "output1") assert dummy_node.outputs == ["output1"]
def saving_none_pipeline(): return Pipeline([ node(random, None, "A"), node(null, "A", "B"), node(identity, "B", "C") ])
def branchless_pipeline(): return Pipeline([ node(identity, "ds1", "ds2", name="node1"), node(identity, "ds2", "ds3", name="node2"), ])
def create_pipeline(**kwargs): # conn start start_conn = [ node(hktvmall_conn_node, inputs="params:hktvmall_home_url", outputs="HktvmallHeader", tags="Preparation") ] # get HKTV mall categories get_category = [ node(request_hktvmall_catagory_code, inputs=["HktvmallHeader", "params:hktvmall_category_diction_url"], outputs="category_raw_req", tags="Preparation"), node(categories_df_etl, inputs="category_raw_req", outputs="category_df", tags="Preparation") ] # generate urls by type for requests gen_url_list = [ node(gen_hktvmall_product_by_method_and_cat_links, inputs=[ 'params:hktvmall_catagory_code', 'params:hktvmall_browse_method', "params:product_by_method_catcode_url" ], outputs=dict(method1="promotiondiff_url_list", method2="hotpickorder_url_list"), tags="Preparation"), node(gen_hktvmall_full_site_links, inputs=["category_df", 'params:hktvmall_cat_product_url'], outputs="fullsite_url_list", tags="Preparation") ] # multi threading requires for raw data req_raw_df = [ node(multi_threading_req, inputs=["HktvmallHeader", "promotiondiff_url_list"], outputs="promotiondiff_raw_list", tags="Requests"), node(multi_threading_req, inputs=["HktvmallHeader", "hotpickorder_url_list"], outputs="hotpickorder_raw_list", tags="Requests"), node(multi_threading_req, inputs=["HktvmallHeader", 'fullsite_url_list'], outputs="fullsite_raw_list", tags="Requests") ] # ETL on df columns for proper columns etl_on_df = [ node(raw_etl, inputs="promotiondiff_raw_list", outputs="promotiondiff_raw_df", tags="ETL"), node(raw_etl, inputs="hotpickorder_raw_list", outputs="hotpickorder_raw_df", tags="ETL"), node(raw_etl, inputs="fullsite_raw_list", outputs="fullsite_raw_df", tags="ETL"), ] # turn df to CSVDataSet df_to_csv = [ node(df_to_kedro_csvdataset, inputs=["category_df", "params:category_path"], outputs="category_raw", tags="Saving Data"), node(df_to_kedro_csvdataset, inputs=["promotiondiff_raw_df", "params:promotiondiff_path"], outputs="promotiondiff_raw", tags="Saving Data"), node(df_to_kedro_csvdataset, inputs=["hotpickorder_raw_df", "params:hotpickorder_path"], outputs="hotpickorder_raw", tags="Saving Data"), node(df_to_kedro_csvdataset, inputs=["fullsite_raw_df", "params:fullsite_path"], outputs="fullsite_raw", tags="Saving Data") ] pipe = start_conn + get_category + gen_url_list + req_raw_df + etl_on_df + df_to_csv # pipe = start_conn + get_category # pipe.append( # node( # gen_hktvmall_full_site_links, # inputs=["category_df", 'params:hktvmall_cat_product_url'], # outputs="fullsite_url_list", # tags="Preparation" # ) # ) # pipe.append( # node( # multi_threading_req, # inputs=["HktvmallHeader", 'fullsite_url_list'], # outputs="fullsite_raw_list", # tags="Requests" # ) # ) # pipe.append( # node( # raw_etl, # inputs="fullsite_raw_list", # outputs="fullsite_raw_df", # tags="ETL" # ), # ) # pipe.append( # node( # df_to_kedro_csvdataset, # inputs=["fullsite_raw_df", "params:fullsite_path"], # outputs="fullsite_raw", # tags="Saving Data" # ) # ) return Pipeline(pipe)
def create_pipeline(**kwargs): return Pipeline([ node(make_prediction, ["test_x", "rf_model"], "predict"), node(rmse_cv, ["test_x", "test_y", "rf_model"], "quality") ])
def _create_pipelines(): bad_pipeline_middle = Pipeline( [ node(identity, "cars", "boats", name="node1", tags=["tag1"]), node(identity, "boats", "trains", name="node2"), node(bad_node, "trains", "ships", name="nodes3"), node(identity, "ships", "planes", name="node4"), ], tags="bad_pipeline", ) bad_pipeline_head = Pipeline( [ node(bad_node, "cars", "boats", name="node1", tags=["tag1"]), node(identity, "boats", "trains", name="node2"), node(identity, "trains", "ships", name="nodes3"), node(identity, "ships", "planes", name="node4"), ], tags="bad_pipeline", ) default_pipeline = Pipeline( [ node(identity, "cars", "boats", name="node1", tags=["tag1"]), node(identity, "boats", "trains", name="node2"), node(identity, "trains", "ships", name="node3"), node(identity, "ships", "planes", name="node4"), ], tags="pipeline", ) return { "__default__": default_pipeline, "empty": Pipeline([]), "simple": Pipeline([node(identity, "cars", "boats")]), "bad_pipeline_middle": bad_pipeline_middle, "bad_pipeline_head": bad_pipeline_head, }
def test_task_exception(self, fan_out_fan_in, catalog): catalog.add_feed_dict(feed_dict=dict(A=42)) pipeline = Pipeline([fan_out_fan_in, node(exception_fn, "Z", "X")]) with pytest.raises(Exception, match="test exception"): ThreadRunner().run(pipeline, catalog)
def test_labelled(self): assert "labeled_node: <lambda>([input1]) -> [output1]" in str( node(lambda x: None, "input1", "output1", name="labeled_node"))
def test_no_input(self): assert "constant_output(None) -> [output1]" in str( node(constant_output, None, "output1"))
def saving_result_pipeline(): return Pipeline([node(identity, "ds", "dsX")])
def test_no_output(self): assert "<lambda>([input1]) -> None" in str( node(lambda x: None, "input1", None))
@pytest.fixture def dummy_dataframe(): return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) def identity(input1: str): return input1 # pragma: no cover def bad_node(x): raise ValueError("Oh no!") bad_pipeline_middle = Pipeline( [ node(identity, "cars", "boats", name="node1", tags=["tag1"]), node(identity, "boats", "trains", name="node2"), node(bad_node, "trains", "ships", name="nodes3"), node(identity, "ships", "planes", name="node4"), ], tags="bad_pipeline", ) expected_message_middle = ( "There are 2 nodes that have not run.\n" "You can resume the pipeline run by adding the following " "argument to your previous command:\n" ' --from-nodes "nodes3"') bad_pipeline_head = Pipeline( [
def test_outputs_none(self): dummy_node = node(identity, "input", None) assert dummy_node.outputs == []
def _get_pipelines(self) -> Dict[str, Pipeline]: return { "__default__": Pipeline([node(identity, "cars", "boats")]) }
def test_node_equals(self): first = node(identity, "input1", "output1", name="a node") second = node(identity, "input1", "output1", name="a node") assert first == second assert first is not second
def test_spark_pickle(self, is_async, data_catalog): """SparkDataSet(load) -> node -> PickleDataSet (save)""" pipeline = Pipeline([node(identity, "spark_in", "pickle_ds")]) pattern = ".* was not serialized due to.*" with pytest.raises(DataSetError, match=pattern): SequentialRunner(is_async=is_async).run(pipeline, data_catalog)
def test_node_invalid_equals(self): n = node(identity, "input1", "output1", name="a node") assert n != "hello"
def create_pipeline(**kwargs): return Pipeline([ node( func=tidy_data, inputs="filt_list", outputs="data", ), node( func=buscar_numero_legajo, inputs="data", outputs="numero_legajo", ), node( func=buscar_violencia_genero, inputs="data", outputs="violencia_de_genero", ), node( func=buscar_violencia_fisica, inputs="data", outputs="violencia_fisica", ), node( func=buscar_info_denunciante, inputs="data", outputs="info_denunciante", ), node( func=buscar_info_acusado, inputs="data", outputs="info_acusado", ), node( func=buscar_genero_den, inputs="info_denunciante", outputs="genero_denunciante", ), node( func=buscar_nacionalidad_denunciante, inputs="info_denunciante", outputs="nacionalidad_denunciante", ), node( func=buscar_est_civil_denunciente, inputs="info_denunciante", outputs="est_civil_denunciante", ), node( func=buscar_edad_denunciante, inputs="info_denunciante", outputs="edad_denunciante", ), node( func=buscar_est_denunciante, inputs="info_denunciante", outputs="est_denunciante", ), node( func=buscar_domic_denunciante, inputs="info_denunciante", outputs="domic_denunciante", ), node( func=buscar_villa_denunciante, inputs="info_denunciante", outputs="villa_denunciante", ), node( func=buscar_ocupac_denunciante, inputs="info_denunciante", outputs="ocupac_denunciante", ), node( func=buscar_genero_acusado, inputs="info_acusado", outputs="genero_acusado", ), node( func=buscar_nacionalidad_acusado, inputs="info_acusado", outputs="nacionalida_acusado", ), node( func=buscar_est_civil_acusado, inputs="info_acusado", outputs="est_civil_acusado", ), node( func=buscar_edad_acusado, inputs="info_acusado", outputs="edad_acusado", ), node( func=buscar_instruccion_acusado, inputs="info_acusado", outputs="instruccion_acusado", ), node( func=buscar_domicilio_acusado, inputs="info_acusado", outputs="domicilio_acusado", ), node( func=buscar_ocupacion_acusado, inputs="info_acusado", outputs="ocupacion_acusado", ), node( func=buscar_relacion, inputs="info_acusado", outputs="relacion", ), node( func=chequear_conv, inputs="info_acusado", outputs="convivencia", ), node( func=buscar_info_episodio, inputs="data", outputs="info_episodio", ), node( func=buscar_denuncia_anterior, inputs="info_episodio", outputs="denuncia_anterior", ), node( func=buscar_medidas_prot, inputs="info_episodio", outputs="medidas_prot", ), node( func=buscar_dia_hecho, inputs="data", outputs="dia_hecho", ), node( func=buscar_conclusiones, inputs="data", outputs="conclusiones", ), node( func=buscar_riesgo, inputs="conclusiones", outputs="riesgo", ), node( func=buscar_informe_final, inputs="data", outputs="informe_final", ), node( func=buscar_violencia_amb, inputs="informe_final", outputs="violencia_amb", ), node( func=buscar_violencia_econ, inputs="informe_final", outputs="violencia_econ", ), node( func=buscar_violencia_genero, inputs="informe_final", outputs="violencia_genero", ), node( func=buscar_violencia_psico, inputs="informe_final", outputs="violencia_psico", ), node( func=buscar_violencia_sex, inputs="informe_final", outputs="violencia_sex", ), node( func=buscar_violencia_simb, inputs="informe_final", outputs="violencia_simb", ), node( func=buscar_violencia_soc, inputs="informe_final", outputs="violencia_soc", ), node( func=buscar_hijos, inputs="informe_final", outputs="hijos", ), node( func=buscar_hijos_en_comun, inputs="hijos", outputs="hijos_en_comun", ), node( func=buscar_frecuencia, inputs="informe_final", outputs="frecuencia", ), node( func=buscar_dijo, inputs="data", outputs="frases_agresion", ), node( func=buscar_dijo_sin_comillas, inputs="data", outputs="frases_sin_comillas", ), node( func=buscar_comillas, inputs="data", outputs="frases_comillas", ), node( func=buscar_fecha_del_hecho, inputs=["info_episodio", "fecha_denuncia"], outputs="fecha_del_hecho", ), node( func=buscar_horario_hecho, inputs="info_episodio", outputs="horario_hecho", ), node( func=buscar_fecha_denuncia, inputs="data", outputs="fecha_denuncia", ), node( func=to_excel, inputs=[ "violencia_de_genero", "violencia_fisica", "genero_denunciante", "nacionalidad_denunciante", "est_civil_denunciante", "edad_denunciante", "est_denunciante", "domic_denunciante", "villa_denunciante", "ocupac_denunciante", "genero_acusado", "nacionalida_acusado", "est_civil_acusado", "edad_acusado", "instruccion_acusado", "domicilio_acusado", "ocupacion_acusado", "relacion", "convivencia", "denuncia_anterior", "medidas_prot", "dia_hecho", "riesgo", "violencia_psico", "violencia_econ", "violencia_sex", "violencia_soc", "violencia_amb", "violencia_simb", "hijos_en_comun", "frecuencia", "frases_sin_comillas", "frases_agresion", "frases_comillas", "fecha_del_hecho", "fecha_denuncia", "horario_hecho", "numero_legajo", ], outputs="excel_ovd_data", ), ])
def test_different_output_list_order_not_equal(self): first = node(identity, "input1", ["output1", "output2"], name="A") second = node(identity, "input1", ["output2", "output1"], name="A") assert first != second
"""Contents of hello_kedro.py""" from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import node, Pipeline from kedro.runner import SequentialRunner # Prepare a data catalog data_catalog = DataCatalog({"example_data": MemoryDataSet()}) # Prepare second node def join_statements(greeting): return f"{greeting} Kedro!" join_statements_node = node(join_statements, inputs="my_salutation", outputs="my_message") # Prepare first node def return_greeting(): return "Bonjourno" return_greeting_node = node(return_greeting, inputs=None, outputs="my_salutation") # Assemble nodes into a pipeline pipeline = Pipeline([join_statements_node, return_greeting_node])
def test_bad_input(func, expected): with pytest.raises(TypeError, match=expected): node(*func())
def create_pipeline(**kwargs): return Pipeline( [ #node( # func=split_train_pool, # inputs=dict( # y_train_full="y_train_full", # n_init="params:N_INIT" # ), # outputs=["full_id", "train_id", "pool_id"], # tags=["sampling"] #), node( func=truncate_dataset, inputs=dict( X_train_full="X_train_full", y_train_full="y_train_full", size="params:SIZE_ANALYSIS" ), outputs=["X_train_trunc", "y_train_trunc"], tags=["pre_sampling"] ), node( func=compute_gaussian_kernel, inputs=dict( X="X_train_trunc" ), outputs="K_FIXE", tags=["pre_sampling"] ), node( func=al_performances, inputs=dict( bs="params:BATCH_SEQ", budget="params:BUDGET", n_simu="params:N_SIMULATIONS", X_train_full="X_train_trunc", y_train_full="y_train_trunc", X_test="X_test", y_test="y_test", K_FIXE="K_FIXE", n_init="params:N_INIT" ), outputs="al_perfs", tags=["sampling", "active_sampling"] ), node( func=lambda_analysis, inputs=dict( b="params:BATCH_SIZE", budget="params:BUDGET", n_simu="params:N_SIMULATIONS", X_train_full="X_train_trunc", y_train_full="y_train_trunc", X_test="X_test", y_test="y_test", n_init="params:N_INIT", K_FIXE="K_FIXE" ), outputs="al_lam_perfs", tags=["sampling", "active_lambda_sampling"] ), node( func=pl_performances, inputs=dict( bs="params:BATCH_SEQ", budget="params:BUDGET", n_simu="params:N_SIMULATIONS", X_train_full="X_train_trunc", y_train_full="y_train_trunc", X_test="X_test", y_test="y_test", n_init="params:N_INIT" ), outputs="pl_perfs", tags=["sampling", "passive_sampling"] ), node( func=b_descent_analysis, inputs=dict( budget="params:BUDGET", n_simu="params:N_SIMULATIONS", X_train_full="X_train_trunc", y_train_full="y_train_trunc", X_test="X_test", y_test="y_test", n_init="params:N_INIT", b_descent_size="params:BATCH_DESCENT_SIZE", K_FIXE="K_FIXE" ), outputs="b_descent_perfs", tags=["sampling", "active_descent_sampling"] ), node( func=b_ascent_analysis, inputs=dict( budget="params:BUDGET", n_simu="params:N_SIMULATIONS", X_train_full="X_train_trunc", y_train_full="y_train_trunc", X_test="X_test", y_test="y_test", n_init="params:N_INIT", b_ascent_size="params:BATCH_DESCENT_SIZE", K_FIXE="K_FIXE" ), outputs="b_ascent_perfs", tags=["sampling", "active_descent_sampling"] ), ] )
def test_tag_nodes_single_tag(self): tagged_node = node(identity, "input", "output", tags="hello").tag("world") assert "hello" in tagged_node.tags assert "world" in tagged_node.tags assert len(tagged_node.tags) == 2
def test_updated_partial(self): n = node(update_wrapper(partial(identity), identity), ["in"], ["out"]) assert str(n) == "identity([in]) -> [out]" assert n.name == "identity([in]) -> [out]" assert n.short_name == "Identity"