def test_run_asha(tmpdir): class M(Monitor): def on_report(self, report: TrialReport) -> None: print(report.jsondict) def assert_metric(df: Iterable[Dict[str, Any]], metric: float, ct: int) -> None: n = 0 for row in df: assert row[TUNE_REPORT_METRIC] == metric n += 1 assert n == ct space = Space(a=Grid(0, 1, 2, 3)) dag = FugueWorkflow() dataset = TuneDatasetBuilder(space, str(tmpdir)).build(dag, shuffle=False) obj = F() res = optimize_by_continuous_asha( obj, dataset, plan=[[1.0, 3], [1.0, 2], [1.0, 1], [1.0, 1]], checkpoint_path=str(tmpdir), ) res.result(1).output(assert_metric, dict(metric=1.0, ct=1)) res = optimize_by_continuous_asha( obj, dataset, plan=[[2.0, 2], [1.0, 1], [1.0, 1]], checkpoint_path=str(tmpdir), monitor=M(), ) res.result(1).output(assert_metric, dict(metric=1.0, ct=1)) dag.run()
def test_compile_conf(): def assert_conf(e: ExecutionEngine, **kwargs) -> pd.DataFrame: for k, v in kwargs.items(): assert e.compile_conf[k] == v return pd.DataFrame([[0]], columns=["a"]) dag = FugueWorkflow(conf={"a": 1}) dag.create(assert_conf, params=dict(a=1)) dag.run() with raises(KeyError): # non-compile time param doesn't keep in new engine dag.run(NativeExecutionEngine()) dag = FugueWorkflow(conf={FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: "abc"}) dag.create(assert_conf, params=dict({FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: "abc"})) dag.run() # non-compile time param is kepts dag.run(NativeExecutionEngine()) # non-compile time param can't be changed by new engines # new engine compile conf will be overwritten dag.run(NativeExecutionEngine({FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: "def"}))
def test_yield(tmpdir): df = pd.DataFrame([[0, 0]], columns=["a", "b"]) # schema: * def t(df: pd.DataFrame) -> pd.DataFrame: return df.assign(b=df.b + 1) dag = FugueWorkflow() dag.df(df).transform(t).yield_dataframe_as("x") result = dag.run()["x"] assert [[0, 1]] == result.as_array() dag1 = FugueWorkflow() dag1.df(df).transform(t).yield_file_as("x") dag1.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)}) dag2 = FugueWorkflow() dag2.df(dag1.yields["x"]).transform(t).yield_dataframe_as("y") result = dag2.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})["y"] assert [[0, 2]] == result.as_array() dag3 = FugueWorkflow() dag3.df(dag2.yields["y"]).transform(t).yield_dataframe_as("z") result = dag3.run()["z"] assert [[0, 3]] == result.as_array()
def test_process_stack_space(tmpdir): space1 = ss(LinearRegression, normalize=Grid(True, False)) space2 = ss(LinearRegression, fit_intercept=Grid(True, False)) dag = FugueWorkflow() result0 = build_sk_cv( space1, dag.df(_create_mock_data()), scoring="neg_mean_absolute_error", cv=2, label_col="l", feature_prefix="f_", ).tune(distributable=False, serialize_path=str(tmpdir)) res0 = result0.process(_process_stack_space, params=dict(keys=[], space=space2)) res0.show() result1 = build_sk_cv( space1, dag.df(_create_mock_data()).partition(by=["p"]), scoring="neg_mean_absolute_error", cv=2, label_col="l", feature_prefix="f_", ).tune(distributable=False, serialize_path=str(tmpdir)) res1 = result1.process(_process_stack_space, params=dict(keys=["p"], space=space2)) dag.run() assert 2 == len(res0.result.as_array()) assert 8 == len(res1.result.as_array())
def test_hyperband(tmpdir): def assert_metric(df: Iterable[Dict[str, Any]], metric: float, ct: int) -> None: n = 0 for row in df: if metric > 0: assert row[TUNE_REPORT_METRIC] == metric n += 1 assert n == ct space = Space(a=Grid(0, 1, 2, 3)) dag = FugueWorkflow() dataset = TuneDatasetBuilder(space, str(tmpdir)).build(dag) obj = F() res = optimize_by_hyperband( obj, dataset, plans=[ [[1.0, 3], [1.0, 2], [1.0, 1], [1.0, 1]], [[2.0, 2], [1.0, 1], [1.0, 1]], ], checkpoint_path=str(tmpdir), ) res.result().output(assert_metric, dict(metric=0.0, ct=2)) res.result(1).output(assert_metric, dict(metric=1.0, ct=1)) dag.run()
def test_study(tmpdir): space = Space(a=Grid(-2, 0, 1)) input_df = pd.DataFrame([[0, 1], [1, 1], [0, 2]], columns=["a", "b"]) dag = FugueWorkflow() monitor = M() # no data partition builder = TuneDatasetBuilder(space, str(tmpdir)).add_df("b", dag.df(input_df)) dataset = builder.build(dag, 1) for distributed in [True, False, None]: # min_better = True result = optimize_noniterative( objective=to_noniterative_objective(objective), dataset=dataset, distributed=distributed, ) result.result()[[TUNE_REPORT, TUNE_REPORT_METRIC ]].output(assert_metric, params=dict(metrics=[3.0, 4.0, 7.0])) result.result(2)[[TUNE_REPORT, TUNE_REPORT_METRIC ]].output(assert_metric, params=dict(metrics=[3.0, 4.0])) # min_better = False result = optimize_noniterative( objective=to_noniterative_objective(objective, min_better=False), dataset=dataset, distributed=distributed, ) result.result()[[TUNE_REPORT, TUNE_REPORT_METRIC ]].output(assert_metric, params=dict(metrics=[-7.0, -4.0, -3.0])) result.result(2)[[TUNE_REPORT, TUNE_REPORT_METRIC ]].output(assert_metric, params=dict(metrics=[-7.0, -4.0])) # with data partition builder = TuneDatasetBuilder(space, str(tmpdir)).add_df( "b", dag.df(input_df).partition_by("a")) dataset = builder.build(dag, 1) for distributed in [True, False, None]: result = optimize_noniterative( objective=to_noniterative_objective(objective), dataset=dataset, distributed=distributed, monitor=monitor, ) result.result()[[TUNE_REPORT, TUNE_REPORT_METRIC]].output( assert_metric, params=dict(metrics=[2.0, 3.0, 6.0, 1.0, 2.0, 5.0])) result.result(1)[[TUNE_REPORT, TUNE_REPORT_METRIC ]].output(assert_metric, params=dict(metrics=[1.0, 2.0])) dag.run() assert 3 * 3 * 2 == len(monitor._reports)
def test_build_sk_cv(tmpdir): space = sum([ ss(LinearRegression, fit_intercept=Grid(True, False)), ss(LinearRegression, normalize=Grid(True, False)), ]) dag = FugueWorkflow() build_sk_cv( space, dag.df(_create_mock_data()), scoring="neg_mean_absolute_error", cv=4, label_col="l", feature_prefix="f_", save_path=str(tmpdir), ).tune(distributable=False, serialize_path=str(tmpdir)).show() dag.run()
def test_workflow(): builder = FugueWorkflow() a = builder.create_data([[0], [0], [1]], "a:int") raises(InvalidOperationError, lambda: a._task.copy()) raises(InvalidOperationError, lambda: copy.copy(a._task)) raises(InvalidOperationError, lambda: copy.deepcopy(a._task)) a.show() a.show() raises(FugueWorkflowCompileError, lambda: builder.df(123)) b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"])) b.show() builder.create_data([[0], [1]], "b:int").show() c = ArrayDataFrame([[100]], "a:int") builder.show(a, b, c) b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast() b.show() builder.run() df_eq(a.result, [[0], [0], [1]], "a:int") raises(TypeError, lambda: builder.run("abc")) builder.run(FugueWorkflowContext()) df_eq(a.result, [[0], [0], [1]], "a:int") builder.run("NativeExecutionEngine") df_eq(b.result, [[0, 2], [0, 2], [1, 1]], "a:int,b:int") df_eq(b.compute(), [[0, 2], [0, 2], [1, 1]], "a:int,b:int") df_eq(b.compute(NativeExecutionEngine), [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
def test_run_ibis_duck(self): def _test1(con: ibis.BaseBackend) -> ibis.Expr: tb = con.table("a") return tb def _test2(con: ibis.BaseBackend) -> ibis.Expr: tb = con.table("a") return tb.mutate(c=tb.a + tb.b) dag = FugueWorkflow() df = dag.df([[0, 1], [2, 3]], "a:long,b:long") res = run_ibis(_test1, ibis_engine="duck", a=df) res.assert_eq(df) df = dag.df([[0, 1], [2, 3]], "a:long,b:long") res = run_ibis(_test2, ibis_engine="duckdb", a=df) df2 = dag.df([[0, 1, 1], [2, 3, 5]], "a:long,b:long,c:long") res.assert_eq(df2) dag.run(NativeExecutionEngine())
def test_out_transform(tmpdir): pdf = pd.DataFrame([[1, 10], [0, 0], [1, 1], [0, 20]], columns=["a", "b"]) class T: def __init__(self): self.n = 0 def f(self, df: Iterable[Dict[str, Any]]) -> None: self.n += 1 t = T() out_transform(pdf, t.f) assert 1 == t.n t = T() out_transform(pdf, t.f, partition=dict(by=["a"])) assert 2 == t.n dag = FugueWorkflow() dag.df(pdf).yield_dataframe_as("x1") dag.df(pdf).yield_dataframe_as("x2") dag.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)}) t = T() out_transform(dag.yields["x1"], t.f) assert 1 == t.n t = T() out_transform( dag.yields["x2"], t.f, partition=dict(by=["a"]), engine_conf={FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)}, ) assert 2 == t.n # schema: * def f3(df: pd.DataFrame, called: callable) -> pd.DataFrame: called() return df cb = Callback() result = out_transform(pdf, f3, callback=cb.called) assert 1 == cb.ct
def suggest_sk_model( space: Space, train_df: Any, scoring: str, serialize_path: str, cv: int = 5, feature_prefix: str = "", label_col: str = "label", save_model: bool = False, partition_keys: List[str] = _EMPTY_LIST, top_n: int = 1, visualize_top_n: int = 0, objective_runner: Optional[ObjectiveRunner] = None, distributable: Optional[bool] = None, execution_engine: Any = None, ) -> List[Dict[str, Any]]: e = make_execution_engine(execution_engine) model_path = serialize_path if save_model else "" dag = FugueWorkflow() df = dag.df(train_df) if len(partition_keys) > 0: df = df.partition(by=partition_keys) skcv = build_sk_cv( space=space, train_df=df, scoring=scoring, cv=cv, feature_prefix=feature_prefix, label_col=label_col, save_path=model_path, ) result = skcv.tune( objective_runner=objective_runner, distributable=distributable, serialize_path=serialize_path, shuffle=True, ).persist() best = select_best(result, top=top_n) if top_n > 0 else result visualize_top(result, top=visualize_top_n) dag.run(e) return list(best.result.as_dict_iterable())
def suggest_sk_models_by_cv( space: Space, train_df: Any, scoring: str, cv: int = 5, temp_path: str = "", feature_prefix: str = "", label_col: str = "label", save_model: bool = False, partition_keys: Optional[List[str]] = None, top_n: int = 1, local_optimizer: Optional[NonIterativeObjectiveLocalOptimizer] = None, monitor: Any = None, stopper: Any = None, stop_check_interval: Any = None, distributed: Optional[bool] = None, execution_engine: Any = None, execution_engine_conf: Any = None, ) -> List[TrialReport]: dag = FugueWorkflow() dataset = TUNE_OBJECT_FACTORY.make_dataset( dag, space, df=train_df, partition_keys=partition_keys, temp_path=temp_path, ) objective = SKCVObjective( scoring=scoring, cv=cv, feature_prefix=feature_prefix, label_col=label_col, checkpoint_path=temp_path if save_model else None, ) study = optimize_noniterative( objective=objective, dataset=dataset, optimizer=local_optimizer, distributed=distributed, monitor=monitor, stopper=stopper, stop_check_interval=stop_check_interval, ) study.result(top_n).yield_dataframe_as("result") rows = list( dag.run( execution_engine, conf=execution_engine_conf, )["result"].as_dict_iterable()) return [ from_base64(r[TUNE_REPORT]) for r in sorted(rows, key=lambda r: r[TUNE_REPORT_METRIC]) ]
def test_transform_from_yield(tmpdir): # schema: *,x:int def f(df: pd.DataFrame) -> pd.DataFrame: return df.assign(x=1) dag = FugueWorkflow() dag.df([[0]], "a:int").yield_dataframe_as("x1") dag.df([[1]], "b:int").yield_dataframe_as("x2") dag.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)}) result = transform(dag.yields["x1"], f) assert isinstance(result, DataFrame) assert result.as_array(type_safe=True) == [[0, 1]] result = transform( dag.yields["x2"], f, engine_conf={FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)}, ) assert isinstance(result, DataFrame) assert result.as_array(type_safe=True) == [[1, 1]]
def test_runtime_exception(): if sys.version_info < (3, 7): return def tr(df: pd.DataFrame) -> pd.DataFrame: raise Exception def show(df): df.show() dag = FugueWorkflow() df = dag.df([[0]], "a:int") df = df.transform(tr, schema="*") show(df) try: dag.run() except: assert len(traceback.extract_tb(sys.exc_info()[2])) < 10 try: dag.run("native", {FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE: False}) except: assert len(traceback.extract_tb(sys.exc_info()[2])) > 10 try: dag.run("native", {FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: ""}) except: assert len(traceback.extract_tb(sys.exc_info()[2])) > 10
def test_dataset(tmpdir): space = Space(a=Grid(0, 1, 2, 3, 4), b=Grid(5, 6, 7, 8, 9)) builder = TuneDatasetBuilder(space, str(tmpdir)) dag = FugueWorkflow() dataset = builder.build(dag) ds = dataset.split([4, 1], 0) assert 2 == len(ds) ds[0].data.yield_dataframe_as("a") ds[1].data.yield_dataframe_as("b") res = dag.run() assert 25 == len(res["a"].as_array()) + len(res["b"].as_array()) assert len(res["b"].as_array()) < 10
def _run(dag: FugueWorkflow, execution_engine: Any, execution_engine_conf: Any) -> List[TrialReport]: try: rows = list( dag.run( execution_engine, conf=execution_engine_conf, )["result"].as_dict_iterable()) return [ from_base64(r[TUNE_REPORT]) for r in sorted(rows, key=lambda r: r[TUNE_REPORT_METRIC]) ] except FugueDataFrameError as e: raise e.__cause__ or e.__context__ or e
def test_modified_exception(): if sys.version_info < (3, 7): return def tr(df: pd.DataFrame) -> pd.DataFrame: raise Exception def show(df): df.show() def tt(df): __modified_exception__ = NotImplementedError() return df.transform(tr, schema="*") dag = FugueWorkflow() df = dag.df([[0]], "a:int") df = tt(df) show(df) try: dag.run() except Exception as ex: assert isinstance(ex.__cause__, NotImplementedError)
def suggest_by_hyperband( objective: Any, space: Space, plans: List[List[Tuple[float, int]]], train_df: Any = None, temp_path: str = "", partition_keys: Optional[List[str]] = None, top_n: int = 1, monitor: Any = None, distributed: Optional[bool] = None, execution_engine: Any = None, execution_engine_conf: Any = None, ) -> List[TrialReport]: assert_or_throw( not space.has_random_parameter, TuneCompileError("space can't contain random parameters, " "use sample method before calling this function"), ) dag = FugueWorkflow() dataset = TUNE_OBJECT_FACTORY.make_dataset( dag, space, df=train_df, partition_keys=partition_keys, temp_path=temp_path, ) study = optimize_by_hyperband( objective=objective, dataset=dataset, plans=plans, checkpoint_path=temp_path, distributed=distributed, monitor=monitor, ) study.result(top_n).yield_dataframe_as("result") rows = list( dag.run( execution_engine, conf=execution_engine_conf, )["result"].as_dict_iterable()) return [ TrialReport.from_jsondict(json.loads(r[TUNE_REPORT])) for r in sorted(rows, key=lambda r: r[TUNE_REPORT_METRIC]) ]
def suggest_sk_stacking_model( space: Space, stack_space: Space, train_df: Any, scoring: str, serialize_path: str, cv: int = 5, feature_prefix: str = "", label_col: str = "label", save_model: bool = False, partition_keys: List[str] = _EMPTY_LIST, top_n: int = 1, visualize_top_n: int = 0, objective_runner: Optional[ObjectiveRunner] = None, distributable: Optional[bool] = None, execution_engine: Any = None, stack_cv: int = 2, stack_method: str = "auto", stack_passthrough: bool = False, ) -> List[Dict[str, Any]]: e = make_execution_engine(execution_engine) model_path = serialize_path if save_model else "" dag = FugueWorkflow() df = dag.df(train_df) if len(partition_keys) > 0: df = df.partition(by=partition_keys) skcv = build_sk_cv( space=space, train_df=df, scoring=scoring, cv=cv, feature_prefix=feature_prefix, label_col=label_col, ) result = skcv.tune( objective_runner=objective_runner, distributable=distributable, serialize_path=serialize_path, shuffle=True, ).persist() best_models = select_best(result.transform(_extract_model), top=1) if top_n > 0: best_models = select_best(best_models.drop(["_sk__model"]), top=top_n) kwargs = Space( _sk__scoring=scoring, _sk__cv=cv, _sk__feature_prefix=feature_prefix, _sk__label_col=label_col, _sk__save_path=model_path, _sk__stack_cv=stack_cv, _sk__method=stack_method, _sk__passthrough=stack_passthrough, ) space_df = best_models.process( _process_stack_space, params=dict(keys=partition_keys, space=stack_space * kwargs), ) data = serialize_df(df, name="_sk__train_df", path=serialize_path) if len(partition_keys) > 0: data = data.inner_join(space_df.broadcast()) else: data = data.cross_join(space_df.broadcast()) result = tune( data, tunable=tunable(_sk_stack_cv), distributable=distributable, objective_runner=objective_runner, ) best = select_best(result, top=1) visualize_top(result, top=visualize_top_n) dag.run(e) return list(best.result.as_dict_iterable())