def test_yield(tmpdir): df = pd.DataFrame([[0, 0]], columns=["a", "b"]) # schema: * def t(df: pd.DataFrame) -> pd.DataFrame: return df.assign(b=df.b + 1) dag = FugueWorkflow() dag.df(df).transform(t).yield_dataframe_as("x") result = dag.run()["x"] assert [[0, 1]] == result.as_array() dag1 = FugueWorkflow() dag1.df(df).transform(t).yield_file_as("x") dag1.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)}) dag2 = FugueWorkflow() dag2.df(dag1.yields["x"]).transform(t).yield_dataframe_as("y") result = dag2.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})["y"] assert [[0, 2]] == result.as_array() dag3 = FugueWorkflow() dag3.df(dag2.yields["y"]).transform(t).yield_dataframe_as("z") result = dag3.run()["z"] assert [[0, 3]] == result.as_array()
def test_workflow(): builder = FugueWorkflow() a = builder.create_data([[0], [0], [1]], "a:int") raises(InvalidOperationError, lambda: a._task.copy()) raises(InvalidOperationError, lambda: copy.copy(a._task)) raises(InvalidOperationError, lambda: copy.deepcopy(a._task)) a.show() a.show() raises(FugueWorkflowCompileError, lambda: builder.df(123)) b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"])) b.show() builder.create_data([[0], [1]], "b:int").show() c = ArrayDataFrame([[100]], "a:int") builder.show(a, b, c) b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast() b.show() builder.run() df_eq(a.result, [[0], [0], [1]], "a:int") raises(TypeError, lambda: builder.run("abc")) builder.run(FugueWorkflowContext()) df_eq(a.result, [[0], [0], [1]], "a:int") builder.run("NativeExecutionEngine") df_eq(b.result, [[0, 2], [0, 2], [1, 1]], "a:int,b:int") df_eq(b.compute(), [[0, 2], [0, 2], [1, 1]], "a:int,b:int") df_eq(b.compute(NativeExecutionEngine), [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
def test_iterative_study(tmpdir): def assert_metric(df: Iterable[Dict[str, Any]], metric: float) -> None: for row in df: assert row[TUNE_REPORT_METRIC] < metric study = IterativeStudy(F(), str(tmpdir)) space = sum( Space(a=a, b=b) for a, b in [(1.1, 0.2), (0.8, -0.2), (1.2, -0.1), (0.7, 0.3), (1.0, 1.5)]) dag = FugueWorkflow() dataset = TuneDatasetBuilder(space, str(tmpdir)).build(dag) result = study.optimize( dataset, J([1, 2, 3, 4]), ) result.result(1).show() result.result(1).output(assert_metric, params=dict(metric=-2.8)) dag.run()
def test_session_as_engine(self): dag = FugueWorkflow() a = dag.df([[p, 0] for p in range(100)], "a:int,b:int") a.partition(algo="even", by=["a"]).transform(AssertMaxNTransform).persist() dag.run(self.spark_session)