def test_worflow_dataframes(): dag1 = FugueWorkflow() df1 = dag1.df([[0]], "a:int") df2 = dag1.df([[0]], "b:int") dag2 = FugueWorkflow() df3 = dag2.df([[0]], "a:int") dfs1 = WorkflowDataFrames(a=df1, b=df2) assert dfs1["a"] is df1 assert dfs1["b"] is df2 dfs2 = WorkflowDataFrames(dfs1, aa=df1, bb=df2) assert 4 == len(dfs2) with raises(ValueError): WorkflowDataFrames(a=df1, b=df3) with raises(ValueError): WorkflowDataFrames(a=df1, b=ArrayDataFrame([[0]], "a:int")) dag = FugueWorkflow() df = dag.df([[0], [1]], "a:int") assert df.partition_spec.empty df2 = df.partition(by=["a"]) assert df.partition_spec.empty assert df2.partition_spec == PartitionSpec(by=["a"])
def test_yield(tmpdir): df = pd.DataFrame([[0, 0]], columns=["a", "b"]) # schema: * def t(df: pd.DataFrame) -> pd.DataFrame: return df.assign(b=df.b + 1) dag = FugueWorkflow() dag.df(df).transform(t).yield_dataframe_as("x") result = dag.run()["x"] assert [[0, 1]] == result.as_array() dag1 = FugueWorkflow() dag1.df(df).transform(t).yield_file_as("x") dag1.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)}) dag2 = FugueWorkflow() dag2.df(dag1.yields["x"]).transform(t).yield_dataframe_as("y") result = dag2.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})["y"] assert [[0, 2]] == result.as_array() dag3 = FugueWorkflow() dag3.df(dag2.yields["y"]).transform(t).yield_dataframe_as("z") result = dag3.run()["z"] assert [[0, 3]] == result.as_array()
def test_workflow(): builder = FugueWorkflow() a = builder.create_data([[0], [0], [1]], "a:int") raises(InvalidOperationError, lambda: a._task.copy()) raises(InvalidOperationError, lambda: copy.copy(a._task)) raises(InvalidOperationError, lambda: copy.deepcopy(a._task)) a.show() a.show() raises(FugueWorkflowCompileError, lambda: builder.df(123)) b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"])) b.show() builder.create_data([[0], [1]], "b:int").show() c = ArrayDataFrame([[100]], "a:int") builder.show(a, b, c) b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast() b.show() builder.run() df_eq(a.result, [[0], [0], [1]], "a:int") raises(TypeError, lambda: builder.run("abc")) builder.run(FugueWorkflowContext()) df_eq(a.result, [[0], [0], [1]], "a:int") builder.run("NativeExecutionEngine") df_eq(b.result, [[0, 2], [0, 2], [1, 1]], "a:int,b:int") df_eq(b.compute(), [[0, 2], [0, 2], [1, 1]], "a:int,b:int") df_eq(b.compute(NativeExecutionEngine), [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
def test_session_as_engine(self): dag = FugueWorkflow() a = dag.df([[p, 0] for p in range(100)], "a:int,b:int") a.partition(algo="even", by=["a"]).transform(AssertMaxNTransform).persist() dag.run(self.spark_session)