def test_workflow_determinism_5(): dag1 = FugueWorkflow() a1 = dag1.create_data([[0], [0], [1]], "a:int32") b1 = a1.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"], num=2)) a1.show() dag2 = FugueWorkflow() a2 = dag2.create_data([[0], [0], [1]], "a:int32") b2 = a2.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"], num=2)) a2.show(rows=22) # <--- assert a1.spec_uuid() == a2.spec_uuid() assert b1.spec_uuid() == b2.spec_uuid() assert dag1.spec_uuid() != dag2.spec_uuid()
def test_auto_persist(): dag1 = FugueWorkflow(NativeExecutionEngine()) df1 = dag1.df([[0]], "a:int") df1.show() df1.show() id1 = dag1.spec_uuid() dag2 = FugueWorkflow(NativeExecutionEngine({"fugue.workflow.auto_persist": True})) df1 = dag2.df([[0]], "a:int") df1.show() df1.show() id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").weak_checkpoint(level=None) df1.show() df1.show() id3 = dag3.spec_uuid() assert id1 == id2 assert id2 == id3 dag2 = FugueWorkflow( NativeExecutionEngine( { "fugue.workflow.auto_persist": True, "fugue.workflow.auto_persist_value": "abc", } ) ) df1 = dag2.df([[0]], "a:int") df1.show() df1.show() id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").weak_checkpoint(level="abc") df1.show() df1.show() id3 = dag3.spec_uuid() assert id2 == id3 dag1 = FugueWorkflow(NativeExecutionEngine()) df1 = dag1.df([[0]], "a:int") df1.show() id1 = dag1.spec_uuid() dag2 = FugueWorkflow(NativeExecutionEngine({"fugue.workflow.auto_persist": True})) df1 = dag2.df([[0]], "a:int") df1.show() # auto persist will not trigger id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").weak_checkpoint(level=None) df1.show() id3 = dag3.spec_uuid() assert id1 == id2 assert id2 == id3 # checkpoint, including auto_persist doesn't change determinism
def test_auto_persist(): dag1 = FugueWorkflow(NativeExecutionEngine()) df1 = dag1.df([[0]], "a:int") df1.show() df1.show() id1 = dag1.spec_uuid() dag2 = FugueWorkflow( NativeExecutionEngine({"fugue.workflow.auto_persist": True})) df1 = dag2.df([[0]], "a:int") df1.show() df1.show() id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").persist() df1.show() df1.show() id3 = dag3.spec_uuid() assert id1 != id2 assert id2 == id3 dag2 = FugueWorkflow( NativeExecutionEngine({ "fugue.workflow.auto_persist": True, "fugue.workflow.auto_persist_value": "abc" })) df1 = dag2.df([[0]], "a:int") df1.show() df1.show() id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").persist("abc") df1.show() df1.show() id3 = dag3.spec_uuid() assert id2 == id3 dag1 = FugueWorkflow(NativeExecutionEngine()) df1 = dag1.df([[0]], "a:int") df1.show() id1 = dag1.spec_uuid() dag2 = FugueWorkflow( NativeExecutionEngine({"fugue.workflow.auto_persist": True})) df1 = dag2.df([[0]], "a:int") df1.show() # auto persist will not trigger id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").persist() df1.show() id3 = dag3.spec_uuid() assert id1 == id2 assert id2 != id3
def test_workflow_determinism_6(): dag1 = FugueWorkflow() dag1.create_data([[0], [0], [1]], "a:int32") a1 = dag1.create_data([[0], [0], [1]], "a:int32") b1 = dag1.create_data([[1], [0], [1]], "a:int32") c1 = a1.union(b1) dag2 = FugueWorkflow() a2 = dag1.create_data([[0], [0], [1]], "a:int32") b2 = dag1.create_data([[1], [0], [1]], "a:int32") c2 = a1.union(b2) assert a1.spec_uuid() == a2.spec_uuid() assert b1.spec_uuid() == b2.spec_uuid() assert c1.spec_uuid() == c2.spec_uuid() assert dag1.spec_uuid() != dag2.spec_uuid()
def test_workflow_determinism_3(): dag1 = FugueWorkflow() data = [[0], [0], [1]] a1 = dag1.create_data(data, "a:int32", data_determiner=to_uuid) b1 = a1.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"], num=2)) a1.show() dag2 = FugueWorkflow() data = [[1], [10], [20]] a2 = dag2.create_data(data, "a:int32", data_determiner=to_uuid) # <--- b2 = a2.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"], num=2)) a2.show() assert a1.spec_uuid() != a2.spec_uuid() assert b1.spec_uuid() != b2.spec_uuid() assert dag1.spec_uuid() != dag2.spec_uuid()
def assert_eq(expr, expected: FugueWorkflow): global_vars, local_vars = get_caller_global_local_vars() sql = FugueSQL(expr, "fugueLanguage", ignore_case=True, simple_assign=True) wf = FugueWorkflow() v = _Extensions( sql, FugueSQLHooks(), wf, global_vars=global_vars, local_vars=local_vars ) obj = v.visit(sql.tree) assert expected.spec_uuid() == v.workflow.spec_uuid()
def test_workflow_determinism_7(): dag1 = FugueWorkflow() a1 = dag1.create_data([[0], [0], [1]], "a:int32") a1.out_transform(mock_tf1) a1.show() dag2 = FugueWorkflow() a2 = dag2.create_data([[0], [0], [1]], "a:int32") a2.out_transform(mock_tf1) a2.show() dag3 = FugueWorkflow() a3 = dag3.create_data([[0], [0], [1]], "a:int32") a3.show() assert a1.spec_uuid() == a2.spec_uuid() assert dag1.spec_uuid() == dag2.spec_uuid() assert a1.spec_uuid() == a3.spec_uuid() assert dag1.spec_uuid() != dag3.spec_uuid()
def test_workflow_determinism_8(): dag1 = FugueWorkflow() a1 = dag1.create_data([[0], [0], [1]], "a:int32") a1.select("a", "b") a1.show() dag2 = FugueWorkflow() a2 = dag2.create_data([[0], [0], [1]], "a:int32") a2.select("a", "b") a2.show() dag3 = FugueWorkflow() a3 = dag3.create_data([[0], [0], [1]], "a:int32") a3.select("b", "a") a3.show() dag4 = FugueWorkflow() a4 = dag4.create_data([[0], [0], [1]], "a:int32") a4.select("a", "b", distinct=True) a4.show() assert a1.spec_uuid() == a2.spec_uuid() assert dag1.spec_uuid() == dag2.spec_uuid() assert a1.spec_uuid() == a3.spec_uuid() assert dag1.spec_uuid() != dag3.spec_uuid() assert a1.spec_uuid() == a4.spec_uuid() assert dag1.spec_uuid() != dag4.spec_uuid()
def test_workflow_determinism(): # TODO: need more thorough test, separate this to small ones and remove it builder1 = FugueWorkflow() a1 = builder1.create_data([[0], [0], [1]], "a:int32") b1 = a1.transform("mock_tf1", "*,b:int", pre_partition=dict(by=["a"], num=2)) a1.show() builder2 = FugueWorkflow() a2 = builder2.create_data([[0], [0], [1]], Schema("a:int")) b2 = a2.transform("mock_tf1", "*,b:int", pre_partition=dict(num="2", by=["a"])) a2.show() assert builder1.spec_uuid() == builder1.spec_uuid() assert a1.spec_uuid() == a2.spec_uuid() assert b1.spec_uuid() == b2.spec_uuid() assert builder1.spec_uuid() == builder2.spec_uuid() builder3 = FugueWorkflow() a3 = builder2.create_data([[0], [0], [1]], Schema("a:int")) b3 = a2.transform("mock_tf1", "*,b:str", pre_partition=dict(num="2", by=["a"])) a3.show() assert a1.spec_uuid() == a3.spec_uuid() assert b1.spec_uuid() != b3.spec_uuid() assert builder1.spec_uuid() != builder3.spec_uuid() builder3 = FugueWorkflow() a3 = builder2.create_data([[0], [0], [1]], Schema("a:int")) b3 = a2.transform("mock_tf1", "*,b:int", pre_partition=dict(num="200", by=["a"])) a3.show() assert a1.spec_uuid() == a3.spec_uuid() assert b1.spec_uuid() != b3.spec_uuid() assert builder1.spec_uuid() != builder3.spec_uuid()
def test_yield(): dag = FugueWorkflow() dag.df([[0]], "a:int32").show() id0 = dag.spec_uuid() x = FugueWorkflow().df([[0]], "a:int32") x.yield_file_as("x") x.show() id1 = x.workflow.spec_uuid() x = FugueWorkflow().df([[0]], "a:int32") x.deterministic_checkpoint().yield_file_as("y") x.show() id2 = x.workflow.spec_uuid() x = FugueWorkflow().df([[0]], "a:int32") x.deterministic_checkpoint().yield_dataframe_as("z") x.show() id3 = x.workflow.spec_uuid() # yield doesn't change determinism assert id0 == id1 assert id0 == id2 assert id0 == id3