Exemple #1
0
def test_workflow_determinism_5():
    dag1 = FugueWorkflow()
    a1 = dag1.create_data([[0], [0], [1]], "a:int32")
    b1 = a1.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"], num=2))
    a1.show()

    dag2 = FugueWorkflow()
    a2 = dag2.create_data([[0], [0], [1]], "a:int32")
    b2 = a2.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"], num=2))
    a2.show(rows=22)  # <---

    assert a1.spec_uuid() == a2.spec_uuid()
    assert b1.spec_uuid() == b2.spec_uuid()
    assert dag1.spec_uuid() != dag2.spec_uuid()
def test_auto_persist():
    dag1 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag1.df([[0]], "a:int")
    df1.show()
    df1.show()
    id1 = dag1.spec_uuid()

    dag2 = FugueWorkflow(NativeExecutionEngine({"fugue.workflow.auto_persist": True}))
    df1 = dag2.df([[0]], "a:int")
    df1.show()
    df1.show()
    id2 = dag2.spec_uuid()

    dag3 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag3.df([[0]], "a:int").weak_checkpoint(level=None)
    df1.show()
    df1.show()
    id3 = dag3.spec_uuid()

    assert id1 == id2
    assert id2 == id3

    dag2 = FugueWorkflow(
        NativeExecutionEngine(
            {
                "fugue.workflow.auto_persist": True,
                "fugue.workflow.auto_persist_value": "abc",
            }
        )
    )
    df1 = dag2.df([[0]], "a:int")
    df1.show()
    df1.show()
    id2 = dag2.spec_uuid()

    dag3 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag3.df([[0]], "a:int").weak_checkpoint(level="abc")
    df1.show()
    df1.show()
    id3 = dag3.spec_uuid()

    assert id2 == id3

    dag1 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag1.df([[0]], "a:int")
    df1.show()
    id1 = dag1.spec_uuid()

    dag2 = FugueWorkflow(NativeExecutionEngine({"fugue.workflow.auto_persist": True}))
    df1 = dag2.df([[0]], "a:int")
    df1.show()  # auto persist will not trigger
    id2 = dag2.spec_uuid()

    dag3 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag3.df([[0]], "a:int").weak_checkpoint(level=None)
    df1.show()
    id3 = dag3.spec_uuid()

    assert id1 == id2
    assert id2 == id3  # checkpoint, including auto_persist doesn't change determinism
Exemple #3
0
def test_auto_persist():
    dag1 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag1.df([[0]], "a:int")
    df1.show()
    df1.show()
    id1 = dag1.spec_uuid()

    dag2 = FugueWorkflow(
        NativeExecutionEngine({"fugue.workflow.auto_persist": True}))
    df1 = dag2.df([[0]], "a:int")
    df1.show()
    df1.show()
    id2 = dag2.spec_uuid()

    dag3 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag3.df([[0]], "a:int").persist()
    df1.show()
    df1.show()
    id3 = dag3.spec_uuid()

    assert id1 != id2
    assert id2 == id3

    dag2 = FugueWorkflow(
        NativeExecutionEngine({
            "fugue.workflow.auto_persist": True,
            "fugue.workflow.auto_persist_value": "abc"
        }))
    df1 = dag2.df([[0]], "a:int")
    df1.show()
    df1.show()
    id2 = dag2.spec_uuid()

    dag3 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag3.df([[0]], "a:int").persist("abc")
    df1.show()
    df1.show()
    id3 = dag3.spec_uuid()

    assert id2 == id3

    dag1 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag1.df([[0]], "a:int")
    df1.show()
    id1 = dag1.spec_uuid()

    dag2 = FugueWorkflow(
        NativeExecutionEngine({"fugue.workflow.auto_persist": True}))
    df1 = dag2.df([[0]], "a:int")
    df1.show()  # auto persist will not trigger
    id2 = dag2.spec_uuid()

    dag3 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag3.df([[0]], "a:int").persist()
    df1.show()
    id3 = dag3.spec_uuid()

    assert id1 == id2
    assert id2 != id3
Exemple #4
0
def test_workflow_determinism_6():
    dag1 = FugueWorkflow()
    dag1.create_data([[0], [0], [1]], "a:int32")
    a1 = dag1.create_data([[0], [0], [1]], "a:int32")
    b1 = dag1.create_data([[1], [0], [1]], "a:int32")
    c1 = a1.union(b1)

    dag2 = FugueWorkflow()
    a2 = dag1.create_data([[0], [0], [1]], "a:int32")
    b2 = dag1.create_data([[1], [0], [1]], "a:int32")
    c2 = a1.union(b2)

    assert a1.spec_uuid() == a2.spec_uuid()
    assert b1.spec_uuid() == b2.spec_uuid()
    assert c1.spec_uuid() == c2.spec_uuid()
    assert dag1.spec_uuid() != dag2.spec_uuid()
Exemple #5
0
def test_workflow_determinism_3():
    dag1 = FugueWorkflow()
    data = [[0], [0], [1]]
    a1 = dag1.create_data(data, "a:int32", data_determiner=to_uuid)
    b1 = a1.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"], num=2))
    a1.show()

    dag2 = FugueWorkflow()
    data = [[1], [10], [20]]
    a2 = dag2.create_data(data, "a:int32", data_determiner=to_uuid)  # <---
    b2 = a2.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"], num=2))
    a2.show()

    assert a1.spec_uuid() != a2.spec_uuid()
    assert b1.spec_uuid() != b2.spec_uuid()
    assert dag1.spec_uuid() != dag2.spec_uuid()
Exemple #6
0
def assert_eq(expr, expected: FugueWorkflow):
    global_vars, local_vars = get_caller_global_local_vars()
    sql = FugueSQL(expr, "fugueLanguage", ignore_case=True, simple_assign=True)
    wf = FugueWorkflow()
    v = _Extensions(
        sql, FugueSQLHooks(), wf, global_vars=global_vars, local_vars=local_vars
    )
    obj = v.visit(sql.tree)
    assert expected.spec_uuid() == v.workflow.spec_uuid()
Exemple #7
0
def test_workflow_determinism_7():
    dag1 = FugueWorkflow()
    a1 = dag1.create_data([[0], [0], [1]], "a:int32")
    a1.out_transform(mock_tf1)
    a1.show()

    dag2 = FugueWorkflow()
    a2 = dag2.create_data([[0], [0], [1]], "a:int32")
    a2.out_transform(mock_tf1)
    a2.show()

    dag3 = FugueWorkflow()
    a3 = dag3.create_data([[0], [0], [1]], "a:int32")
    a3.show()

    assert a1.spec_uuid() == a2.spec_uuid()
    assert dag1.spec_uuid() == dag2.spec_uuid()

    assert a1.spec_uuid() == a3.spec_uuid()
    assert dag1.spec_uuid() != dag3.spec_uuid()
Exemple #8
0
def test_workflow_determinism_8():
    dag1 = FugueWorkflow()
    a1 = dag1.create_data([[0], [0], [1]], "a:int32")
    a1.select("a", "b")
    a1.show()

    dag2 = FugueWorkflow()
    a2 = dag2.create_data([[0], [0], [1]], "a:int32")
    a2.select("a", "b")
    a2.show()

    dag3 = FugueWorkflow()
    a3 = dag3.create_data([[0], [0], [1]], "a:int32")
    a3.select("b", "a")
    a3.show()

    dag4 = FugueWorkflow()
    a4 = dag4.create_data([[0], [0], [1]], "a:int32")
    a4.select("a", "b", distinct=True)
    a4.show()

    assert a1.spec_uuid() == a2.spec_uuid()
    assert dag1.spec_uuid() == dag2.spec_uuid()

    assert a1.spec_uuid() == a3.spec_uuid()
    assert dag1.spec_uuid() != dag3.spec_uuid()

    assert a1.spec_uuid() == a4.spec_uuid()
    assert dag1.spec_uuid() != dag4.spec_uuid()
Exemple #9
0
def test_workflow_determinism():
    # TODO: need more thorough test, separate this to small ones and remove it
    builder1 = FugueWorkflow()
    a1 = builder1.create_data([[0], [0], [1]], "a:int32")
    b1 = a1.transform("mock_tf1",
                      "*,b:int",
                      pre_partition=dict(by=["a"], num=2))
    a1.show()

    builder2 = FugueWorkflow()
    a2 = builder2.create_data([[0], [0], [1]], Schema("a:int"))
    b2 = a2.transform("mock_tf1",
                      "*,b:int",
                      pre_partition=dict(num="2", by=["a"]))
    a2.show()

    assert builder1.spec_uuid() == builder1.spec_uuid()
    assert a1.spec_uuid() == a2.spec_uuid()
    assert b1.spec_uuid() == b2.spec_uuid()
    assert builder1.spec_uuid() == builder2.spec_uuid()

    builder3 = FugueWorkflow()
    a3 = builder2.create_data([[0], [0], [1]], Schema("a:int"))
    b3 = a2.transform("mock_tf1",
                      "*,b:str",
                      pre_partition=dict(num="2", by=["a"]))
    a3.show()

    assert a1.spec_uuid() == a3.spec_uuid()
    assert b1.spec_uuid() != b3.spec_uuid()
    assert builder1.spec_uuid() != builder3.spec_uuid()

    builder3 = FugueWorkflow()
    a3 = builder2.create_data([[0], [0], [1]], Schema("a:int"))
    b3 = a2.transform("mock_tf1",
                      "*,b:int",
                      pre_partition=dict(num="200", by=["a"]))
    a3.show()

    assert a1.spec_uuid() == a3.spec_uuid()
    assert b1.spec_uuid() != b3.spec_uuid()
    assert builder1.spec_uuid() != builder3.spec_uuid()
Exemple #10
0
def test_yield():
    dag = FugueWorkflow()
    dag.df([[0]], "a:int32").show()
    id0 = dag.spec_uuid()
    x = FugueWorkflow().df([[0]], "a:int32")
    x.yield_file_as("x")
    x.show()
    id1 = x.workflow.spec_uuid()
    x = FugueWorkflow().df([[0]], "a:int32")
    x.deterministic_checkpoint().yield_file_as("y")
    x.show()
    id2 = x.workflow.spec_uuid()
    x = FugueWorkflow().df([[0]], "a:int32")
    x.deterministic_checkpoint().yield_dataframe_as("z")
    x.show()
    id3 = x.workflow.spec_uuid()
    # yield doesn't change determinism
    assert id0 == id1
    assert id0 == id2
    assert id0 == id3