Esempio n. 1
0
def test_compile_conf():
    def assert_conf(e: ExecutionEngine, **kwargs) -> pd.DataFrame:
        for k, v in kwargs.items():
            assert e.compile_conf[k] == v
        return pd.DataFrame([[0]], columns=["a"])

    dag = FugueWorkflow(conf={"a": 1})
    dag.create(assert_conf, params=dict(a=1))

    dag.run()

    with raises(KeyError):  # non-compile time param doesn't keep in new engine
        dag.run(NativeExecutionEngine())

    dag = FugueWorkflow(conf={FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: "abc"})
    dag.create(assert_conf,
               params=dict({FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: "abc"}))

    dag.run()

    # non-compile time param is kepts
    dag.run(NativeExecutionEngine())

    # non-compile time param can't be changed by new engines
    # new engine compile conf will be overwritten
    dag.run(NativeExecutionEngine({FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: "def"}))
def test_save_and_use():
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1))
    b = dag.create(mock_create1, params=dict(n=1))
    a = a.save_and_use("xx", fmt="parquet", mode="overwrite")
    b.save_and_use("xx", mode="append")
    b.save_and_use("xx", mode="error")
    a = a.save_and_use("xx.csv",
                       fmt="csv",
                       mode="error",
                       single=True,
                       header=True)
    a = a.partition(by=["x"]).save_and_use("xx", mode="overwrite")
    dag.create(mock_create1, params=dict(n=2)).save_and_use("xx",
                                                            mode="overwrite")
    assert_eq(
        """
    a=create using mock_create1(n=1)
    b=create using mock_create1(n=1)
    a=save and use a overwrite parquet "xx"
    save and use b append "xx"
    save and use b to "xx"
    save and use a to single csv "xx.csv"(header=True)
    save and use prepartition by x overwrite "xx"
    save and use (create using mock_create1(n=2)) overwrite "xx"
    """,
        dag,
    )
Esempio n. 3
0
def test_zip():
    dag = FugueWorkflow()
    a1 = dag.create(mock_create1, params=dict(n=1))
    a2 = dag.create(mock_create1, params=dict(n=2))
    a1.zip(a2)
    assert_eq(
        """
    a=create using mock_create1 params n:1
    zip a,(create using mock_create1 params n:2)
    """,
        dag,
    )

    dag = FugueWorkflow()
    a1 = dag.create(mock_create1, params=dict(n=1))
    a2 = dag.create(mock_create1, params=dict(n=2))
    a1.zip(a2, how="left_outer", partition=dict(by=["a"], presort="b DESC"))
    assert_eq(
        """
    a=create using mock_create1 params n:1
    zip a,(create using mock_create1 params n:2) left
        outer by a presort b desc
    """,
        dag,
    )
Esempio n. 4
0
def test_create():
    dag = FugueWorkflow()
    dag.create(mock_create1, params=dict(n=1))
    dag.create(mock_create2, schema="a:int", params=dict(n=1))
    assert_eq(
        """
    a=create using mock_create1 params n:1
    b=create using mock_create2(n=1) schema a:int
    """,
        dag,
    )
Esempio n. 5
0
def test_print():
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1))
    a.show()
    b = dag.create(mock_create1, params=dict(n=2))
    dag.show(a, b, rows=5, show_count=True, title='"b   B')
    assert_eq(
        """
    a=create using mock_create1(n=1)
    print
    print 5 rows from a, (create using mock_create1(n=2)) rowcount title "\\"b   B"
    """,
        dag,
    )
Esempio n. 6
0
def test_output():
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1))
    a.partition(num=4).output(mock_output)
    b = dag.create(mock_create1, params=dict(n=2))
    dag.output(a, b, using=mock_output, params=dict(n=3))
    assert_eq(
        """
    a=create using mock_create1(n=1)
    output prepartition 4 using mock_output
    output a, (create using mock_create1(n=2)) using mock_output(n=3)
    """,
        dag,
    )
Esempio n. 7
0
def test_general_set_op():
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1))
    b = dag.create(mock_create1, params=dict(n=2))
    dag.select("select * from", a, "AS a union all select * from", b, "AS b")
    dag.select(
        "SELECT * FROM", dag.create(mock_create1), "union select * from", b, "AS b"
    )
    dag.select(
        "SELECT * FROM",
        dag.create(mock_create1),
        "intersect distinct SELECT * FROM",
        a.process(mock_processor1),
    )
    dag.select(
        "select * from",
        dag.create(mock_create1),
        "union SELECT * FROM",
        a.process(mock_processor1),
    )
    c = dag.create(mock_create1, params=dict(n=2))
    dag.select(
        "SELECT * FROM",
        c.transform(mock_transformer2),
        "union SELECT * FROM",
        c.process(mock_processor1),
    )
    assert_eq(
        """
    a=create using mock_create1(n=1)
    b=create using mock_create1(n=2)

    select * from a union all select * from b
    create using mock_create1 union select * from b
    create using mock_create1 intersect distinct process a using mock_processor1
    select * from (create using mock_create1) union process a using mock_processor1

    # operation on omitted dependencies should work as expected
    c=create using mock_create1(n=2)
    transform using mock_transformer2 union process using mock_processor1
    """,
        dag,
    )
Esempio n. 8
0
def test_select_nested():
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1))
    b = dag.create(mock_create1, params=dict(n=2))
    dag.select("select * from (select * from a.b)")
    dag.select("select * from", dag.create(mock_create1), "AS bb")
    dag.select("select * from", dag.create(mock_create1), "TABLESAMPLE (5 PERCENT)")
    dag.select("select * from (select * from", dag.create(mock_create1), ")")
    assert_eq(
        """
    a=create using mock_create1(n=1)
    b=create using mock_create1(n=2)

    # nested query
    select * from (select * from a.b)
    select * from (create using mock_create1) AS bb
    select * from (create using mock_create1) TABLESAMPLE(5 PERCENT)
    select * from (select * from (create using mock_create1))
    """,
        dag,
    )
Esempio n. 9
0
def test_sample():
    dag = FugueWorkflow()
    a = dag.create(mock_create1)
    a.sample(frac=0.1, replace=False, seed=None)
    a.sample(n=5, replace=True, seed=7)

    assert_eq(
        """
    a=create using mock_create1
    sample 10 percent
    sample replace 5 rows seed 7 from a
    """,
        dag,
    )
Esempio n. 10
0
def test_drop():
    dag = FugueWorkflow()
    a = dag.create(mock_create1)
    b = a.drop(["a", "b"])
    c = a.drop(["a", "b"], if_exists=True)

    d = dag.create(mock_create1)
    e = d.dropna(how="any")
    f = d.dropna(how="all")
    g = d.dropna(how="any", subset=["a", "c"])
    assert_eq(
        """
    a=create using mock_create1
    drop columns a,b
    drop columns a,b if exists from a

    d=create using mock_create1
    drop rows if any null
    drop rows if all null from d
    drop rows if any nulls on a,c from d
    """,
        dag,
    )
Esempio n. 11
0
def test_alter_columns():
    dag = FugueWorkflow()
    a = dag.create(mock_create1)
    a.alter_columns(Schema("a:str,b:str"))
    a.alter_columns(Schema("a:float,b:double"))

    assert_eq(
        """
    a=create using mock_create1
    alter columns a:str, b:str
    alter columns a:float, b:double from a
    """,
        dag,
    )
Esempio n. 12
0
def test_rename():
    dag = FugueWorkflow()
    a = dag.create(mock_create1)
    b = a.rename({"a": "aa", "b": "bb"})
    c = a.rename({"a": "aaa", "b": "bbb"})

    assert_eq(
        """
    a=create using mock_create1
    rename columns a:aa,b:bb
    rename columns a:aaa,b:bbb from a
    """,
        dag,
    )
Esempio n. 13
0
def test_cotransform():
    dag = FugueWorkflow()
    a1 = dag.create(mock_create1, params=dict(n=1))
    a2 = dag.create(mock_create1, params=dict(n=2))
    z = dag.zip(a1, a2)
    t = z.partition(num=3).transform(mock_cotransformer1, params=dict(n=3))
    assert_eq(
        """
    zip
        (create using mock_create1 params n:1),
        (create using mock_create1 params n:2)
    transform prepartition 3 using mock_cotransformer1(n=3)
    """,
        dag,
    )
Esempio n. 14
0
def test_select():
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1))
    b = dag.create(mock_create1, params=dict(n=2))
    dag.select("select * from a.b")
    dag.select("select * from a.b TABLESAMPLE (5 PERCENT) AS x")
    dag.select("select * from a.b AS x")
    dag.select("select * from", a, "AS a")  # fugue sql adds 'AS a'
    dag.select("select * from", a, "TABLESAMPLE (5 PERCENT) AS a")
    x = dag.select("select * from", a, "TABLESAMPLE (5 PERCENT) AS x")
    y = dag.select("select * FROM", x)
    z = dag.select("select * FROM", y, "where t = 100")
    dag.select("select a.* from", a, "AS a join", b, "AS b on a.a == b.a")

    dag.select("select * from", a, "AS a").persist().broadcast().show()
    dag.select("select * from", a, "AS a").weak_checkpoint(
        level="a.b.c"
    ).broadcast().show()
    assert_eq(
        """
    a=create using mock_create1(n=1)
    b=create using mock_create1(n=2)

    # assignment and table not found
    x=select * from a.b

    # sample and alias when table not found
    select * from a.b TABLESAMPLE (5 PERCENT) AS x
    select * from a.b AS x

    # when table is found
    select * from a
    select * from a TABLESAMPLE(5 PERCENT)
    select * from a TABLESAMPLE(5 PERCENT) AS x

    # no from
    select *
    select * where t=100

    # multiple dependencies
    select a.* from a join b on a.a==b.a

    # persist & checkpoint & broadcast
    select * from a persist broadcast print
    select * from a persist (level="a.b.c") broadcast print
    """,
        dag,
    )
Esempio n. 15
0
def test_yield():
    dag = FugueWorkflow()
    dag.create(mock_create1).yield_dataframe_as("a")
    dag.create(mock_create1).yield_file_as("aa")
    dag.create(mock_create1).deterministic_checkpoint().yield_dataframe_as("c")
    dag.create(mock_create1).deterministic_checkpoint().yield_file_as("bb")
    dag.create(mock_create1).deterministic_checkpoint().yield_file_as("cc")

    assert_eq(
        """
    a=create using mock_create1 yield dataframe
    create using mock_create1 yield file as aa
    c=create using mock_create1 deterministic checkpoint yield dataframe
    d=create using mock_create1 deterministic checkpoint yield file as bb
    create using mock_create1 deterministic checkpoint yield file as cc
    """,
        dag,
    )
Esempio n. 16
0
def test_process():
    # basic features, nest
    dag = FugueWorkflow()
    a1 = dag.create(mock_create1, params=dict(n=1))
    a2 = dag.create(mock_create1, params=dict(n=2))
    dag.process(a1, a2, using=mock_processor1, params=dict(n=3))
    dag.process(a2, a1, using=mock_processor2, schema="b:int", params=dict(n=4))
    dag.process(
        dag.create(mock_create1, params=dict(n=5)),
        dag.create(mock_create1, params=dict(n=6)),
        using=mock_processor1,
        params=dict(n=7),
    )
    assert_eq(
        """
    a=create using mock_create1 params n:1
    b=create using mock_create1 params n:2
    process a,b using mock_processor1(n=3)
    process b,a using mock_processor2(n=4) schema b:int
    process  # nested
        (create using mock_create1(n=5)),
        (create using mock_create1(n=6))
        using mock_processor1(n=7)
    """,
        dag,
    )

    # anonymous, nested anonymous
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1)).process(mock_processor3)
    b = a.partition(by=["a"]).process(mock_processor3)
    c = a.process(mock_processor3)
    dag.process(b, c, using=mock_processor1)
    assert_eq(
        """
    create using mock_create1 params n:1
    process using mock_processor3
    process  # nested
        (process prepartition by a using mock_processor3),
        (process using mock_processor3)
        using mock_processor1
    """,
        dag,
    )

    # no last dataframe
    with raises(FugueSQLError):
        assert_eq(
            """
        process using mock_processor3
        """,
            None,
        )

    # dict like dataframes
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1))
    b = dag.create(mock_create1, params=dict(n=2))
    dag.process(dict(df1=a, df2=b), using=mock_processor1)
    assert_eq(
        """
    process
        df1=(create using mock_create1(n=1)),
        df2:(create using mock_create1(n=2))
        using mock_processor1
    """,
        dag,
    )
Esempio n. 17
0
def test_persist_checkpoint_broadcast():
    dag = FugueWorkflow()
    dag.create(mock_create1).persist()
    dag.create(mock_create1).weak_checkpoint(lazy=True, level="a.b")

    dag.create(mock_create1).broadcast()
    dag.create(mock_create1).weak_checkpoint(level="a.b").broadcast()

    dag.create(mock_create1).checkpoint()
    dag.create(mock_create1).strong_checkpoint(lazy=True)
    dag.create(mock_create1).strong_checkpoint(lazy=True, x="xy z")
    dag.create(mock_create1).strong_checkpoint(
        lazy=False, partition=PartitionSpec(num=5), single=True, x="xy z"
    ).broadcast()

    dag.create(mock_create1).deterministic_checkpoint()
    dag.create(mock_create1).deterministic_checkpoint(
        lazy=False, partition=PartitionSpec(num=4), single=True, namespace="n", x=2
    )
    assert_eq(
        """
    create using mock_create1 persist
    a=create using mock_create1 lazy persist (level="a.b")

    create using mock_create1 broadcast
    a=create using mock_create1 persist(level="a.b") broadcast

    create using mock_create1 checkpoint
    a= create using mock_create1 lazy strong checkpoint
    a=create using mock_create1 lazy checkpoint(x="xy z")
    a=create using mock_create1 checkpoint prepartition 5 single (x="xy z") broadcast

    create using mock_create1 deterministic checkpoint
    create using mock_create1 deterministic checkpoint "n"
        prepartition 4 single params x=2
    """,
        dag,
    )