Exemple #1
0
def test_yield(tmpdir):
    df = pd.DataFrame([[0, 0]], columns=["a", "b"])

    # schema: *
    def t(df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(b=df.b + 1)

    dag = FugueWorkflow()
    dag.df(df).transform(t).yield_dataframe_as("x")
    result = dag.run()["x"]
    assert [[0, 1]] == result.as_array()

    dag1 = FugueWorkflow()
    dag1.df(df).transform(t).yield_file_as("x")
    dag1.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})

    dag2 = FugueWorkflow()
    dag2.df(dag1.yields["x"]).transform(t).yield_dataframe_as("y")
    result = dag2.run("",
                      {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})["y"]
    assert [[0, 2]] == result.as_array()

    dag3 = FugueWorkflow()
    dag3.df(dag2.yields["y"]).transform(t).yield_dataframe_as("z")
    result = dag3.run()["z"]
    assert [[0, 3]] == result.as_array()
Exemple #2
0
def test_worflow_dataframes():
    dag1 = FugueWorkflow()
    df1 = dag1.df([[0]], "a:int")
    df2 = dag1.df([[0]], "b:int")
    dag2 = FugueWorkflow()
    df3 = dag2.df([[0]], "a:int")

    dfs1 = WorkflowDataFrames(a=df1, b=df2)
    assert dfs1["a"] is df1
    assert dfs1["b"] is df2

    dfs2 = WorkflowDataFrames(dfs1, aa=df1, bb=df2)
    assert 4 == len(dfs2)

    with raises(ValueError):
        WorkflowDataFrames(a=df1, b=df3)

    with raises(ValueError):
        WorkflowDataFrames(a=df1, b=ArrayDataFrame([[0]], "a:int"))

    dag = FugueWorkflow()
    df = dag.df([[0], [1]], "a:int")
    assert df.partition_spec.empty
    df2 = df.partition(by=["a"])
    assert df.partition_spec.empty
    assert df2.partition_spec == PartitionSpec(by=["a"])
def test_process():
    # basic features, nest
    dag = FugueWorkflow()
    a1 = dag.create(mock_create1, params=dict(n=1))
    a2 = dag.create(mock_create1, params=dict(n=2))
    dag.process(a1, a2, using=mock_processor1, params=dict(n=3))
    dag.process(a2, a1, using=mock_processor2, schema="b:int", params=dict(n=4))
    dag.process(
        dag.create(mock_create1, params=dict(n=5)),
        dag.create(mock_create1, params=dict(n=6)),
        using=mock_processor1, params=dict(n=7))
    assert_eq("""
    a=create using mock_create1 params n:1
    b=create using mock_create1 params n:2
    process a,b using mock_processor1(n=3)
    process b,a using mock_processor2(n=4) schema b:int
    process  # nested
        (create using mock_create1(n=5)),
        (create using mock_create1(n=6))
        using mock_processor1(n=7)
    """, dag)

    # anonymous, nested anonymous
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1)).process(mock_processor3)
    b = a.partition(by=["a"]).process(mock_processor3)
    c = a.process(mock_processor3)
    dag.process(b, c, using=mock_processor1)
    assert_eq("""
    create using mock_create1 params n:1
    process using mock_processor3
    process  # nested
        (process prepartition by a using mock_processor3),
        (process using mock_processor3)
        using mock_processor1
    """, dag)

    # no last dataframe
    with raises(FugueSQLError):
        assert_eq("""
        process using mock_processor3
        """, None)

    # dict like dataframes
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1))
    b = dag.create(mock_create1, params=dict(n=2))
    dag.process(dict(df1=a, df2=b), using=mock_processor1)
    assert_eq("""
    process
        df1=(create using mock_create1(n=1)),
        df2:(create using mock_create1(n=2))
        using mock_processor1
    """, dag)
Exemple #4
0
def test_workflow():
    builder = FugueWorkflow()

    a = builder.create_data([[0], [0], [1]], "a:int")
    raises(InvalidOperationError, lambda: a._task.copy())
    raises(InvalidOperationError, lambda: copy.copy(a._task))
    raises(InvalidOperationError, lambda: copy.deepcopy(a._task))
    a.show()
    a.show()

    raises(FugueWorkflowCompileError, lambda: builder.df(123))

    b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"]))
    b.show()
    builder.create_data([[0], [1]], "b:int").show()
    c = ArrayDataFrame([[100]], "a:int")
    builder.show(a, b, c)
    b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast()
    b.show()

    builder.run()
    df_eq(a.result, [[0], [0], [1]], "a:int")
    raises(TypeError, lambda: builder.run("abc"))
    builder.run(FugueWorkflowContext())
    df_eq(a.result, [[0], [0], [1]], "a:int")
    builder.run("NativeExecutionEngine")
    df_eq(b.result, [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
    df_eq(b.compute(), [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
    df_eq(b.compute(NativeExecutionEngine), [[0, 2], [0, 2], [1, 1]],
          "a:int,b:int")
Exemple #5
0
        def test_workflows(self):
            a = FugueWorkflow().df([[0]], "a:int")
            df_eq(a.compute(self.engine), [[0]], "a:int")

            a = _FugueInteractiveWorkflow(self.engine).df([[0]],
                                                          "a:int").persist()
            df_eq(a.result, [[0]], "a:int")
def test_create():
    dag = FugueWorkflow()
    dag.create(mock_create1, params=dict(n=1))
    dag.create(mock_create2, schema="a:int", params=dict(n=1))
    assert_eq("""
    a=create using mock_create1 params n:1
    b=create using mock_create2(n=1) schema a:int
    """, dag)
def test_select():
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1))
    b = dag.create(mock_create1, params=dict(n=2))
    dag.select("select * from a.b")
    dag.select("select * from a.b TABLESAMPLE (5 PERCENT) AS x")
    dag.select("select * from a.b AS x")
    dag.select("select * from", a, "AS a")  # fugue sql adds 'AS a'
    dag.select("select * from", a, "TABLESAMPLE (5 PERCENT) AS a")
    x = dag.select("select * from", a, "TABLESAMPLE (5 PERCENT) AS x")
    y = dag.select("select * FROM", x)
    z = dag.select("select * FROM", y, "where t = 100")
    dag.select("select a.* from", a, "AS a join", b, "AS b on a.a == b.a")

    dag.select("select * from (select * from a.b)")
    dag.select("select * from", dag.create(mock_create1), "TABLESAMPLE (5 PERCENT)")
    dag.select("select * from", dag.create(mock_create1), "AS b")
    dag.select("select * from (select * from", dag.create(mock_create1), ")")

    dag.select("select * from", a, "AS a").persist().broadcast().show()
    dag.select("select * from", a, "AS a").persist("a.b.c").broadcast().show()
    assert_eq("""
    a=create using mock_create1(n=1)
    b=create using mock_create1(n=2)
    
    # assignment and table not found
    x=select * from a.b
    
    # sample and alias when table not found
    select * from a.b TABLESAMPLE (5 PERCENT) AS x
    select * from a.b AS x
    
    # when table is found
    select * from a
    select * from a TABLESAMPLE(5 PERCENT)
    select * from a TABLESAMPLE(5 PERCENT) AS x

    # no from
    select *
    select * where t=100

    # multiple dependencies
    select a.* from a join b on a.a==b.a

    # nested query
    select * from (select * from a.b)

    # nested fugue extensions
    select * from (create using mock_create1) TABLESAMPLE(5 PERCENT)
    select * from (create using mock_create1) AS b
    select * from (select * from (create using mock_create1))

    # persist & checkpoint & broadcast
    select * from a persist broadcast print
    select * from a persist a.b.c broadcast print
    """, dag)
def test_zip():
    dag = FugueWorkflow()
    a1 = dag.create(mock_create1, params=dict(n=1))
    a2 = dag.create(mock_create1, params=dict(n=2))
    a1.zip(a2)
    assert_eq("""
    a=create using mock_create1 params n:1
    zip a,(create using mock_create1 params n:2)
    """, dag)

    dag = FugueWorkflow()
    a1 = dag.create(mock_create1, params=dict(n=1))
    a2 = dag.create(mock_create1, params=dict(n=2))
    a1.zip(a2, how="left_outer", partition=dict(by=["a"], presort="b DESC"))
    assert_eq("""
    a=create using mock_create1 params n:1
    zip a,(create using mock_create1 params n:2) left
        outer by a presort b desc
    """, dag)
def test_transform():
    w = (FugueWorkflow().df([[0], [1]], "a:int")
         .transform(mock_transformer, schema=Schema("a:int"), params=dict(n=2))
         )
    assert_eq("""
    create [[0],[1]] schema a:int
    transform using mock_transformer(n=2) schema a:int
    """, w.workflow)

    w = (FugueWorkflow().df([[0], [1]], "a:int")
         .partition(by=["a"], presort="b DESC", num="ROWCOUNT/2")
         .transform(mock_transformer, schema="*", params=dict(n=2))
         )
    assert_eq("""
    create [[0],[1]] schema a:int
    
    transform 
        prepartition ROWCOUNT / 2 by a presort b desc
        using mock_transformer(n=2) schema *
    """, w.workflow)
def test_print():
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1))
    a.show()
    b = dag.create(mock_create1, params=dict(n=2))
    dag.show(a, b, rows=5, show_count=True, title="\"b   B")
    assert_eq("""
    a=create using mock_create1(n=1)
    print
    print a, (create using mock_create1(n=2)) rows 5 rowcount title "\\"b   B"
    """, dag)
def test_output():
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1))
    a.partition(num=4).output(mock_output)
    b = dag.create(mock_create1, params=dict(n=2))
    dag.output(a, b, using=mock_output, params=dict(n=3))
    assert_eq("""
    a=create using mock_create1(n=1)
    output prepartition 4 using mock_output
    output a, (create using mock_create1(n=2)) using mock_output(n=3)
    """, dag)
Exemple #12
0
def test_dataset(tmpdir):
    factory = TuneObjectFactory()
    factory.set_temp_path(str(tmpdir))
    df = pd.DataFrame([[0]], columns=["a"])
    dag = FugueWorkflow()
    data = factory.make_dataset(dag, Space(a=1, b=1), df=df)
    assert isinstance(data, TuneDataset)
    assert factory.make_dataset(dag, data) is data
    # TODO: the unit test is not complete, but it's covered by other functions
    with raises(TuneCompileError):
        factory.make_dataset(dag, 1)
def test_load():
    dag = FugueWorkflow()
    dag.load("xx")
    dag.load("xx", fmt="csv")
    dag.load("xx", columns="a:int,b:str")
    dag.load("xx", columns=["a", "b"], header=True)
    assert_eq("""
    load "xx"
    load csv "xx"
    load "xx" columns a:int, b:str
    load "xx"(header=True) columns a, b
    """, dag)
def test_cotransform():
    dag = FugueWorkflow()
    a1 = dag.create(mock_create1, params=dict(n=1))
    a2 = dag.create(mock_create1, params=dict(n=2))
    z = dag.zip(a1, a2)
    t = z.partition(num=3).transform(mock_cotransformer1, params=dict(n=3))
    assert_eq("""
    zip 
        (create using mock_create1 params n:1),
        (create using mock_create1 params n:2)
    transform prepartition 3 using mock_cotransformer1(n=3)
    """, dag)
def test_save():
    dag = FugueWorkflow()
    a = dag.create(mock_create1, params=dict(n=1))
    a.save("xx", fmt="parquet", mode="overwrite")
    a.save("xx", mode="append")
    a.save("xx", mode="error")
    a.save("xx.csv", fmt="csv", mode="error", single=True, header=True)
    a.partition(by=["x"]).save("xx", mode="overwrite")
    b = dag.create(mock_create1, params=dict(n=2)).save("xx", mode="overwrite")
    assert_eq("""
    a=create using mock_create1(n=1)
    save overwrite parquet "xx"
    save a append "xx"
    save a to "xx"
    save to single csv "xx.csv"(header=True)
    save prepartition by x overwrite "xx"
    save (create using mock_create1(n=2)) overwrite "xx"
    """, dag)
Exemple #16
0
def test_iterative_study(tmpdir):
    def assert_metric(df: Iterable[Dict[str, Any]], metric: float) -> None:
        for row in df:
            assert row[TUNE_REPORT_METRIC] < metric

    study = IterativeStudy(F(), str(tmpdir))
    space = sum(
        Space(a=a, b=b)
        for a, b in [(1.1, 0.2), (0.8, -0.2), (1.2, -0.1), (0.7,
                                                            0.3), (1.0, 1.5)])
    dag = FugueWorkflow()
    dataset = TuneDatasetBuilder(space, str(tmpdir)).build(dag)
    result = study.optimize(
        dataset,
        J([1, 2, 3, 4]),
    )
    result.result(1).show()
    result.result(1).output(assert_metric, params=dict(metric=-2.8))

    dag.run()
def test_persist_checkpoint_broadcast():
    dag = FugueWorkflow()
    dag.create(mock_create1).persist()
    dag.create(mock_create1).persist("a.b")

    dag.create(mock_create1).broadcast()
    dag.create(mock_create1).persist("a.b").broadcast()

    dag.create(mock_create1).checkpoint()
    dag.create(mock_create1).checkpoint()
    dag.create(mock_create1).checkpoint("xy z")
    dag.create(mock_create1).checkpoint("xy z").broadcast()
    assert_eq("""
    create using mock_create1 persist
    a=create using mock_create1 persist a.b

    create using mock_create1 broadcast
    a=create using mock_create1 persist a.b broadcast

    create using mock_create1 checkpoint
    a?? create using mock_create1
    a=create using mock_create1 checkpoint "xy z"
    a??create using mock_create1 checkpoint "xy z" broadcast
    """, dag)
Exemple #18
0
 def test_default_init(self):
     a = FugueWorkflow().df([[0]], "a:int")
     df_eq(a.compute(DaskExecutionEngine), [[0]], "a:int")
def test_create_data():
    w = FugueWorkflow().df([[0], [1]], "a:int")
    assert_eq("""
    a=create [[0],[1]] schema a:int
    """, w.workflow)
Exemple #20
0
 def dag(self) -> FugueWorkflow:
     return FugueWorkflow(self.engine)
 def test_default_session(self):
     a = FugueWorkflow().df([[0]], "a:int")
     df_eq(a.compute(SparkExecutionEngine), [[0]], "a:int")
 def test_df_init(self):
     sdf = self.spark_session.createDataFrame([[1.1]], "a:double")
     a = FugueWorkflow().df(sdf)
     df_eq(a.compute(SparkExecutionEngine), [[1.1]], "a:double")
def assert_eq(expr, expected: FugueWorkflow):
    sql = FugueSQL(expr, "fugueLanguage", ignore_case=True, simple_assign=True)
    wf = FugueWorkflow()
    v = _Extensions(sql, FugueSQLHooks(), wf)
    obj = v.visit(sql.tree)
    assert expected.spec_uuid() == v.workflow.spec_uuid()
Exemple #24
0
 def test_session_as_engine(self):
     dag = FugueWorkflow()
     a = dag.df([[p, 0] for p in range(100)], "a:int,b:int")
     a.partition(algo="even",
                 by=["a"]).transform(AssertMaxNTransform).persist()
     dag.run(self.spark_session)