def test_compile_conf(): def assert_conf(e: ExecutionEngine, **kwargs) -> pd.DataFrame: for k, v in kwargs.items(): assert e.compile_conf[k] == v return pd.DataFrame([[0]], columns=["a"]) dag = FugueWorkflow(conf={"a": 1}) dag.create(assert_conf, params=dict(a=1)) dag.run() with raises(KeyError): # non-compile time param doesn't keep in new engine dag.run(NativeExecutionEngine()) dag = FugueWorkflow(conf={FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: "abc"}) dag.create(assert_conf, params=dict({FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: "abc"})) dag.run() # non-compile time param is kepts dag.run(NativeExecutionEngine()) # non-compile time param can't be changed by new engines # new engine compile conf will be overwritten dag.run(NativeExecutionEngine({FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: "def"}))
def test_save_and_use(): dag = FugueWorkflow() a = dag.create(mock_create1, params=dict(n=1)) b = dag.create(mock_create1, params=dict(n=1)) a = a.save_and_use("xx", fmt="parquet", mode="overwrite") b.save_and_use("xx", mode="append") b.save_and_use("xx", mode="error") a = a.save_and_use("xx.csv", fmt="csv", mode="error", single=True, header=True) a = a.partition(by=["x"]).save_and_use("xx", mode="overwrite") dag.create(mock_create1, params=dict(n=2)).save_and_use("xx", mode="overwrite") assert_eq( """ a=create using mock_create1(n=1) b=create using mock_create1(n=1) a=save and use a overwrite parquet "xx" save and use b append "xx" save and use b to "xx" save and use a to single csv "xx.csv"(header=True) save and use prepartition by x overwrite "xx" save and use (create using mock_create1(n=2)) overwrite "xx" """, dag, )
def test_zip(): dag = FugueWorkflow() a1 = dag.create(mock_create1, params=dict(n=1)) a2 = dag.create(mock_create1, params=dict(n=2)) a1.zip(a2) assert_eq( """ a=create using mock_create1 params n:1 zip a,(create using mock_create1 params n:2) """, dag, ) dag = FugueWorkflow() a1 = dag.create(mock_create1, params=dict(n=1)) a2 = dag.create(mock_create1, params=dict(n=2)) a1.zip(a2, how="left_outer", partition=dict(by=["a"], presort="b DESC")) assert_eq( """ a=create using mock_create1 params n:1 zip a,(create using mock_create1 params n:2) left outer by a presort b desc """, dag, )
def test_create(): dag = FugueWorkflow() dag.create(mock_create1, params=dict(n=1)) dag.create(mock_create2, schema="a:int", params=dict(n=1)) assert_eq( """ a=create using mock_create1 params n:1 b=create using mock_create2(n=1) schema a:int """, dag, )
def test_print(): dag = FugueWorkflow() a = dag.create(mock_create1, params=dict(n=1)) a.show() b = dag.create(mock_create1, params=dict(n=2)) dag.show(a, b, rows=5, show_count=True, title='"b B') assert_eq( """ a=create using mock_create1(n=1) print print 5 rows from a, (create using mock_create1(n=2)) rowcount title "\\"b B" """, dag, )
def test_output(): dag = FugueWorkflow() a = dag.create(mock_create1, params=dict(n=1)) a.partition(num=4).output(mock_output) b = dag.create(mock_create1, params=dict(n=2)) dag.output(a, b, using=mock_output, params=dict(n=3)) assert_eq( """ a=create using mock_create1(n=1) output prepartition 4 using mock_output output a, (create using mock_create1(n=2)) using mock_output(n=3) """, dag, )
def test_general_set_op(): dag = FugueWorkflow() a = dag.create(mock_create1, params=dict(n=1)) b = dag.create(mock_create1, params=dict(n=2)) dag.select("select * from", a, "AS a union all select * from", b, "AS b") dag.select( "SELECT * FROM", dag.create(mock_create1), "union select * from", b, "AS b" ) dag.select( "SELECT * FROM", dag.create(mock_create1), "intersect distinct SELECT * FROM", a.process(mock_processor1), ) dag.select( "select * from", dag.create(mock_create1), "union SELECT * FROM", a.process(mock_processor1), ) c = dag.create(mock_create1, params=dict(n=2)) dag.select( "SELECT * FROM", c.transform(mock_transformer2), "union SELECT * FROM", c.process(mock_processor1), ) assert_eq( """ a=create using mock_create1(n=1) b=create using mock_create1(n=2) select * from a union all select * from b create using mock_create1 union select * from b create using mock_create1 intersect distinct process a using mock_processor1 select * from (create using mock_create1) union process a using mock_processor1 # operation on omitted dependencies should work as expected c=create using mock_create1(n=2) transform using mock_transformer2 union process using mock_processor1 """, dag, )
def test_select_nested(): dag = FugueWorkflow() a = dag.create(mock_create1, params=dict(n=1)) b = dag.create(mock_create1, params=dict(n=2)) dag.select("select * from (select * from a.b)") dag.select("select * from", dag.create(mock_create1), "AS bb") dag.select("select * from", dag.create(mock_create1), "TABLESAMPLE (5 PERCENT)") dag.select("select * from (select * from", dag.create(mock_create1), ")") assert_eq( """ a=create using mock_create1(n=1) b=create using mock_create1(n=2) # nested query select * from (select * from a.b) select * from (create using mock_create1) AS bb select * from (create using mock_create1) TABLESAMPLE(5 PERCENT) select * from (select * from (create using mock_create1)) """, dag, )
def test_sample(): dag = FugueWorkflow() a = dag.create(mock_create1) a.sample(frac=0.1, replace=False, seed=None) a.sample(n=5, replace=True, seed=7) assert_eq( """ a=create using mock_create1 sample 10 percent sample replace 5 rows seed 7 from a """, dag, )
def test_drop(): dag = FugueWorkflow() a = dag.create(mock_create1) b = a.drop(["a", "b"]) c = a.drop(["a", "b"], if_exists=True) d = dag.create(mock_create1) e = d.dropna(how="any") f = d.dropna(how="all") g = d.dropna(how="any", subset=["a", "c"]) assert_eq( """ a=create using mock_create1 drop columns a,b drop columns a,b if exists from a d=create using mock_create1 drop rows if any null drop rows if all null from d drop rows if any nulls on a,c from d """, dag, )
def test_alter_columns(): dag = FugueWorkflow() a = dag.create(mock_create1) a.alter_columns(Schema("a:str,b:str")) a.alter_columns(Schema("a:float,b:double")) assert_eq( """ a=create using mock_create1 alter columns a:str, b:str alter columns a:float, b:double from a """, dag, )
def test_rename(): dag = FugueWorkflow() a = dag.create(mock_create1) b = a.rename({"a": "aa", "b": "bb"}) c = a.rename({"a": "aaa", "b": "bbb"}) assert_eq( """ a=create using mock_create1 rename columns a:aa,b:bb rename columns a:aaa,b:bbb from a """, dag, )
def test_cotransform(): dag = FugueWorkflow() a1 = dag.create(mock_create1, params=dict(n=1)) a2 = dag.create(mock_create1, params=dict(n=2)) z = dag.zip(a1, a2) t = z.partition(num=3).transform(mock_cotransformer1, params=dict(n=3)) assert_eq( """ zip (create using mock_create1 params n:1), (create using mock_create1 params n:2) transform prepartition 3 using mock_cotransformer1(n=3) """, dag, )
def test_select(): dag = FugueWorkflow() a = dag.create(mock_create1, params=dict(n=1)) b = dag.create(mock_create1, params=dict(n=2)) dag.select("select * from a.b") dag.select("select * from a.b TABLESAMPLE (5 PERCENT) AS x") dag.select("select * from a.b AS x") dag.select("select * from", a, "AS a") # fugue sql adds 'AS a' dag.select("select * from", a, "TABLESAMPLE (5 PERCENT) AS a") x = dag.select("select * from", a, "TABLESAMPLE (5 PERCENT) AS x") y = dag.select("select * FROM", x) z = dag.select("select * FROM", y, "where t = 100") dag.select("select a.* from", a, "AS a join", b, "AS b on a.a == b.a") dag.select("select * from", a, "AS a").persist().broadcast().show() dag.select("select * from", a, "AS a").weak_checkpoint( level="a.b.c" ).broadcast().show() assert_eq( """ a=create using mock_create1(n=1) b=create using mock_create1(n=2) # assignment and table not found x=select * from a.b # sample and alias when table not found select * from a.b TABLESAMPLE (5 PERCENT) AS x select * from a.b AS x # when table is found select * from a select * from a TABLESAMPLE(5 PERCENT) select * from a TABLESAMPLE(5 PERCENT) AS x # no from select * select * where t=100 # multiple dependencies select a.* from a join b on a.a==b.a # persist & checkpoint & broadcast select * from a persist broadcast print select * from a persist (level="a.b.c") broadcast print """, dag, )
def test_yield(): dag = FugueWorkflow() dag.create(mock_create1).yield_dataframe_as("a") dag.create(mock_create1).yield_file_as("aa") dag.create(mock_create1).deterministic_checkpoint().yield_dataframe_as("c") dag.create(mock_create1).deterministic_checkpoint().yield_file_as("bb") dag.create(mock_create1).deterministic_checkpoint().yield_file_as("cc") assert_eq( """ a=create using mock_create1 yield dataframe create using mock_create1 yield file as aa c=create using mock_create1 deterministic checkpoint yield dataframe d=create using mock_create1 deterministic checkpoint yield file as bb create using mock_create1 deterministic checkpoint yield file as cc """, dag, )
def test_process(): # basic features, nest dag = FugueWorkflow() a1 = dag.create(mock_create1, params=dict(n=1)) a2 = dag.create(mock_create1, params=dict(n=2)) dag.process(a1, a2, using=mock_processor1, params=dict(n=3)) dag.process(a2, a1, using=mock_processor2, schema="b:int", params=dict(n=4)) dag.process( dag.create(mock_create1, params=dict(n=5)), dag.create(mock_create1, params=dict(n=6)), using=mock_processor1, params=dict(n=7), ) assert_eq( """ a=create using mock_create1 params n:1 b=create using mock_create1 params n:2 process a,b using mock_processor1(n=3) process b,a using mock_processor2(n=4) schema b:int process # nested (create using mock_create1(n=5)), (create using mock_create1(n=6)) using mock_processor1(n=7) """, dag, ) # anonymous, nested anonymous dag = FugueWorkflow() a = dag.create(mock_create1, params=dict(n=1)).process(mock_processor3) b = a.partition(by=["a"]).process(mock_processor3) c = a.process(mock_processor3) dag.process(b, c, using=mock_processor1) assert_eq( """ create using mock_create1 params n:1 process using mock_processor3 process # nested (process prepartition by a using mock_processor3), (process using mock_processor3) using mock_processor1 """, dag, ) # no last dataframe with raises(FugueSQLError): assert_eq( """ process using mock_processor3 """, None, ) # dict like dataframes dag = FugueWorkflow() a = dag.create(mock_create1, params=dict(n=1)) b = dag.create(mock_create1, params=dict(n=2)) dag.process(dict(df1=a, df2=b), using=mock_processor1) assert_eq( """ process df1=(create using mock_create1(n=1)), df2:(create using mock_create1(n=2)) using mock_processor1 """, dag, )
def test_persist_checkpoint_broadcast(): dag = FugueWorkflow() dag.create(mock_create1).persist() dag.create(mock_create1).weak_checkpoint(lazy=True, level="a.b") dag.create(mock_create1).broadcast() dag.create(mock_create1).weak_checkpoint(level="a.b").broadcast() dag.create(mock_create1).checkpoint() dag.create(mock_create1).strong_checkpoint(lazy=True) dag.create(mock_create1).strong_checkpoint(lazy=True, x="xy z") dag.create(mock_create1).strong_checkpoint( lazy=False, partition=PartitionSpec(num=5), single=True, x="xy z" ).broadcast() dag.create(mock_create1).deterministic_checkpoint() dag.create(mock_create1).deterministic_checkpoint( lazy=False, partition=PartitionSpec(num=4), single=True, namespace="n", x=2 ) assert_eq( """ create using mock_create1 persist a=create using mock_create1 lazy persist (level="a.b") create using mock_create1 broadcast a=create using mock_create1 persist(level="a.b") broadcast create using mock_create1 checkpoint a= create using mock_create1 lazy strong checkpoint a=create using mock_create1 lazy checkpoint(x="xy z") a=create using mock_create1 checkpoint prepartition 5 single (x="xy z") broadcast create using mock_create1 deterministic checkpoint create using mock_create1 deterministic checkpoint "n" prepartition 4 single params x=2 """, dag, )