def test_show(): class _CustomShow(object): def __init__(self): self.called = False def show(self, schema, head_rows, title, rows, count): print(schema, head_rows) print(title, rows, count) self.called = True cs = _CustomShow() with FugueSQLWorkflow() as dag: dag(""" a = CREATE[[0], [1]] SCHEMA a: int b = CREATE[[0], [1]] SCHEMA a: int PRINT 10 ROWS FROM a, b ROWCOUNT TITLE "abc" PRINT a, b """) assert not cs.called Show.set_hook(cs.show) with FugueSQLWorkflow() as dag: dag(""" a = CREATE[[0], [1]] SCHEMA a: int b = CREATE[[0], [1]] SCHEMA a: int PRINT 10 ROWS FROM a, b ROWCOUNT TITLE "abc" PRINT a, b """) assert cs.called
def test_use_soecial_df(tmpdir): # external non-workflowdataframe arr = ArrayDataFrame([[0], [1]], "a:int") fsql( """ b=CREATE[[0], [1]] SCHEMA a: int a = SELECT * FROM a.x OUTPUT a, b USING assert_eq a = SELECT x.* FROM a.x AS x OUTPUT a, b USING assert_eq c=CREATE [[0,0],[1,1]] SCHEMA a:int,b:int d = SELECT x.*,y.a AS b FROM a.x x INNER JOIN a.x y ON x.a=y.a OUTPUT c, d USING assert_eq """, { "a.x": arr }, ).run() # from yield file engine = NativeExecutionEngine( conf={"fugue.workflow.checkpoint.path": os.path.join(tmpdir, "ck")}) with FugueSQLWorkflow(engine) as dag: dag("CREATE[[0], [1]] SCHEMA a: int YIELD FILE AS b") res = dag.yields["b"] with FugueSQLWorkflow(engine) as dag: dag( """ b=CREATE[[0], [1]] SCHEMA a: int a = SELECT * FROM a.x OUTPUT a, b USING assert_eq """, {"a.x": res}, )
def test_conf_override(): with raises(FugueSQLSyntaxError): FugueSQLWorkflow()("create [[0]] schema a:int") with FugueSQLWorkflow(None, {"fugue.sql.compile.ignore_case": "true"}) as dag: a = dag.df([[0], [1]], "a:int") dag(""" create [[0],[1]] schema a:int b = select * output a,b using assert_eq""")
def test_conf_override(): with raises(FugueSQLSyntaxError): FugueSQLWorkflow()("create [[0]] schema a:int") with FugueSQLWorkflow( NativeExecutionEngine({"fugue.sql.compile.ignore_case": "true"})) as dag: a = dag.df([[0], [1]], "a:int") dag(""" b = create [[0],[1]] schema a:int output a,b using assert_eq""")
def test_sql(): register_execution_engine( "da", lambda conf, **kwargs: DaskExecutionEngine(conf=conf)) df = dd.from_pandas(pd.DataFrame([[0], [1]], columns=["a"]), npartitions=2) dag = FugueSQLWorkflow() dag( """ SELECT * FROM df WHERE a>0 PRINT """, df=df, ) dag.run("da")
def test(self): with FugueSQLWorkflow() as dag: dag(""" a = CREATE [[0],[1]] SCHEMA a:int b = TRANSFORM USING self.t OUTPUT a,b USING assert_eq """)
def test_sql(): session = SparkSession.builder.getOrCreate() register_execution_engine( "s", lambda conf, **kwargs: SparkExecutionEngine(conf=conf, spark_session=session), ) df = session.createDataFrame(pd.DataFrame([[0], [1]], columns=["a"])) dag = FugueSQLWorkflow() dag( """ SELECT * FROM df WHERE a>0 PRINT """, df=df, ) dag.run("s")
def test_workflow_conf(): dag = FugueSQLWorkflow(NativeExecutionEngine({"x": 10})) assert 10 == dag.conf.get_or_throw("x", int) assert not dag.conf.get_or_throw("fugue.sql.compile.ignore_case", bool) dag = FugueSQLWorkflow( NativeExecutionEngine({ "x": 10, "fugue.sql.compile.ignore_case": True })) assert 10 == dag.conf.get_or_throw("x", int) assert dag.conf.get_or_throw("fugue.sql.compile.ignore_case", bool) dag = FugueSQLWorkflow(NativeExecutionEngine({"x": 10}), {"fugue.sql.compile.ignore_case": "true"}) assert 10 == dag.conf.get_or_throw("x", int) assert dag.conf.get_or_throw("fugue.sql.compile.ignore_case", bool)
def test_workflow_conf(): dag = FugueSQLWorkflow( NativeExecutionEngine({ "x": 10, "fugue.sql.compile.simple_assign": "false" })) assert 10 == dag.conf.get_or_throw("x", int) assert not dag.conf.get_or_throw("fugue.sql.compile.simple_assign", bool) assert not dag.conf.get_or_throw("fugue.sql.compile.ignore_case", bool)
def test_process_module(): # pylint: disable=no-value-for-parameter def process(df: pd.DataFrame, d: int = 1) -> pd.DataFrame: df["a"] += d return df @module def p1(wf: FugueSQLWorkflow, df: WorkflowDataFrame) -> WorkflowDataFrame: return df.process(process) @module() def p2(wf: FugueWorkflow, dfs: WorkflowDataFrames, d: int) -> WorkflowDataFrames: return WorkflowDataFrames( {k: v.process(process, params={"d": d}) for k, v in dfs.items()}) @module(as_method=True, name="p4") def p3(df: WorkflowDataFrame) -> WorkflowDataFrame: return df.process(process) assert p1.has_input assert not p1.has_dfs_input assert p2.has_dfs_input with FugueSQLWorkflow() as dag: df = dag.df([[0]], "a:int") p1(df).assert_eq(dag.df([[1]], "a:int")) p1(dag, df).assert_eq(dag.df([[1]], "a:int")) p1(df=df).assert_eq(dag.df([[1]], "a:int")) p1(df=df, wf=dag).assert_eq(dag.df([[1]], "a:int")) with FugueWorkflow() as dag: dfs = WorkflowDataFrames(aa=dag.df([[0]], "a:int"), bb=dag.df([[10]], "a:int")) r = p2(dag, dfs, 1) r["aa"].assert_eq(dag.df([[1]], "a:int")) r["bb"].assert_eq(dag.df([[11]], "a:int")) r = p2(dfs, 1) r["aa"].assert_eq(dag.df([[1]], "a:int")) r["bb"].assert_eq(dag.df([[11]], "a:int")) r = p2(d=1, dfs=dfs, wf=dag) r["aa"].assert_eq(dag.df([[1]], "a:int")) r["bb"].assert_eq(dag.df([[11]], "a:int")) r = p2(d=1, dfs=dfs) r["aa"].assert_eq(dag.df([[1]], "a:int")) r["bb"].assert_eq(dag.df([[11]], "a:int")) with FugueWorkflow() as dag: df = dag.df([[0]], "a:int") p3(df).assert_eq(dag.df([[1]], "a:int")) p3(df=df).assert_eq(dag.df([[1]], "a:int")) df.p4().assert_eq(dag.df([[1]], "a:int"))
def test_multiple_sql_with_reset(): with FugueSQLWorkflow() as dag: a = dag.df([[0], [1]], "a:int") dag("b = CREATE [[0],[1]] SCHEMA a:int") a = dag.df([[0], [2]], "a:int") b = dag.df([[0], [2]], "a:int") dag(""" OUTPUT a, b USING assert_eq OUTPUT a, (CREATE[[0], [2]] SCHEMA a: int) USING assert_eq """)
def test_lazy_use_df(): df1 = pd.DataFrame([[0]], columns=["a"]) df2 = pd.DataFrame([[1]], columns=["a"]) # although df2 is defined as a local variable # since it is not used in dag1, so it was never converted dag1 = FugueSQLWorkflow() dag1("""PRINT df1""") dag2 = FugueSQLWorkflow() dag2.df(df1).show() assert dag1.spec_uuid() == dag2.spec_uuid()
def test_multiple_blocks(): with FugueSQLWorkflow() as dag: a = dag.df([[0], [1]], "a:int") c = 1 dag(""" OUTPUT a, (CREATE[[0], [1]] SCHEMA a: int) USING assert_eq """) # dataframe can't pass to another workflow with FugueSQLWorkflow() as dag: assert "a" in locals() with raises(FugueSQLError): dag(""" OUTPUT a, (CREATE[[0], [1]] SCHEMA a: int) USING assert_eq """) # other local variables are fine with FugueSQLWorkflow() as dag: a = dag.df([[0], [1]], "a:int") dag(""" OUTPUT a, (CREATE[[0], [{{c}}]] SCHEMA a: int) USING assert_eq """)
def test_use_param(): with FugueSQLWorkflow() as dag: a = dag.df([[0], [1]], "a:int") x = 0 dag( """ b=CREATE[[{{x}}], [{{y}}]] SCHEMA a: int OUTPUT a, b USING assert_eq """, y=1, )
def fsql_dask( sql: str, ctx: Optional[Context] = None, register: bool = False, fugue_conf: Any = None, ) -> Dict[str, dd.DataFrame]: """Fugue SQL utility function that can consume Context directly. Fugue SQL is a language extending standard SQL. It makes SQL eligible to describe end to end workflows. It also enables you to invoke python extensions in the SQL like language. For more, please read `Fugue SQl Tutorial <https://fugue-tutorials.readthedocs.io/en/latest/tutorials/fugue_sql/index.html/>`_ Args: sql: (:obj:`str`): Fugue SQL statement ctx (:class:`dask_sql.Context`): The context to operate on, defaults to None register (:obj:`bool`): Whether to register named steps back to the context (if provided), defaults to False fugue_conf (:obj:`Any`): a dictionary like object containing Fugue specific configs Example: .. code-block:: python # schema: * def median(df:pd.DataFrame) -> pd.DataFrame: df["y"] = df["y"].median() return df.head(1) # Create a context with tables df1, df2 c = Context() ... result = fsql_dask(''' j = SELECT df1.*, df2.x FROM df1 INNER JOIN df2 ON df1.key = df2.key PERSIST # using persist because j will be used twice TAKE 5 ROWS PREPARTITION BY x PRESORT key PRINT TRANSFORM j PREPARTITION BY x USING median PRINT ''', c, register=True) assert "j" in result assert "j" in c.tables """ _global, _local = get_caller_global_local_vars() dag = FugueSQLWorkflow() dfs = {} if ctx is None else {k: dag.df(v.df) for k, v in ctx.tables.items()} result = dag._sql(sql, _global, _local, **dfs) dag.run(DaskSQLExecutionEngine(conf=fugue_conf)) result_dfs = { k: v.result.native for k, v in result.items() if isinstance(v, WorkflowDataFrame) } if register and ctx is not None: for k, v in result_dfs.items(): ctx.create_table(k, v) return result_dfs
def test_jinja_keyword_in_sql(): with FugueSQLWorkflow(("", "sqlite")) as dag: dag("""{% raw -%} CREATE [["{%'{%'"]] SCHEMA a:str SELECT * WHERE a LIKE '{%' PRINT {%- endraw %}""") df = dag.df([["b"]], "a:str") x = "b" dag(""" df2 = SELECT * FROM df WHERE a = "{{x}}" OUTPUT df, df2 USING assert_eq """)
def test_use_df(tmpdir): # df generated inside dag with FugueSQLWorkflow() as dag: a = dag.df([[0], [1]], "a:int") dag(""" b=CREATE[[0], [1]] SCHEMA a: int OUTPUT a, b USING assert_eq """) dag.sql_vars["b"].assert_eq(a) # external non-workflowdataframe arr = ArrayDataFrame([[0], [1]], "a:int") with FugueSQLWorkflow() as dag: dag( """ b=CREATE[[0], [1]] SCHEMA a: int OUTPUT a, b USING assert_eq """, a=arr, ) dag.sql_vars["b"].assert_eq(dag.df([[0], [1]], "a:int")) # from yield file engine = NativeExecutionEngine( conf={"fugue.workflow.checkpoint.path": os.path.join(tmpdir, "ck")}) with FugueSQLWorkflow(engine) as dag: dag("CREATE[[0], [1]] SCHEMA a: int YIELD FILE AS b") res = dag.yields["b"] with FugueSQLWorkflow(engine) as dag: dag( """ b=CREATE[[0], [1]] SCHEMA a: int OUTPUT a, b USING assert_eq """, a=res, ) # from yield dataframe engine = NativeExecutionEngine() with FugueSQLWorkflow(engine) as dag: dag("CREATE[[0], [1]] SCHEMA a: int YIELD DATAFRAME AS b") res = dag.yields["b"] with FugueSQLWorkflow(engine) as dag: dag( """ b=CREATE[[0], [1]] SCHEMA a: int OUTPUT a, b USING assert_eq """, a=res, )
def test_local_instance_as_extension(): class _Mock(object): # schema: * def t(self, df: pd.DataFrame) -> pd.DataFrame: return df def test(self): with FugueSQLWorkflow() as dag: dag(""" a = CREATE [[0],[1]] SCHEMA a:int b = TRANSFORM USING self.t OUTPUT a,b USING assert_eq """) m = _Mock() m.test() with FugueSQLWorkflow() as dag: dag(""" a = CREATE [[0],[1]] SCHEMA a:int b = TRANSFORM USING m.t OUTPUT a,b USING assert_eq """)
def test_call_back(): class CB(object): def __init__(self): self.n = 0 def incr(self, n): self.n += n return self.n cb = CB() # schema: * def t(df: pd.DataFrame, incr: callable) -> pd.DataFrame: incr(1) return df with FugueSQLWorkflow() as dag: dag(""" a = CREATE [[0],[1],[1]] SCHEMA a:int TRANSFORM PREPARTITION BY a USING t CALLBACK cb.incr PERSIST OUTTRANSFORM a PREPARTITION BY a USING t CALLBACK cb.incr """) assert 4 == cb.n
def dag(self) -> FugueSQLWorkflow: return FugueSQLWorkflow(self.engine)
def test_function_calls(): with FugueSQLWorkflow() as dag: a = dag.df([[0], [1]], "a:int") _eq(dag, a)
def i1(wf: FugueSQLWorkflow) -> WorkflowDataFrame: return wf.df([[0]], "a:int")
def test_multiple_sql(): with FugueSQLWorkflow() as dag: a = dag.df([[0], [1]], "a:int") dag("b = CREATE [[0],[1]] SCHEMA a:int") dag("OUTPUT a,b USING assert_eq")