def test_lazy_use_df(): df1 = pd.DataFrame([[0]], columns=["a"]) df2 = pd.DataFrame([[1]], columns=["a"]) # although df2 is defined as a local variable # since it is not used in dag1, so it was never converted dag1 = FugueSQLWorkflow() dag1("""PRINT df1""") dag2 = FugueSQLWorkflow() dag2.df(df1).show() assert dag1.spec_uuid() == dag2.spec_uuid()
def fsql_dask( sql: str, ctx: Optional[Context] = None, register: bool = False, fugue_conf: Any = None, ) -> Dict[str, dd.DataFrame]: """Fugue SQL utility function that can consume Context directly. Fugue SQL is a language extending standard SQL. It makes SQL eligible to describe end to end workflows. It also enables you to invoke python extensions in the SQL like language. For more, please read `Fugue SQl Tutorial <https://fugue-tutorials.readthedocs.io/en/latest/tutorials/fugue_sql/index.html/>`_ Args: sql: (:obj:`str`): Fugue SQL statement ctx (:class:`dask_sql.Context`): The context to operate on, defaults to None register (:obj:`bool`): Whether to register named steps back to the context (if provided), defaults to False fugue_conf (:obj:`Any`): a dictionary like object containing Fugue specific configs Example: .. code-block:: python # schema: * def median(df:pd.DataFrame) -> pd.DataFrame: df["y"] = df["y"].median() return df.head(1) # Create a context with tables df1, df2 c = Context() ... result = fsql_dask(''' j = SELECT df1.*, df2.x FROM df1 INNER JOIN df2 ON df1.key = df2.key PERSIST # using persist because j will be used twice TAKE 5 ROWS PREPARTITION BY x PRESORT key PRINT TRANSFORM j PREPARTITION BY x USING median PRINT ''', c, register=True) assert "j" in result assert "j" in c.tables """ _global, _local = get_caller_global_local_vars() dag = FugueSQLWorkflow() dfs = {} if ctx is None else {k: dag.df(v.df) for k, v in ctx.tables.items()} result = dag._sql(sql, _global, _local, **dfs) dag.run(DaskSQLExecutionEngine(conf=fugue_conf)) result_dfs = { k: v.result.native for k, v in result.items() if isinstance(v, WorkflowDataFrame) } if register and ctx is not None: for k, v in result_dfs.items(): ctx.create_table(k, v) return result_dfs
def i1(wf: FugueSQLWorkflow) -> WorkflowDataFrame: return wf.df([[0]], "a:int")