def _to_outputter( obj: Any, global_vars: Optional[Dict[str, Any]] = None, local_vars: Optional[Dict[str, Any]] = None, validation_rules: Optional[Dict[str, Any]] = None, ) -> Outputter: global_vars, local_vars = get_caller_global_local_vars(global_vars, local_vars) exp: Optional[Exception] = None if validation_rules is None: validation_rules = {} try: return copy.copy( to_instance(obj, Outputter, global_vars=global_vars, local_vars=local_vars) ) except Exception as e: exp = e try: f = to_function(obj, global_vars=global_vars, local_vars=local_vars) # this is for string expression of function with decorator if isinstance(f, Outputter): return copy.copy(f) # this is for functions without decorator return _FuncAsOutputter.from_func(f, validation_rules=validation_rules) except Exception as e: exp = e raise FugueInterfacelessError(f"{obj} is not a valid outputter", exp)
def _to_tunable( obj: Any, global_vars: Optional[Dict[str, Any]] = None, local_vars: Optional[Dict[str, Any]] = None, distributable: Optional[bool] = None, ) -> Tunable: global_vars, local_vars = get_caller_global_local_vars( global_vars, local_vars) def get_tunable() -> Tunable: if isinstance(obj, Tunable): return copy.copy(obj) try: f = to_function(obj, global_vars=global_vars, local_vars=local_vars) # this is for string expression of function with decorator if isinstance(f, Tunable): return copy.copy(f) # this is for functions without decorator return _FuncAsTunable.from_func(f, distributable) except Exception as e: exp = e raise FugueTuneCompileError(f"{obj} is not a valid tunable function", exp) t = get_tunable() if distributable is None: distributable = t.distributable elif distributable: assert_or_throw(t.distributable, FugueTuneCompileError(f"{t} is not distributable")) return t
def __call__(self, code: str, *args: Any, **kwargs: Any) -> None: global_vars, local_vars = get_caller_global_local_vars() variables = self._sql(code, self._sql_vars, global_vars, local_vars, *args, **kwargs) for k, v in variables.items(): if isinstance(v, WorkflowDataFrame) and v.workflow is self: self._sql_vars[k] = v
def _to_creator( obj: Any, schema: Any = None, global_vars: Optional[Dict[str, Any]] = None, local_vars: Optional[Dict[str, Any]] = None, ) -> Creator: global_vars, local_vars = get_caller_global_local_vars( global_vars, local_vars) exp: Optional[Exception] = None try: return copy.copy( to_instance(obj, Creator, global_vars=global_vars, local_vars=local_vars)) except Exception as e: exp = e try: f = to_function(obj, global_vars=global_vars, local_vars=local_vars) # this is for string expression of function with decorator if isinstance(f, Creator): return copy.copy(f) # this is for functions without decorator return _FuncAsCreator.from_func(f, schema) except Exception as e: exp = e raise FugueInterfacelessError(f"{obj} is not a valid creator", exp)
def _to_processor( obj: Any, schema: Any = None, global_vars: Optional[Dict[str, Any]] = None, local_vars: Optional[Dict[str, Any]] = None, validation_rules: Optional[Dict[str, Any]] = None, ) -> Processor: global_vars, local_vars = get_caller_global_local_vars( global_vars, local_vars) obj = _PROCESSOR_REGISTRY.get(obj) exp: Optional[Exception] = None if validation_rules is None: validation_rules = {} try: return copy.copy( to_instance(obj, Processor, global_vars=global_vars, local_vars=local_vars)) except Exception as e: exp = e try: f = to_function(obj, global_vars=global_vars, local_vars=local_vars) # this is for string expression of function with decorator if isinstance(f, Processor): return copy.copy(f) # this is for functions without decorator return _FuncAsProcessor.from_func(f, schema, validation_rules=validation_rules) except Exception as e: exp = e raise FugueInterfacelessError(f"{obj} is not a valid processor", exp)
def fsql_dask( sql: str, ctx: Optional[Context] = None, register: bool = False, fugue_conf: Any = None, ) -> Dict[str, dd.DataFrame]: """Fugue SQL utility function that can consume Context directly. Fugue SQL is a language extending standard SQL. It makes SQL eligible to describe end to end workflows. It also enables you to invoke python extensions in the SQL like language. For more, please read `Fugue SQl Tutorial <https://fugue-tutorials.readthedocs.io/en/latest/tutorials/fugue_sql/index.html/>`_ Args: sql: (:obj:`str`): Fugue SQL statement ctx (:class:`dask_sql.Context`): The context to operate on, defaults to None register (:obj:`bool`): Whether to register named steps back to the context (if provided), defaults to False fugue_conf (:obj:`Any`): a dictionary like object containing Fugue specific configs Example: .. code-block:: python # schema: * def median(df:pd.DataFrame) -> pd.DataFrame: df["y"] = df["y"].median() return df.head(1) # Create a context with tables df1, df2 c = Context() ... result = fsql_dask(''' j = SELECT df1.*, df2.x FROM df1 INNER JOIN df2 ON df1.key = df2.key PERSIST # using persist because j will be used twice TAKE 5 ROWS PREPARTITION BY x PRESORT key PRINT TRANSFORM j PREPARTITION BY x USING median PRINT ''', c, register=True) assert "j" in result assert "j" in c.tables """ _global, _local = get_caller_global_local_vars() dag = FugueSQLWorkflow() dfs = {} if ctx is None else {k: dag.df(v.df) for k, v in ctx.tables.items()} result = dag._sql(sql, _global, _local, **dfs) dag.run(DaskSQLExecutionEngine(conf=fugue_conf)) result_dfs = { k: v.result.native for k, v in result.items() if isinstance(v, WorkflowDataFrame) } if register and ctx is not None: for k, v in result_dfs.items(): ctx.create_table(k, v) return result_dfs
def assert_eq(expr, expected: FugueWorkflow): global_vars, local_vars = get_caller_global_local_vars() sql = FugueSQL(expr, "fugueLanguage", ignore_case=True, simple_assign=True) wf = FugueWorkflow() v = _Extensions( sql, FugueSQLHooks(), wf, global_vars=global_vars, local_vars=local_vars ) obj = v.visit(sql.tree) assert expected.spec_uuid() == v.workflow.spec_uuid()
def tune( # noqa: C901 params_df: WorkflowDataFrame, tunable: Any, distributable: Optional[bool] = None, objective_runner: Optional[ObjectiveRunner] = None, ) -> WorkflowDataFrame: t = _to_tunable( # type: ignore tunable, *get_caller_global_local_vars(), distributable) if distributable is None: distributable = t.distributable if objective_runner is None: objective_runner = ObjectiveRunner() # input_has: __fmin_params__:str # schema: *,__fmin_value__:double,__fmin_metadata__:str def compute_transformer( df: Iterable[Dict[str, Any]]) -> Iterable[Dict[str, Any]]: for row in df: dfs: Dict[str, Any] = {} dfs_keys: Set[str] = set() for k, v in row.items(): if k.startswith("__df_"): key = k[len("__df_"):] if v is not None: dfs[key] = pd.read_parquet(v) dfs_keys.add(key) for params in json.loads(row["__fmin_params__"]): p = decode(params) best = objective_runner.run( # type: ignore t, dict(**dfs, **p), set(p.keys())) res = dict(row) res["__fmin_params__"] = json.dumps(best["hp"]) res["__fmin_value__"] = best["error"] res["__fmin_metadata__"] = json.dumps(best["metadata"]) yield res # input_has: __fmin_params__:str def compute_processor(engine: ExecutionEngine, df: DataFrame) -> DataFrame: def get_rows() -> Iterable[Any]: keys = list( df.schema.names) + ["__fmin_value__", "__fmin_metadata__"] for row in compute_transformer(df.as_dict_iterable()): yield [row[k] for k in keys] t._execution_engine = engine # type:ignore return ArrayDataFrame( get_rows(), df.schema + "__fmin_value__:double,__fmin_metadata__:str") if not distributable: return params_df.process(compute_processor) else: return params_df.partition(num="ROWCOUNT", algo="even").transform(compute_transformer)
def _to_transformer( # noqa: C901 obj: Any, schema: Any = None, global_vars: Optional[Dict[str, Any]] = None, local_vars: Optional[Dict[str, Any]] = None, validation_rules: Optional[Dict[str, Any]] = None, func_transformer_type: Type = _FuncAsTransformer, func_cotransformer_type: Type = _FuncAsCoTransformer, ) -> Union[Transformer, CoTransformer]: global_vars, local_vars = get_caller_global_local_vars( global_vars, local_vars) exp: Optional[Exception] = None if validation_rules is None: validation_rules = {} try: return copy.copy( to_instance(obj, Transformer, global_vars=global_vars, local_vars=local_vars)) except Exception as e: exp = e try: return copy.copy( to_instance(obj, CoTransformer, global_vars=global_vars, local_vars=local_vars)) except Exception as e: exp = e try: f = to_function(obj, global_vars=global_vars, local_vars=local_vars) # this is for string expression of function with decorator if isinstance(f, Transformer): return copy.copy(f) # this is for functions without decorator return func_transformer_type.from_func( f, schema, validation_rules=validation_rules) except Exception as e: exp = e try: f = to_function(obj, global_vars=global_vars, local_vars=local_vars) # this is for string expression of function with decorator if isinstance(f, CoTransformer): return copy.copy(f) # this is for functions without decorator return func_cotransformer_type.from_func( f, schema, validation_rules=validation_rules) except Exception as e: exp = e raise FugueInterfacelessError(f"{obj} is not a valid transformer", exp)
def _to_output_transformer( obj: Any, global_vars: Optional[Dict[str, Any]] = None, local_vars: Optional[Dict[str, Any]] = None, validation_rules: Optional[Dict[str, Any]] = None, ) -> Union[Transformer, CoTransformer]: global_vars, local_vars = get_caller_global_local_vars( global_vars, local_vars) return _to_transformer( obj=obj, schema=None, global_vars=global_vars, local_vars=local_vars, validation_rules=validation_rules, func_transformer_type=_FuncAsOutputTransformer, func_cotransformer_type=_FuncAsOutputCoTransformer, )
def _to_module( obj: Any, global_vars: Optional[Dict[str, Any]] = None, local_vars: Optional[Dict[str, Any]] = None, ) -> "_ModuleFunctionWrapper": if isinstance(obj, _ModuleFunctionWrapper): return obj global_vars, local_vars = get_caller_global_local_vars(global_vars, local_vars) try: f = to_function(obj, global_vars=global_vars, local_vars=local_vars) # this is for string expression of function with decorator if isinstance(f, _ModuleFunctionWrapper): return copy.copy(f) # this is for functions without decorator return _ModuleFunctionWrapper(f) except Exception as e: exp = e raise FugueInterfacelessError(f"{obj} is not a valid module", exp)
def _to_transformer( obj: Any, schema: Any = None, global_vars: Optional[Dict[str, Any]] = None, local_vars: Optional[Dict[str, Any]] = None, validation_rules: Optional[Dict[str, Any]] = None, ) -> Union[Transformer, CoTransformer]: global_vars, local_vars = get_caller_global_local_vars( global_vars, local_vars) return _to_general_transformer( obj=_TRANSFORMER_REGISTRY.get(obj), schema=schema, global_vars=global_vars, local_vars=local_vars, validation_rules=validation_rules, func_transformer_type=_FuncAsTransformer, func_cotransformer_type=_FuncAsCoTransformer, )
def __call__(self, code: str, *args: Any, **kwargs: Any): cf = inspect.currentframe() global_vars, local_vars = get_caller_global_local_vars() global_vars = { k: v for k, v in global_vars.items() if not isinstance(v, WorkflowDataFrame) or v.workflow is self } local_vars = { k: v for k, v in local_vars.items() if not isinstance(v, WorkflowDataFrame) or v.workflow is self } variables = self._sql(code, self._sql_vars, global_vars, local_vars, *args, **kwargs) if cf is not None: for k, v in variables.items(): if isinstance(v, WorkflowDataFrame) and v.workflow is self: self._sql_vars[k] = v
def to_noniterative_objective( obj: Any, min_better: bool = True, global_vars: Optional[Dict[str, Any]] = None, local_vars: Optional[Dict[str, Any]] = None, ) -> NonIterativeObjectiveFunc: if isinstance(obj, NonIterativeObjectiveFunc): return copy.copy(obj) global_vars, local_vars = get_caller_global_local_vars(global_vars, local_vars) try: f = to_function(obj, global_vars=global_vars, local_vars=local_vars) # this is for string expression of function with decorator if isinstance(f, NonIterativeObjectiveFunc): return copy.copy(f) # this is for functions without decorator return _NonIterativeObjectiveFuncWrapper.from_func(f, min_better) except Exception as e: exp = e raise TuneCompileError(f"{obj} is not a valid tunable function", exp)
def __init__( self, sql: FugueSQL, hooks: FugueSQLHooks, workflow: FugueWorkflow, variables: Optional[Dict[str, WorkflowDataFrame]] = None, last: Optional[WorkflowDataFrame] = None, global_vars: Optional[Dict[str, Any]] = None, local_vars: Optional[Dict[str, Any]] = None, ): super().__init__(sql) self._workflow = workflow self._variables: Dict[str, WorkflowDataFrame] = {} if variables is not None: self._variables.update(variables) self._last: Optional[WorkflowDataFrame] = last self._hooks = hooks self._global_vars, self._local_vars = get_caller_global_local_vars( global_vars, local_vars )
def fsql(sql: str, *args: Any, **kwargs: Any) -> FugueSQLWorkflow: global_vars, local_vars = get_caller_global_local_vars() dag = FugueSQLWorkflow() dag._sql(sql, global_vars, local_vars, *args, **kwargs) return dag
def fsql(sql: str, *args: Any, fsql_ignore_case: bool = False, **kwargs: Any) -> FugueSQLWorkflow: """Fugue SQL functional interface :param sql: the Fugue SQL string (can be a jinja template) :param args: variables related to the SQL string :param fsql_ignore_case: whether to ignore case when parsing the SQL string defaults to False. :param kwargs: variables related to the SQL string :return: the translated Fugue workflow .. code-block:: python # Basic case fsql(''' CREATE [[0]] SCHEMA a:int PRINT ''').run() # With external data sources df = pd.DataFrame([[0],[1]], columns=["a"]) fsql(''' SELECT * FROM df WHERE a=0 PRINT ''').run() # With external variables df = pd.DataFrame([[0],[1]], columns=["a"]) t = 1 fsql(''' SELECT * FROM df WHERE a={{t}} PRINT ''').run() # The following is the explicit way to specify variables and datafrems # (recommended) df = pd.DataFrame([[0],[1]], columns=["a"]) t = 1 fsql(''' SELECT * FROM df WHERE a={{t}} PRINT ''', df=df, t=t).run() # Using extensions def dummy(df:pd.DataFrame) -> pd.DataFrame: return df fsql(''' CREATE [[0]] SCHEMA a:int TRANSFORM USING dummy SCHEMA * PRINT ''').run() # It's recommended to provide full path of the extension inside # Fugue SQL, so the SQL definition and exeuction can be more # independent from the extension definition. # Run with different execution engines sql = ''' CREATE [[0]] SCHEMA a:int TRANSFORM USING dummy SCHEMA * PRINT ''' fsql(sql).run(user_defined_spark_session()) fsql(sql).run(SparkExecutionEngine, {"spark.executor.instances":10}) fsql(sql).run(DaskExecutionEngine) # Passing dataframes between fsql calls result = fsql(''' CREATE [[0]] SCHEMA a:int YIELD DATAFRAME AS x CREATE [[1]] SCHEMA a:int YIELD DATAFRAME AS y ''').run(DaskExecutionEngine) fsql(''' SELECT * FROM x UNION SELECT * FROM y UNION SELECT * FROM z PRINT ''', result, z=pd.DataFrame([[2]], columns=["z"])).run() # Get framework native dataframes result["x"].native # Dask dataframe result["y"].native # Dask dataframe result["x"].as_pandas() # Pandas dataframe # Use lower case fugue sql df = pd.DataFrame([[0],[1]], columns=["a"]) t = 1 fsql(''' select * from df where a={{t}} print ''', df=df, t=t, fsql_ignore_case=True).run() """ global_vars, local_vars = get_caller_global_local_vars() dag = FugueSQLWorkflow(None, {FUGUE_CONF_SQL_IGNORE_CASE: fsql_ignore_case}) try: dag._sql(sql, global_vars, local_vars, *args, **kwargs) except FugueSQLSyntaxError as ex: raise FugueSQLSyntaxError(str(ex)).with_traceback(None) from None return dag