Ejemplo n.º 1
0
def _to_outputter(
    obj: Any,
    global_vars: Optional[Dict[str, Any]] = None,
    local_vars: Optional[Dict[str, Any]] = None,
    validation_rules: Optional[Dict[str, Any]] = None,
) -> Outputter:
    global_vars, local_vars = get_caller_global_local_vars(global_vars, local_vars)
    exp: Optional[Exception] = None
    if validation_rules is None:
        validation_rules = {}
    try:
        return copy.copy(
            to_instance(obj, Outputter, global_vars=global_vars, local_vars=local_vars)
        )
    except Exception as e:
        exp = e
    try:
        f = to_function(obj, global_vars=global_vars, local_vars=local_vars)
        # this is for string expression of function with decorator
        if isinstance(f, Outputter):
            return copy.copy(f)
        # this is for functions without decorator
        return _FuncAsOutputter.from_func(f, validation_rules=validation_rules)
    except Exception as e:
        exp = e
    raise FugueInterfacelessError(f"{obj} is not a valid outputter", exp)
Ejemplo n.º 2
0
def _to_tunable(
    obj: Any,
    global_vars: Optional[Dict[str, Any]] = None,
    local_vars: Optional[Dict[str, Any]] = None,
    distributable: Optional[bool] = None,
) -> Tunable:
    global_vars, local_vars = get_caller_global_local_vars(
        global_vars, local_vars)

    def get_tunable() -> Tunable:
        if isinstance(obj, Tunable):
            return copy.copy(obj)
        try:
            f = to_function(obj,
                            global_vars=global_vars,
                            local_vars=local_vars)
            # this is for string expression of function with decorator
            if isinstance(f, Tunable):
                return copy.copy(f)
            # this is for functions without decorator
            return _FuncAsTunable.from_func(f, distributable)
        except Exception as e:
            exp = e
        raise FugueTuneCompileError(f"{obj} is not a valid tunable function",
                                    exp)

    t = get_tunable()
    if distributable is None:
        distributable = t.distributable
    elif distributable:
        assert_or_throw(t.distributable,
                        FugueTuneCompileError(f"{t} is not distributable"))
    return t
Ejemplo n.º 3
0
 def __call__(self, code: str, *args: Any, **kwargs: Any) -> None:
     global_vars, local_vars = get_caller_global_local_vars()
     variables = self._sql(code, self._sql_vars, global_vars, local_vars,
                           *args, **kwargs)
     for k, v in variables.items():
         if isinstance(v, WorkflowDataFrame) and v.workflow is self:
             self._sql_vars[k] = v
Ejemplo n.º 4
0
def _to_creator(
    obj: Any,
    schema: Any = None,
    global_vars: Optional[Dict[str, Any]] = None,
    local_vars: Optional[Dict[str, Any]] = None,
) -> Creator:
    global_vars, local_vars = get_caller_global_local_vars(
        global_vars, local_vars)
    exp: Optional[Exception] = None
    try:
        return copy.copy(
            to_instance(obj,
                        Creator,
                        global_vars=global_vars,
                        local_vars=local_vars))
    except Exception as e:
        exp = e
    try:
        f = to_function(obj, global_vars=global_vars, local_vars=local_vars)
        # this is for string expression of function with decorator
        if isinstance(f, Creator):
            return copy.copy(f)
        # this is for functions without decorator
        return _FuncAsCreator.from_func(f, schema)
    except Exception as e:
        exp = e
    raise FugueInterfacelessError(f"{obj} is not a valid creator", exp)
Ejemplo n.º 5
0
def _to_processor(
    obj: Any,
    schema: Any = None,
    global_vars: Optional[Dict[str, Any]] = None,
    local_vars: Optional[Dict[str, Any]] = None,
    validation_rules: Optional[Dict[str, Any]] = None,
) -> Processor:
    global_vars, local_vars = get_caller_global_local_vars(
        global_vars, local_vars)
    obj = _PROCESSOR_REGISTRY.get(obj)
    exp: Optional[Exception] = None
    if validation_rules is None:
        validation_rules = {}
    try:
        return copy.copy(
            to_instance(obj,
                        Processor,
                        global_vars=global_vars,
                        local_vars=local_vars))
    except Exception as e:
        exp = e
    try:
        f = to_function(obj, global_vars=global_vars, local_vars=local_vars)
        # this is for string expression of function with decorator
        if isinstance(f, Processor):
            return copy.copy(f)
        # this is for functions without decorator
        return _FuncAsProcessor.from_func(f,
                                          schema,
                                          validation_rules=validation_rules)
    except Exception as e:
        exp = e
    raise FugueInterfacelessError(f"{obj} is not a valid processor", exp)
Ejemplo n.º 6
0
def fsql_dask(
    sql: str,
    ctx: Optional[Context] = None,
    register: bool = False,
    fugue_conf: Any = None,
) -> Dict[str, dd.DataFrame]:
    """Fugue SQL utility function that can consume Context directly. Fugue SQL is a language
    extending standard SQL. It makes SQL eligible to describe end to end workflows. It also
    enables you to invoke python extensions in the SQL like language.

    For more, please read
    `Fugue SQl Tutorial <https://fugue-tutorials.readthedocs.io/en/latest/tutorials/fugue_sql/index.html/>`_

    Args:
        sql: (:obj:`str`): Fugue SQL statement
        ctx (:class:`dask_sql.Context`): The context to operate on, defaults to None
        register (:obj:`bool`): Whether to register named steps back to the context
          (if provided), defaults to False
        fugue_conf (:obj:`Any`): a dictionary like object containing Fugue specific configs

    Example:
        .. code-block:: python
            # schema: *
            def median(df:pd.DataFrame) -> pd.DataFrame:
                df["y"] = df["y"].median()
                return df.head(1)

            # Create a context with tables df1, df2
            c = Context()
            ...
            result = fsql_dask('''
            j = SELECT df1.*, df2.x
                FROM df1 INNER JOIN df2 ON df1.key = df2.key
                PERSIST  # using persist because j will be used twice
            TAKE 5 ROWS PREPARTITION BY x PRESORT key
            PRINT
            TRANSFORM j PREPARTITION BY x USING median
            PRINT
            ''', c, register=True)
            assert "j" in result
            assert "j" in c.tables

    """
    _global, _local = get_caller_global_local_vars()

    dag = FugueSQLWorkflow()
    dfs = {} if ctx is None else {k: dag.df(v.df) for k, v in ctx.tables.items()}
    result = dag._sql(sql, _global, _local, **dfs)
    dag.run(DaskSQLExecutionEngine(conf=fugue_conf))

    result_dfs = {
        k: v.result.native
        for k, v in result.items()
        if isinstance(v, WorkflowDataFrame)
    }
    if register and ctx is not None:
        for k, v in result_dfs.items():
            ctx.create_table(k, v)
    return result_dfs
Ejemplo n.º 7
0
def assert_eq(expr, expected: FugueWorkflow):
    global_vars, local_vars = get_caller_global_local_vars()
    sql = FugueSQL(expr, "fugueLanguage", ignore_case=True, simple_assign=True)
    wf = FugueWorkflow()
    v = _Extensions(
        sql, FugueSQLHooks(), wf, global_vars=global_vars, local_vars=local_vars
    )
    obj = v.visit(sql.tree)
    assert expected.spec_uuid() == v.workflow.spec_uuid()
Ejemplo n.º 8
0
def tune(  # noqa: C901
    params_df: WorkflowDataFrame,
    tunable: Any,
    distributable: Optional[bool] = None,
    objective_runner: Optional[ObjectiveRunner] = None,
) -> WorkflowDataFrame:
    t = _to_tunable(  # type: ignore
        tunable, *get_caller_global_local_vars(), distributable)
    if distributable is None:
        distributable = t.distributable

    if objective_runner is None:
        objective_runner = ObjectiveRunner()

    # input_has: __fmin_params__:str
    # schema: *,__fmin_value__:double,__fmin_metadata__:str
    def compute_transformer(
            df: Iterable[Dict[str, Any]]) -> Iterable[Dict[str, Any]]:
        for row in df:
            dfs: Dict[str, Any] = {}
            dfs_keys: Set[str] = set()
            for k, v in row.items():
                if k.startswith("__df_"):
                    key = k[len("__df_"):]
                    if v is not None:
                        dfs[key] = pd.read_parquet(v)
                    dfs_keys.add(key)
            for params in json.loads(row["__fmin_params__"]):
                p = decode(params)
                best = objective_runner.run(  # type: ignore
                    t, dict(**dfs, **p), set(p.keys()))
                res = dict(row)
                res["__fmin_params__"] = json.dumps(best["hp"])
                res["__fmin_value__"] = best["error"]
                res["__fmin_metadata__"] = json.dumps(best["metadata"])
                yield res

    # input_has: __fmin_params__:str
    def compute_processor(engine: ExecutionEngine, df: DataFrame) -> DataFrame:
        def get_rows() -> Iterable[Any]:
            keys = list(
                df.schema.names) + ["__fmin_value__", "__fmin_metadata__"]
            for row in compute_transformer(df.as_dict_iterable()):
                yield [row[k] for k in keys]

        t._execution_engine = engine  # type:ignore
        return ArrayDataFrame(
            get_rows(),
            df.schema + "__fmin_value__:double,__fmin_metadata__:str")

    if not distributable:
        return params_df.process(compute_processor)
    else:
        return params_df.partition(num="ROWCOUNT",
                                   algo="even").transform(compute_transformer)
Ejemplo n.º 9
0
def _to_transformer(  # noqa: C901
    obj: Any,
    schema: Any = None,
    global_vars: Optional[Dict[str, Any]] = None,
    local_vars: Optional[Dict[str, Any]] = None,
    validation_rules: Optional[Dict[str, Any]] = None,
    func_transformer_type: Type = _FuncAsTransformer,
    func_cotransformer_type: Type = _FuncAsCoTransformer,
) -> Union[Transformer, CoTransformer]:
    global_vars, local_vars = get_caller_global_local_vars(
        global_vars, local_vars)
    exp: Optional[Exception] = None
    if validation_rules is None:
        validation_rules = {}
    try:
        return copy.copy(
            to_instance(obj,
                        Transformer,
                        global_vars=global_vars,
                        local_vars=local_vars))
    except Exception as e:
        exp = e
    try:
        return copy.copy(
            to_instance(obj,
                        CoTransformer,
                        global_vars=global_vars,
                        local_vars=local_vars))
    except Exception as e:
        exp = e
    try:
        f = to_function(obj, global_vars=global_vars, local_vars=local_vars)
        # this is for string expression of function with decorator
        if isinstance(f, Transformer):
            return copy.copy(f)
        # this is for functions without decorator
        return func_transformer_type.from_func(
            f, schema, validation_rules=validation_rules)
    except Exception as e:
        exp = e
    try:
        f = to_function(obj, global_vars=global_vars, local_vars=local_vars)
        # this is for string expression of function with decorator
        if isinstance(f, CoTransformer):
            return copy.copy(f)
        # this is for functions without decorator
        return func_cotransformer_type.from_func(
            f, schema, validation_rules=validation_rules)
    except Exception as e:
        exp = e
    raise FugueInterfacelessError(f"{obj} is not a valid transformer", exp)
Ejemplo n.º 10
0
def _to_output_transformer(
    obj: Any,
    global_vars: Optional[Dict[str, Any]] = None,
    local_vars: Optional[Dict[str, Any]] = None,
    validation_rules: Optional[Dict[str, Any]] = None,
) -> Union[Transformer, CoTransformer]:
    global_vars, local_vars = get_caller_global_local_vars(
        global_vars, local_vars)
    return _to_transformer(
        obj=obj,
        schema=None,
        global_vars=global_vars,
        local_vars=local_vars,
        validation_rules=validation_rules,
        func_transformer_type=_FuncAsOutputTransformer,
        func_cotransformer_type=_FuncAsOutputCoTransformer,
    )
Ejemplo n.º 11
0
def _to_module(
    obj: Any,
    global_vars: Optional[Dict[str, Any]] = None,
    local_vars: Optional[Dict[str, Any]] = None,
) -> "_ModuleFunctionWrapper":
    if isinstance(obj, _ModuleFunctionWrapper):
        return obj
    global_vars, local_vars = get_caller_global_local_vars(global_vars, local_vars)
    try:
        f = to_function(obj, global_vars=global_vars, local_vars=local_vars)
        # this is for string expression of function with decorator
        if isinstance(f, _ModuleFunctionWrapper):
            return copy.copy(f)
        # this is for functions without decorator
        return _ModuleFunctionWrapper(f)
    except Exception as e:
        exp = e
    raise FugueInterfacelessError(f"{obj} is not a valid module", exp)
Ejemplo n.º 12
0
def _to_transformer(
    obj: Any,
    schema: Any = None,
    global_vars: Optional[Dict[str, Any]] = None,
    local_vars: Optional[Dict[str, Any]] = None,
    validation_rules: Optional[Dict[str, Any]] = None,
) -> Union[Transformer, CoTransformer]:
    global_vars, local_vars = get_caller_global_local_vars(
        global_vars, local_vars)
    return _to_general_transformer(
        obj=_TRANSFORMER_REGISTRY.get(obj),
        schema=schema,
        global_vars=global_vars,
        local_vars=local_vars,
        validation_rules=validation_rules,
        func_transformer_type=_FuncAsTransformer,
        func_cotransformer_type=_FuncAsCoTransformer,
    )
Ejemplo n.º 13
0
 def __call__(self, code: str, *args: Any, **kwargs: Any):
     cf = inspect.currentframe()
     global_vars, local_vars = get_caller_global_local_vars()
     global_vars = {
         k: v
         for k, v in global_vars.items()
         if not isinstance(v, WorkflowDataFrame) or v.workflow is self
     }
     local_vars = {
         k: v
         for k, v in local_vars.items()
         if not isinstance(v, WorkflowDataFrame) or v.workflow is self
     }
     variables = self._sql(code, self._sql_vars, global_vars, local_vars,
                           *args, **kwargs)
     if cf is not None:
         for k, v in variables.items():
             if isinstance(v, WorkflowDataFrame) and v.workflow is self:
                 self._sql_vars[k] = v
Ejemplo n.º 14
0
def to_noniterative_objective(
    obj: Any,
    min_better: bool = True,
    global_vars: Optional[Dict[str, Any]] = None,
    local_vars: Optional[Dict[str, Any]] = None,
) -> NonIterativeObjectiveFunc:
    if isinstance(obj, NonIterativeObjectiveFunc):
        return copy.copy(obj)
    global_vars, local_vars = get_caller_global_local_vars(global_vars, local_vars)
    try:
        f = to_function(obj, global_vars=global_vars, local_vars=local_vars)
        # this is for string expression of function with decorator
        if isinstance(f, NonIterativeObjectiveFunc):
            return copy.copy(f)
        # this is for functions without decorator
        return _NonIterativeObjectiveFuncWrapper.from_func(f, min_better)
    except Exception as e:
        exp = e
    raise TuneCompileError(f"{obj} is not a valid tunable function", exp)
Ejemplo n.º 15
0
 def __init__(
     self,
     sql: FugueSQL,
     hooks: FugueSQLHooks,
     workflow: FugueWorkflow,
     variables: Optional[Dict[str, WorkflowDataFrame]] = None,
     last: Optional[WorkflowDataFrame] = None,
     global_vars: Optional[Dict[str, Any]] = None,
     local_vars: Optional[Dict[str, Any]] = None,
 ):
     super().__init__(sql)
     self._workflow = workflow
     self._variables: Dict[str, WorkflowDataFrame] = {}
     if variables is not None:
         self._variables.update(variables)
     self._last: Optional[WorkflowDataFrame] = last
     self._hooks = hooks
     self._global_vars, self._local_vars = get_caller_global_local_vars(
         global_vars, local_vars
     )
Ejemplo n.º 16
0
def fsql(sql: str, *args: Any, **kwargs: Any) -> FugueSQLWorkflow:
    global_vars, local_vars = get_caller_global_local_vars()
    dag = FugueSQLWorkflow()
    dag._sql(sql, global_vars, local_vars, *args, **kwargs)
    return dag
Ejemplo n.º 17
0
def fsql(sql: str,
         *args: Any,
         fsql_ignore_case: bool = False,
         **kwargs: Any) -> FugueSQLWorkflow:
    """Fugue SQL functional interface

    :param sql: the Fugue SQL string (can be a jinja template)
    :param args: variables related to the SQL string
    :param fsql_ignore_case: whether to ignore case when parsing the SQL string
        defaults to False.
    :param kwargs: variables related to the SQL string
    :return: the translated Fugue workflow

    .. code-block:: python

        # Basic case
        fsql('''
        CREATE [[0]] SCHEMA a:int
        PRINT
        ''').run()

        # With external data sources
        df = pd.DataFrame([[0],[1]], columns=["a"])
        fsql('''
        SELECT * FROM df WHERE a=0
        PRINT
        ''').run()

        # With external variables
        df = pd.DataFrame([[0],[1]], columns=["a"])
        t = 1
        fsql('''
        SELECT * FROM df WHERE a={{t}}
        PRINT
        ''').run()

        # The following is the explicit way to specify variables and datafrems
        # (recommended)
        df = pd.DataFrame([[0],[1]], columns=["a"])
        t = 1
        fsql('''
        SELECT * FROM df WHERE a={{t}}
        PRINT
        ''', df=df, t=t).run()

        # Using extensions
        def dummy(df:pd.DataFrame) -> pd.DataFrame:
            return df

        fsql('''
        CREATE [[0]] SCHEMA a:int
        TRANSFORM USING dummy SCHEMA *
        PRINT
        ''').run()

        # It's recommended to provide full path of the extension inside
        # Fugue SQL, so the SQL definition and exeuction can be more
        # independent from the extension definition.

        # Run with different execution engines
        sql = '''
        CREATE [[0]] SCHEMA a:int
        TRANSFORM USING dummy SCHEMA *
        PRINT
        '''

        fsql(sql).run(user_defined_spark_session())
        fsql(sql).run(SparkExecutionEngine, {"spark.executor.instances":10})
        fsql(sql).run(DaskExecutionEngine)

        # Passing dataframes between fsql calls
        result = fsql('''
        CREATE [[0]] SCHEMA a:int
        YIELD DATAFRAME AS x

        CREATE [[1]] SCHEMA a:int
        YIELD DATAFRAME AS y
        ''').run(DaskExecutionEngine)

        fsql('''
        SELECT * FROM x
        UNION
        SELECT * FROM y
        UNION
        SELECT * FROM z

        PRINT
        ''', result, z=pd.DataFrame([[2]], columns=["z"])).run()

        # Get framework native dataframes
        result["x"].native  # Dask dataframe
        result["y"].native  # Dask dataframe
        result["x"].as_pandas()  # Pandas dataframe

        # Use lower case fugue sql
        df = pd.DataFrame([[0],[1]], columns=["a"])
        t = 1
        fsql('''
        select * from df where a={{t}}
        print
        ''', df=df, t=t, fsql_ignore_case=True).run()
    """
    global_vars, local_vars = get_caller_global_local_vars()
    dag = FugueSQLWorkflow(None,
                           {FUGUE_CONF_SQL_IGNORE_CASE: fsql_ignore_case})
    try:
        dag._sql(sql, global_vars, local_vars, *args, **kwargs)
    except FugueSQLSyntaxError as ex:
        raise FugueSQLSyntaxError(str(ex)).with_traceback(None) from None
    return dag