Beispiel #1
0
 def __init__(self, conf: Any):
     _conf = ParamDict(conf)
     self._conf = ParamDict({**FUGUE_DEFAULT_CONF, **_conf})
     self._rpc_server = make_rpc_server(self.conf)
     self._engine_start_lock = RLock()
     self._engine_start = 0
     self._sql_engine: Optional[SQLEngine] = None
Beispiel #2
0
def test_flask_service():
    # fugue.rpc.flask.FlaskRPCServer
    conf = ParamDict({
        "fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer",
        "fugue.rpc.flask_server.host": "127.0.0.1",
        "fugue.rpc.flask_server.port": "1234",
        "fugue.rpc.flask_server.timeout": "2 sec",
    })

    def k(value: str) -> str:
        return value + "x"

    def kk(a: int, b: int) -> int:
        return a + b

    def kkk(f: callable, a: int) -> int:
        return f(a)

    with make_rpc_server(conf).start() as server:
        assert "1234" == server.conf["fugue.rpc.flask_server.port"]
        with server.start():  # recursive start will take no effect
            client1 = cloudpickle.loads(
                cloudpickle.dumps(server.make_client(k)))
        assert "dddx" == client1("ddd")
        client2 = cloudpickle.loads(cloudpickle.dumps(server.make_client(kk)))
        assert 3 == client2(1, 2)
        assert "dddx" == client1("ddd")
        client3 = cloudpickle.loads(cloudpickle.dumps(server.make_client(kkk)))
        assert 3 == client3(lambda x: x + 1, 2)
        assert 3 == client2(1, 2)
        server.stop()  # extra stop in the end will take no effect
Beispiel #3
0
 def __init__(self, conf: Any = None, use_sqlite: bool = False):
     configs = _process_confs(ParamDict(conf))
     super().__init__(configs)
     if not use_sqlite:
         self.set_sql_engine(KaggleSQLEngineWrapper(self, QPDPandasEngine(self)))
     else:  # pragma: no cover
         self.set_sql_engine(KaggleSQLEngineWrapper(self, SqliteEngine(self)))
Beispiel #4
0
 def __init__(
     self,
     data: Any,
     schema: Any = None,
     metadata: Any = None,
     deterministic: bool = True,
     data_determiner: Optional[Callable[[Any], str]] = None,
     lazy: bool = True,
 ):
     self._validate_data(data, schema, metadata)
     self._data = data
     self._schema = None if schema is None else Schema(schema)
     self._metadata = None if metadata is None else ParamDict(metadata)
     did = "" if data_determiner is None else data_determiner(data)
     super().__init__(
         params=dict(
             schema=self._schema,
             metadata=self._metadata,
             determinism_id=did,
         ),
         input_n=0,
         output_n=1,
         deterministic=deterministic,
         lazy=lazy,
     )
Beispiel #5
0
    def __init__(self, conf: Any = None):
        configs = _process_confs(
            {FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS: _get_optimal_partition()},
            ParamDict(conf),
        )
        super().__init__(conf=configs)
        try:
            from dask_sql.integrations.fugue import DaskSQLEngine

            self.set_sql_engine(KaggleSQLEngineWrapper(self, DaskSQLEngine(self)))
            print("dask-sql is set as the SQL Engine for Dask")
        except ImportError:
            self.set_sql_engine(KaggleSQLEngineWrapper(self, QPDDaskEngine(self)))
Beispiel #6
0
def make_rpc_server(conf: Any) -> RPCServer:
    """Make :class:`~.RPCServer` based on configuration.
    If '`fugue.rpc.server`` is set, then the value will be used as
    the server type for the initialization. Otherwise, a
    :class:`~.NativeRPCServer` instance will be returned

    :param conf: |FugueConfig|
    :return: the RPC server
    """
    conf = ParamDict(conf)
    tp = conf.get_or_none("fugue.rpc.server", str)
    t_server = NativeRPCServer if tp is None else to_type(tp, RPCServer)
    return t_server(conf)  # type: ignore
Beispiel #7
0
 def __init__(
     self,
     creator: Any,
     schema: Any = None,
     params: Any = None,
     deterministic: bool = True,
     lazy: bool = True,
 ):
     self._creator = _to_creator(creator, schema)
     self._creator._params = ParamDict(params, deep=False)
     super().__init__(
         params=params, input_n=0, output_n=1, deterministic=deterministic, lazy=lazy
     )
Beispiel #8
0
def _to_trail_row(data: Dict[str, Any], metadata: Dict[str,
                                                       Any]) -> Dict[str, Any]:
    key_names = sorted(k for k in data.keys() if not k.startswith(TUNE_PREFIX))
    keys = [data[k] for k in key_names]
    trials: Dict[str, Dict[str, Any]] = {}
    for param in pickle.loads(data[TUNE_DATASET_PARAMS_PREFIX]):
        p = ParamDict(
            sorted(((k, v) for k, v in param.items()), key=lambda x: x[0]))
        tid = to_uuid(keys, p)
        trials[tid] = Trial(trial_id=tid,
                            params=p,
                            metadata=metadata,
                            keys=keys).jsondict
    data[TUNE_DATASET_TRIALS] = json.dumps(list(trials.values()))
    del data[TUNE_DATASET_PARAMS_PREFIX]
    return data
Beispiel #9
0
 def __init__(self, spark_session: Optional[SparkSession] = None, conf: Any = None):
     if spark_session is None:
         spark_session = SparkSession.builder.getOrCreate()
     self._spark_session = spark_session
     cf = dict(FUGUE_SPARK_DEFAULT_CONF)
     cf.update({x[0]: x[1] for x in spark_session.sparkContext.getConf().getAll()})
     cf.update(ParamDict(conf))
     super().__init__(cf)
     self._fs = FileSystem()
     self._log = logging.getLogger()
     self._broadcast_func = RunOnce(
         self._broadcast, lambda *args, **kwargs: id(args[0])
     )
     self._persist_func = RunOnce(self._persist, lambda *args, **kwargs: id(args[0]))
     self._register_func = RunOnce(
         self._register, lambda *args, **kwargs: id(args[0])
     )
     self._io = SparkIO(self.spark_session, self.fs)
Beispiel #10
0
 def __init__(self, spark_session: Optional[SparkSession] = None, conf: Any = None):
     configs = _process_confs(
         {
             "fugue.spark.use_pandas_udf": True,
             "spark.driver.memory": _get_optimal_mem(),
             "spark.sql.shuffle.partitions": _get_optimal_partition(),
             "spark.sql.execution.arrow.pyspark.fallback.enabled": True,
             "spark.driver.extraJavaOptions": "-Dio.netty.tryReflectionSetAccessible=true",  # noqa: E501
             "spark.executor.extraJavaOptions": "-Dio.netty.tryReflectionSetAccessible=true",  # noqa: E501
         },
         ParamDict(conf),
     )
     builder = SparkSession.builder.master("local[*]")
     for k, v in configs.items():
         builder = builder.config(k, v)
     spark_session = builder.getOrCreate()
     super().__init__(spark_session=spark_session, conf=configs)
     self.set_sql_engine(KaggleSQLEngineWrapper(self, SparkSQLEngine(self)))
Beispiel #11
0
 def get_engine(self, line: str, lc: Dict[str, Any]) -> ExecutionEngine:
     line = line.strip()
     p = line.find("{")
     if p >= 0:
         engine = line[:p].strip()
         conf = json.loads(line[p:])
     else:
         parts = line.split(" ", 1)
         engine = parts[0]
         conf = ParamDict(None if len(parts) == 1 else lc[parts[1]])
     cf = dict(self._pre_conf)
     cf.update(conf)
     for k, v in self._post_conf.items():
         if k in cf and cf[k] != v:
             raise ValueError(
                 f"{k} must be {v}, but you set to {cf[k]}, you may unset it"
             )
         cf[k] = v
     if "+" in engine:
         return make_execution_engine(tuple(engine.split("+", 1)), cf)
     return make_execution_engine(engine, cf)
Beispiel #12
0
 def __init__(
     self,
     input_n: int,
     outputter: Any,
     params: Any,
     pre_partition: Any = None,
     deterministic: bool = True,
     lazy: bool = False,
     input_names: Optional[List[str]] = None,
 ):
     assert_or_throw(input_n > 0, FugueWorkflowError("must have at least one input"))
     self._outputter = _to_outputter(outputter)
     self._outputter._params = ParamDict(params)
     self._outputter._partition_spec = PartitionSpec(pre_partition)
     self._outputter.validate_on_compile()
     super().__init__(
         params=params,
         input_n=input_n,
         output_n=1,
         deterministic=deterministic,
         lazy=lazy,
         input_names=input_names,
     )
Beispiel #13
0
 def __init__(
     self,
     input_n: int,
     processor: Any,
     schema: Any,
     params: Any,
     pre_partition: Any = None,
     deterministic: bool = True,
     lazy: bool = False,
     input_names: Optional[List[str]] = None,
 ):
     self._processor = _to_processor(processor, schema)
     self._processor._params = ParamDict(params)
     self._processor._partition_spec = PartitionSpec(pre_partition)
     self._processor.validate_on_compile()
     super().__init__(
         params=params,
         input_n=input_n,
         output_n=1,
         deterministic=deterministic,
         lazy=lazy,
         input_names=input_names,
     )
Beispiel #14
0
    FUGUE_CONF_WORKFLOW_AUTO_PERSIST,
    FUGUE_CONF_WORKFLOW_AUTO_PERSIST_VALUE,
    FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE,
    FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT,
    FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE,
    FUGUE_CONF_SQL_IGNORE_CASE,
])

_FUGUE_GLOBAL_CONF = ParamDict({
    FUGUE_CONF_WORKFLOW_CONCURRENCY:
    1,
    FUGUE_CONF_WORKFLOW_AUTO_PERSIST:
    False,
    FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE:
    "fugue.,six,adagio.,pandas,"
    "fugue_dask.,dask.,fugue_spark.,pyspark.,antlr4,_qpd_antlr,qpd,triad,"
    "fugue_notebook.,ipython.,jupyter.,ipykernel,_pytest,pytest,fugue_ibis.",
    FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT:
    3,
    FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE:
    True,
    FUGUE_CONF_SQL_IGNORE_CASE:
    False,
})


def register_global_conf(conf: Dict[str, Any],
                         on_dup: int = ParamDict.OVERWRITE) -> None:
    """Register global Fugue configs that can be picked up by any
    Fugue execution engines as the base configs.

    :param conf: the config dictionary
Beispiel #15
0
 def __init__(self, conf: Any):
     super().__init__()
     self._conf = ParamDict(conf)
     self._handlers: Dict[str, RPCHandler] = {}
Beispiel #16
0
 def report(self, result: Dict[str, Any]) -> None:
     self._error = float(result["error"])
     self._hp = ParamDict(result.get("hp", None))
     self._metadata = ParamDict(result.get("metadata", None))
Beispiel #17
0
 def _get_temp_path(p: str, conf: ParamDict) -> str:
     if p is not None and p != "":
         return p
     return conf.get_or_throw(FUGUE_TUNE_TEMP_PATH,
                              str)  # TODO: remove hard code
Beispiel #18
0
 def _get_temp_path(p: str, conf: ParamDict) -> str:
     if p is not None and p != "":
         return p
     return conf.get_or_throw(TUNE_TEMP_PATH, str)