Exemple #1
0
 def jsondict(self) -> ParamDict:
     """Get json serializeable dict of the spec"""
     return ParamDict(
         dict(
             num_partitions=self._num_partitions,
             algo=self._algo,
             partition_by=self._partition_by,
             presort=self.presort_expr,
             size_limit=self._size_limit,
             row_limit=self._row_limit,
         )
     )
Exemple #2
0
    def map(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        if partition_spec.num_partitions != "0":
            self.log.warning(
                "%s doesn't respect num_partitions %s",
                self,
                partition_spec.num_partitions,
            )
        cursor = partition_spec.get_cursor(df.schema, 0)
        if on_init is not None:
            on_init(0, df)
        if len(partition_spec.partition_by) == 0:  # no partition
            df = to_local_df(df)
            cursor.set(df.peek_array(), 0, 0)
            output_df = map_func(cursor, df)
            if (isinstance(output_df, PandasDataFrame)
                    and output_df.schema != output_schema):
                output_df = PandasDataFrame(output_df.native, output_schema)
            assert_or_throw(
                output_df.schema == output_schema,
                lambda: f"map output {output_df.schema} "
                f"mismatches given {output_schema}",
            )
            output_df._metadata = ParamDict(metadata, deep=True)
            output_df._metadata.set_readonly()
            return self.to_df(output_df)
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)

        def _map(pdf: pd.DataFrame) -> pd.DataFrame:
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(pdf.reset_index(drop=True),
                                       df.schema,
                                       pandas_df_wrapper=True)
            cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        result = self.pl_utils.safe_groupby_apply(df.as_pandas(),
                                                  partition_spec.partition_by,
                                                  _map)
        return PandasDataFrame(result, output_schema, metadata)
Exemple #3
0
 def __init__(self, schema: Any = None, metadata: Any = None):
     if not callable(schema):
         schema = _input_schema(schema).assert_not_empty()
         schema.set_readonly()
         self._schema: Union[Schema, Callable[[], Schema]] = schema
         self._schema_discovered = True
     else:
         self._schema: Union[Schema,
                             Callable[[], Schema]] = schema  # type: ignore
         self._schema_discovered = False
     self._metadata = (metadata if isinstance(metadata, ParamDict) else
                       ParamDict(metadata, deep=True))
     self._metadata.set_readonly()
     self._lazy_schema_lock = RLock()
Exemple #4
0
 def jsondict(self) -> ParamDict:
     res = ParamDict(
         dict(
             configs=[c.jsondict for c in self.configs.values()],
             inputs=[c.jsondict for c in self.inputs.values()],
             outputs=[c.jsondict for c in self.outputs.values()],
             func=get_full_type_path(self.func),
             metadata=self.metadata,
             deterministic=self.deterministic,
             lazy=self.lazy,
         ))
     if self._node_spec is not None:
         res["node_spec"] = self.node_spec.jsondict
     return res
Exemple #5
0
 def __init__(
     self,
     creator: Any,
     schema: Any = None,
     params: Any = None,
     deterministic: bool = True,
     lazy: bool = True,
 ):
     self._creator = _to_creator(creator, schema)
     self._creator._params = ParamDict(params)
     super().__init__(params=params,
                      input_n=0,
                      output_n=1,
                      deterministic=deterministic,
                      lazy=lazy)
Exemple #6
0
 def __init__(
     self,
     configs: Any,
     inputs: Any,
     outputs: Any,
     func: Any,
     metadata: Any = None,
     deterministic: bool = True,
     lazy: bool = False,
 ):
     self.configs = self._parse_spec_collection(configs, ConfigSpec)
     self.inputs = self._parse_spec_collection(inputs, InputSpec)
     self.outputs = self._parse_spec_collection(outputs, OutputSpec)
     self.metadata = ParamDict(metadata, deep=True)
     self.func = to_function(func)
     self.deterministic = deterministic
     self.lazy = lazy
     self._node_spec: Optional["_NodeSpec"] = None
Exemple #7
0
 def __init__(self, *args: Any, **kwargs: Any):  # noqa: C901
     p = ParamDict()
     if (
         len(args) == 1
         and len(kwargs) == 0
         and isinstance(args[0], str)
         and args[0].lower() == "per_row"
     ):
         p["algo"] = "even"
         p["num_partitions"] = "ROWCOUNT"
     else:
         for a in args:
             if a is None:
                 continue
             elif isinstance(a, PartitionSpec):
                 self._update_dict(p, a.jsondict)
             elif isinstance(a, Dict):
                 self._update_dict(p, a)
             elif isinstance(a, str):
                 self._update_dict(p, json.loads(a))
             else:
                 raise TypeError(f"{a} is not supported")
         self._update_dict(p, kwargs)
     self._num_partitions = p.get("num_partitions", "0")
     self._algo = p.get("algo", "").lower()
     if "partition_by" not in p:
         self._partition_by: List[str] = []
     elif isinstance(p["partition_by"], str):
         self._partition_by = [p["partition_by"]]
     elif isinstance(p["partition_by"], (list, tuple)):
         self._partition_by = list(p["partition_by"])
     else:
         raise SyntaxError(p["partition_by"])
     aot(
         len(self._partition_by) == len(set(self._partition_by)),
         SyntaxError(f"{self._partition_by} has duplicated keys"),
     )
     self._presort = parse_presort_exp(p.get_or_none("presort", object))
     if any(x in self._presort for x in self._partition_by):
         raise SyntaxError(
             "partition by overlap with presort: "
             + f"{self._partition_by}, {self._presort}"
         )
     # TODO: currently, size limit not in use
     self._size_limit = to_size(p.get("size_limit", "0"))
     self._row_limit = p.get("row_limit", 0)
Exemple #8
0
 def process(self, dfs: DataFrames) -> None:
     df = dfs[0]
     tf = _to_output_transformer(
         self.params.get_or_none("transformer", object), )
     tf._workflow_conf = self.execution_engine.conf
     tf._params = self.params.get("params", ParamDict())  # type: ignore
     tf._partition_spec = self.partition_spec  # type: ignore
     rpc_handler = to_rpc_handler(
         self.params.get_or_throw("rpc_handler", object))
     if not isinstance(rpc_handler, EmptyRPCHandler):
         tf._rpc_client = self.execution_engine.rpc_server.make_client(
             rpc_handler)
     ie = self.params.get("ignore_errors", [])
     self._ignore_errors = [to_type(x, Exception) for x in ie]
     tf.validate_on_runtime(df)
     if isinstance(tf, Transformer):
         self.transform(df, tf)
     else:
         self.cotransform(df, tf)
Exemple #9
0
def _load_single_avro(path: str, **kwargs: Any) -> pd.DataFrame:
    from fastavro import reader

    kw = ParamDict(kwargs)
    process_record = None
    if "process_record" in kw:
        process_record = kw["process_record"]
        del kw["process_record"]

    with FileSystem().openbin(path) as fp:
        # Configure Avro reader
        avro_reader = reader(fp)
        # Load records in memory
        if process_record:
            records = [process_record(r) for r in avro_reader]

        else:
            records = list(avro_reader)

        # Populate pandas.DataFrame with records
        return pd.DataFrame.from_records(records)
Exemple #10
0
    def __init__(
        self,
        cache: Any = NoOpCache,
        engine: Any = SequentialExecutionEngine,
        hooks: Any = WorkflowHooks,
        logger: Any = None,
        config: Any = None,
    ):
        self._conf: ParamDict = ParamDict(config)
        self._abort_requested: Event = Event()

        self._cache: WorkflowResultCache = self._parse_config(
            cache, WorkflowResultCache, [self])
        self._engine: WorkflowExecutionEngine = self._parse_config(
            engine, WorkflowExecutionEngine, [self])
        self._hooks: WorkflowHooks = self._parse_config(
            hooks, WorkflowHooks, [self])
        if logger is None:
            logger = logging.getLogger()
        self._logger: logging.Logger = self._parse_config(
            logger, logging.Logger, [])
Exemple #11
0
 def __init__(
     self,
     input_n: int,
     processor: Any,
     schema: Any,
     params: Any,
     pre_partition: Any = None,
     deterministic: bool = True,
     lazy: bool = False,
     input_names: Optional[List[str]] = None,
 ):
     self._processor = _to_processor(processor, schema)
     self._processor._params = ParamDict(params)
     self._processor._partition_spec = PartitionSpec(pre_partition)
     super().__init__(
         params=params,
         input_n=input_n,
         output_n=1,
         deterministic=deterministic,
         lazy=lazy,
         input_names=input_names,
     )
Exemple #12
0
 def __init__(
     self,
     input_n: int,
     outputter: Any,
     params: Any,
     pre_partition: Any = None,
     deterministic: bool = True,
     lazy: bool = False,
     input_names: Optional[List[str]] = None,
 ):
     assert_or_throw(input_n > 0,
                     FugueWorkflowError("must have at least one input"))
     self._outputter = _to_outputter(outputter)
     self._outputter._params = ParamDict(params)
     self._outputter._partition_spec = PartitionSpec(pre_partition)
     super().__init__(
         params=params,
         input_n=input_n,
         output_n=1,
         deterministic=deterministic,
         lazy=lazy,
         input_names=input_names,
     )
def test_input():
    t = MockTaskForVar()
    s = OutputSpec("o", dict, False)
    o = _Output(t, s)
    p = ParamDict()
    ii = InputSpec("x", dict, False, False, default_value=p, default_on_timeout=True)
    i = _Input(t, ii)
    i.set_dependency(o)
    raises(ValueError, lambda: o.set(None))
    raises(ValueError, lambda: i.get())

    t = MockTaskForVar()
    s = OutputSpec("o", ParamDict, False)
    o = _Output(t, s)
    raises(AssertionError, lambda: InputSpec("x", dict, False, False,
                                             timeout="0.1s",
                                             default_value=None,
                                             default_on_timeout=True))

    # Input linked with Output
    t = MockTaskForVar()
    s = OutputSpec("o", ParamDict, False)
    o = _Output(t, s)
    p = ParamDict()
    p2 = ParamDict()
    ii = InputSpec("x", dict, False, False, timeout="0.1s",
                   default_value=p, default_on_timeout=True)
    i = _Input(t, ii).set_dependency(o)
    assert p is i.get()
    o.set(p2)
    assert p is not i.get()
    assert p2 is i.get()
    # Input linked with Input
    i2 = _Input(t, ii).set_dependency(i)
    assert p is not i2.get()
    assert p2 is i2.get()

    t = MockTaskForVar()
    s = OutputSpec("o", ParamDict, False)
    o = _Output(t, s)
    p = ParamDict()
    p2 = ParamDict()
    ii = InputSpec("x", dict, False, False, timeout="0.1s",
                   default_value=p, default_on_timeout=False)
    i = _Input(t, ii).set_dependency(o)
    raises(TimeoutError, lambda: i.get())

    # Output skipped, input without default will raise error
    t = MockTaskForVar()
    s = OutputSpec("o", ParamDict, False)
    o = _Output(t, s)
    p = ParamDict()
    ii = InputSpec("x", dict, False)
    i = _Input(t, ii).set_dependency(o)
    o.skip()
    raises(SkippedError, lambda: i.get())

    # Output skipped, input with default will return default
    t = MockTaskForVar()
    s = OutputSpec("o", ParamDict, False)
    o = _Output(t, s)
    p = ParamDict()
    ii = InputSpec("x", dict, False, False, p)
    i = _Input(t, ii).set_dependency(o)
    o.skip()
    assert p is i.get()

    # Output -> workflow output -> Input
    t = MockTaskForVar()
    s = OutputSpec("o", ParamDict, False)
    oo = _Output(t, s)  # task output
    o = _Output(t, s)  # workflow output
    o.set_dependency(oo)
    p = ParamDict()
    ii = InputSpec("x", dict, False)
    i = _Input(t, ii).set_dependency(o)
    oo.set(p)
    assert p is i.get()
Exemple #14
0
from triad.collections.dict import ParamDict

DEFAULT_CONFIG = ParamDict({"fugue.dask.dataframe.default.partitions": 16})
Exemple #15
0
def test_param_dict():
    d = ParamDict([("a", 1), ("b", 2)])
    assert 1 == d["a"]
    assert 1 == d[0]
    assert 2 == d["b"]
    assert "2" == d.get_or_throw(1, str)
    # if giving index, it should ignore the throw flag and always throw
    raises(IndexError, lambda: d.get(2, "x"))
    raises(IndexError, lambda: d.get_or_none(2, str))

    d = {"a": "b", "b": {"x": 1, "y": "d"}}
    p = ParamDict(d)
    print({"test": p})
    d["b"]["x"] = 2
    assert 1 == p["b"]["x"]
    p = ParamDict(d, deep=False)
    d["b"]["x"] = 3
    assert 3 == p["b"]["x"]
    pp = ParamDict(p, deep=False)
    p["b"]["x"] = 4
    assert 4 == pp["b"]["x"]
    pp = ParamDict(p, deep=True)
    p["b"]["x"] = 5
    assert 4 == pp["b"]["x"]

    assert 2 == len(p)
    assert "a,b" == ",".join([k for k, _ in p.items()])
    del p["a"]
    assert 1 == len(p)
    p["c"] = 1
    assert 2 == len(p)
    assert "c" in p
    assert "a" not in p

    raises(ValueError, lambda: p.get("c", None))
    assert 1 == p.get("c", 2)
    assert "1" == p.get("c", "2")
    assert 1.0 == p.get("c", 2.0)
    raises(TypeError, lambda: p.get("c", ParamDict()))
    assert 2 == p.get("d", 2)
    p["arr"] = [1, 2]
    assert [1, 2] == p.get("arr", [])
    assert [] == p.get("arr2", [])

    assert p.get_or_none("e", int) is None
    assert 1 == p.get_or_none("c", int)
    assert "1" == p.get_or_none("c", str)
    # exists but can't convert type
    raises(TypeError, lambda: p.get_or_none("c", ParamDict))

    raises(KeyError, lambda: p.get_or_throw("e", int))
    assert 1 == p.get_or_throw("c", int)
    assert "1" == p.get_or_throw("c", str)
    # exists but can't convert type
    raises(TypeError, lambda: p.get_or_throw("c", ParamDict))

    p = ParamDict()
    assert 0 == len(p)
    for x in p:
        pass

    raises(TypeError, lambda: ParamDict("abc"))

    a = ParamDict({"a": 1, "b": 2})
    b = ParamDict({"b": 2, "a": 1})
    c = ParamDict({"b": 2})
    assert a == a
    assert a != b
    assert a != c
    assert a == {"b": 2, "a": 1}
    assert a != {"b": 1, "a": 1}
    assert a != None
    assert not (a == None)

    p = ParamDict({
        "a": "True",
        "b": True,
        "c": "true",
        "d": "False",
        "e": False,
        "f": "false",
        "g": "yes",
        "h": "NO",
        "i": 0,
        "j": "1",
        "k": "",
    })
    assert p.get_or_throw("a", bool)
    assert p.get_or_throw("b", bool)
    assert p.get_or_throw("c", bool)
    assert not p.get_or_throw("d", bool)
    assert not p.get_or_throw("e", bool)
    assert not p.get_or_throw("f", bool)
    assert p.get_or_throw("g", bool)
    assert not p.get_or_throw("h", bool)
    assert not p.get_or_throw("i", bool)
    assert p.get_or_throw("j", bool)
    raises(TypeError, lambda: p.get_or_throw("k", bool))

    s = '{"a":false,"b":10,"c":"cd"}'
    p = ParamDict(json.loads(s))
    assert not p.get_or_throw("a", bool)
    assert "10" == p.get_or_throw("b", str)
    assert "cd" == p.get_or_throw("c", str)
    raises(KeyError, lambda: p.get_or_throw("d", str))

    print(p.to_json())
    print(p.to_json(True))

    # update
    p = ParamDict(dict(a=1, b=2))
    p1 = ParamDict(dict(b=3, c=4))
    p.update(p1)
    assert dict(a=1, b=3, c=4) == p

    p = ParamDict(dict(a=1, b=2))
    p.update(p1, ParamDict.OVERWRITE)
    assert dict(a=1, b=3, c=4) == p

    p = ParamDict(dict(a=1, b=2))
    p.update(p1, ParamDict.IGNORE)
    assert dict(a=1, b=2, c=4) == p

    p = ParamDict(dict(a=1, b=2))
    raises(KeyError, lambda: p.update(p1, ParamDict.THROW))

    raises(ValueError, lambda: p.update(p1, 100))

    p.set_readonly()
    raises(InvalidOperationError, lambda: p.update(p1, 100))
Exemple #16
0
 def paramdict(self) -> ParamDict:
     return ParamDict((x, self.__dict__[x]) for x in self.attributes)
Exemple #17
0
 def __init__(self, *args: Any, **kwargs: Any):
     super().__init__(*args, **kwargs)
     self._sql_vars: Dict[str, WorkflowDataFrame] = {}
     self._sql_conf = ParamDict({**FUGUE_SQL_DEFAULT_CONF, **super().conf})
 def __init__(self, conf: Any = None):
     p = ParamDict(FUGUE_DASK_DEFAULT_CONF)
     p.update(ParamDict(conf))
     super().__init__(p)
     self._fs = FileSystem()
     self._log = logging.getLogger()