def jsondict(self) -> ParamDict: """Get json serializeable dict of the spec""" return ParamDict( dict( num_partitions=self._num_partitions, algo=self._algo, partition_by=self._partition_by, presort=self.presort_expr, size_limit=self._size_limit, row_limit=self._row_limit, ) )
def map( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: if partition_spec.num_partitions != "0": self.log.warning( "%s doesn't respect num_partitions %s", self, partition_spec.num_partitions, ) cursor = partition_spec.get_cursor(df.schema, 0) if on_init is not None: on_init(0, df) if len(partition_spec.partition_by) == 0: # no partition df = to_local_df(df) cursor.set(df.peek_array(), 0, 0) output_df = map_func(cursor, df) if (isinstance(output_df, PandasDataFrame) and output_df.schema != output_schema): output_df = PandasDataFrame(output_df.native, output_schema) assert_or_throw( output_df.schema == output_schema, lambda: f"map output {output_df.schema} " f"mismatches given {output_schema}", ) output_df._metadata = ParamDict(metadata, deep=True) output_df._metadata.set_readonly() return self.to_df(output_df) presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) def _map(pdf: pd.DataFrame) -> pd.DataFrame: if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame(pdf.reset_index(drop=True), df.schema, pandas_df_wrapper=True) cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() result = self.pl_utils.safe_groupby_apply(df.as_pandas(), partition_spec.partition_by, _map) return PandasDataFrame(result, output_schema, metadata)
def __init__(self, schema: Any = None, metadata: Any = None): if not callable(schema): schema = _input_schema(schema).assert_not_empty() schema.set_readonly() self._schema: Union[Schema, Callable[[], Schema]] = schema self._schema_discovered = True else: self._schema: Union[Schema, Callable[[], Schema]] = schema # type: ignore self._schema_discovered = False self._metadata = (metadata if isinstance(metadata, ParamDict) else ParamDict(metadata, deep=True)) self._metadata.set_readonly() self._lazy_schema_lock = RLock()
def jsondict(self) -> ParamDict: res = ParamDict( dict( configs=[c.jsondict for c in self.configs.values()], inputs=[c.jsondict for c in self.inputs.values()], outputs=[c.jsondict for c in self.outputs.values()], func=get_full_type_path(self.func), metadata=self.metadata, deterministic=self.deterministic, lazy=self.lazy, )) if self._node_spec is not None: res["node_spec"] = self.node_spec.jsondict return res
def __init__( self, creator: Any, schema: Any = None, params: Any = None, deterministic: bool = True, lazy: bool = True, ): self._creator = _to_creator(creator, schema) self._creator._params = ParamDict(params) super().__init__(params=params, input_n=0, output_n=1, deterministic=deterministic, lazy=lazy)
def __init__( self, configs: Any, inputs: Any, outputs: Any, func: Any, metadata: Any = None, deterministic: bool = True, lazy: bool = False, ): self.configs = self._parse_spec_collection(configs, ConfigSpec) self.inputs = self._parse_spec_collection(inputs, InputSpec) self.outputs = self._parse_spec_collection(outputs, OutputSpec) self.metadata = ParamDict(metadata, deep=True) self.func = to_function(func) self.deterministic = deterministic self.lazy = lazy self._node_spec: Optional["_NodeSpec"] = None
def __init__(self, *args: Any, **kwargs: Any): # noqa: C901 p = ParamDict() if ( len(args) == 1 and len(kwargs) == 0 and isinstance(args[0], str) and args[0].lower() == "per_row" ): p["algo"] = "even" p["num_partitions"] = "ROWCOUNT" else: for a in args: if a is None: continue elif isinstance(a, PartitionSpec): self._update_dict(p, a.jsondict) elif isinstance(a, Dict): self._update_dict(p, a) elif isinstance(a, str): self._update_dict(p, json.loads(a)) else: raise TypeError(f"{a} is not supported") self._update_dict(p, kwargs) self._num_partitions = p.get("num_partitions", "0") self._algo = p.get("algo", "").lower() if "partition_by" not in p: self._partition_by: List[str] = [] elif isinstance(p["partition_by"], str): self._partition_by = [p["partition_by"]] elif isinstance(p["partition_by"], (list, tuple)): self._partition_by = list(p["partition_by"]) else: raise SyntaxError(p["partition_by"]) aot( len(self._partition_by) == len(set(self._partition_by)), SyntaxError(f"{self._partition_by} has duplicated keys"), ) self._presort = parse_presort_exp(p.get_or_none("presort", object)) if any(x in self._presort for x in self._partition_by): raise SyntaxError( "partition by overlap with presort: " + f"{self._partition_by}, {self._presort}" ) # TODO: currently, size limit not in use self._size_limit = to_size(p.get("size_limit", "0")) self._row_limit = p.get("row_limit", 0)
def process(self, dfs: DataFrames) -> None: df = dfs[0] tf = _to_output_transformer( self.params.get_or_none("transformer", object), ) tf._workflow_conf = self.execution_engine.conf tf._params = self.params.get("params", ParamDict()) # type: ignore tf._partition_spec = self.partition_spec # type: ignore rpc_handler = to_rpc_handler( self.params.get_or_throw("rpc_handler", object)) if not isinstance(rpc_handler, EmptyRPCHandler): tf._rpc_client = self.execution_engine.rpc_server.make_client( rpc_handler) ie = self.params.get("ignore_errors", []) self._ignore_errors = [to_type(x, Exception) for x in ie] tf.validate_on_runtime(df) if isinstance(tf, Transformer): self.transform(df, tf) else: self.cotransform(df, tf)
def _load_single_avro(path: str, **kwargs: Any) -> pd.DataFrame: from fastavro import reader kw = ParamDict(kwargs) process_record = None if "process_record" in kw: process_record = kw["process_record"] del kw["process_record"] with FileSystem().openbin(path) as fp: # Configure Avro reader avro_reader = reader(fp) # Load records in memory if process_record: records = [process_record(r) for r in avro_reader] else: records = list(avro_reader) # Populate pandas.DataFrame with records return pd.DataFrame.from_records(records)
def __init__( self, cache: Any = NoOpCache, engine: Any = SequentialExecutionEngine, hooks: Any = WorkflowHooks, logger: Any = None, config: Any = None, ): self._conf: ParamDict = ParamDict(config) self._abort_requested: Event = Event() self._cache: WorkflowResultCache = self._parse_config( cache, WorkflowResultCache, [self]) self._engine: WorkflowExecutionEngine = self._parse_config( engine, WorkflowExecutionEngine, [self]) self._hooks: WorkflowHooks = self._parse_config( hooks, WorkflowHooks, [self]) if logger is None: logger = logging.getLogger() self._logger: logging.Logger = self._parse_config( logger, logging.Logger, [])
def __init__( self, input_n: int, processor: Any, schema: Any, params: Any, pre_partition: Any = None, deterministic: bool = True, lazy: bool = False, input_names: Optional[List[str]] = None, ): self._processor = _to_processor(processor, schema) self._processor._params = ParamDict(params) self._processor._partition_spec = PartitionSpec(pre_partition) super().__init__( params=params, input_n=input_n, output_n=1, deterministic=deterministic, lazy=lazy, input_names=input_names, )
def __init__( self, input_n: int, outputter: Any, params: Any, pre_partition: Any = None, deterministic: bool = True, lazy: bool = False, input_names: Optional[List[str]] = None, ): assert_or_throw(input_n > 0, FugueWorkflowError("must have at least one input")) self._outputter = _to_outputter(outputter) self._outputter._params = ParamDict(params) self._outputter._partition_spec = PartitionSpec(pre_partition) super().__init__( params=params, input_n=input_n, output_n=1, deterministic=deterministic, lazy=lazy, input_names=input_names, )
def test_input(): t = MockTaskForVar() s = OutputSpec("o", dict, False) o = _Output(t, s) p = ParamDict() ii = InputSpec("x", dict, False, False, default_value=p, default_on_timeout=True) i = _Input(t, ii) i.set_dependency(o) raises(ValueError, lambda: o.set(None)) raises(ValueError, lambda: i.get()) t = MockTaskForVar() s = OutputSpec("o", ParamDict, False) o = _Output(t, s) raises(AssertionError, lambda: InputSpec("x", dict, False, False, timeout="0.1s", default_value=None, default_on_timeout=True)) # Input linked with Output t = MockTaskForVar() s = OutputSpec("o", ParamDict, False) o = _Output(t, s) p = ParamDict() p2 = ParamDict() ii = InputSpec("x", dict, False, False, timeout="0.1s", default_value=p, default_on_timeout=True) i = _Input(t, ii).set_dependency(o) assert p is i.get() o.set(p2) assert p is not i.get() assert p2 is i.get() # Input linked with Input i2 = _Input(t, ii).set_dependency(i) assert p is not i2.get() assert p2 is i2.get() t = MockTaskForVar() s = OutputSpec("o", ParamDict, False) o = _Output(t, s) p = ParamDict() p2 = ParamDict() ii = InputSpec("x", dict, False, False, timeout="0.1s", default_value=p, default_on_timeout=False) i = _Input(t, ii).set_dependency(o) raises(TimeoutError, lambda: i.get()) # Output skipped, input without default will raise error t = MockTaskForVar() s = OutputSpec("o", ParamDict, False) o = _Output(t, s) p = ParamDict() ii = InputSpec("x", dict, False) i = _Input(t, ii).set_dependency(o) o.skip() raises(SkippedError, lambda: i.get()) # Output skipped, input with default will return default t = MockTaskForVar() s = OutputSpec("o", ParamDict, False) o = _Output(t, s) p = ParamDict() ii = InputSpec("x", dict, False, False, p) i = _Input(t, ii).set_dependency(o) o.skip() assert p is i.get() # Output -> workflow output -> Input t = MockTaskForVar() s = OutputSpec("o", ParamDict, False) oo = _Output(t, s) # task output o = _Output(t, s) # workflow output o.set_dependency(oo) p = ParamDict() ii = InputSpec("x", dict, False) i = _Input(t, ii).set_dependency(o) oo.set(p) assert p is i.get()
from triad.collections.dict import ParamDict DEFAULT_CONFIG = ParamDict({"fugue.dask.dataframe.default.partitions": 16})
def test_param_dict(): d = ParamDict([("a", 1), ("b", 2)]) assert 1 == d["a"] assert 1 == d[0] assert 2 == d["b"] assert "2" == d.get_or_throw(1, str) # if giving index, it should ignore the throw flag and always throw raises(IndexError, lambda: d.get(2, "x")) raises(IndexError, lambda: d.get_or_none(2, str)) d = {"a": "b", "b": {"x": 1, "y": "d"}} p = ParamDict(d) print({"test": p}) d["b"]["x"] = 2 assert 1 == p["b"]["x"] p = ParamDict(d, deep=False) d["b"]["x"] = 3 assert 3 == p["b"]["x"] pp = ParamDict(p, deep=False) p["b"]["x"] = 4 assert 4 == pp["b"]["x"] pp = ParamDict(p, deep=True) p["b"]["x"] = 5 assert 4 == pp["b"]["x"] assert 2 == len(p) assert "a,b" == ",".join([k for k, _ in p.items()]) del p["a"] assert 1 == len(p) p["c"] = 1 assert 2 == len(p) assert "c" in p assert "a" not in p raises(ValueError, lambda: p.get("c", None)) assert 1 == p.get("c", 2) assert "1" == p.get("c", "2") assert 1.0 == p.get("c", 2.0) raises(TypeError, lambda: p.get("c", ParamDict())) assert 2 == p.get("d", 2) p["arr"] = [1, 2] assert [1, 2] == p.get("arr", []) assert [] == p.get("arr2", []) assert p.get_or_none("e", int) is None assert 1 == p.get_or_none("c", int) assert "1" == p.get_or_none("c", str) # exists but can't convert type raises(TypeError, lambda: p.get_or_none("c", ParamDict)) raises(KeyError, lambda: p.get_or_throw("e", int)) assert 1 == p.get_or_throw("c", int) assert "1" == p.get_or_throw("c", str) # exists but can't convert type raises(TypeError, lambda: p.get_or_throw("c", ParamDict)) p = ParamDict() assert 0 == len(p) for x in p: pass raises(TypeError, lambda: ParamDict("abc")) a = ParamDict({"a": 1, "b": 2}) b = ParamDict({"b": 2, "a": 1}) c = ParamDict({"b": 2}) assert a == a assert a != b assert a != c assert a == {"b": 2, "a": 1} assert a != {"b": 1, "a": 1} assert a != None assert not (a == None) p = ParamDict({ "a": "True", "b": True, "c": "true", "d": "False", "e": False, "f": "false", "g": "yes", "h": "NO", "i": 0, "j": "1", "k": "", }) assert p.get_or_throw("a", bool) assert p.get_or_throw("b", bool) assert p.get_or_throw("c", bool) assert not p.get_or_throw("d", bool) assert not p.get_or_throw("e", bool) assert not p.get_or_throw("f", bool) assert p.get_or_throw("g", bool) assert not p.get_or_throw("h", bool) assert not p.get_or_throw("i", bool) assert p.get_or_throw("j", bool) raises(TypeError, lambda: p.get_or_throw("k", bool)) s = '{"a":false,"b":10,"c":"cd"}' p = ParamDict(json.loads(s)) assert not p.get_or_throw("a", bool) assert "10" == p.get_or_throw("b", str) assert "cd" == p.get_or_throw("c", str) raises(KeyError, lambda: p.get_or_throw("d", str)) print(p.to_json()) print(p.to_json(True)) # update p = ParamDict(dict(a=1, b=2)) p1 = ParamDict(dict(b=3, c=4)) p.update(p1) assert dict(a=1, b=3, c=4) == p p = ParamDict(dict(a=1, b=2)) p.update(p1, ParamDict.OVERWRITE) assert dict(a=1, b=3, c=4) == p p = ParamDict(dict(a=1, b=2)) p.update(p1, ParamDict.IGNORE) assert dict(a=1, b=2, c=4) == p p = ParamDict(dict(a=1, b=2)) raises(KeyError, lambda: p.update(p1, ParamDict.THROW)) raises(ValueError, lambda: p.update(p1, 100)) p.set_readonly() raises(InvalidOperationError, lambda: p.update(p1, 100))
def paramdict(self) -> ParamDict: return ParamDict((x, self.__dict__[x]) for x in self.attributes)
def __init__(self, *args: Any, **kwargs: Any): super().__init__(*args, **kwargs) self._sql_vars: Dict[str, WorkflowDataFrame] = {} self._sql_conf = ParamDict({**FUGUE_SQL_DEFAULT_CONF, **super().conf})
def __init__(self, conf: Any = None): p = ParamDict(FUGUE_DASK_DEFAULT_CONF) p.update(ParamDict(conf)) super().__init__(p) self._fs = FileSystem() self._log = logging.getLogger()