Ejemplo n.º 1
0
 def process(self, dfs: DataFrames) -> DataFrame:
     assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input"))
     how = self.params.get("how", "any")
     assert_or_throw(
         how in ["any", "all"],
         FugueWorkflowError("how' needs to be either 'any' or 'all'"),
     )
     thresh = self.params.get_or_none("thresh", int)
     subset = self.params.get_or_none("subset", list)
     return self.execution_engine.dropna(
         dfs[0], how=how, thresh=thresh, subset=subset
     )
Ejemplo n.º 2
0
 def process(self, dfs: DataFrames) -> DataFrame:
     assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input"))
     value = self.params.get_or_none("value", object)
     assert_or_throw(
         (not isinstance(value, list)) and (value is not None),
         FugueWorkflowError("fillna value cannot be None or list"),
     )
     if isinstance(value, dict):
         assert_or_throw(
             (None not in value.values()) and (any(value.values())),
             FugueWorkflowError(
                 "fillna dict can't contain None and must have len > 1"),
         )
     subset = self.params.get_or_none("subset", list)
     return self.execution_engine.fillna(dfs[0], value=value, subset=subset)
Ejemplo n.º 3
0
 def process(self, dfs: DataFrames) -> DataFrame:
     assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input"))
     if_exists = self.params.get("if_exists", False)
     columns = self.params.get_or_throw("columns", list)
     if if_exists:
         columns = set(columns).intersection(dfs[0].schema.keys())
     return dfs[0].drop(list(columns))
Ejemplo n.º 4
0
 def process(self, dfs: DataFrames) -> DataFrame:
     assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input"))
     columns = self.params.get_or_throw("columns", ColumnsSelect)
     where = None if "where" not in self.params else self.params["where"]
     having = None if "having" not in self.params else self.params["having"]
     return self.execution_engine.select(
         df=dfs[0], cols=columns, where=where, having=having
     )
Ejemplo n.º 5
0
 def process(self, dfs: DataFrames) -> DataFrame:
     assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input"))
     n = self.params.get_or_none("n", int)
     frac = self.params.get_or_none("frac", float)
     replace = self.params.get("replace", False)
     seed = self.params.get_or_none("seed", int)
     return self.execution_engine.sample(
         dfs[0], n=n, frac=frac, replace=replace, seed=seed
     )
Ejemplo n.º 6
0
 def run(self, *args: Any, **kwargs: Any) -> None:
     assert_or_throw(
         len(args) == 0 and len(kwargs) == 0,
         FugueWorkflowError(
             "can't reset workflow context in _FugueInteractiveWorkflow"
         ),
     )
     with self._lock:
         self._computed = False
         self._workflow_ctx.run(self._spec, {})
         self._computed = True
Ejemplo n.º 7
0
 def process(self, dfs: DataFrames) -> DataFrame:
     assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input"))
     # All _get_or operations convert float to int
     n = self.params.get_or_none("n", int)
     presort = self.params.get_or_none("presort", str)
     na_position = self.params.get("na_position", "last")
     partition_spec = self.partition_spec
     return self.execution_engine.take(
         dfs[0],
         n,
         presort=presort,
         na_position=na_position,
         partition_spec=partition_spec,
     )
Ejemplo n.º 8
0
    def process(self, dfs: DataFrames) -> None:
        assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input"))
        kwargs = self.params.get("params", dict())
        path = self.params.get_or_throw("path", str)
        format_hint = self.params.get("fmt", "")
        mode = self.params.get("mode", "overwrite")
        partition_spec = self.partition_spec
        force_single = self.params.get("single", False)

        self.execution_engine.save_df(df=dfs[0],
                                      path=path,
                                      format_hint=format_hint,
                                      mode=mode,
                                      partition_spec=partition_spec,
                                      force_single=force_single,
                                      **kwargs)
Ejemplo n.º 9
0
    def get_result(self, df: WorkflowDataFrame) -> DataFrame:
        """After :meth:`~.run`, get the result of a dataframe defined in the dag

        :return: a calculated dataframe

        :Examples:

        .. code-block:: python

            dag = FugueWorkflow()
            df1 = dag.df([[0]],"a:int")
            dag.run()
            dag.get_result(df1).show()
        """
        assert_or_throw(self._computed, FugueWorkflowError("not computed"))
        return self._workflow_ctx.get_result(id(df._task))
Ejemplo n.º 10
0
 def __init__(
     self,
     input_n: int,
     outputter: Any,
     params: Any,
     pre_partition: Any = None,
     deterministic: bool = True,
     lazy: bool = False,
     input_names: Optional[List[str]] = None,
 ):
     assert_or_throw(input_n > 0, FugueWorkflowError("must have at least one input"))
     self._outputter = _to_outputter(outputter)
     self._outputter._params = ParamDict(params)
     self._outputter._partition_spec = PartitionSpec(pre_partition)
     self._outputter.validate_on_compile()
     super().__init__(
         params=params,
         input_n=input_n,
         output_n=1,
         deterministic=deterministic,
         lazy=lazy,
         input_names=input_names,
     )
Ejemplo n.º 11
0
 def process(self, dfs: DataFrames) -> DataFrame:
     assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input"))
     columns = self.params.get_or_throw("columns", list)
     return dfs[0][columns]
Ejemplo n.º 12
0
 def process(self, dfs: DataFrames) -> DataFrame:
     assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input"))
     columns = self.params.get_or_throw("columns", list)
     return self.execution_engine.aggregate(
         df=dfs[0], partition_spec=self.partition_spec, agg_cols=columns
     )
Ejemplo n.º 13
0
 def process(self, dfs: DataFrames) -> DataFrame:
     assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input"))
     condition = self.params.get_or_throw("condition", ColumnExpr)
     return self.execution_engine.filter(df=dfs[0], condition=condition)
Ejemplo n.º 14
0
 def single_output_expression(self) -> str:
     assert_or_throw(
         len(self.outputs) == 1,
         lambda: FugueWorkflowError(f"{self.name} does not have single output"),
     )
     return self.name + "." + self.outputs.get_key_by_index(0)
Ejemplo n.º 15
0
 def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
     raise FugueWorkflowError(  # pragma: no cover
         "with statement is invalid for _FugueInteractiveWorkflow"
     )
Ejemplo n.º 16
0
 def __enter__(self):
     raise FugueWorkflowError(
         "with statement is invalid for _FugueInteractiveWorkflow"
     )
Ejemplo n.º 17
0
 def process(self, dfs: DataFrames) -> DataFrame:
     assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input"))
     return self.execution_engine.distinct(dfs[0])
Ejemplo n.º 18
0
 def process(self, dfs: DataFrames) -> None:
     assert_or_throw(
         len(dfs) > 1, FugueWorkflowError("can't accept single input"))
     expected = dfs[0]
     for i in range(1, len(dfs)):
         assert not _df_eq(expected, dfs[i], throw=False, **self.params)