Esempio n. 1
0
 def visitFugueModuleTask(self, ctx: fp.FugueModuleTaskContext) -> None:
     data = self.get_dict(ctx, "assign", "dfs", "using", "params")
     sub = _to_module(
         data["using"],
         global_vars=self.global_vars,
         local_vars=self.local_vars,
     )
     varname = data["assign"][0] if "assign" in data else None
     if varname is not None:
         assert_or_throw(
             sub.has_single_output or sub.has_multiple_output,
             FugueSQLSyntaxError(
                 "invalid assignment for module without output"),
         )
     if sub.has_input:
         dfs = data["dfs"] if "dfs" in data else WorkflowDataFrames(
             self.last)
     else:
         dfs = WorkflowDataFrames()
     p = data["params"] if "params" in data else {}
     if sub.has_dfs_input:
         result = sub(dfs, **p)
     elif len(dfs) == 0:
         result = sub(self.workflow, **p)
     elif len(dfs) == 1 or not dfs.has_key:
         result = sub(*list(dfs.values()), **p)
     else:
         result = sub(**dfs, **p)
     if sub.has_single_output or sub.has_multiple_output:
         self.variables[varname] = result
     if sub.has_single_output:
         self._last = result
Esempio n. 2
0
def str_to_type(
    s: str,
    expected_base_type: Optional[type] = None,
    global_vars: Optional[Dict[str, Any]] = None,
    local_vars: Optional[Dict[str, Any]] = None,
) -> type:
    """Given a string expression, find the first/last type from all import libraries.
    If the expression contains `.`, it's supposed to be a relative or full path of
    the type including modules.

    :param s: type expression, for example `triad.utils.iter.Slicer` or `str`
    :param expected_base_type: base class type that must satisfy, defaults to None
    :param global_vars: overriding global variables, if None, it will
      use the caller's globals(), defaults to None
    :param local_vars: overriding local variables, if None, it will
      use the caller's locals(), defaults to None

    :raises TypeError: unable to find a matching type

    :return: found type
    """
    global_vars, local_vars = get_caller_global_local_vars(global_vars, local_vars)
    try:
        obj = str_to_object(s, global_vars, local_vars)
    except ValueError:
        raise TypeError(f"{s} is not a type")
    assert_or_throw(isinstance(obj, type), TypeError(f"{obj} is not a type"))
    assert_or_throw(
        expected_base_type is None or issubclass(obj, expected_base_type),
        TypeError(f"{obj} is not a subtype of {expected_base_type}"),
    )
    return obj
Esempio n. 3
0
def to_function(
    func: Any,
    global_vars: Optional[Dict[str, Any]] = None,
    local_vars: Optional[Dict[str, Any]] = None,
) -> Any:  # noqa: C901
    """For an expression, it tries to find the matching function.

    :params s: a string expression or a callable
    :param global_vars: overriding global variables, if None, it will
      use the caller's globals(), defaults to None
    :param local_vars: overriding local variables, if None, it will
      use the caller's locals(), defaults to None

    :raises AttributeError: if unable to find such a function

    :return: the matching function
    """
    if isinstance(func, str):
        global_vars, local_vars = get_caller_global_local_vars(global_vars, local_vars)
        try:
            func = str_to_object(func, global_vars, local_vars)
        except ValueError:
            raise AttributeError(f"{func} is not a function")
    assert_or_throw(
        callable(func) and not isinstance(func, six.class_types),
        AttributeError(f"{func} is not a function"),
    )
    return func
Esempio n. 4
0
    def from_func(func: Callable, schema: Any,
                  validation_rules: Dict[str, Any]) -> "_FuncAsCoTransformer":
        assert_or_throw(
            len(validation_rules) == 0,
            NotImplementedError(
                "CoTransformer does not support validation rules"),
        )

        if schema is None:
            schema = parse_output_schema_from_comment(func)
        if isinstance(schema, Schema):  # to be less strict on determinism
            schema = str(schema)
        if isinstance(schema, str):
            assert_or_throw(
                "*" not in schema,
                FugueInterfacelessError(
                    "* can't be used on cotransformer output schema"),
            )
        assert_arg_not_none(schema, "schema")
        tr = _FuncAsCoTransformer()
        tr._wrapper = FunctionWrapper(  # type: ignore
            func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspq]$")
        tr._dfs_input = tr._wrapper.input_code[0] == "c"  # type: ignore
        tr._output_schema_arg = schema  # type: ignore
        tr._validation_rules = {}  # type: ignore
        tr._uses_callback = "f" in tr._wrapper.input_code.lower(
        )  # type: ignore
        tr._requires_callback = "F" in tr._wrapper.input_code  # type: ignore
        return tr
Esempio n. 5
0
def _validate_callback(ctx: Any) -> None:
    if ctx._requires_callback:
        assert_or_throw(
            ctx.has_callback,
            FugueInterfacelessError(
                f"Callback is required but not provided: {ctx}"),
        )
Esempio n. 6
0
 def fillna(
     self,
     df: DataFrame,
     value: Any,
     subset: List[str] = None,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         (not isinstance(value, list)) and (value is not None),
         ValueError("fillna value can not be a list or None"),
     )
     if isinstance(value, dict):
         assert_or_throw(
             (None not in value.values()) and (any(value.values())),
             ValueError(
                 "fillna dict can not contain None and needs at least one value"
             ),
         )
         mapping = value
     else:
         # If subset is none, apply to all columns
         subset = subset or df.schema.names
         mapping = {col: value for col in subset}
     d = self.to_df(df).native.fillna(mapping)
     return self.to_df(d, df.schema, metadata)
Esempio n. 7
0
    def take(
        self,
        df: DataFrame,
        n: int,
        presort: str,
        na_position: str = "last",
        partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
        metadata: Any = None,
    ) -> DataFrame:
        assert_or_throw(
            isinstance(n, int),
            ValueError("n needs to be an integer"),
        )
        d = self.to_df(df).native
        nulls_last = bool(na_position == "last")

        if presort:
            presort = parse_presort_exp(presort)
        # Use presort over partition_spec.presort if possible
        _presort: IndexedOrderedDict = presort or partition_spec.presort

        def _presort_to_col(_col: str, _asc: bool) -> Any:
            if nulls_last:
                if _asc:
                    return col(_col).asc_nulls_last()
                else:
                    return col(_col).desc_nulls_last()
            else:
                if _asc:
                    return col(_col).asc_nulls_first()
                else:
                    return col(_col).desc_nulls_first()

        # If no partition
        if len(partition_spec.partition_by) == 0:
            if len(_presort.keys()) > 0:
                d = d.orderBy(
                    [_presort_to_col(_col, _presort[_col]) for _col in _presort.keys()]
                )
            d = d.limit(n)

        # If partition exists
        else:
            w = Window.partitionBy([col(x) for x in partition_spec.partition_by])

            if len(_presort.keys()) > 0:
                w = w.orderBy(
                    [_presort_to_col(_col, _presort[_col]) for _col in _presort.keys()]
                )
            else:
                # row_number() still needs an orderBy
                w = w.orderBy(lit(1))

            d = (
                d.select(col("*"), row_number().over(w).alias("__row_number__"))
                .filter(col("__row_number__") <= n)
                .drop("__row_number__")
            )

        return self.to_df(d, df.schema, metadata)
Esempio n. 8
0
 def _process_assignable(self, df: WorkflowDataFrame, ctx: Tree):
     data = self.get_dict(ctx, "assign", "persist", "broadcast")
     if "assign" in data:
         varname, sign = data["assign"]
     else:
         varname, sign = None, None
     need_checkpoint = sign == "??"
     if "persist" in data:
         is_checkpoint, value = data["persist"]
         if need_checkpoint or is_checkpoint:
             assert_or_throw(
                 is_checkpoint,
                 FugueSQLSyntaxError(
                     "can't persist when checkpoint is specified"),
             )
             df = df.checkpoint(value)
         else:
             df = df.persist(value)
     elif need_checkpoint:
         df = df.checkpoint()
     if "broadcast" in data:
         df = df.broadcast()
     if varname is not None:
         self.variables[varname] = df
     self._last = df
Esempio n. 9
0
 def _sql(self, code: str, *args: Any,
          **kwargs: Any) -> Dict[str, WorkflowDataFrame]:
     # TODO: move dict construction to triad
     params: Dict[str, Any] = {}
     for a in args:
         assert_or_throw(isinstance(a, Dict),
                         f"args can only have dict: {a}")
         params.update(a)
     params.update(kwargs)
     code = fill_sql_template(code, params)
     sql = FugueSQL(
         code,
         "fugueLanguage",
         ignore_case=self.conf.get_or_throw("fugue.sql.compile.ignore_case",
                                            bool),
         simple_assign=self.conf.get_or_throw(
             "fugue.sql.compile.simple_assign", bool),
     )
     dfs = {
         k: v
         for k, v in params.items() if isinstance(v, WorkflowDataFrame)
     }
     v = _Extensions(sql, FugueSQLHooks(), self, dfs)
     v.visit(sql.tree)
     return v.variables
Esempio n. 10
0
 def _sql(self, code: str, *args: Any,
          **kwargs: Any) -> Dict[str, WorkflowDataFrame]:
     # TODO: move dict construction to triad
     params: Dict[str, Any] = {}
     for a in args:
         assert_or_throw(isinstance(a, Dict),
                         f"args can only have dict: {a}")
         params.update(a)
     params.update(kwargs)
     template_params = dict(params)
     if "self" in template_params:
         del template_params["self"]
     code = fill_sql_template(code, template_params)
     sql = FugueSQL(
         code,
         "fugueLanguage",
         ignore_case=self.conf.get_or_throw(FUGUE_SQL_CONF_IGNORE_CASE,
                                            bool),
         simple_assign=self.conf.get_or_throw(FUGUE_SQL_CONF_SIMPLE_ASSIGN,
                                              bool),
     )
     dfs = {
         k: v
         for k, v in params.items() if isinstance(v, WorkflowDataFrame)
     }
     v = _Extensions(sql, FugueSQLHooks(), self, dfs, local_vars=params)
     v.visit(sql.tree)
     return v.variables
Esempio n. 11
0
 def from_func(func: Callable, schema: Any) -> "_FuncAsCreator":
     # pylint: disable=W0201
     if schema is None:
         schema = parse_output_schema_from_comment(func)
     tr = _FuncAsCreator()
     tr._wrapper = FunctionWrapper(func, "^e?x*z?$",
                                   "^[dlspq]$")  # type: ignore
     tr._need_engine = tr._wrapper.input_code.startswith("e")
     tr._need_output_schema = "s" == tr._wrapper.output_code
     tr._output_schema = Schema(schema)
     if len(tr._output_schema) == 0:
         assert_or_throw(
             not tr._need_output_schema,
             FugueInterfacelessError(
                 f"schema must be provided for return type {tr._wrapper._rt}"
             ),
         )
     else:
         assert_or_throw(
             tr._need_output_schema,
             FugueInterfacelessError(
                 f"schema must not be provided for return type {tr._wrapper._rt}"
             ),
         )
     return tr
Esempio n. 12
0
    def assert_not_empty(self) -> None:
        """Assert this dataframe is not empty

        :raises FugueDataFrameEmptyError: if it is empty
        """

        assert_or_throw(not self.empty, FugueDataFrameEmptyError("dataframe is empty"))
Esempio n. 13
0
def to_validation_rules(data: Dict[str, Any]) -> Dict[str, Any]:
    res: Dict[str, Any] = {}
    for k, v in data.items():
        if k in ["partitionby_has", "partitionby_is"]:
            if isinstance(v, str):
                v = [x.strip() for x in v.split(",")]
            res[k] = PartitionSpec(by=v).partition_by
        elif k in ["presort_has", "presort_is"]:
            res[k] = list(parse_presort_exp(v).items())
        elif k in ["input_has"]:
            if isinstance(v, str):
                res[k] = v.replace(" ", "").split(",")
            else:
                assert_or_throw(
                    isinstance(v, list),
                    lambda: SyntaxError(f"{v} is neither a string or a list"),
                )
                res[k] = [x.replace(" ", "") for x in v]
        elif k in ["input_is"]:
            try:
                res[k] = str(Schema(v))
            except SyntaxError:
                raise SyntaxError(  # pylint: disable=W0707
                    f"for input_is, the input must be a schema expression {v}")
        else:
            raise NotImplementedError(k)
    return res
Esempio n. 14
0
def to_cast_expression(schema1: Any, schema2: Any,
                       allow_name_mismatch: bool) -> Tuple[bool, List[str]]:
    schema1 = to_spark_schema(schema1)
    schema2 = to_spark_schema(schema2)
    assert_or_throw(
        len(schema1) == len(schema2),
        lambda: ValueError(f"schema mismatch: {schema1}, {schema2}"),
    )
    expr: List[str] = []
    has_cast = False
    for i in range(len(schema1)):
        name_match = schema1[i].name == schema2[i].name
        assert_or_throw(
            name_match or allow_name_mismatch,
            lambda: ValueError(f"schema name mismatch: {schema1}, {schema2}"),
        )
        if schema1[i].dataType != schema2[i].dataType:
            type2 = schema2[i].dataType.simpleString()
            if isinstance(schema1[i].dataType,
                          pt.FractionalType) and isinstance(
                              schema2[i].dataType, pt.StringType):
                expr.append(
                    f"CAST(IF(isnan({schema1[i].name}), NULL, {schema1[i].name})"
                    f" AS {type2}) {schema2[i].name}")
            else:
                expr.append(
                    f"CAST({schema1[i].name} AS {type2}) {schema2[i].name}")
            has_cast = True
        else:
            if schema1[i].name != schema2[i].name:
                expr.append(f"{schema1[i].name} AS {schema2[i].name}")
                has_cast = True
            else:
                expr.append(schema1[i].name)
    return has_cast, expr
Esempio n. 15
0
def save_df(
    df: DaskDataFrame,
    uri: str,
    format_hint: Optional[str] = None,
    mode: str = "overwrite",
    fs: Optional[FileSystem] = None,
    **kwargs: Any,
) -> None:
    assert_or_throw(
        mode in ["overwrite", "error"],
        lambda: NotImplementedError(f"{mode} is not supported"),
    )
    p = FileParser(uri, format_hint).assert_no_glob()
    if fs is None:
        fs = FileSystem()
    if fs.exists(uri):
        assert_or_throw(mode == "overwrite", FileExistsError(uri))
        try:
            fs.remove(uri)
        except Exception:
            try:
                fs.removetree(uri)
            except Exception:  # pragma: no cover
                pass
    _FORMAT_SAVE[p.file_format](df, p, **kwargs)
Esempio n. 16
0
def to_kv_iterable(  # noqa: C901
        data: Any, none_as_empty: bool = True) -> Iterable[Tuple[Any, Any]]:
    """Convert data to iterable of key value pairs

    :param data: input object, it can be a dict or Iterable[Tuple[Any, Any]]
        or Iterable[List[Any]]
    :param none_as_empty: if to treat None as empty iterable

    :raises ValueError: if input is None and `none_as_empty==False`
    :raises TypeError or ValueError: if input data type is not acceptable

    :yield: iterable of key value pair as tuples
    """
    if data is None:
        assert_or_throw(none_as_empty, ValueError("data can't be None"))
    elif isinstance(data, Dict):
        for k, v in data.items():
            yield k, v
    elif isinstance(data, Iterable):
        ei = make_empty_aware(data)
        if not ei.empty:
            first = ei.peek()
            if isinstance(first, tuple):
                for k, v in ei:
                    yield k, v
            elif isinstance(first, List):
                for arr in ei:
                    if len(arr) == 2:
                        yield arr[0], arr[1]
                    else:
                        raise TypeError(f"{arr} is not an acceptable item")
            else:
                raise TypeError(f"{first} is not an acceptable item")
    else:
        raise TypeError(f"{type(data)} is not supported")
Esempio n. 17
0
 def yielded_file(self) -> YieldedFile:
     assert_or_throw(
         self.permanent,
         lambda: FugueWorkflowCompileError(
             f"yield is not allowed for {self}"),
     )
     return self._yielded
Esempio n. 18
0
    def __init__(self, path: str, format_hint: Optional[str] = None):
        last = len(path)
        has_glob = False
        for i in range(len(path)):
            if path[i] in ["/", "\\"]:
                last = i
            if path[i] in ["*", "?"]:
                has_glob = True
                break
        if not has_glob:
            self._uri = urlparse(path)
            self._glob_pattern = ""
            self._path = self._uri.path
        else:
            self._uri = urlparse(path[:last])
            self._glob_pattern = path[last + 1:]
            self._path = pfs.path.join(self._uri.path, self._glob_pattern)

        if format_hint is None or format_hint == "":
            for k, v in _FORMAT_MAP.items():
                if self.suffix.endswith(k):
                    self._format = v
                    return
            raise NotImplementedError(f"{self.suffix} is not supported")
        else:
            assert_or_throw(
                format_hint in _FORMAT_MAP.values(),
                NotImplementedError(f"{format_hint} is not supported"),
            )
            self._format = format_hint
Esempio n. 19
0
 def _sql(
     self, code: str, *args: Any, **kwargs: Any
 ) -> Dict[str, Tuple[WorkflowDataFrame, WorkflowDataFrames,
                      LazyWorkflowDataFrame]]:
     # TODO: move dict construction to triad
     params: Dict[str, Any] = {}
     for a in args:
         assert_or_throw(isinstance(a, Dict),
                         lambda: f"args can only have dict: {a}")
         params.update(a)
     params.update(kwargs)
     params, dfs = self._split_params(params)
     code = fill_sql_template(code, params)
     sql = FugueSQL(
         code,
         "fugueLanguage",
         ignore_case=self.conf.get_or_throw(FUGUE_SQL_CONF_IGNORE_CASE,
                                            bool),
         simple_assign=self.conf.get_or_throw(FUGUE_SQL_CONF_SIMPLE_ASSIGN,
                                              bool),
     )
     v = _Extensions(
         sql,
         FugueSQLHooks(),
         self,
         dfs,
         local_vars=params  # type: ignore
     )
     v.visit(sql.tree)
     return v.variables
Esempio n. 20
0
 def from_func(func: Callable, schema: Any,
               validation_rules: Dict[str, Any]) -> "_FuncAsProcessor":
     if schema is None:
         schema = parse_output_schema_from_comment(func)
     validation_rules.update(parse_validation_rules_from_comment(func))
     tr = _FuncAsProcessor()
     tr._wrapper = FunctionWrapper(func, "^e?(c|[dlspq]+)x*z?$",
                                   "^[dlspq]$")  # type: ignore
     tr._engine_param = (tr._wrapper._params.get_value_by_index(0) if
                         tr._wrapper.input_code.startswith("e") else None)
     tr._use_dfs = "c" in tr._wrapper.input_code
     tr._need_output_schema = tr._wrapper.need_output_schema
     tr._validation_rules = validation_rules
     tr._output_schema = Schema(schema)
     if len(tr._output_schema) == 0:
         assert_or_throw(
             tr._need_output_schema is None or not tr._need_output_schema,
             FugueInterfacelessError(
                 f"schema must be provided for return type {tr._wrapper._rt}"
             ),
         )
     else:
         assert_or_throw(
             tr._need_output_schema is None or tr._need_output_schema,
             FugueInterfacelessError(
                 f"schema must not be provided for return type {tr._wrapper._rt}"
             ),
         )
     return tr
Esempio n. 21
0
 def _parse_function(
     self,
     func: Callable,
     params_re: str = ".*",
     return_re: str = ".*"
 ) -> Tuple[bool, IndexedOrderedDict[str, "_FuncParam"], "_FuncParam"]:
     sig = inspect.signature(func)
     annotations = get_type_hints(func)
     res: IndexedOrderedDict[str, "_FuncParam"] = IndexedOrderedDict()
     class_method = False
     for k, w in sig.parameters.items():
         if k == "self":
             res[k] = _SelfParam(w)
             class_method = True
         else:
             anno = annotations.get(k, w.annotation)
             res[k] = self._parse_param(anno, w)
     anno = annotations.get("return", sig.return_annotation)
     rt = self._parse_param(anno, None, none_as_other=False)
     params_str = "".join(x.code for x in res.values())
     assert_or_throw(re.match(params_re, params_str),
                     TypeError(f"Input types not valid {res}"))
     assert_or_throw(re.match(return_re, rt.code),
                     TypeError(f"Return type not valid {rt}"))
     return class_method, res, rt
Esempio n. 22
0
def parse_output_schema_from_comment(func: Callable) -> Optional[str]:
    """Parse schema hint from the comments above the function. It try to find
    comment lines starts with `schema:` from bottom up, and will use the first
    occurrance as the hint.

    :param func: the function
    :return: schema hint string

    :Example:

    .. code-block:: python

        # schema: a:int,b:str
        #schema:a:int,b:int # more comment
        # some comment
        def dummy():
            pass

        assert "a:int,b:int" == parse_output_schema_from_comment(dummy)
    """
    res = parse_comment_annotation(func, _COMMENT_SCHEMA_ANNOTATION)
    if res is None:
        return None
    assert_or_throw(res != "", SyntaxError("incorrect schema annotation"))
    return res.replace(" ", "")
Esempio n. 23
0
 def fillna(
     self,
     df: DataFrame,
     value: Any,
     subset: List[str] = None,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         (not isinstance(value, list)) and (value is not None),
         ValueError("fillna value can not None or a list"),
     )
     if isinstance(value, dict):
         assert_or_throw(
             (None not in value.values()) and (any(value.values())),
             ValueError(
                 "fillna dict can not contain None and needs at least one value"
             ),
         )
         mapping = value
     else:
         # If subset is none, apply to all columns
         subset = subset or df.schema.names
         mapping = {col: value for col in subset}
     d = df.as_pandas().fillna(mapping, inplace=False)
     return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
Esempio n. 24
0
 def to_output_df(self, output: LocalDataFrame, schema: Any,
                  ctx: Any) -> DataFrame:
     assert_or_throw(
         schema is None or output.schema == schema,
         lambda: f"Output schema mismatch {output.schema} vs {schema}",
     )
     return output
Esempio n. 25
0
 def validate_on_compile(self):
     n = self.params.get_or_none("n", int)
     frac = self.params.get_or_none("frac", float)
     assert_or_throw(
         (n is None and frac is not None) or (n is not None and frac is None),
         ValueError("one and only one of n and frac should be set"),
     )
Esempio n. 26
0
 def process(self, dfs: DataFrames) -> DataFrame:
     assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input"))
     if_exists = self.params.get("if_exists", False)
     columns = self.params.get_or_throw("columns", list)
     if if_exists:
         columns = set(columns).intersection(dfs[0].schema.keys())
     return dfs[0].drop(list(columns))
Esempio n. 27
0
 def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame:
     data = df.as_array(type_safe=True)
     assert_or_throw(
         len(data) == 1,
         FugueBug("each comap partition can have one and only one row"),
     )
     dfs = DataFrames(list(self._get_dfs(data[0])))
     return self.func(cursor, dfs)
Esempio n. 28
0
 def _append(self, value: Any):
     assert_or_throw(not self.has_key,
                     InvalidOperationError("this DataFrames must have key"))
     assert_or_throw(
         isinstance(value, DataFrame),
         lambda: ValueError(f"{value} is not a DataFrame"),
     )
     super().__setitem__("_" + str(len(self)), value)
Esempio n. 29
0
 def process(self, dfs: DataFrames) -> DataFrame:
     assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input"))
     columns = self.params.get_or_throw("columns", ColumnsSelect)
     where = None if "where" not in self.params else self.params["where"]
     having = None if "having" not in self.params else self.params["having"]
     return self.execution_engine.select(
         df=dfs[0], cols=columns, where=where, having=having
     )
Esempio n. 30
0
 def get_temp_file(self, file_id: str, permanent: bool) -> str:
     path = self._path if permanent else self._temp_path
     assert_or_throw(
         path != "",
         FugueWorkflowRuntimeError(
             f"{FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH} is not set"),
     )
     return os.path.join(path, file_id + ".parquet")