Example #1
0
def to_spark_schema(obj: Any) -> pt.StructType:
    assert_arg_not_none(obj, "schema")
    if isinstance(obj, pt.StructType):
        return obj
    if isinstance(obj, ps.DataFrame):
        return obj.schema
    return _from_arrow_schema(Schema(obj).pa_schema)
Example #2
0
def to_schema(obj: Any) -> Schema:
    assert_arg_not_none(obj, "obj")
    if isinstance(obj, pt.StructType):
        return Schema(_to_arrow_schema(obj))
    if isinstance(obj, ps.DataFrame):
        return to_schema(obj.schema)
    return Schema(obj)
Example #3
0
def to_taskspec(
        obj: Any,
        parent_workflow_spec: Optional[WorkflowSpec] = None) -> TaskSpec:
    assert_arg_not_none(obj, "obj")
    if isinstance(obj, str):
        return to_taskspec(json.loads(obj))
    if isinstance(obj, TaskSpec):
        return obj
    if isinstance(obj, Dict):
        d: Dict[str, Any] = dict(obj)
        node_spec: Optional[_NodeSpec] = None
        if "node_spec" in d:
            aot(
                parent_workflow_spec is not None,
                lambda: InvalidOperationError("parent workflow must be set"),
            )
            node_spec = _NodeSpec(
                workflow=parent_workflow_spec,
                **d["node_spec"]  # type: ignore
            )
            del d["node_spec"]
        if "tasks" in d:
            ts: TaskSpec = WorkflowSpec(**d)
        else:
            ts = TaskSpec(**d)
        if node_spec is not None:
            ts._node_spec = node_spec
        return ts
    raise TypeError(f"can't convert {obj} to TaskSpec")  # pragma: no cover
Example #4
0
    def from_func(func: Callable, schema: Any,
                  validation_rules: Dict[str, Any]) -> "_FuncAsCoTransformer":
        assert_or_throw(
            len(validation_rules) == 0,
            NotImplementedError(
                "CoTransformer does not support validation rules"),
        )

        if schema is None:
            schema = parse_output_schema_from_comment(func)
        if isinstance(schema, Schema):  # to be less strict on determinism
            schema = str(schema)
        if isinstance(schema, str):
            assert_or_throw(
                "*" not in schema,
                FugueInterfacelessError(
                    "* can't be used on cotransformer output schema"),
            )
        assert_arg_not_none(schema, "schema")
        tr = _FuncAsCoTransformer()
        tr._wrapper = FunctionWrapper(  # type: ignore
            func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspq]$")
        tr._dfs_input = tr._wrapper.input_code[0] == "c"  # type: ignore
        tr._output_schema_arg = schema  # type: ignore
        tr._validation_rules = {}  # type: ignore
        tr._uses_callback = "f" in tr._wrapper.input_code.lower(
        )  # type: ignore
        tr._requires_callback = "F" in tr._wrapper.input_code  # type: ignore
        return tr
Example #5
0
def to_local_df(df: Any, schema: Any = None, metadata: Any = None) -> LocalDataFrame:
    """Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalDataFrame`

    :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
      list or iterable of arrays
    :param schema: |SchemaLikeObject|, defaults to None, it should not be set for
      :class:`~fugue.dataframe.dataframe.DataFrame` type
    :param metadata: dict-like object with string keys, defaults to  None
    :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
      but you set ``schema`` or ``metadata``
    :raises TypeError: if ``df`` is not compatible
    :return: the dataframe itself if it's
      :class:`~fugue.dataframe.dataframe.LocalDataFrame` else a converted one

    :Examples:

    >>> a = to_local_df([[0,'a'],[1,'b']],"a:int,b:str")
    >>> assert to_local_df(a) is a
    >>> to_local_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
    """
    assert_arg_not_none(df, "df")
    if isinstance(df, DataFrame):
        aot(
            schema is None and metadata is None,
            ValueError("schema and metadata must be None when df is a DataFrame"),
        )
        return df.as_local()
    if isinstance(df, pd.DataFrame):
        return PandasDataFrame(df, schema, metadata)
    if isinstance(df, List):
        return ArrayDataFrame(df, schema, metadata)
    if isinstance(df, Iterable):
        return IterableDataFrame(df, schema, metadata)
    raise TypeError(f"{df} cannot convert to a LocalDataFrame")
Example #6
0
    def to_df(
        self, df: Any, schema: Any = None, metadata: Any = None
    ) -> SparkDataFrame:
        """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame`

        :param data: :class:`~fugue.dataframe.dataframe.DataFrame`,
          :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`,
          pandas DataFrame or list or iterable of arrays
        :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType`
          defaults to None.
        :param metadata: |ParamsLikeObject|, defaults to None
        :return: engine compatible dataframe

        :Notice:

        * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`,
          it should return itself
        * For :class:`~spark:pyspark.RDD`, list or iterable of arrays,
          ``schema`` must be specified
        * When ``schema`` is not None, a potential type cast may happen to ensure
          the dataframe's schema.
        * all other methods in the engine can take arbitrary dataframes and
          call this method to convert before doing anything
        """
        if isinstance(df, DataFrame):
            assert_or_throw(
                schema is None and metadata is None,
                ValueError("schema and metadata must be None when df is a DataFrame"),
            )
            if isinstance(df, SparkDataFrame):
                return df
            if any(pa.types.is_struct(t) for t in df.schema.types):
                sdf = self.spark_session.createDataFrame(
                    df.as_array(type_safe=True), to_spark_schema(df.schema)
                )
            else:
                sdf = self.spark_session.createDataFrame(
                    df.as_pandas(), to_spark_schema(df.schema)
                )
            return SparkDataFrame(sdf, df.schema, df.metadata)
        if isinstance(df, ps.DataFrame):
            return SparkDataFrame(
                df, None if schema is None else to_schema(schema), metadata
            )
        if isinstance(df, RDD):
            assert_arg_not_none(schema, "schema")
            sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema))
            return SparkDataFrame(sdf, to_schema(schema), metadata)
        if isinstance(df, pd.DataFrame):
            sdf = self.spark_session.createDataFrame(df)
            return SparkDataFrame(sdf, schema, metadata)

        # use arrow dataframe here to handle nulls in int cols
        adf = ArrowDataFrame(df, to_schema(schema))
        sdf = self.spark_session.createDataFrame(
            adf.as_array(), to_spark_schema(adf.schema)
        )
        return SparkDataFrame(sdf, adf.schema, metadata)
Example #7
0
 def from_func(func: Callable, schema: Any) -> "_FuncAsTransformer":
     if schema is None:
         schema = parse_output_schema_from_comment(func)
     if isinstance(schema, Schema):  # to be less strict on determinism
         schema = str(schema)
     assert_arg_not_none(schema, "schema")
     tr = _FuncAsTransformer()
     tr._wrapper = FunctionWrapper(func, "^[lsp]x*$",
                                   "^[lsp]$")  # type: ignore
     tr._output_schema_arg = schema  # type: ignore
     return tr
Example #8
0
def test_assert_arg_not_none():
    assert_arg_not_none(1)
    with raises(NoneArgumentError) as err:
        assert_arg_not_none(None, "a")
    assert "a can't be None" == err.value.args[0]
    with raises(NoneArgumentError) as err:
        assert_arg_not_none(None, "a", "b")
    assert "a can't be None" == err.value.args[0]
    with raises(NoneArgumentError) as err:
        assert_arg_not_none(None, "", msg="b")
    assert "b" == err.value.args[0]
    with raises(NoneArgumentError) as err:
        assert_arg_not_none(None, None, msg="b")
    assert "b" == err.value.args[0]
Example #9
0
    def get(self, key: Union[int, str], default: Any) -> Any:  # type: ignore
        """Get value by `key`, and the value must be a subtype of the type of `default`
        (which can't be None). If the `key` is not found, return `default`.

        :param key: the key to search
        :raises NoneArgumentError: if default is None
        :raises TypeError: if the value can't be converted to the type of `default`

        :return: the value by `key`, and the value must be a subtype of the type of
        `default`. If `key` is not found, return `default`
        """
        assert_arg_not_none(default, "default")
        if (isinstance(key, str) and key in self) or isinstance(key, int):
            return as_type(self[key], type(default))
        return default
Example #10
0
 def from_func(func: Callable, schema: Any,
               validation_rules: Dict[str, Any]) -> "_FuncAsTransformer":
     if schema is None:
         schema = parse_output_schema_from_comment(func)
     if isinstance(schema, Schema):  # to be less strict on determinism
         schema = str(schema)
     validation_rules.update(parse_validation_rules_from_comment(func))
     assert_arg_not_none(schema, "schema")
     tr = _FuncAsTransformer()
     tr._wrapper = FunctionWrapper(  # type: ignore
         func, "^[lspq][fF]?x*z?$", "^[lspq]$")
     tr._output_schema_arg = schema  # type: ignore
     tr._validation_rules = validation_rules  # type: ignore
     tr._uses_callback = "f" in tr._wrapper.input_code.lower(
     )  # type: ignore
     tr._requires_callback = "F" in tr._wrapper.input_code  # type: ignore
     return tr
Example #11
0
 def __setitem__(  # type: ignore
         self, name: str, value: Any, *args: List[Any],
         **kwds: Dict[str, Any]) -> None:
     assert_arg_not_none(value, "value")
     if not validate_column_name(name):
         raise SchemaError(f"Invalid column name {name}")
     if name in self:  # update existing value is not allowed
         raise SchemaError(f"{name} already exists in {self}")
     if isinstance(value, pa.Field):
         assert_or_throw(name == value.name,
                         SchemaError(f"{name} doesn't match {value}"))
     elif isinstance(value, pa.DataType):
         value = pa.field(name, value)
     else:
         value = pa.field(name, to_pa_datatype(value))
     assert_or_throw(is_supported(value.type),
                     SchemaError(f"{value} is not supported"))
     super().__setitem__(name, value, *args, **kwds)  # type: ignore
Example #12
0
 def _apply_schema(
     self, pdf: pd.DataFrame, schema: Optional[Schema], type_safe: bool = True
 ) -> Tuple[pd.DataFrame, Schema]:
     if not type_safe:
         assert_arg_not_none(pdf, "pdf")
         assert_arg_not_none(schema, "schema")
         return pdf, schema
     DASK_UTILS.ensure_compatible(pdf)
     if pdf.columns.dtype == "object":  # pdf has named schema
         pschema = Schema(DASK_UTILS.to_schema(pdf))
         if schema is None or pschema == schema:
             return pdf, pschema.assert_not_empty()
         pdf = pdf[schema.assert_not_empty().names]
     else:  # pdf has no named schema
         schema = _input_schema(schema).assert_not_empty()
         assert_or_throw(
             pdf.shape[1] == len(schema),
             ValueError(f"Pandas datafame column count doesn't match {schema}"),
         )
         pdf.columns = schema.names
     return DASK_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True), schema
Example #13
0
def get_join_schemas(df1: DataFrame, df2: DataFrame, how: str,
                     on: Iterable[str]) -> Tuple[Schema, Schema]:
    """Get :class:`~triad:triad.collections.schema.Schema` object after
    joining ``df1`` and ``df2``. If ``on`` is not empty, it's mainly for
    validation purpose.

    :param df1: first dataframe
    :param df2: second dataframe
    :param how: can accept ``semi``, ``left_semi``, ``anti``, ``left_anti``,
      ``inner``, ``left_outer``, ``right_outer``, ``full_outer``, ``cross``
    :param on: it can always be inferred, but if you provide, it will be
      validated agained the inferred keys.
    :return: the pair key schema and schema after join

    .. note::

        In Fugue, joined schema can always be inferred because it always uses the
        input dataframes' common keys as the join keys. So you must make sure to
        :meth:`~fugue.dataframe.dataframe.DataFrame.rename` to input dataframes so
        they follow this rule.
    """
    assert_arg_not_none(how, "how")
    how = how.lower()
    aot(
        how in [
            "semi",
            "left_semi",
            "anti",
            "left_anti",
            "inner",
            "left_outer",
            "right_outer",
            "full_outer",
            "cross",
        ],
        ValueError(f"{how} is not a valid join type"),
    )
    on = list(on)
    aot(len(on) == len(set(on)), f"{on} has duplication")
    if how != "cross" and len(on) == 0:
        on = list(df1.schema.intersect(df2.schema.names).names)
        aot(
            len(on) > 0,
            lambda: SchemaError(
                f"no common columns between {df1.schema} and {df2.schema}"),
        )
    schema2 = df2.schema
    aot(
        how != "outer",
        ValueError(
            "'how' must use left_outer, right_outer, full_outer for outer joins"
        ),
    )
    if how in ["semi", "left_semi", "anti", "left_anti"]:
        schema2 = schema2.extract(on)
    aot(
        on in df1.schema and on in schema2,
        lambda: SchemaError(
            f"{on} is not the intersection of {df1.schema} & {df2.schema}"),
    )
    cm = df1.schema.intersect(on)
    if how == "cross":
        aot(
            len(df1.schema.intersect(schema2)) == 0,
            SchemaError("can't specify on for cross join"),
        )
    else:
        aot(len(on) > 0, SchemaError("on must be specified"))
    return cm, (df1.schema.union(schema2))