def to_spark_schema(obj: Any) -> pt.StructType: assert_arg_not_none(obj, "schema") if isinstance(obj, pt.StructType): return obj if isinstance(obj, ps.DataFrame): return obj.schema return _from_arrow_schema(Schema(obj).pa_schema)
def to_schema(obj: Any) -> Schema: assert_arg_not_none(obj, "obj") if isinstance(obj, pt.StructType): return Schema(_to_arrow_schema(obj)) if isinstance(obj, ps.DataFrame): return to_schema(obj.schema) return Schema(obj)
def to_taskspec( obj: Any, parent_workflow_spec: Optional[WorkflowSpec] = None) -> TaskSpec: assert_arg_not_none(obj, "obj") if isinstance(obj, str): return to_taskspec(json.loads(obj)) if isinstance(obj, TaskSpec): return obj if isinstance(obj, Dict): d: Dict[str, Any] = dict(obj) node_spec: Optional[_NodeSpec] = None if "node_spec" in d: aot( parent_workflow_spec is not None, lambda: InvalidOperationError("parent workflow must be set"), ) node_spec = _NodeSpec( workflow=parent_workflow_spec, **d["node_spec"] # type: ignore ) del d["node_spec"] if "tasks" in d: ts: TaskSpec = WorkflowSpec(**d) else: ts = TaskSpec(**d) if node_spec is not None: ts._node_spec = node_spec return ts raise TypeError(f"can't convert {obj} to TaskSpec") # pragma: no cover
def from_func(func: Callable, schema: Any, validation_rules: Dict[str, Any]) -> "_FuncAsCoTransformer": assert_or_throw( len(validation_rules) == 0, NotImplementedError( "CoTransformer does not support validation rules"), ) if schema is None: schema = parse_output_schema_from_comment(func) if isinstance(schema, Schema): # to be less strict on determinism schema = str(schema) if isinstance(schema, str): assert_or_throw( "*" not in schema, FugueInterfacelessError( "* can't be used on cotransformer output schema"), ) assert_arg_not_none(schema, "schema") tr = _FuncAsCoTransformer() tr._wrapper = FunctionWrapper( # type: ignore func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspq]$") tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore tr._output_schema_arg = schema # type: ignore tr._validation_rules = {} # type: ignore tr._uses_callback = "f" in tr._wrapper.input_code.lower( ) # type: ignore tr._requires_callback = "F" in tr._wrapper.input_code # type: ignore return tr
def to_local_df(df: Any, schema: Any = None, metadata: Any = None) -> LocalDataFrame: """Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalDataFrame` :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and list or iterable of arrays :param schema: |SchemaLikeObject|, defaults to None, it should not be set for :class:`~fugue.dataframe.dataframe.DataFrame` type :param metadata: dict-like object with string keys, defaults to None :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame` but you set ``schema`` or ``metadata`` :raises TypeError: if ``df`` is not compatible :return: the dataframe itself if it's :class:`~fugue.dataframe.dataframe.LocalDataFrame` else a converted one :Examples: >>> a = to_local_df([[0,'a'],[1,'b']],"a:int,b:str") >>> assert to_local_df(a) is a >>> to_local_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str")) """ assert_arg_not_none(df, "df") if isinstance(df, DataFrame): aot( schema is None and metadata is None, ValueError("schema and metadata must be None when df is a DataFrame"), ) return df.as_local() if isinstance(df, pd.DataFrame): return PandasDataFrame(df, schema, metadata) if isinstance(df, List): return ArrayDataFrame(df, schema, metadata) if isinstance(df, Iterable): return IterableDataFrame(df, schema, metadata) raise TypeError(f"{df} cannot convert to a LocalDataFrame")
def to_df( self, df: Any, schema: Any = None, metadata: Any = None ) -> SparkDataFrame: """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame` :param data: :class:`~fugue.dataframe.dataframe.DataFrame`, :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`, pandas DataFrame or list or iterable of arrays :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType` defaults to None. :param metadata: |ParamsLikeObject|, defaults to None :return: engine compatible dataframe :Notice: * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`, it should return itself * For :class:`~spark:pyspark.RDD`, list or iterable of arrays, ``schema`` must be specified * When ``schema`` is not None, a potential type cast may happen to ensure the dataframe's schema. * all other methods in the engine can take arbitrary dataframes and call this method to convert before doing anything """ if isinstance(df, DataFrame): assert_or_throw( schema is None and metadata is None, ValueError("schema and metadata must be None when df is a DataFrame"), ) if isinstance(df, SparkDataFrame): return df if any(pa.types.is_struct(t) for t in df.schema.types): sdf = self.spark_session.createDataFrame( df.as_array(type_safe=True), to_spark_schema(df.schema) ) else: sdf = self.spark_session.createDataFrame( df.as_pandas(), to_spark_schema(df.schema) ) return SparkDataFrame(sdf, df.schema, df.metadata) if isinstance(df, ps.DataFrame): return SparkDataFrame( df, None if schema is None else to_schema(schema), metadata ) if isinstance(df, RDD): assert_arg_not_none(schema, "schema") sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema)) return SparkDataFrame(sdf, to_schema(schema), metadata) if isinstance(df, pd.DataFrame): sdf = self.spark_session.createDataFrame(df) return SparkDataFrame(sdf, schema, metadata) # use arrow dataframe here to handle nulls in int cols adf = ArrowDataFrame(df, to_schema(schema)) sdf = self.spark_session.createDataFrame( adf.as_array(), to_spark_schema(adf.schema) ) return SparkDataFrame(sdf, adf.schema, metadata)
def from_func(func: Callable, schema: Any) -> "_FuncAsTransformer": if schema is None: schema = parse_output_schema_from_comment(func) if isinstance(schema, Schema): # to be less strict on determinism schema = str(schema) assert_arg_not_none(schema, "schema") tr = _FuncAsTransformer() tr._wrapper = FunctionWrapper(func, "^[lsp]x*$", "^[lsp]$") # type: ignore tr._output_schema_arg = schema # type: ignore return tr
def test_assert_arg_not_none(): assert_arg_not_none(1) with raises(NoneArgumentError) as err: assert_arg_not_none(None, "a") assert "a can't be None" == err.value.args[0] with raises(NoneArgumentError) as err: assert_arg_not_none(None, "a", "b") assert "a can't be None" == err.value.args[0] with raises(NoneArgumentError) as err: assert_arg_not_none(None, "", msg="b") assert "b" == err.value.args[0] with raises(NoneArgumentError) as err: assert_arg_not_none(None, None, msg="b") assert "b" == err.value.args[0]
def get(self, key: Union[int, str], default: Any) -> Any: # type: ignore """Get value by `key`, and the value must be a subtype of the type of `default` (which can't be None). If the `key` is not found, return `default`. :param key: the key to search :raises NoneArgumentError: if default is None :raises TypeError: if the value can't be converted to the type of `default` :return: the value by `key`, and the value must be a subtype of the type of `default`. If `key` is not found, return `default` """ assert_arg_not_none(default, "default") if (isinstance(key, str) and key in self) or isinstance(key, int): return as_type(self[key], type(default)) return default
def from_func(func: Callable, schema: Any, validation_rules: Dict[str, Any]) -> "_FuncAsTransformer": if schema is None: schema = parse_output_schema_from_comment(func) if isinstance(schema, Schema): # to be less strict on determinism schema = str(schema) validation_rules.update(parse_validation_rules_from_comment(func)) assert_arg_not_none(schema, "schema") tr = _FuncAsTransformer() tr._wrapper = FunctionWrapper( # type: ignore func, "^[lspq][fF]?x*z?$", "^[lspq]$") tr._output_schema_arg = schema # type: ignore tr._validation_rules = validation_rules # type: ignore tr._uses_callback = "f" in tr._wrapper.input_code.lower( ) # type: ignore tr._requires_callback = "F" in tr._wrapper.input_code # type: ignore return tr
def __setitem__( # type: ignore self, name: str, value: Any, *args: List[Any], **kwds: Dict[str, Any]) -> None: assert_arg_not_none(value, "value") if not validate_column_name(name): raise SchemaError(f"Invalid column name {name}") if name in self: # update existing value is not allowed raise SchemaError(f"{name} already exists in {self}") if isinstance(value, pa.Field): assert_or_throw(name == value.name, SchemaError(f"{name} doesn't match {value}")) elif isinstance(value, pa.DataType): value = pa.field(name, value) else: value = pa.field(name, to_pa_datatype(value)) assert_or_throw(is_supported(value.type), SchemaError(f"{value} is not supported")) super().__setitem__(name, value, *args, **kwds) # type: ignore
def _apply_schema( self, pdf: pd.DataFrame, schema: Optional[Schema], type_safe: bool = True ) -> Tuple[pd.DataFrame, Schema]: if not type_safe: assert_arg_not_none(pdf, "pdf") assert_arg_not_none(schema, "schema") return pdf, schema DASK_UTILS.ensure_compatible(pdf) if pdf.columns.dtype == "object": # pdf has named schema pschema = Schema(DASK_UTILS.to_schema(pdf)) if schema is None or pschema == schema: return pdf, pschema.assert_not_empty() pdf = pdf[schema.assert_not_empty().names] else: # pdf has no named schema schema = _input_schema(schema).assert_not_empty() assert_or_throw( pdf.shape[1] == len(schema), ValueError(f"Pandas datafame column count doesn't match {schema}"), ) pdf.columns = schema.names return DASK_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True), schema
def get_join_schemas(df1: DataFrame, df2: DataFrame, how: str, on: Iterable[str]) -> Tuple[Schema, Schema]: """Get :class:`~triad:triad.collections.schema.Schema` object after joining ``df1`` and ``df2``. If ``on`` is not empty, it's mainly for validation purpose. :param df1: first dataframe :param df2: second dataframe :param how: can accept ``semi``, ``left_semi``, ``anti``, ``left_anti``, ``inner``, ``left_outer``, ``right_outer``, ``full_outer``, ``cross`` :param on: it can always be inferred, but if you provide, it will be validated agained the inferred keys. :return: the pair key schema and schema after join .. note:: In Fugue, joined schema can always be inferred because it always uses the input dataframes' common keys as the join keys. So you must make sure to :meth:`~fugue.dataframe.dataframe.DataFrame.rename` to input dataframes so they follow this rule. """ assert_arg_not_none(how, "how") how = how.lower() aot( how in [ "semi", "left_semi", "anti", "left_anti", "inner", "left_outer", "right_outer", "full_outer", "cross", ], ValueError(f"{how} is not a valid join type"), ) on = list(on) aot(len(on) == len(set(on)), f"{on} has duplication") if how != "cross" and len(on) == 0: on = list(df1.schema.intersect(df2.schema.names).names) aot( len(on) > 0, lambda: SchemaError( f"no common columns between {df1.schema} and {df2.schema}"), ) schema2 = df2.schema aot( how != "outer", ValueError( "'how' must use left_outer, right_outer, full_outer for outer joins" ), ) if how in ["semi", "left_semi", "anti", "left_anti"]: schema2 = schema2.extract(on) aot( on in df1.schema and on in schema2, lambda: SchemaError( f"{on} is not the intersection of {df1.schema} & {df2.schema}"), ) cm = df1.schema.intersect(on) if how == "cross": aot( len(df1.schema.intersect(schema2)) == 0, SchemaError("can't specify on for cross join"), ) else: aot(len(on) > 0, SchemaError("on must be specified")) return cm, (df1.schema.union(schema2))