def append(self, obj: Any) -> "Schema": # noqa: C901 """Append schema like object to the current schema. Only new columns are allowed. :raises SchemaError: if a column exists or is invalid or obj is not convertible :return: the Schema object itself """ try: if obj is None: return self elif isinstance(obj, pa.Field): self[obj.name] = obj.type elif isinstance(obj, str): self._append_pa_schema(expression_to_schema(obj)) elif isinstance(obj, Dict): for k, v in obj.items(): self[k] = v elif isinstance(obj, pa.Schema): self._append_pa_schema(obj) elif isinstance(obj, pd.DataFrame): self._append_pa_schema(PD_UTILS.to_schema(obj)) elif isinstance(obj, Tuple): # type: ignore self[obj[0]] = obj[1] elif isinstance(obj, List): for x in obj: self.append(x) else: raise SchemaError(f"Invalid schema to add {obj}") return self except SchemaError: raise except Exception as e: raise SchemaError(str(e))
def to_df(self, df: Any, schema: Any = None, metadata: Any = None) -> SparkDataFrame: """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame` :param data: :class:`~fugue.dataframe.dataframe.DataFrame`, :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`, pandas DataFrame or list or iterable of arrays :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType` defaults to None. :param metadata: |ParamsLikeObject|, defaults to None :return: engine compatible dataframe :Notice: * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`, it should return itself * For :class:`~spark:pyspark.RDD`, list or iterable of arrays, ``schema`` must be specified * When ``schema`` is not None, a potential type cast may happen to ensure the dataframe's schema. * all other methods in the engine can take arbitrary dataframes and call this method to convert before doing anything """ if isinstance(df, DataFrame): assert_or_throw( schema is None and metadata is None, ValueError( "schema and metadata must be None when df is a DataFrame"), ) if isinstance(df, SparkDataFrame): return df if isinstance(df, ArrowDataFrame): sdf = self.spark_session.createDataFrame( df.as_array(), to_spark_schema(df.schema)) return SparkDataFrame(sdf, df.schema, df.metadata) if isinstance(df, (ArrayDataFrame, IterableDataFrame)): adf = ArrowDataFrame(df.as_array(type_safe=False), df.schema) sdf = self.spark_session.createDataFrame( adf.as_array(), to_spark_schema(df.schema)) return SparkDataFrame(sdf, df.schema, df.metadata) if any(pa.types.is_struct(t) for t in df.schema.types): sdf = self.spark_session.createDataFrame( df.as_array(type_safe=True), to_spark_schema(df.schema)) else: sdf = self.spark_session.createDataFrame( df.as_pandas(), to_spark_schema(df.schema)) return SparkDataFrame(sdf, df.schema, df.metadata) if isinstance(df, ps.DataFrame): return SparkDataFrame( df, None if schema is None else to_schema(schema), metadata) if isinstance(df, RDD): assert_arg_not_none(schema, "schema") sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema)) return SparkDataFrame(sdf, to_schema(schema), metadata) if isinstance(df, pd.DataFrame): if PD_UTILS.empty(df): temp_schema = to_spark_schema(PD_UTILS.to_schema(df)) sdf = self.spark_session.createDataFrame([], temp_schema) else: sdf = self.spark_session.createDataFrame(df) return SparkDataFrame(sdf, schema, metadata) # use arrow dataframe here to handle nulls in int cols assert_or_throw(schema is not None, FugueDataFrameInitError("schema can't be None")) adf = ArrowDataFrame(df, to_schema(schema)) sdf = self.spark_session.createDataFrame(adf.as_array(), to_spark_schema(adf.schema)) return SparkDataFrame(sdf, adf.schema, metadata)
def test_to_schema(): df = pd.DataFrame([[1.0, 2], [2.0, 3]]) raises(ValueError, lambda: PD_UTILS.to_schema(df)) df = pd.DataFrame([[1.0, 2], [2.0, 3]], columns=["x", "y"]) assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df)) df = pd.DataFrame([["a", 2], ["b", 3]], columns=["x", "y"]) assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df)) df = pd.DataFrame([], columns=["x", "y"]) df = df.astype(dtype={"x": np.int32, "y": np.dtype("object")}) assert [pa.field("x", pa.int32()), pa.field("y", pa.string())] == list(PD_UTILS.to_schema(df)) df = pd.DataFrame([[1, "x"], [2, "y"]], columns=["x", "y"]) df = df.astype(dtype={"x": np.int32, "y": np.dtype("object")}) assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df)) df = pd.DataFrame([[1, "x"], [2, "y"]], columns=["x", "y"]) df = df.astype(dtype={"x": np.int32, "y": np.dtype(str)}) assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df)) df = pd.DataFrame([[1, "x"], [2, "y"]], columns=["x", "y"]) df = df.astype(dtype={"x": np.int32, "y": np.dtype("str")}) assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df)) # test index df = pd.DataFrame([[3.0, 2], [2.0, 3]], columns=["x", "y"]) df = df.sort_values(["x"]) assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df)) df.index.name = "x" raises(ValueError, lambda: PD_UTILS.to_schema(df)) df = df.reset_index(drop=True) assert list(pa.Schema.from_pandas(df)) == list(PD_UTILS.to_schema(df)) df["p"] = "p" df = df.set_index(["p"]) df.index.name = None raises(ValueError, lambda: PD_UTILS.to_schema(df))