def __init__( # noqa: C901 self, df: Any = None, schema: Any = None, metadata: Any = None, pandas_df_wrapper: bool = False, ): try: apply_schema = True if df is None: schema = _input_schema(schema).assert_not_empty() df = [] if isinstance(df, PandasDataFrame): # TODO: This is useless if in this way and wrong pdf = df.native schema = None elif isinstance(df, (pd.DataFrame, pd.Series)): if isinstance(df, pd.Series): df = df.to_frame() pdf = df schema = None if schema is None else _input_schema(schema) if pandas_df_wrapper and schema is not None: apply_schema = False elif isinstance(df, Iterable): schema = _input_schema(schema).assert_not_empty() pdf = pd.DataFrame(df, columns=schema.names) pdf = PD_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True) if PD_UTILS.empty(pdf): for k, v in schema.items(): pdf[k] = pdf[k].astype(v.type.to_pandas_dtype()) apply_schema = False else: raise ValueError(f"{df} is incompatible with PandasDataFrame") if apply_schema: pdf, schema = self._apply_schema(pdf, schema) super().__init__(schema, metadata) self._native = pdf except Exception as e: raise FugueDataFrameInitError from e
def to_df(self, df: Any, schema: Any = None, metadata: Any = None) -> SparkDataFrame: """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame` :param data: :class:`~fugue.dataframe.dataframe.DataFrame`, :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`, pandas DataFrame or list or iterable of arrays :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType` defaults to None. :param metadata: |ParamsLikeObject|, defaults to None :return: engine compatible dataframe :Notice: * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`, it should return itself * For :class:`~spark:pyspark.RDD`, list or iterable of arrays, ``schema`` must be specified * When ``schema`` is not None, a potential type cast may happen to ensure the dataframe's schema. * all other methods in the engine can take arbitrary dataframes and call this method to convert before doing anything """ if isinstance(df, DataFrame): assert_or_throw( schema is None and metadata is None, ValueError( "schema and metadata must be None when df is a DataFrame"), ) if isinstance(df, SparkDataFrame): return df if isinstance(df, ArrowDataFrame): sdf = self.spark_session.createDataFrame( df.as_array(), to_spark_schema(df.schema)) return SparkDataFrame(sdf, df.schema, df.metadata) if isinstance(df, (ArrayDataFrame, IterableDataFrame)): adf = ArrowDataFrame(df.as_array(type_safe=False), df.schema) sdf = self.spark_session.createDataFrame( adf.as_array(), to_spark_schema(df.schema)) return SparkDataFrame(sdf, df.schema, df.metadata) if any(pa.types.is_struct(t) for t in df.schema.types): sdf = self.spark_session.createDataFrame( df.as_array(type_safe=True), to_spark_schema(df.schema)) else: sdf = self.spark_session.createDataFrame( df.as_pandas(), to_spark_schema(df.schema)) return SparkDataFrame(sdf, df.schema, df.metadata) if isinstance(df, ps.DataFrame): return SparkDataFrame( df, None if schema is None else to_schema(schema), metadata) if isinstance(df, RDD): assert_arg_not_none(schema, "schema") sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema)) return SparkDataFrame(sdf, to_schema(schema), metadata) if isinstance(df, pd.DataFrame): if PD_UTILS.empty(df): temp_schema = to_spark_schema(PD_UTILS.to_schema(df)) sdf = self.spark_session.createDataFrame([], temp_schema) else: sdf = self.spark_session.createDataFrame(df) return SparkDataFrame(sdf, schema, metadata) # use arrow dataframe here to handle nulls in int cols assert_or_throw(schema is not None, FugueDataFrameInitError("schema can't be None")) adf = ArrowDataFrame(df, to_schema(schema)) sdf = self.spark_session.createDataFrame(adf.as_array(), to_spark_schema(adf.schema)) return SparkDataFrame(sdf, adf.schema, metadata)