def test_as_arrow(self): # pd.Nat df = self.df([[pd.NaT, 1]], "a:datetime,b:int") assert [dict(a=None, b=1)] == list( ArrowDataFrame(df.as_arrow()).as_dict_iterable()) # pandas timestamps df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int") assert [dict(a=datetime(2020, 1, 1), b=1)] == list( ArrowDataFrame(df.as_arrow()).as_dict_iterable()) # float nan, list data = [[[float("nan"), 2.0]]] df = self.df(data, "a:[float]") assert [[[None, 2.0]]] == ArrowDataFrame(df.as_arrow()).as_array() # dict data = [[dict(b="x")]] df = self.df(data, "a:{b:str}") assert data == ArrowDataFrame(df.as_arrow()).as_array() # list[dict] data = [[[dict(b=[30, 40])]]] df = self.df(data, "a:[{b:[int]}]") assert data == ArrowDataFrame(df.as_arrow()).as_array()
def test_to_df(self): e = self.engine o = ArrayDataFrame( [[1, 2], [None, 3]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) assert a is not o res = a.native.collect() assert res[0][0] == 1.0 or res[0][0] is None assert res[1][0] == 1.0 or res[1][0] is None df_eq(a, o, throw=True) o = ArrowDataFrame( [[1, 2], [None, 3]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) assert a is not o res = a.native.collect() assert res[0][0] == 1.0 or res[0][0] is None assert res[1][0] == 1.0 or res[1][0] is None a = e.to_df([[1, None]], "a:int,b:int", dict(a=1)) df_eq(a, [[1, None]], "a:int,b:int", dict(a=1), throw=True) o = PandasDataFrame( [[{ "a": "b" }, 2]], "a:{a:str},b:int", dict(a=1), ) a = e.to_df(o) assert a is not o res = a.as_array(type_safe=True) assert res[0][0] == {"a": "b"}
def test_init(): df = ArrowDataFrame(schema="a:str,b:int") assert df.empty assert df.schema == "a:str,b:int" assert df.is_bounded data = [["a", "1"], ["b", "2"]] df = ArrowDataFrame(data, "a:str,b:str") assert [["a", "1"], ["b", "2"]] == df.as_array(type_safe=True) data = [["a", 1], ["b", 2]] df = ArrowDataFrame(data, "a:str,b:int") assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True) df = ArrowDataFrame(data, "a:str,b:double") assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True) ddf = ArrowDataFrame(df.native) assert [["a", 1.0], ["b", 2.0]] == ddf.as_array(type_safe=True) df = ArrowDataFrame(df.as_pandas(), "a:str,b:double") assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True) df = ArrowDataFrame(df.as_pandas()["b"]) assert [[1.0], [2.0]] == df.as_array(type_safe=True) df = ArrowDataFrame([], "x:str,y:double") assert df.empty assert df.is_local assert df.is_bounded raises(FugueDataFrameInitError, lambda: ArrowDataFrame(123))
def to_df(self, df: Any, schema: Any = None, metadata: Any = None) -> SparkDataFrame: """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame` :param data: :class:`~fugue.dataframe.dataframe.DataFrame`, :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`, pandas DataFrame or list or iterable of arrays :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType` defaults to None. :param metadata: |ParamsLikeObject|, defaults to None :return: engine compatible dataframe :Notice: * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`, it should return itself * For :class:`~spark:pyspark.RDD`, list or iterable of arrays, ``schema`` must be specified * When ``schema`` is not None, a potential type cast may happen to ensure the dataframe's schema. * all other methods in the engine can take arbitrary dataframes and call this method to convert before doing anything """ if isinstance(df, DataFrame): assert_or_throw( schema is None and metadata is None, ValueError( "schema and metadata must be None when df is a DataFrame"), ) if isinstance(df, SparkDataFrame): return df if isinstance(df, ArrowDataFrame): sdf = self.spark_session.createDataFrame( df.as_array(), to_spark_schema(df.schema)) return SparkDataFrame(sdf, df.schema, df.metadata) if isinstance(df, (ArrayDataFrame, IterableDataFrame)): adf = ArrowDataFrame(df.as_array(type_safe=False), df.schema) sdf = self.spark_session.createDataFrame( adf.as_array(), to_spark_schema(df.schema)) return SparkDataFrame(sdf, df.schema, df.metadata) if any(pa.types.is_struct(t) for t in df.schema.types): sdf = self.spark_session.createDataFrame( df.as_array(type_safe=True), to_spark_schema(df.schema)) else: sdf = self.spark_session.createDataFrame( df.as_pandas(), to_spark_schema(df.schema)) return SparkDataFrame(sdf, df.schema, df.metadata) if isinstance(df, ps.DataFrame): return SparkDataFrame( df, None if schema is None else to_schema(schema), metadata) if isinstance(df, RDD): assert_arg_not_none(schema, "schema") sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema)) return SparkDataFrame(sdf, to_schema(schema), metadata) if isinstance(df, pd.DataFrame): if PD_UTILS.empty(df): temp_schema = to_spark_schema(PD_UTILS.to_schema(df)) sdf = self.spark_session.createDataFrame([], temp_schema) else: sdf = self.spark_session.createDataFrame(df) return SparkDataFrame(sdf, schema, metadata) # use arrow dataframe here to handle nulls in int cols assert_or_throw(schema is not None, FugueDataFrameInitError("schema can't be None")) adf = ArrowDataFrame(df, to_schema(schema)) sdf = self.spark_session.createDataFrame(adf.as_array(), to_spark_schema(adf.schema)) return SparkDataFrame(sdf, adf.schema, metadata)
def df(self, data: Any = None, schema: Any = None, metadata: Any = None) -> ArrowDataFrame: return ArrowDataFrame(data, schema, metadata)