Ejemplo n.º 1
0
 def test_as_arrow(self):
     # pd.Nat
     df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
     assert [dict(a=None, b=1)] == list(
         ArrowDataFrame(df.as_arrow()).as_dict_iterable())
     # pandas timestamps
     df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
     assert [dict(a=datetime(2020, 1, 1), b=1)] == list(
         ArrowDataFrame(df.as_arrow()).as_dict_iterable())
     # float nan, list
     data = [[[float("nan"), 2.0]]]
     df = self.df(data, "a:[float]")
     assert [[[None, 2.0]]] == ArrowDataFrame(df.as_arrow()).as_array()
     # dict
     data = [[dict(b="x")]]
     df = self.df(data, "a:{b:str}")
     assert data == ArrowDataFrame(df.as_arrow()).as_array()
     # list[dict]
     data = [[[dict(b=[30, 40])]]]
     df = self.df(data, "a:[{b:[int]}]")
     assert data == ArrowDataFrame(df.as_arrow()).as_array()
Ejemplo n.º 2
0
    def test_to_df(self):
        e = self.engine
        o = ArrayDataFrame(
            [[1, 2], [None, 3]],
            "a:double,b:int",
            dict(a=1),
        )
        a = e.to_df(o)
        assert a is not o
        res = a.native.collect()
        assert res[0][0] == 1.0 or res[0][0] is None
        assert res[1][0] == 1.0 or res[1][0] is None
        df_eq(a, o, throw=True)

        o = ArrowDataFrame(
            [[1, 2], [None, 3]],
            "a:double,b:int",
            dict(a=1),
        )
        a = e.to_df(o)
        assert a is not o
        res = a.native.collect()
        assert res[0][0] == 1.0 or res[0][0] is None
        assert res[1][0] == 1.0 or res[1][0] is None

        a = e.to_df([[1, None]], "a:int,b:int", dict(a=1))
        df_eq(a, [[1, None]], "a:int,b:int", dict(a=1), throw=True)

        o = PandasDataFrame(
            [[{
                "a": "b"
            }, 2]],
            "a:{a:str},b:int",
            dict(a=1),
        )
        a = e.to_df(o)
        assert a is not o
        res = a.as_array(type_safe=True)
        assert res[0][0] == {"a": "b"}
Ejemplo n.º 3
0
def test_init():
    df = ArrowDataFrame(schema="a:str,b:int")
    assert df.empty
    assert df.schema == "a:str,b:int"
    assert df.is_bounded

    data = [["a", "1"], ["b", "2"]]
    df = ArrowDataFrame(data, "a:str,b:str")
    assert [["a", "1"], ["b", "2"]] == df.as_array(type_safe=True)
    data = [["a", 1], ["b", 2]]
    df = ArrowDataFrame(data, "a:str,b:int")
    assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True)
    df = ArrowDataFrame(data, "a:str,b:double")
    assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True)

    ddf = ArrowDataFrame(df.native)
    assert [["a", 1.0], ["b", 2.0]] == ddf.as_array(type_safe=True)

    df = ArrowDataFrame(df.as_pandas(), "a:str,b:double")
    assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True)
    df = ArrowDataFrame(df.as_pandas()["b"])
    assert [[1.0], [2.0]] == df.as_array(type_safe=True)

    df = ArrowDataFrame([], "x:str,y:double")
    assert df.empty
    assert df.is_local
    assert df.is_bounded

    raises(FugueDataFrameInitError, lambda: ArrowDataFrame(123))
Ejemplo n.º 4
0
    def to_df(self,
              df: Any,
              schema: Any = None,
              metadata: Any = None) -> SparkDataFrame:
        """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame`

        :param data: :class:`~fugue.dataframe.dataframe.DataFrame`,
          :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`,
          pandas DataFrame or list or iterable of arrays
        :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType`
          defaults to None.
        :param metadata: |ParamsLikeObject|, defaults to None
        :return: engine compatible dataframe

        :Notice:

        * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`,
          it should return itself
        * For :class:`~spark:pyspark.RDD`, list or iterable of arrays,
          ``schema`` must be specified
        * When ``schema`` is not None, a potential type cast may happen to ensure
          the dataframe's schema.
        * all other methods in the engine can take arbitrary dataframes and
          call this method to convert before doing anything
        """
        if isinstance(df, DataFrame):
            assert_or_throw(
                schema is None and metadata is None,
                ValueError(
                    "schema and metadata must be None when df is a DataFrame"),
            )
            if isinstance(df, SparkDataFrame):
                return df
            if isinstance(df, ArrowDataFrame):
                sdf = self.spark_session.createDataFrame(
                    df.as_array(), to_spark_schema(df.schema))
                return SparkDataFrame(sdf, df.schema, df.metadata)
            if isinstance(df, (ArrayDataFrame, IterableDataFrame)):
                adf = ArrowDataFrame(df.as_array(type_safe=False), df.schema)
                sdf = self.spark_session.createDataFrame(
                    adf.as_array(), to_spark_schema(df.schema))
                return SparkDataFrame(sdf, df.schema, df.metadata)
            if any(pa.types.is_struct(t) for t in df.schema.types):
                sdf = self.spark_session.createDataFrame(
                    df.as_array(type_safe=True), to_spark_schema(df.schema))
            else:
                sdf = self.spark_session.createDataFrame(
                    df.as_pandas(), to_spark_schema(df.schema))
            return SparkDataFrame(sdf, df.schema, df.metadata)
        if isinstance(df, ps.DataFrame):
            return SparkDataFrame(
                df, None if schema is None else to_schema(schema), metadata)
        if isinstance(df, RDD):
            assert_arg_not_none(schema, "schema")
            sdf = self.spark_session.createDataFrame(df,
                                                     to_spark_schema(schema))
            return SparkDataFrame(sdf, to_schema(schema), metadata)
        if isinstance(df, pd.DataFrame):
            if PD_UTILS.empty(df):
                temp_schema = to_spark_schema(PD_UTILS.to_schema(df))
                sdf = self.spark_session.createDataFrame([], temp_schema)
            else:
                sdf = self.spark_session.createDataFrame(df)
            return SparkDataFrame(sdf, schema, metadata)

        # use arrow dataframe here to handle nulls in int cols
        assert_or_throw(schema is not None,
                        FugueDataFrameInitError("schema can't be None"))
        adf = ArrowDataFrame(df, to_schema(schema))
        sdf = self.spark_session.createDataFrame(adf.as_array(),
                                                 to_spark_schema(adf.schema))
        return SparkDataFrame(sdf, adf.schema, metadata)
Ejemplo n.º 5
0
 def df(self,
        data: Any = None,
        schema: Any = None,
        metadata: Any = None) -> ArrowDataFrame:
     return ArrowDataFrame(data, schema, metadata)