Beispiel #1
0
 def _load_csv(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame:
     kw = ParamDict(kwargs)
     infer_schema = kw.get("infer_schema", False)
     if infer_schema:
         kw["inferSchema"] = True
     if "infer_schema" in kw:
         del kw["infer_schema"]
     header = str(kw.get_or_none("header", object)).lower()
     if "header" in kw:
         del kw["header"]
     reader = self._session.read.format("csv")
     reader.options(**kw)
     if header == "true":
         reader.option("header", "true")
         if columns is None:
             return SparkDataFrame(reader.load(p))
         if isinstance(columns, list):  # column names
             return SparkDataFrame(reader.load(p)[columns])
         schema = Schema(columns)
         return SparkDataFrame(reader.load(p)[schema.names], schema)
     if header in ["false", "none"]:
         reader.option("header", "false")
         if columns is None:
             raise InvalidOperationError("columns must be set if without header")
         if isinstance(columns, list):  # column names
             sdf = reader.load(p)
             inferred = to_schema(sdf)
             renames = [f"{k} AS {v}" for k, v in zip(inferred.names, columns)]
             return SparkDataFrame(sdf.selectExpr(*renames))
         schema = Schema(columns)
         sdf = reader.schema(to_spark_schema(schema)).load(p)
         return SparkDataFrame(sdf, schema)
     else:
         raise NotImplementedError(f"{header} is not supported")
Beispiel #2
0
    def to_df(
        self, df: Any, schema: Any = None, metadata: Any = None
    ) -> SparkDataFrame:
        """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame`

        :param data: :class:`~fugue.dataframe.dataframe.DataFrame`,
          :class:`spark:pyspark.sql.DataFrame`, :class:`spark:pyspark.RDD`,
          pandas DataFrame or list or iterable of arrays
        :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType`
          defaults to None.
        :param metadata: |ParamsLikeObject|, defaults to None
        :return: engine compatible dataframe

        :Notice:

        * if the input is already :class:`~fugue_spark.dataframe.SparkDataFrame`,
          it should return itself
        * For :class:`~spark:pyspark.RDD`, list or iterable of arrays,
          ``schema`` must be specified
        * When ``schema`` is not None, a potential type cast may happen to ensure
          the dataframe's schema.
        * all other methods in the engine can take arbitrary dataframes and
          call this method to convert before doing anything
        """
        if isinstance(df, DataFrame):
            assert_or_throw(
                schema is None and metadata is None,
                ValueError("schema and metadata must be None when df is a DataFrame"),
            )
            if isinstance(df, SparkDataFrame):
                return df
            if any(pa.types.is_struct(t) for t in df.schema.types):
                sdf = self.spark_session.createDataFrame(
                    df.as_array(type_safe=True), to_spark_schema(df.schema)
                )
            else:
                sdf = self.spark_session.createDataFrame(
                    df.as_pandas(), to_spark_schema(df.schema)
                )
            return SparkDataFrame(sdf, df.schema, df.metadata)
        if isinstance(df, ps.DataFrame):
            return SparkDataFrame(
                df, None if schema is None else to_schema(schema), metadata
            )
        if isinstance(df, RDD):
            assert_arg_not_none(schema, "schema")
            sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema))
            return SparkDataFrame(sdf, to_schema(schema), metadata)
        if isinstance(df, pd.DataFrame):
            sdf = self.spark_session.createDataFrame(df)
            return SparkDataFrame(sdf, schema, metadata)

        # use arrow dataframe here to handle nulls in int cols
        adf = ArrowDataFrame(df, to_schema(schema))
        sdf = self.spark_session.createDataFrame(
            adf.as_array(), to_spark_schema(adf.schema)
        )
        return SparkDataFrame(sdf, adf.schema, metadata)
Beispiel #3
0
 def _load_json(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame:
     reader = self._session.read.format("json")
     reader.options(**kwargs)
     if columns is None:
         return SparkDataFrame(reader.load(p))
     if isinstance(columns, list):  # column names
         return SparkDataFrame(reader.load(p))[columns]
     schema = Schema(columns)
     return SparkDataFrame(reader.load(p)[schema.names], schema)
Beispiel #4
0
 def _load_parquet(
     self, p: List[str], columns: Any = None, **kwargs: Any
 ) -> DataFrame:
     sdf = self._session.read.parquet(*p, **kwargs)
     if columns is None:
         return SparkDataFrame(sdf)
     if isinstance(columns, list):  # column names
         return SparkDataFrame(sdf)[columns]
     schema = Schema(columns)
     return SparkDataFrame(sdf[schema.names], schema)
Beispiel #5
0
 def _load_avro(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame:
     reader = self._session.read.format(
         "avro"
     )  # avro is an external data source that has built-in support since spark 2.4
     reader.options(**kwargs)
     if columns is None:
         return SparkDataFrame(reader.load(p))
     if isinstance(columns, list):  # column names
         return SparkDataFrame(reader.load(p))[columns]
     schema = Schema(columns)
     return SparkDataFrame(reader.load(p)[schema.names], schema)
def _df(data, schema=None, metadata=None):
    session = SparkSession.builder.getOrCreate()
    if schema is not None:
        pdf = PandasDataFrame(data, to_schema(schema), metadata)
        df = session.createDataFrame(pdf.native, to_spark_schema(schema))
    else:
        df = session.createDataFrame(data)
    return SparkDataFrame(df, schema, metadata)
Beispiel #7
0
 def _persist(self, df: SparkDataFrame, lazy: bool, level: Any) -> SparkDataFrame:
     if level is None:
         level = StorageLevel.MEMORY_AND_DISK
     if isinstance(level, str) and level in StorageLevel.__dict__:
         level = StorageLevel.__dict__[level]
     if isinstance(level, StorageLevel):
         df.native.persist()
         if not lazy:
             ct = df.count()
             self.log.info("Persist dataframe with %s, count %i", level, ct)
         return df
     raise ValueError(f"{level} is not supported persist type")  # pragma: no cover
def _test_as_array_perf():
    s = Schema()
    arr = []
    for i in range(100):
        s.append(f"a{i}:int")
        arr.append(i)
    for i in range(100):
        s.append(f"b{i}:int")
        arr.append(float(i))
    for i in range(100):
        s.append(f"c{i}:str")
        arr.append(str(i))
    data = []
    for i in range(5000):
        data.append(list(arr))
    df = SparkDataFrame(data, s)
    res = df.as_array()
    res = df.as_array(type_safe=True)
    nts, ts = 0.0, 0.0
    for i in range(10):
        t = datetime.now()
        res = df.as_array()
        nts += (datetime.now() - t).total_seconds()
        t = datetime.now()
        res = df.as_array(type_safe=True)
        ts += (datetime.now() - t).total_seconds()
    print(nts, ts)
Beispiel #9
0
    def _map_by_pandas_udf(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        df = self.to_df(self.repartition(df, partition_spec))
        output_schema = Schema(output_schema)
        input_schema = df.schema
        on_init_once: Any = (
            None
            if on_init is None
            else RunOnce(
                on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0]))
            )
        )

        def _udf(
            dfs: Iterable[pd.DataFrame],
        ) -> Iterable[pd.DataFrame]:  # pragma: no cover
            def get_dfs() -> Iterable[LocalDataFrame]:
                for df in dfs:
                    if df.shape[0] > 0:
                        yield PandasDataFrame(
                            df.reset_index(drop=True),
                            input_schema,
                            pandas_df_wrapper=True,
                        )

            input_df = LocalDataFrameIterableDataFrame(get_dfs(), input_schema)
            if input_df.empty:
                return PandasDataFrame([], output_schema).as_pandas()
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            if isinstance(output_df, LocalDataFrameIterableDataFrame):
                for res in output_df.native:
                    yield res.as_pandas()
            else:
                yield output_df.as_pandas()

        df = self.to_df(df)
        sdf = df.native.mapInPandas(_udf, schema=to_spark_schema(output_schema))
        return SparkDataFrame(sdf, metadata=metadata)
Beispiel #10
0
 def select(
         self, dfs: DataFrames,
         ibis_func: Callable[[ibis.BaseBackend],
                             ir.TableExpr]) -> DataFrame:
     for k, v in dfs.items():
         self.execution_engine.register(v, k)  # type: ignore
     con = ibis.pyspark.connect(
         self.execution_engine.spark_session)  # type: ignore
     expr = ibis_func(con)
     schema = to_schema(expr.schema())
     result = expr.compile()
     assert_or_throw(
         isinstance(result, PySparkDataFrame),
         lambda: ValueError(
             f"result must be a PySpark DataFrame ({type(result)})"),
     )
     return SparkDataFrame(result, schema=schema)
Beispiel #11
0
 def df(self,
        data: Any = None,
        schema: Any = None,
        metadata: Any = None) -> SparkDataFrame:
     session = SparkSession.builder.getOrCreate()
     if data is None:
         df = None
     else:
         if schema is not None:
             pdf = PandasDataFrame(data, to_schema(schema), metadata)
             df = session.createDataFrame(pdf.native,
                                          to_spark_schema(schema))
         else:
             try:
                 df = session.createDataFrame(data)
             except Exception:
                 raise FugueDataFrameInitError("schema error")
     return SparkDataFrame(df, schema, metadata)
Beispiel #12
0
 def save_df(
     self,
     df: SparkDataFrame,
     uri: str,
     format_hint: Optional[str] = None,
     partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
     mode: str = "overwrite",
     force_single: bool = False,
     **kwargs: Any,
 ) -> None:
     if not force_single:
         p = FileParser(uri, format_hint)
         writer = self._get_writer(df.native, partition_spec)
         writer.format(p.file_format).options(**kwargs).mode(mode)
         writer.save(uri)
     else:
         ldf = df.as_local()
         save_df(ldf, uri, format_hint=format_hint, mode=mode, fs=self._fs, **kwargs)
Beispiel #13
0
def test_init(spark_session):
    sdf = spark_session.createDataFrame([["a", 1]])
    df = SparkDataFrame(sdf, "a:str,b:double")
    assert [["a", 1.0]] == df.as_array()
    assert [["a", 1.0]] == df.as_pandas().values.tolist()
    assert not df.is_local
    assert df.is_bounded
    assert df.num_partitions > 0

    df = _df([["a", 1], ["b", 2]])
    assert [["a", 1], ["b", 2]] == df.as_array()
    df = _df([], "a:str,b:str")
    assert [] == df.as_array()
    assert df.schema == "a:str,b:str"
    df = _df([["a", 1], ["b", 2]], "a:str,b:str")
    assert [["a", "1"], ["b", "2"]] == df.as_array()
    assert df.schema == "a:str,b:str"
Beispiel #14
0
    def _group_map_by_pandas_udf(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)
        input_schema = df.schema
        on_init_once: Any = (
            None
            if on_init is None
            else RunOnce(
                on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0]))
            )
        )

        def _udf(pdf: Any) -> pd.DataFrame:  # pragma: no cover
            if pdf.shape[0] == 0:
                return PandasDataFrame([], output_schema).as_pandas()
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(
                pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True
            )
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        df = self.to_df(df)

        gdf = df.native.groupBy(*partition_spec.partition_by)
        sdf = gdf.applyInPandas(_udf, schema=to_spark_schema(output_schema))
        return SparkDataFrame(sdf, metadata=metadata)
Beispiel #15
0
def test_nested(spark_session):
    # data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]]
    # df = SparkDataFrame(data, "a:{a:str,b:[int]}")
    # a = df.as_array(type_safe=True)
    # assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a

    data = [[[10, 20]]]
    sdf = spark_session.createDataFrame(data, to_spark_schema("a:[int]"))
    df = SparkDataFrame(sdf)
    assert data == df.as_array(type_safe=False)
    assert data == df.as_array(type_safe=True)
    assert data == list(df.as_array_iterable(type_safe=False))
    assert data == list(df.as_array_iterable(type_safe=True))

    data = [[dict(b=[30, 40])]]
    sdf = spark_session.createDataFrame(data,
                                        to_spark_schema("a:{a:str,b:[int]}"))
    df = SparkDataFrame(sdf)
    a = df.as_array(type_safe=False)
    assert [[dict(a=None, b=[30, 40])]] == a
    a = df.as_array(type_safe=True)
    assert [[dict(a=None, b=[30, 40])]] == a
    a = list(df.as_array_iterable(type_safe=False))
    assert [[dict(a=None, b=[30, 40])]] == a
    a = list(df.as_array_iterable(type_safe=True))
    assert [[dict(a=None, b=[30, 40])]] == a
Beispiel #16
0
 def select(self, dfs: DataFrames, statement: str) -> DataFrame:
     for k, v in dfs.items():
         self.execution_engine.register(v, k)  # type: ignore
     return SparkDataFrame(
         self.execution_engine.spark_session.sql(statement)  # type: ignore
     )
Beispiel #17
0
 def _broadcast(self, df: SparkDataFrame) -> SparkDataFrame:
     sdf = broadcast(df.native)
     return SparkDataFrame(sdf, df.schema, df.metadata)