Exemple #1
0
def to_schema(obj: Any) -> Schema:
    assert_arg_not_none(obj, "obj")
    if isinstance(obj, pt.StructType):
        return Schema(_to_arrow_schema(obj))
    if isinstance(obj, ps.DataFrame):
        return to_schema(obj.schema)
    return Schema(obj)
Exemple #2
0
 def _load_csv(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame:
     kw = ParamDict(kwargs)
     infer_schema = kw.get("infer_schema", False)
     if infer_schema:
         kw["inferSchema"] = True
     if "infer_schema" in kw:
         del kw["infer_schema"]
     header = str(kw.get_or_none("header", object)).lower()
     if "header" in kw:
         del kw["header"]
     reader = self._session.read.format("csv")
     reader.options(**kw)
     if header == "true":
         reader.option("header", "true")
         if columns is None:
             return SparkDataFrame(reader.load(p))
         if isinstance(columns, list):  # column names
             return SparkDataFrame(reader.load(p)[columns])
         schema = Schema(columns)
         return SparkDataFrame(reader.load(p)[schema.names], schema)
     if header in ["false", "none"]:
         reader.option("header", "false")
         if columns is None:
             raise InvalidOperationError("columns must be set if without header")
         if isinstance(columns, list):  # column names
             sdf = reader.load(p)
             inferred = to_schema(sdf)
             renames = [f"{k} AS {v}" for k, v in zip(inferred.names, columns)]
             return SparkDataFrame(sdf.selectExpr(*renames))
         schema = Schema(columns)
         sdf = reader.schema(to_spark_schema(schema)).load(p)
         return SparkDataFrame(sdf, schema)
     else:
         raise NotImplementedError(f"{header} is not supported")
Exemple #3
0
 def from_func(func: Callable, schema: Any,
               validation_rules: Dict[str, Any]) -> "_FuncAsProcessor":
     if schema is None:
         schema = parse_output_schema_from_comment(func)
     validation_rules.update(parse_validation_rules_from_comment(func))
     tr = _FuncAsProcessor()
     tr._wrapper = FunctionWrapper(func, "^e?(c|[dlspq]+)x*z?$",
                                   "^[dlspq]$")  # type: ignore
     tr._engine_param = (tr._wrapper._params.get_value_by_index(0) if
                         tr._wrapper.input_code.startswith("e") else None)
     tr._use_dfs = "c" in tr._wrapper.input_code
     tr._need_output_schema = tr._wrapper.need_output_schema
     tr._validation_rules = validation_rules
     tr._output_schema = Schema(schema)
     if len(tr._output_schema) == 0:
         assert_or_throw(
             tr._need_output_schema is None or not tr._need_output_schema,
             FugueInterfacelessError(
                 f"schema must be provided for return type {tr._wrapper._rt}"
             ),
         )
     else:
         assert_or_throw(
             tr._need_output_schema is None or tr._need_output_schema,
             FugueInterfacelessError(
                 f"schema must not be provided for return type {tr._wrapper._rt}"
             ),
         )
     return tr
Exemple #4
0
 def map(
     self,
     df: DataFrame,
     map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
     output_schema: Any,
     partition_spec: PartitionSpec,
     metadata: Any = None,
     on_init: Optional[Callable[[int, DataFrame], Any]] = None,
 ) -> DataFrame:
     if (self.conf.get_or_throw("fugue.spark.use_pandas_udf", bool)
             and len(partition_spec.partition_by) > 0 and not any(
                 pa.types.is_nested(t)
                 for t in Schema(output_schema).types)):
         return self._map_by_pandas_udf(
             df,
             map_func=map_func,
             output_schema=output_schema,
             partition_spec=partition_spec,
             metadata=metadata,
             on_init=on_init,
         )
     df = self.to_df(self.repartition(df, partition_spec))
     mapper = _Mapper(df, map_func, output_schema, partition_spec, on_init)
     sdf = df.native.rdd.mapPartitionsWithIndex(mapper.run, True)
     return self.to_df(sdf, output_schema, metadata)
Exemple #5
0
def to_spark_schema(obj: Any) -> pt.StructType:
    assert_arg_not_none(obj, "schema")
    if isinstance(obj, pt.StructType):
        return obj
    if isinstance(obj, ps.DataFrame):
        return obj.schema
    return _from_arrow_schema(Schema(obj).pa_schema)
Exemple #6
0
 def _serialize_by_partition(
     self,
     df: DataFrame,
     partition_spec: PartitionSpec,
     df_name: str,
     temp_path: Optional[str] = None,
     to_file_threshold: Any = -1,
     has_name: bool = False,
 ) -> DataFrame:
     to_file_threshold = _get_file_threshold(to_file_threshold)
     on = list(filter(lambda k: k in df.schema, partition_spec.partition_by))
     presort = list(
         filter(lambda p: p[0] in df.schema, partition_spec.presort.items())
     )
     col_name = _df_name_to_serialize_col(df_name)
     if len(on) == 0:
         partition_spec = PartitionSpec(
             partition_spec, num=1, by=[], presort=presort
         )
         output_schema = Schema(f"{col_name}:str")
     else:
         partition_spec = PartitionSpec(partition_spec, by=on, presort=presort)
         output_schema = partition_spec.get_key_schema(df.schema) + f"{col_name}:str"
     s = _PartitionSerializer(output_schema, temp_path, to_file_threshold)
     metadata = dict(
         serialized=True,
         serialized_cols={df_name: col_name},
         schemas={df_name: str(df.schema)},
         serialized_has_name=has_name,
     )
     return self.map(df, s.run, output_schema, partition_spec, metadata)
Exemple #7
0
 def from_func(func: Callable, schema: Any) -> "_FuncAsCreator":
     # pylint: disable=W0201
     if schema is None:
         schema = parse_output_schema_from_comment(func)
     tr = _FuncAsCreator()
     tr._wrapper = FunctionWrapper(func, "^e?x*z?$",
                                   "^[dlspq]$")  # type: ignore
     tr._need_engine = tr._wrapper.input_code.startswith("e")
     tr._need_output_schema = "s" == tr._wrapper.output_code
     tr._output_schema = Schema(schema)
     if len(tr._output_schema) == 0:
         assert_or_throw(
             not tr._need_output_schema,
             FugueInterfacelessError(
                 f"schema must be provided for return type {tr._wrapper._rt}"
             ),
         )
     else:
         assert_or_throw(
             tr._need_output_schema,
             FugueInterfacelessError(
                 f"schema must not be provided for return type {tr._wrapper._rt}"
             ),
         )
     return tr
    def to_output_df(self, output: EmptyAwareIterable[Dict[str, Any]],
                     schema: Any) -> DataFrame:
        schema = schema if isinstance(schema, Schema) else Schema(schema)

        def get_all() -> Iterable[List[Any]]:
            for row in output:
                yield [row[x] for x in schema.names]

        return IterableDataFrame(get_all(), schema)
Exemple #9
0
 def _load_json(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame:
     reader = self._session.read.format("json")
     reader.options(**kwargs)
     if columns is None:
         return SparkDataFrame(reader.load(p))
     if isinstance(columns, list):  # column names
         return SparkDataFrame(reader.load(p))[columns]
     schema = Schema(columns)
     return SparkDataFrame(reader.load(p)[schema.names], schema)
Exemple #10
0
 def _load_parquet(
     self, p: List[str], columns: Any = None, **kwargs: Any
 ) -> DataFrame:
     sdf = self._session.read.parquet(*p, **kwargs)
     if columns is None:
         return SparkDataFrame(sdf)
     if isinstance(columns, list):  # column names
         return SparkDataFrame(sdf)[columns]
     schema = Schema(columns)
     return SparkDataFrame(sdf[schema.names], schema)
Exemple #11
0
 def _load_avro(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame:
     reader = self._session.read.format(
         "avro"
     )  # avro is an external data source that has built-in support since spark 2.4
     reader.options(**kwargs)
     if columns is None:
         return SparkDataFrame(reader.load(p))
     if isinstance(columns, list):  # column names
         return SparkDataFrame(reader.load(p))[columns]
     schema = Schema(columns)
     return SparkDataFrame(reader.load(p)[schema.names], schema)
Exemple #12
0
    def map(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        if partition_spec.num_partitions != "0":
            self.log.warning(
                "%s doesn't respect num_partitions %s",
                self,
                partition_spec.num_partitions,
            )
        cursor = partition_spec.get_cursor(df.schema, 0)
        if on_init is not None:
            on_init(0, df)
        if len(partition_spec.partition_by) == 0:  # no partition
            df = to_local_df(df)
            cursor.set(df.peek_array(), 0, 0)
            output_df = map_func(cursor, df)
            if (isinstance(output_df, PandasDataFrame)
                    and output_df.schema != output_schema):
                output_df = PandasDataFrame(output_df.native, output_schema)
            assert_or_throw(
                output_df.schema == output_schema,
                lambda: f"map output {output_df.schema} "
                f"mismatches given {output_schema}",
            )
            output_df._metadata = ParamDict(metadata, deep=True)
            output_df._metadata.set_readonly()
            return self.to_df(output_df)
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)

        def _map(pdf: pd.DataFrame) -> pd.DataFrame:
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(pdf.reset_index(drop=True),
                                       df.schema,
                                       pandas_df_wrapper=True)
            cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        result = self.pl_utils.safe_groupby_apply(df.as_pandas(),
                                                  partition_spec.partition_by,
                                                  _map)
        return PandasDataFrame(result, output_schema, metadata)
Exemple #13
0
    def map(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)
        input_schema = df.schema
        on_init_once: Any = (
            None
            if on_init is None
            else RunOnce(
                on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0]))
            )
        )

        def _map(pdf: Any) -> pd.DataFrame:
            if pdf.shape[0] == 0:
                return PandasDataFrame([], output_schema).as_pandas()
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(
                pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True
            )
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        df = self.to_df(df)
        if len(partition_spec.partition_by) == 0:
            pdf = self.repartition(df, partition_spec)
            result = pdf.native.map_partitions(_map, meta=output_schema.pandas_dtype)
        else:
            df = self.repartition(df, PartitionSpec(num=partition_spec.num_partitions))
            result = self.pl_utils.safe_groupby_apply(
                df.native,
                partition_spec.partition_by,
                _map,
                meta=output_schema.pandas_dtype,
            )
        return DaskDataFrame(result, output_schema, metadata)
Exemple #14
0
 def __init__(
     self,
     df: DataFrame,
     map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
     output_schema: Any,
     partition_spec: PartitionSpec,
     on_init: Optional[Callable[[int, DataFrame], Any]],
 ):
     super().__init__()
     self.schema = df.schema
     self.output_schema = Schema(output_schema)
     self.metadata = df.metadata
     self.partition_spec = partition_spec
     self.map_func = map_func
     self.on_init = on_init
Exemple #15
0
    def create_data(
        self, data: Any, schema: Any = None, metadata: Any = None
    ) -> WorkflowDataFrame:
        """Create dataframe.

        :param data: |DataFrameLikeObject|
        :param schema: |SchemaLikeObject|, defaults to None
        :param metadata: |ParamsLikeObject|, defaults to None
        :return: a dataframe of the current workflow
        """
        if isinstance(data, WorkflowDataFrame):
            assert_or_throw(
                data.workflow is self, f"{data} does not belong to this workflow"
            )
            return data
        schema = None if schema is None else Schema(schema)
        return self.create(
            using=CreateData, params=dict(data=data, schema=schema, metadata=metadata)
        )
Exemple #16
0
    def _map_by_pandas_udf(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)
        input_schema = df.schema
        on_init_once: Any = (None if on_init is None else RunOnce(
            on_init,
            lambda *args, **kwargs: to_uuid(id(on_init), id(args[0]))))

        def _udf(pdf: Any) -> pd.DataFrame:  # pragma: no cover
            if pdf.shape[0] == 0:
                return PandasDataFrame([], output_schema).as_pandas()
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(pdf.reset_index(drop=True),
                                       input_schema,
                                       pandas_df_wrapper=True)
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        df = self.to_df(df)
        udf = pandas_udf(_udf, to_spark_schema(output_schema),
                         PandasUDFType.GROUPED_MAP)
        sdf = df.native.groupBy(*partition_spec.partition_by).apply(udf)
        return SparkDataFrame(sdf, metadata=metadata)