def to_schema(obj: Any) -> Schema: assert_arg_not_none(obj, "obj") if isinstance(obj, pt.StructType): return Schema(_to_arrow_schema(obj)) if isinstance(obj, ps.DataFrame): return to_schema(obj.schema) return Schema(obj)
def _load_csv(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame: kw = ParamDict(kwargs) infer_schema = kw.get("infer_schema", False) if infer_schema: kw["inferSchema"] = True if "infer_schema" in kw: del kw["infer_schema"] header = str(kw.get_or_none("header", object)).lower() if "header" in kw: del kw["header"] reader = self._session.read.format("csv") reader.options(**kw) if header == "true": reader.option("header", "true") if columns is None: return SparkDataFrame(reader.load(p)) if isinstance(columns, list): # column names return SparkDataFrame(reader.load(p)[columns]) schema = Schema(columns) return SparkDataFrame(reader.load(p)[schema.names], schema) if header in ["false", "none"]: reader.option("header", "false") if columns is None: raise InvalidOperationError("columns must be set if without header") if isinstance(columns, list): # column names sdf = reader.load(p) inferred = to_schema(sdf) renames = [f"{k} AS {v}" for k, v in zip(inferred.names, columns)] return SparkDataFrame(sdf.selectExpr(*renames)) schema = Schema(columns) sdf = reader.schema(to_spark_schema(schema)).load(p) return SparkDataFrame(sdf, schema) else: raise NotImplementedError(f"{header} is not supported")
def from_func(func: Callable, schema: Any, validation_rules: Dict[str, Any]) -> "_FuncAsProcessor": if schema is None: schema = parse_output_schema_from_comment(func) validation_rules.update(parse_validation_rules_from_comment(func)) tr = _FuncAsProcessor() tr._wrapper = FunctionWrapper(func, "^e?(c|[dlspq]+)x*z?$", "^[dlspq]$") # type: ignore tr._engine_param = (tr._wrapper._params.get_value_by_index(0) if tr._wrapper.input_code.startswith("e") else None) tr._use_dfs = "c" in tr._wrapper.input_code tr._need_output_schema = tr._wrapper.need_output_schema tr._validation_rules = validation_rules tr._output_schema = Schema(schema) if len(tr._output_schema) == 0: assert_or_throw( tr._need_output_schema is None or not tr._need_output_schema, FugueInterfacelessError( f"schema must be provided for return type {tr._wrapper._rt}" ), ) else: assert_or_throw( tr._need_output_schema is None or tr._need_output_schema, FugueInterfacelessError( f"schema must not be provided for return type {tr._wrapper._rt}" ), ) return tr
def map( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: if (self.conf.get_or_throw("fugue.spark.use_pandas_udf", bool) and len(partition_spec.partition_by) > 0 and not any( pa.types.is_nested(t) for t in Schema(output_schema).types)): return self._map_by_pandas_udf( df, map_func=map_func, output_schema=output_schema, partition_spec=partition_spec, metadata=metadata, on_init=on_init, ) df = self.to_df(self.repartition(df, partition_spec)) mapper = _Mapper(df, map_func, output_schema, partition_spec, on_init) sdf = df.native.rdd.mapPartitionsWithIndex(mapper.run, True) return self.to_df(sdf, output_schema, metadata)
def to_spark_schema(obj: Any) -> pt.StructType: assert_arg_not_none(obj, "schema") if isinstance(obj, pt.StructType): return obj if isinstance(obj, ps.DataFrame): return obj.schema return _from_arrow_schema(Schema(obj).pa_schema)
def _serialize_by_partition( self, df: DataFrame, partition_spec: PartitionSpec, df_name: str, temp_path: Optional[str] = None, to_file_threshold: Any = -1, has_name: bool = False, ) -> DataFrame: to_file_threshold = _get_file_threshold(to_file_threshold) on = list(filter(lambda k: k in df.schema, partition_spec.partition_by)) presort = list( filter(lambda p: p[0] in df.schema, partition_spec.presort.items()) ) col_name = _df_name_to_serialize_col(df_name) if len(on) == 0: partition_spec = PartitionSpec( partition_spec, num=1, by=[], presort=presort ) output_schema = Schema(f"{col_name}:str") else: partition_spec = PartitionSpec(partition_spec, by=on, presort=presort) output_schema = partition_spec.get_key_schema(df.schema) + f"{col_name}:str" s = _PartitionSerializer(output_schema, temp_path, to_file_threshold) metadata = dict( serialized=True, serialized_cols={df_name: col_name}, schemas={df_name: str(df.schema)}, serialized_has_name=has_name, ) return self.map(df, s.run, output_schema, partition_spec, metadata)
def from_func(func: Callable, schema: Any) -> "_FuncAsCreator": # pylint: disable=W0201 if schema is None: schema = parse_output_schema_from_comment(func) tr = _FuncAsCreator() tr._wrapper = FunctionWrapper(func, "^e?x*z?$", "^[dlspq]$") # type: ignore tr._need_engine = tr._wrapper.input_code.startswith("e") tr._need_output_schema = "s" == tr._wrapper.output_code tr._output_schema = Schema(schema) if len(tr._output_schema) == 0: assert_or_throw( not tr._need_output_schema, FugueInterfacelessError( f"schema must be provided for return type {tr._wrapper._rt}" ), ) else: assert_or_throw( tr._need_output_schema, FugueInterfacelessError( f"schema must not be provided for return type {tr._wrapper._rt}" ), ) return tr
def to_output_df(self, output: EmptyAwareIterable[Dict[str, Any]], schema: Any) -> DataFrame: schema = schema if isinstance(schema, Schema) else Schema(schema) def get_all() -> Iterable[List[Any]]: for row in output: yield [row[x] for x in schema.names] return IterableDataFrame(get_all(), schema)
def _load_json(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame: reader = self._session.read.format("json") reader.options(**kwargs) if columns is None: return SparkDataFrame(reader.load(p)) if isinstance(columns, list): # column names return SparkDataFrame(reader.load(p))[columns] schema = Schema(columns) return SparkDataFrame(reader.load(p)[schema.names], schema)
def _load_parquet( self, p: List[str], columns: Any = None, **kwargs: Any ) -> DataFrame: sdf = self._session.read.parquet(*p, **kwargs) if columns is None: return SparkDataFrame(sdf) if isinstance(columns, list): # column names return SparkDataFrame(sdf)[columns] schema = Schema(columns) return SparkDataFrame(sdf[schema.names], schema)
def _load_avro(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame: reader = self._session.read.format( "avro" ) # avro is an external data source that has built-in support since spark 2.4 reader.options(**kwargs) if columns is None: return SparkDataFrame(reader.load(p)) if isinstance(columns, list): # column names return SparkDataFrame(reader.load(p))[columns] schema = Schema(columns) return SparkDataFrame(reader.load(p)[schema.names], schema)
def map( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: if partition_spec.num_partitions != "0": self.log.warning( "%s doesn't respect num_partitions %s", self, partition_spec.num_partitions, ) cursor = partition_spec.get_cursor(df.schema, 0) if on_init is not None: on_init(0, df) if len(partition_spec.partition_by) == 0: # no partition df = to_local_df(df) cursor.set(df.peek_array(), 0, 0) output_df = map_func(cursor, df) if (isinstance(output_df, PandasDataFrame) and output_df.schema != output_schema): output_df = PandasDataFrame(output_df.native, output_schema) assert_or_throw( output_df.schema == output_schema, lambda: f"map output {output_df.schema} " f"mismatches given {output_schema}", ) output_df._metadata = ParamDict(metadata, deep=True) output_df._metadata.set_readonly() return self.to_df(output_df) presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) def _map(pdf: pd.DataFrame) -> pd.DataFrame: if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame(pdf.reset_index(drop=True), df.schema, pandas_df_wrapper=True) cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() result = self.pl_utils.safe_groupby_apply(df.as_pandas(), partition_spec.partition_by, _map) return PandasDataFrame(result, output_schema, metadata)
def map( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) input_schema = df.schema on_init_once: Any = ( None if on_init is None else RunOnce( on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0])) ) ) def _map(pdf: Any) -> pd.DataFrame: if pdf.shape[0] == 0: return PandasDataFrame([], output_schema).as_pandas() if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame( pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True ) if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() df = self.to_df(df) if len(partition_spec.partition_by) == 0: pdf = self.repartition(df, partition_spec) result = pdf.native.map_partitions(_map, meta=output_schema.pandas_dtype) else: df = self.repartition(df, PartitionSpec(num=partition_spec.num_partitions)) result = self.pl_utils.safe_groupby_apply( df.native, partition_spec.partition_by, _map, meta=output_schema.pandas_dtype, ) return DaskDataFrame(result, output_schema, metadata)
def __init__( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, on_init: Optional[Callable[[int, DataFrame], Any]], ): super().__init__() self.schema = df.schema self.output_schema = Schema(output_schema) self.metadata = df.metadata self.partition_spec = partition_spec self.map_func = map_func self.on_init = on_init
def create_data( self, data: Any, schema: Any = None, metadata: Any = None ) -> WorkflowDataFrame: """Create dataframe. :param data: |DataFrameLikeObject| :param schema: |SchemaLikeObject|, defaults to None :param metadata: |ParamsLikeObject|, defaults to None :return: a dataframe of the current workflow """ if isinstance(data, WorkflowDataFrame): assert_or_throw( data.workflow is self, f"{data} does not belong to this workflow" ) return data schema = None if schema is None else Schema(schema) return self.create( using=CreateData, params=dict(data=data, schema=schema, metadata=metadata) )
def _map_by_pandas_udf( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) input_schema = df.schema on_init_once: Any = (None if on_init is None else RunOnce( on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0])))) def _udf(pdf: Any) -> pd.DataFrame: # pragma: no cover if pdf.shape[0] == 0: return PandasDataFrame([], output_schema).as_pandas() if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame(pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True) if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() df = self.to_df(df) udf = pandas_udf(_udf, to_spark_schema(output_schema), PandasUDFType.GROUPED_MAP) sdf = df.native.groupBy(*partition_spec.partition_by).apply(udf) return SparkDataFrame(sdf, metadata=metadata)