def __init__(self, conf: Any = None): p = ParamDict(FUGUE_DASK_DEFAULT_CONF) p.update(ParamDict(conf)) super().__init__(p) self._fs = FileSystem() self._log = logging.getLogger() self._native = NativeExecutionEngine(conf=conf)
def test_load_parquet_folder(self): e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1]], "c:int,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") native.save_df(a, os.path.join(path, "a.parquet")) native.save_df(b, os.path.join(path, "b.parquet")) FileSystem().touch(os.path.join(path, "_SUCCESS")) c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True)
def test_load_avro_folder(self): # TODO: switch to c:int,a:long when we can preserve schema to avro e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1]], "c:long,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:long,a:long") path = os.path.join(self.tmpdir, "a", "b") native.save_df(a, os.path.join(path, "a.avro")) native.save_df(b, os.path.join(path, "b.avro")) FileSystem().touch(os.path.join(path, "_SUCCESS")) c = e.load_df(path, format_hint="avro", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:long", throw=True)
def test_use_df(tmpdir): # df generated inside dag with FugueSQLWorkflow() as dag: a = dag.df([[0], [1]], "a:int") dag(""" b=CREATE[[0], [1]] SCHEMA a: int OUTPUT a, b USING assert_eq """) dag.sql_vars["b"].assert_eq(a) # external non-workflowdataframe arr = ArrayDataFrame([[0], [1]], "a:int") with FugueSQLWorkflow() as dag: dag( """ b=CREATE[[0], [1]] SCHEMA a: int OUTPUT a, b USING assert_eq """, a=arr, ) dag.sql_vars["b"].assert_eq(dag.df([[0], [1]], "a:int")) # from yield file engine = NativeExecutionEngine( conf={"fugue.workflow.checkpoint.path": os.path.join(tmpdir, "ck")}) with FugueSQLWorkflow(engine) as dag: dag("CREATE[[0], [1]] SCHEMA a: int YIELD FILE AS b") res = dag.yields["b"] with FugueSQLWorkflow(engine) as dag: dag( """ b=CREATE[[0], [1]] SCHEMA a: int OUTPUT a, b USING assert_eq """, a=res, ) # from yield dataframe engine = NativeExecutionEngine() with FugueSQLWorkflow(engine) as dag: dag("CREATE[[0], [1]] SCHEMA a: int YIELD DATAFRAME AS b") res = dag.yields["b"] with FugueSQLWorkflow(engine) as dag: dag( """ b=CREATE[[0], [1]] SCHEMA a: int OUTPUT a, b USING assert_eq """, a=res, )
def test_auto_persist(): dag1 = FugueWorkflow(NativeExecutionEngine()) df1 = dag1.df([[0]], "a:int") df1.show() df1.show() id1 = dag1.spec_uuid() dag2 = FugueWorkflow(NativeExecutionEngine({"fugue.workflow.auto_persist": True})) df1 = dag2.df([[0]], "a:int") df1.show() df1.show() id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").weak_checkpoint(level=None) df1.show() df1.show() id3 = dag3.spec_uuid() assert id1 == id2 assert id2 == id3 dag2 = FugueWorkflow( NativeExecutionEngine( { "fugue.workflow.auto_persist": True, "fugue.workflow.auto_persist_value": "abc", } ) ) df1 = dag2.df([[0]], "a:int") df1.show() df1.show() id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").weak_checkpoint(level="abc") df1.show() df1.show() id3 = dag3.spec_uuid() assert id2 == id3 dag1 = FugueWorkflow(NativeExecutionEngine()) df1 = dag1.df([[0]], "a:int") df1.show() id1 = dag1.spec_uuid() dag2 = FugueWorkflow(NativeExecutionEngine({"fugue.workflow.auto_persist": True})) df1 = dag2.df([[0]], "a:int") df1.show() # auto persist will not trigger id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").weak_checkpoint(level=None) df1.show() id3 = dag3.spec_uuid() assert id1 == id2 assert id2 == id3 # checkpoint, including auto_persist doesn't change determinism
def test_auto_persist(): dag1 = FugueWorkflow(NativeExecutionEngine()) df1 = dag1.df([[0]], "a:int") df1.show() df1.show() id1 = dag1.spec_uuid() dag2 = FugueWorkflow( NativeExecutionEngine({"fugue.workflow.auto_persist": True})) df1 = dag2.df([[0]], "a:int") df1.show() df1.show() id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").persist() df1.show() df1.show() id3 = dag3.spec_uuid() assert id1 != id2 assert id2 == id3 dag2 = FugueWorkflow( NativeExecutionEngine({ "fugue.workflow.auto_persist": True, "fugue.workflow.auto_persist_value": "abc" })) df1 = dag2.df([[0]], "a:int") df1.show() df1.show() id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").persist("abc") df1.show() df1.show() id3 = dag3.spec_uuid() assert id2 == id3 dag1 = FugueWorkflow(NativeExecutionEngine()) df1 = dag1.df([[0]], "a:int") df1.show() id1 = dag1.spec_uuid() dag2 = FugueWorkflow( NativeExecutionEngine({"fugue.workflow.auto_persist": True})) df1 = dag2.df([[0]], "a:int") df1.show() # auto persist will not trigger id2 = dag2.spec_uuid() dag3 = FugueWorkflow(NativeExecutionEngine()) df1 = dag3.df([[0]], "a:int").persist() df1.show() id3 = dag3.spec_uuid() assert id1 == id2 assert id2 != id3
def test_use_soecial_df(tmpdir): # external non-workflowdataframe arr = ArrayDataFrame([[0], [1]], "a:int") fsql( """ b=CREATE[[0], [1]] SCHEMA a: int a = SELECT * FROM a.x OUTPUT a, b USING assert_eq a = SELECT x.* FROM a.x AS x OUTPUT a, b USING assert_eq c=CREATE [[0,0],[1,1]] SCHEMA a:int,b:int d = SELECT x.*,y.a AS b FROM a.x x INNER JOIN a.x y ON x.a=y.a OUTPUT c, d USING assert_eq """, { "a.x": arr }, ).run() # from yield file engine = NativeExecutionEngine( conf={"fugue.workflow.checkpoint.path": os.path.join(tmpdir, "ck")}) with FugueSQLWorkflow(engine) as dag: dag("CREATE[[0], [1]] SCHEMA a: int YIELD FILE AS b") res = dag.yields["b"] with FugueSQLWorkflow(engine) as dag: dag( """ b=CREATE[[0], [1]] SCHEMA a: int a = SELECT * FROM a.x OUTPUT a, b USING assert_eq """, {"a.x": res}, )
def __init__( self, execution_engine: Any = None, cache: Any = NoOpCache, workflow_engine: Any = None, hooks: Any = WorkflowHooks, ): if execution_engine is None: ee: ExecutionEngine = NativeExecutionEngine() else: ee = to_instance(execution_engine, ExecutionEngine) self._fugue_engine = ee self._lock = RLock() self._results: Dict[Any, DataFrame] = {} if workflow_engine is None: workflow_engine = ParallelExecutionEngine( self.execution_engine.conf.get("fugue.workflow.concurrency", 1), self) super().__init__( cache=cache, engine=workflow_engine, hooks=hooks, logger=ee.log, config=ee.conf, )
def __init__( self, execution_engine: Any = None, cache: Any = NoOpCache, workflow_engine: Any = None, hooks: Any = WorkflowHooks, ): if execution_engine is None: ee: ExecutionEngine = NativeExecutionEngine() else: ee = to_instance(execution_engine, ExecutionEngine) self._fugue_engine = ee self._lock = RLock() self._results: Dict[Any, DataFrame] = {} if workflow_engine is None: workflow_engine = ParallelExecutionEngine( self.execution_engine.conf.get( FUGUE_CONF_WORKFLOW_CONCURRENCY, FUGUE_DEFAULT_CONF[FUGUE_CONF_WORKFLOW_CONCURRENCY], ), self, ) super().__init__( cache=cache, engine=workflow_engine, hooks=hooks, logger=ee.log, config=ee.conf, )
def __init__(self): self._funcs: Dict[str, Callable] = {} self._sql_funcs: Dict[str, Callable] = {} self.register_default( lambda conf, **kwargs: NativeExecutionEngine(conf=conf)) self.register_default_sql_engine( lambda engine, **kwargs: engine.sql_engine)
def test_workflow_conf(): dag = FugueSQLWorkflow(NativeExecutionEngine({"x": 10})) assert 10 == dag.conf.get_or_throw("x", int) assert not dag.conf.get_or_throw("fugue.sql.compile.ignore_case", bool) dag = FugueSQLWorkflow( NativeExecutionEngine({ "x": 10, "fugue.sql.compile.ignore_case": True })) assert 10 == dag.conf.get_or_throw("x", int) assert dag.conf.get_or_throw("fugue.sql.compile.ignore_case", bool) dag = FugueSQLWorkflow(NativeExecutionEngine({"x": 10}), {"fugue.sql.compile.ignore_case": "true"}) assert 10 == dag.conf.get_or_throw("x", int) assert dag.conf.get_or_throw("fugue.sql.compile.ignore_case", bool)
def test_workflow_conf(): dag = FugueSQLWorkflow( NativeExecutionEngine({ "x": 10, "fugue.sql.compile.simple_assign": "false" })) assert 10 == dag.conf.get_or_throw("x", int) assert not dag.conf.get_or_throw("fugue.sql.compile.simple_assign", bool) assert not dag.conf.get_or_throw("fugue.sql.compile.ignore_case", bool)
def test_conf_override(): with raises(FugueSQLSyntaxError): FugueSQLWorkflow()("create [[0]] schema a:int") with FugueSQLWorkflow( NativeExecutionEngine({"fugue.sql.compile.ignore_case": "true"})) as dag: a = dag.df([[0], [1]], "a:int") dag(""" b = create [[0],[1]] schema a:int output a,b using assert_eq""")
def test_load_csv_folder(self): e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6.1, 1.1]], "c:double,a:double") b = ArrayDataFrame([[2.1, 7.1], [4.1, 8.1]], "c:double,a:double") path = os.path.join(self.tmpdir, "a", "b") native.save_df(a, os.path.join(path, "a.csv"), format_hint="csv", header=True) native.save_df(b, os.path.join(path, "b.csv"), format_hint="csv", header=True) FileSystem().touch(os.path.join(path, "_SUCCESS")) c = e.load_df( path, format_hint="csv", header=True, infer_schema=True, columns=["a", "c"], ) df_eq(c, [[1.1, 6.1], [7.1, 2.1], [8.1, 4.1]], "a:double,c:double", throw=True)
from fugue.execution.execution_engine import ExecutionEngine, SQLEngine from fugue.execution.factory import ( make_execution_engine, make_sql_engine, register_default_execution_engine, register_default_sql_engine, register_execution_engine, register_sql_engine, ) from fugue.execution.native_execution_engine import ( NativeExecutionEngine, SqliteEngine, QPDPandasEngine, ) register_execution_engine("native", lambda conf: NativeExecutionEngine(conf), on_dup="ignore") register_execution_engine("pandas", lambda conf: NativeExecutionEngine(conf), on_dup="ignore") register_sql_engine("sqlite", lambda engine: SqliteEngine(engine), on_dup="ignore") register_sql_engine("qpdpandas", lambda engine: QPDPandasEngine(engine), on_dup="ignore") register_sql_engine("qpd_pandas", lambda engine: QPDPandasEngine(engine), on_dup="ignore")
class DaskExecutionEngine(ExecutionEngine): """The execution engine based on `Dask <https://docs.dask.org/>`_. Please read |ExecutionEngineTutorial| to understand this important Fugue concept :param conf: |ParamsLikeObject| defaults to None, read |FugueConfig| to learn Fugue specific options :Notice: You should setup Dask single machine or distributed environment in the :doc:`common <dask:setup>` way. Before initializing :class:`~.DaskExecutionEngine` """ def __init__(self, conf: Any = None): p = ParamDict(FUGUE_DASK_DEFAULT_CONF) p.update(ParamDict(conf)) super().__init__(p) self._fs = FileSystem() self._log = logging.getLogger() self._native = NativeExecutionEngine(conf=conf) def __repr__(self) -> str: return "DaskExecutionEngine" @property def log(self) -> logging.Logger: return self._log @property def fs(self) -> FileSystem: return self._fs @property def default_sql_engine(self) -> SQLEngine: return QPDDaskEngine(self) @property def pl_utils(self) -> DaskUtils: """Pandas-like dataframe utils""" return DaskUtils() def to_df(self, df: Any, schema: Any = None, metadata: Any = None) -> DaskDataFrame: """Convert a data structure to :class:`~fugue_dask.dataframe.DaskDataFrame` :param data: :class:`~fugue.dataframe.dataframe.DataFrame`, :class:`dask:dask.dataframe.DataFrame`, pandas DataFrame or list or iterable of arrays :param schema: |SchemaLikeObject|, defaults to None. :param metadata: |ParamsLikeObject|, defaults to None :return: engine compatible dataframe :Notice: * if the input is already :class:`~fugue_dask.dataframe.DaskDataFrame`, it should return itself * For list or iterable of arrays, ``schema`` must be specified * When ``schema`` is not None, a potential type cast may happen to ensure the dataframe's schema. * all other methods in the engine can take arbitrary dataframes and call this method to convert before doing anything """ default_partitions = self.conf.get_or_throw( FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS, int) if isinstance(df, DataFrame): assert_or_throw( schema is None and metadata is None, ValueError( "schema and metadata must be None when df is a DataFrame"), ) if isinstance(df, DaskDataFrame): return df if isinstance(df, PandasDataFrame): return DaskDataFrame(df.native, df.schema, df.metadata, num_partitions=default_partitions) return DaskDataFrame( df.as_array(type_safe=True), df.schema, df.metadata, num_partitions=default_partitions, ) return DaskDataFrame(df, schema, metadata, num_partitions=default_partitions) def repartition(self, df: DataFrame, partition_spec: PartitionSpec) -> DaskDataFrame: df = self.to_df(df) if partition_spec.empty: return df if len(partition_spec.partition_by) > 0: return df p = partition_spec.get_num_partitions( **{ KEYWORD_ROWCOUNT: lambda: df.persist().count(), # type: ignore KEYWORD_CORECOUNT: lambda: 2, # TODO: remove this hard code }) if p > 0: return DaskDataFrame( df.native.repartition(npartitions=p), schema=df.schema, metadata=df.metadata, type_safe=False, ) return df def map( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) input_schema = df.schema on_init_once: Any = (None if on_init is None else RunOnce( on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0])))) def _map(pdf: Any) -> dd.DataFrame: if pdf.shape[0] == 0: return PandasDataFrame([], output_schema).as_pandas() if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame(pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True) if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() df = self.to_df(df) if len(partition_spec.partition_by) == 0: pdf = self.repartition(df, partition_spec) result = pdf.native.map_partitions(_map, meta=output_schema.pandas_dtype) else: df = self.repartition( df, PartitionSpec(num=partition_spec.num_partitions)) result = self.pl_utils.safe_groupby_apply( df.native, partition_spec.partition_by, _map, meta=output_schema.pandas_dtype, ) return DaskDataFrame(result, output_schema, metadata) def broadcast(self, df: DataFrame) -> DataFrame: return self.to_df(df) def persist( self, df: DataFrame, lazy: bool = False, **kwargs: Any, ) -> DataFrame: return self.to_df(df).persist() def join( self, df1: DataFrame, df2: DataFrame, how: str, on: List[str] = _DEFAULT_JOIN_KEYS, metadata: Any = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) d = self.pl_utils.join( self.to_df(df1).native, self.to_df(df2).native, join_type=how, on=key_schema.names, ) return DaskDataFrame(d, output_schema, metadata) def union( self, df1: DataFrame, df2: DataFrame, distinct: bool = True, metadata: Any = None, ) -> DataFrame: assert_or_throw( df1.schema == df2.schema, lambda: ValueError(f"{df1.schema} != {df2.schema}"), ) d = self.pl_utils.union(self.to_df(df1).native, self.to_df(df2).native, unique=distinct) return DaskDataFrame(d, df1.schema, metadata) def subtract( self, df1: DataFrame, df2: DataFrame, distinct: bool = True, metadata: Any = None, ) -> DataFrame: assert_or_throw( distinct, NotImplementedError("EXCEPT ALL for DaskExecutionEngine")) assert_or_throw( df1.schema == df2.schema, lambda: ValueError(f"{df1.schema} != {df2.schema}"), ) d = self.pl_utils.except_df(self.to_df(df1).native, self.to_df(df2).native, unique=distinct) return DaskDataFrame(d, df1.schema, metadata) def intersect( self, df1: DataFrame, df2: DataFrame, distinct: bool = True, metadata: Any = None, ) -> DataFrame: assert_or_throw( distinct, NotImplementedError("INTERSECT ALL for DaskExecutionEngine")) assert_or_throw( df1.schema == df2.schema, lambda: ValueError(f"{df1.schema} != {df2.schema}"), ) d = self.pl_utils.intersect(self.to_df(df1).native, self.to_df(df2).native, unique=distinct) return DaskDataFrame(d, df1.schema, metadata) def distinct( self, df: DataFrame, metadata: Any = None, ) -> DataFrame: d = self.pl_utils.drop_duplicates(self.to_df(df).native) return DaskDataFrame(d, df.schema, metadata) def dropna( self, df: DataFrame, how: str = "any", thresh: int = None, subset: List[str] = None, metadata: Any = None, ) -> DataFrame: d = self.to_df(df).native.dropna(how=how, thresh=thresh, subset=subset) return DaskDataFrame(d, df.schema, metadata) def fillna( self, df: DataFrame, value: Any, subset: List[str] = None, metadata: Any = None, ) -> DataFrame: assert_or_throw( (not isinstance(value, list)) and (value is not None), ValueError("fillna value can not be a list or None"), ) if isinstance(value, dict): assert_or_throw( (None not in value.values()) and (any(value.values())), ValueError( "fillna dict can not contain None and needs at least one value" ), ) mapping = value else: # If subset is none, apply to all columns subset = subset or df.schema.names mapping = {col: value for col in subset} d = self.to_df(df).native.fillna(mapping) return DaskDataFrame(d, df.schema, metadata) def sample( self, df: DataFrame, n: Optional[int] = None, frac: Optional[float] = None, replace: bool = False, seed: Optional[int] = None, metadata: Any = None, ) -> DataFrame: assert_or_throw( (n is None and frac is not None) or (n is not None and frac is None), ValueError("one and only one of n and frac should be set"), ) # TODO: dask does not support sample by number of rows d = self.to_df(df).native.sample(n=n, frac=frac, replace=replace, random_state=seed) return DaskDataFrame(d, df.schema, metadata) def take( self, df: DataFrame, n: int, presort: str, na_position: str = "last", partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, metadata: Any = None, ) -> DataFrame: assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), ) d = self.to_df(df).native meta = [(d[x].name, d[x].dtype) for x in d.columns] if presort: presort = parse_presort_exp(presort) # Use presort over partition_spec.presort if possible _presort: IndexedOrderedDict = presort or partition_spec.presort def _partition_take(partition, n, presort): if len(presort.keys()) > 0: partition = partition.sort_values( list(presort.keys()), ascending=list(presort.values()), na_position=na_position, ) return partition.head(n) if len(partition_spec.partition_by) == 0: if len(_presort.keys()) == 0: d = d.head(n) else: # Use the default partition d = (d.map_partitions( _partition_take, n, _presort, meta=meta).reset_index(drop=True).compute()) # compute() brings this to Pandas so we can use pandas d = d.sort_values( list(_presort.keys()), ascending=list(_presort.values()), na_position=na_position, ).head(n) else: d = (d.groupby(partition_spec.partition_by, dropna=False).apply( _partition_take, n=n, presort=_presort, meta=meta).reset_index(drop=True)) return DaskDataFrame(d, df.schema, metadata) def load_df( self, path: Union[str, List[str]], format_hint: Any = None, columns: Any = None, **kwargs: Any, ) -> DaskDataFrame: return self.to_df( load_df(path, format_hint=format_hint, columns=columns, fs=self.fs, **kwargs)) def save_df( self, df: DataFrame, path: str, format_hint: Any = None, mode: str = "overwrite", partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, force_single: bool = False, **kwargs: Any, ) -> None: if force_single: self._native.save_df( df, path=path, format_hint=format_hint, mode=mode, partition_spec=partition_spec, **kwargs, ) else: if not partition_spec.empty: self.log.warning( # pragma: no cover "partition_spec is not respected in %s.save_df", self) self.fs.makedirs(os.path.dirname(path), recreate=True) df = self.to_df(df) save_df(df, path, format_hint=format_hint, mode=mode, fs=self.fs, **kwargs)