def test_random_walk(): """ test Fugue func random_walk() """ from node2vec.fugue import random_walk graph = [[0, 2, 0.41], [0, 4, 0.85], [3, 4, 0.36], [2, 0, 0.68], [4, 0, 0.1], [4, 3, 0.37]] df = ArrayDataFrame(graph, schema="src:int,dst:int,weight:double") n2v_params = {"num_walks": 2, "walk_length": 3, "return_param": 0.5} res = random_walk(NativeExecutionEngine(), df, n2v_params) assert res is not None res = random_walk(NativeExecutionEngine(), df.as_pandas(), n2v_params) assert res is not None df1 = df.rename({"src": "id"})[["id"]] res = random_walk(NativeExecutionEngine(), df.as_pandas(), n2v_params, df1) assert res is not None pytest.raises( ValueError, random_walk, NativeExecutionEngine(), df.as_pandas(), n2v_params, df, ) spark = SparkSession.builder.config("spark.executor.cores", 4).getOrCreate() r = Row("src", "dst", "weight") df = spark.sparkContext.parallelize([r(*x) for x in graph]).toDF() res = random_walk(SparkExecutionEngine(spark), SparkDataFrame(df), n2v_params) assert res is not None pytest.raises( ValueError, random_walk, SparkExecutionEngine(spark), SparkDataFrame(df), n2v_params, SparkDataFrame(df), )
def test_to_ibis_engine(): e = NativeExecutionEngine() ie = PandasIbisEngine(e) assert isinstance(to_ibis_engine(e, None), PandasIbisEngine) assert isinstance(to_ibis_engine(e, ie), PandasIbisEngine) with raises(NotImplementedError): to_ibis_engine(e, "dummy")
def register_execution_engines(self): """Register execution engines with names. This will also try to register spark and dask engines if the dependent packages are available and they are not registered""" register_execution_engine( "native", lambda conf, **kwargs: NativeExecutionEngine(conf=conf), on_dup="ignore", ) try: import pyspark # noqa: F401 from fugue_spark import SparkExecutionEngine register_execution_engine( "spark", lambda conf, **kwargs: SparkExecutionEngine(conf=conf), on_dup="ignore", ) except ImportError: pass try: import dask.dataframe # noqa: F401 from fugue_dask import DaskExecutionEngine register_execution_engine( "dask", lambda conf, **kwargs: DaskExecutionEngine(conf=conf), on_dup="ignore", ) except ImportError: pass
def test_visualize_top_n(tmpdir): def t1(a: int, b: int) -> float: return a + b with FugueWorkflow() as dag: df = space_to_df(dag, Space(a=Grid(0, 1), b=Grid(2, 3))) visualize_top_n(tune(df, t1, distributable=False), top=2) @tunable() def t2(df1: pd.DataFrame, df2: pd.DataFrame, a: int, b: int) -> Dict[str, Any]: return { "error": float(a + b + df1["y"].sum() + df2["y"].sum()), "metadata": { "a": a }, } e = NativeExecutionEngine(conf={FUGUE_TUNE_TEMP_PATH: str(tmpdir)}) with FugueWorkflow(e) as dag: df1 = dag.df([[0, 1], [1, 2], [0, 2]], "x:int,y:int").partition(by=["x"]) df2 = dag.df([[0, 10], [1, 20]], "x:int,y:int").partition(by=["x"]) res = t2.space(df1=df1, df2=df2, a=Grid(0, 1), b=Grid(2, 3)).tune() visualize_top_n(res, top=2)
def test_trim_index(): """ test Fugue func trim_index() """ from node2vec.fugue import trim_index graph = [[0, 2, 0.41], [0, 4, 0.85], [3, 4, 0.36], [2, 0, 0.68], [4, 0, 0.1], [4, 3, 0.37]] df = ArrayDataFrame(graph, schema="src:int,dst:int,weight:double") df_res, name_id = trim_index(NativeExecutionEngine(), df, indexed=True) assert len(df_res.as_pandas()) == 6 and name_id is None df_res, name_id = trim_index( NativeExecutionEngine(), df, indexed=True, max_out_deg=1, ) assert len(df_res.as_pandas()) == 4 and name_id is None spark = SparkSession.builder.config("spark.executor.cores", 4).getOrCreate() dat1 = { 'src': ['a1', 'a1', 'a1', 'a2', 'b2'], 'dst': ['a2', 'b1', 'b2', 'b1', 'a2'], } dat2 = { 'dst': ['a2', 'b1', 'b2', 'a1'], 'weight': [0.8, 1.1, 1.0, 0.3] } df = spark.createDataFrame(pd.DataFrame.from_dict(dat1)) df_res, name_id = trim_index( SparkExecutionEngine(spark), SparkDataFrame(df), indexed=False, max_out_deg=2 ) assert df_res.count() == 4 and name_id.count() == 4 df = spark.createDataFrame(pd.DataFrame.from_dict(dat2)) pytest.raises( ValueError, trim_index, SparkExecutionEngine(spark), SparkDataFrame(df), True, ) df = pd.DataFrame.from_dict(dat1) df_res, name_id = trim_index( NativeExecutionEngine(), PandasDataFrame(df), indexed=False, ) assert len(df_res.as_pandas()) == 5 and len(name_id.as_pandas()) == 4 df = pd.DataFrame.from_dict(dat2) pytest.raises( ValueError, trim_index, NativeExecutionEngine(), PandasDataFrame(df), False, )
def test_make_execution_engine(): e = make_execution_engine(None, {FUGUE_CONF_SQL_IGNORE_CASE: True}) assert isinstance(e, NativeExecutionEngine) assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool) e = make_execution_engine(NativeExecutionEngine, {FUGUE_CONF_SQL_IGNORE_CASE: True}) assert isinstance(e, NativeExecutionEngine) assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool) e = make_execution_engine(NativeExecutionEngine({"ab": "c"}), {FUGUE_CONF_SQL_IGNORE_CASE: True}, de="f") assert isinstance(e, NativeExecutionEngine) assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool) assert "c" == e.compile_conf.get_or_throw("ab", str) assert "f" == e.compile_conf.get_or_throw("de", str) assert "c" == e.conf.get_or_throw("ab", str) assert "de" not in e.conf e = make_execution_engine("pandas", {FUGUE_CONF_SQL_IGNORE_CASE: True}) assert isinstance(e, NativeExecutionEngine) assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool) e = make_execution_engine((NativeExecutionEngine, "sqlite"), {FUGUE_CONF_SQL_IGNORE_CASE: True}) assert isinstance(e, NativeExecutionEngine) assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool) e = make_execution_engine( NativeExecutionEngine({FUGUE_CONF_SQL_IGNORE_CASE: True})) assert isinstance(e, NativeExecutionEngine) assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool) e = make_execution_engine( (NativeExecutionEngine({FUGUE_CONF_SQL_IGNORE_CASE: True}), "sqlite")) assert isinstance(e, NativeExecutionEngine) assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)
def test_run_ibis_duck(self): def _test1(con: ibis.BaseBackend) -> ibis.Expr: tb = con.table("a") return tb def _test2(con: ibis.BaseBackend) -> ibis.Expr: tb = con.table("a") return tb.mutate(c=tb.a + tb.b) dag = FugueWorkflow() df = dag.df([[0, 1], [2, 3]], "a:long,b:long") res = run_ibis(_test1, ibis_engine="duck", a=df) res.assert_eq(df) df = dag.df([[0, 1], [2, 3]], "a:long,b:long") res = run_ibis(_test2, ibis_engine="duckdb", a=df) df2 = dag.df([[0, 1, 1], [2, 3, 5]], "a:long,b:long,c:long") res.assert_eq(df2) dag.run(NativeExecutionEngine())
def make_engine(self): e = NativeExecutionEngine(dict(test=True)) e.set_sql_engine(QPDPandasEngine(e)) return e
def test_builder(tmpdir): space = Space(a=1, b=2, c=Grid(2, 3)) builder = TuneDatasetBuilder(space, str(tmpdir)) def assert_count(df: DataFrame, n: int, schema=None) -> None: assert len(df.as_array()) == n if schema is not None: assert df.schema == schema # test to_space with FugueWorkflow() as dag: df = builder.build(dag).data df.show() df1 = ArrayDataFrame([[0, 1], [1, 1], [0, 2]], "a:int,b:int") # test single df with FugueWorkflow() as dag: builder.add_dfs(WorkflowDataFrames(x=dag.df(df1))) dataset = builder.build(dag) assert ["x"] == dataset.dfs assert [] == dataset.keys df = dataset.data df.show() df.output( assert_count, params=dict(n=2, schema=f"__tune_df__x:str,{TUNE_DATASET_TRIALS}:str"), ) space = Space(b=Rand(0, 1), a=1, c=Grid(2, 3), d=Grid("a", "b")) df2 = ArrayDataFrame([[0, 1], [1, 1], [3, 2]], "a:int,bb:int") df3 = ArrayDataFrame([[10, 1], [11, 1], [10, 2]], "a:int,c:int") builder = TuneDatasetBuilder(space) engine = NativeExecutionEngine(conf={TUNE_TEMP_PATH: str(tmpdir)}) # test multiple dfs, batch_size and config with FugueWorkflow(engine) as dag: dfs = WorkflowDataFrames(a=dag.df(df1).partition_by("a"), b=dag.df(df2).partition_by("a")) dataset = (builder.add_dfs(dfs, "inner").add_df("c", dag.df(df3), "cross").build(dag)) assert ["a"] == dataset.keys assert ["a", "b", "c"] == dataset.dfs df = dataset.data df.show() df.output( assert_count, params=dict( n=8, schema="a:int,__tune_df__a:str,__tune_df__b:str," f"__tune_df__c:str,{TUNE_DATASET_TRIALS}:str", ), ) df = builder.build(dag, batch_size=3).data df.show() df.output( assert_count, params=dict( n=4, schema="a:int,__tune_df__a:str,__tune_df__b:str," f"__tune_df__c:str,{TUNE_DATASET_TRIALS}:str", ), )
def make_engine(self): e = NativeExecutionEngine(dict(test=True)) e.set_sql_engine(SqliteEngine(e)) return e