Example #1
0
def test_random_walk():
    """
    test Fugue func random_walk()
    """
    from node2vec.fugue import random_walk

    graph = [[0, 2, 0.41], [0, 4, 0.85], [3, 4, 0.36], [2, 0, 0.68], [4, 0, 0.1],
             [4, 3, 0.37]]
    df = ArrayDataFrame(graph, schema="src:int,dst:int,weight:double")
    n2v_params = {"num_walks": 2, "walk_length": 3, "return_param": 0.5}

    res = random_walk(NativeExecutionEngine(), df, n2v_params)
    assert res is not None
    res = random_walk(NativeExecutionEngine(), df.as_pandas(), n2v_params)
    assert res is not None
    df1 = df.rename({"src": "id"})[["id"]]
    res = random_walk(NativeExecutionEngine(), df.as_pandas(), n2v_params, df1)
    assert res is not None
    pytest.raises(
        ValueError, random_walk, NativeExecutionEngine(), df.as_pandas(),
        n2v_params, df,
    )

    spark = SparkSession.builder.config("spark.executor.cores", 4).getOrCreate()
    r = Row("src", "dst", "weight")
    df = spark.sparkContext.parallelize([r(*x) for x in graph]).toDF()
    res = random_walk(SparkExecutionEngine(spark), SparkDataFrame(df), n2v_params)
    assert res is not None
    pytest.raises(
        ValueError, random_walk, SparkExecutionEngine(spark), SparkDataFrame(df),
        n2v_params, SparkDataFrame(df),
    )
Example #2
0
def test_to_ibis_engine():
    e = NativeExecutionEngine()
    ie = PandasIbisEngine(e)
    assert isinstance(to_ibis_engine(e, None), PandasIbisEngine)
    assert isinstance(to_ibis_engine(e, ie), PandasIbisEngine)
    with raises(NotImplementedError):
        to_ibis_engine(e, "dummy")
Example #3
0
    def register_execution_engines(self):
        """Register execution engines with names. This will also try to register
        spark and dask engines if the dependent packages are available and they
        are not registered"""
        register_execution_engine(
            "native",
            lambda conf, **kwargs: NativeExecutionEngine(conf=conf),
            on_dup="ignore",
        )
        try:
            import pyspark  # noqa: F401
            from fugue_spark import SparkExecutionEngine

            register_execution_engine(
                "spark",
                lambda conf, **kwargs: SparkExecutionEngine(conf=conf),
                on_dup="ignore",
            )
        except ImportError:
            pass
        try:
            import dask.dataframe  # noqa: F401
            from fugue_dask import DaskExecutionEngine

            register_execution_engine(
                "dask",
                lambda conf, **kwargs: DaskExecutionEngine(conf=conf),
                on_dup="ignore",
            )
        except ImportError:
            pass
Example #4
0
def test_visualize_top_n(tmpdir):
    def t1(a: int, b: int) -> float:
        return a + b

    with FugueWorkflow() as dag:
        df = space_to_df(dag, Space(a=Grid(0, 1), b=Grid(2, 3)))
        visualize_top_n(tune(df, t1, distributable=False), top=2)

    @tunable()
    def t2(df1: pd.DataFrame, df2: pd.DataFrame, a: int,
           b: int) -> Dict[str, Any]:
        return {
            "error": float(a + b + df1["y"].sum() + df2["y"].sum()),
            "metadata": {
                "a": a
            },
        }

    e = NativeExecutionEngine(conf={FUGUE_TUNE_TEMP_PATH: str(tmpdir)})
    with FugueWorkflow(e) as dag:
        df1 = dag.df([[0, 1], [1, 2], [0, 2]],
                     "x:int,y:int").partition(by=["x"])
        df2 = dag.df([[0, 10], [1, 20]], "x:int,y:int").partition(by=["x"])
        res = t2.space(df1=df1, df2=df2, a=Grid(0, 1), b=Grid(2, 3)).tune()
        visualize_top_n(res, top=2)
Example #5
0
def test_trim_index():
    """
    test Fugue func trim_index()
    """
    from node2vec.fugue import trim_index

    graph = [[0, 2, 0.41], [0, 4, 0.85], [3, 4, 0.36], [2, 0, 0.68], [4, 0, 0.1],
             [4, 3, 0.37]]
    df = ArrayDataFrame(graph, schema="src:int,dst:int,weight:double")

    df_res, name_id = trim_index(NativeExecutionEngine(), df, indexed=True)
    assert len(df_res.as_pandas()) == 6 and name_id is None
    df_res, name_id = trim_index(
        NativeExecutionEngine(), df, indexed=True, max_out_deg=1,
    )
    assert len(df_res.as_pandas()) == 4 and name_id is None

    spark = SparkSession.builder.config("spark.executor.cores", 4).getOrCreate()
    dat1 = {
        'src': ['a1', 'a1', 'a1', 'a2', 'b2'], 'dst': ['a2', 'b1', 'b2', 'b1', 'a2'],
    }
    dat2 = {
        'dst': ['a2', 'b1', 'b2', 'a1'], 'weight': [0.8, 1.1, 1.0, 0.3]
    }
    df = spark.createDataFrame(pd.DataFrame.from_dict(dat1))
    df_res, name_id = trim_index(
        SparkExecutionEngine(spark), SparkDataFrame(df), indexed=False, max_out_deg=2
    )
    assert df_res.count() == 4 and name_id.count() == 4
    df = spark.createDataFrame(pd.DataFrame.from_dict(dat2))
    pytest.raises(
        ValueError, trim_index, SparkExecutionEngine(spark), SparkDataFrame(df), True,
    )

    df = pd.DataFrame.from_dict(dat1)
    df_res, name_id = trim_index(
        NativeExecutionEngine(), PandasDataFrame(df), indexed=False,
    )
    assert len(df_res.as_pandas()) == 5 and len(name_id.as_pandas()) == 4
    df = pd.DataFrame.from_dict(dat2)
    pytest.raises(
        ValueError, trim_index, NativeExecutionEngine(), PandasDataFrame(df), False,
    )
Example #6
0
def test_make_execution_engine():
    e = make_execution_engine(None, {FUGUE_CONF_SQL_IGNORE_CASE: True})
    assert isinstance(e, NativeExecutionEngine)
    assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)

    e = make_execution_engine(NativeExecutionEngine,
                              {FUGUE_CONF_SQL_IGNORE_CASE: True})
    assert isinstance(e, NativeExecutionEngine)
    assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)

    e = make_execution_engine(NativeExecutionEngine({"ab": "c"}),
                              {FUGUE_CONF_SQL_IGNORE_CASE: True},
                              de="f")
    assert isinstance(e, NativeExecutionEngine)
    assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)
    assert "c" == e.compile_conf.get_or_throw("ab", str)
    assert "f" == e.compile_conf.get_or_throw("de", str)
    assert "c" == e.conf.get_or_throw("ab", str)
    assert "de" not in e.conf

    e = make_execution_engine("pandas", {FUGUE_CONF_SQL_IGNORE_CASE: True})
    assert isinstance(e, NativeExecutionEngine)
    assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)

    e = make_execution_engine((NativeExecutionEngine, "sqlite"),
                              {FUGUE_CONF_SQL_IGNORE_CASE: True})
    assert isinstance(e, NativeExecutionEngine)
    assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)

    e = make_execution_engine(
        NativeExecutionEngine({FUGUE_CONF_SQL_IGNORE_CASE: True}))
    assert isinstance(e, NativeExecutionEngine)
    assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)

    e = make_execution_engine(
        (NativeExecutionEngine({FUGUE_CONF_SQL_IGNORE_CASE: True}), "sqlite"))
    assert isinstance(e, NativeExecutionEngine)
    assert e.compile_conf.get_or_throw(FUGUE_CONF_SQL_IGNORE_CASE, bool)
Example #7
0
    def test_run_ibis_duck(self):
        def _test1(con: ibis.BaseBackend) -> ibis.Expr:
            tb = con.table("a")
            return tb

        def _test2(con: ibis.BaseBackend) -> ibis.Expr:
            tb = con.table("a")
            return tb.mutate(c=tb.a + tb.b)

        dag = FugueWorkflow()
        df = dag.df([[0, 1], [2, 3]], "a:long,b:long")
        res = run_ibis(_test1, ibis_engine="duck", a=df)
        res.assert_eq(df)
        df = dag.df([[0, 1], [2, 3]], "a:long,b:long")
        res = run_ibis(_test2, ibis_engine="duckdb", a=df)
        df2 = dag.df([[0, 1, 1], [2, 3, 5]], "a:long,b:long,c:long")
        res.assert_eq(df2)
        dag.run(NativeExecutionEngine())
Example #8
0
 def make_engine(self):
     e = NativeExecutionEngine(dict(test=True))
     e.set_sql_engine(QPDPandasEngine(e))
     return e
Example #9
0
def test_builder(tmpdir):
    space = Space(a=1, b=2, c=Grid(2, 3))
    builder = TuneDatasetBuilder(space, str(tmpdir))

    def assert_count(df: DataFrame, n: int, schema=None) -> None:
        assert len(df.as_array()) == n
        if schema is not None:
            assert df.schema == schema

    # test to_space
    with FugueWorkflow() as dag:
        df = builder.build(dag).data
        df.show()

    df1 = ArrayDataFrame([[0, 1], [1, 1], [0, 2]], "a:int,b:int")

    # test single df
    with FugueWorkflow() as dag:
        builder.add_dfs(WorkflowDataFrames(x=dag.df(df1)))
        dataset = builder.build(dag)
        assert ["x"] == dataset.dfs
        assert [] == dataset.keys
        df = dataset.data
        df.show()
        df.output(
            assert_count,
            params=dict(n=2,
                        schema=f"__tune_df__x:str,{TUNE_DATASET_TRIALS}:str"),
        )

    space = Space(b=Rand(0, 1), a=1, c=Grid(2, 3), d=Grid("a", "b"))
    df2 = ArrayDataFrame([[0, 1], [1, 1], [3, 2]], "a:int,bb:int")
    df3 = ArrayDataFrame([[10, 1], [11, 1], [10, 2]], "a:int,c:int")
    builder = TuneDatasetBuilder(space)
    engine = NativeExecutionEngine(conf={TUNE_TEMP_PATH: str(tmpdir)})

    # test multiple dfs, batch_size and config
    with FugueWorkflow(engine) as dag:
        dfs = WorkflowDataFrames(a=dag.df(df1).partition_by("a"),
                                 b=dag.df(df2).partition_by("a"))
        dataset = (builder.add_dfs(dfs,
                                   "inner").add_df("c", dag.df(df3),
                                                   "cross").build(dag))
        assert ["a"] == dataset.keys
        assert ["a", "b", "c"] == dataset.dfs
        df = dataset.data
        df.show()
        df.output(
            assert_count,
            params=dict(
                n=8,
                schema="a:int,__tune_df__a:str,__tune_df__b:str,"
                f"__tune_df__c:str,{TUNE_DATASET_TRIALS}:str",
            ),
        )

        df = builder.build(dag, batch_size=3).data
        df.show()
        df.output(
            assert_count,
            params=dict(
                n=4,
                schema="a:int,__tune_df__a:str,__tune_df__b:str,"
                f"__tune_df__c:str,{TUNE_DATASET_TRIALS}:str",
            ),
        )
 def make_engine(self):
     e = NativeExecutionEngine(dict(test=True))
     e.set_sql_engine(SqliteEngine(e))
     return e