Esempio n. 1
0
def test_out_transform():
    class OT(OutputTransformer):
        def process(self, df):
            return

    o = _to_output_transformer(OT)
    w = FugueWorkflow()
    w.df([[0], [1]], "a:int", data_determiner=to_uuid).out_transform(
        o, params=dict(n=2)
    )
    assert_eq(
        """
    create [[0],[1]] schema a:int
    outtransform using OT(n=2)
    """,
        w,
    )

    w = FugueWorkflow()
    w.df([[0], [1]], "a:int", data_determiner=to_uuid).partition(
        by=["a"], presort="b DESC", num="ROWCOUNT/2"
    ).out_transform(mock_transformer, params=dict(n=2))
    assert_eq(
        """
    create [[0],[1]] schema a:int

    outtransform
        prepartition ROWCOUNT / 2 by a presort b desc
        using mock_transformer(n=2)
    """,
        w,
    )
def test_process_stack_space(tmpdir):
    space1 = ss(LinearRegression, normalize=Grid(True, False))
    space2 = ss(LinearRegression, fit_intercept=Grid(True, False))
    dag = FugueWorkflow()
    result0 = build_sk_cv(
        space1,
        dag.df(_create_mock_data()),
        scoring="neg_mean_absolute_error",
        cv=2,
        label_col="l",
        feature_prefix="f_",
    ).tune(distributable=False, serialize_path=str(tmpdir))
    res0 = result0.process(_process_stack_space,
                           params=dict(keys=[], space=space2))
    res0.show()

    result1 = build_sk_cv(
        space1,
        dag.df(_create_mock_data()).partition(by=["p"]),
        scoring="neg_mean_absolute_error",
        cv=2,
        label_col="l",
        feature_prefix="f_",
    ).tune(distributable=False, serialize_path=str(tmpdir))
    res1 = result1.process(_process_stack_space,
                           params=dict(keys=["p"], space=space2))
    dag.run()

    assert 2 == len(res0.result.as_array())
    assert 8 == len(res1.result.as_array())
Esempio n. 3
0
 def make_dataset(
     self,
     dag: FugueWorkflow,
     dataset: Any,
     df: Any = None,
     df_name: str = TUNE_DATASET_DF_DEFAULT_NAME,
     test_df: Any = None,
     test_df_name: str = TUNE_DATASET_VALIDATION_DF_DEFAULT_NAME,
     partition_keys: Optional[List[str]] = None,
     temp_path: str = "",
 ) -> TuneDataset:
     assert_or_throw(dataset is not None,
                     TuneCompileError("dataset can't be None"))
     if isinstance(dataset, TuneDataset):
         assert_or_throw(
             df is None,
             TuneCompileError("can't set df when dataset is TuneDataset"))
         return dataset
     if isinstance(dataset, Space):
         path = self.get_path_or_temp(temp_path)
         builder = TuneDatasetBuilder(dataset, path)
         if df is not None:
             wdf = dag.df(df)
             if partition_keys is not None and len(partition_keys) > 0:
                 wdf = wdf.partition_by(*partition_keys)
             builder.add_df(df_name, wdf)
         if test_df is not None:
             wdf = dag.df(test_df)
             how = "cross"
             if partition_keys is not None and len(partition_keys) > 0:
                 wdf = wdf.partition_by(*partition_keys)
                 how = "inner"
             builder.add_df(test_df_name, wdf, how=how)
         return builder.build(dag, batch_size=1, shuffle=True)
     raise TuneCompileError(f"{dataset} can't be converted to TuneDataset")
Esempio n. 4
0
def test_worflow_dataframes():
    dag1 = FugueWorkflow()
    df1 = dag1.df([[0]], "a:int")
    df2 = dag1.df([[0]], "b:int")
    dag2 = FugueWorkflow()
    df3 = dag2.df([[0]], "a:int")

    dfs1 = WorkflowDataFrames(a=df1, b=df2)
    assert dfs1["a"] is df1
    assert dfs1["b"] is df2

    dfs2 = WorkflowDataFrames(dfs1, aa=df1, bb=df2)
    assert 4 == len(dfs2)

    with raises(ValueError):
        WorkflowDataFrames(a=df1, b=df3)

    with raises(ValueError):
        WorkflowDataFrames(a=df1, b=ArrayDataFrame([[0]], "a:int"))

    dag = FugueWorkflow()
    df = dag.df([[0, 1], [1, 1]], "a:int,b:int")
    assert df.partition_spec.empty
    df2 = df.partition(by=["a"])
    assert df.partition_spec.empty
    assert df2.partition_spec == PartitionSpec(by=["a"])
    df3 = df.partition_by("a", "b")
    assert df.partition_spec.empty
    assert df3.partition_spec == PartitionSpec(by=["a", "b"])
    df4 = df.per_partition_by("a", "b")
    assert df.partition_spec.empty
    assert df4.partition_spec == PartitionSpec(by=["a", "b"], algo="even")
    df4 = df.per_row()
    assert df.partition_spec.empty
    assert df4.partition_spec == PartitionSpec("per_row")
Esempio n. 5
0
def test_study(tmpdir):
    space = Space(a=Grid(-2, 0, 1))
    input_df = pd.DataFrame([[0, 1], [1, 1], [0, 2]], columns=["a", "b"])
    dag = FugueWorkflow()
    monitor = M()

    # no data partition
    builder = TuneDatasetBuilder(space,
                                 str(tmpdir)).add_df("b", dag.df(input_df))
    dataset = builder.build(dag, 1)
    for distributed in [True, False, None]:
        # min_better = True
        result = optimize_noniterative(
            objective=to_noniterative_objective(objective),
            dataset=dataset,
            distributed=distributed,
        )
        result.result()[[TUNE_REPORT, TUNE_REPORT_METRIC
                         ]].output(assert_metric,
                                   params=dict(metrics=[3.0, 4.0, 7.0]))
        result.result(2)[[TUNE_REPORT, TUNE_REPORT_METRIC
                          ]].output(assert_metric,
                                    params=dict(metrics=[3.0, 4.0]))

        # min_better = False
        result = optimize_noniterative(
            objective=to_noniterative_objective(objective, min_better=False),
            dataset=dataset,
            distributed=distributed,
        )
        result.result()[[TUNE_REPORT, TUNE_REPORT_METRIC
                         ]].output(assert_metric,
                                   params=dict(metrics=[-7.0, -4.0, -3.0]))
        result.result(2)[[TUNE_REPORT, TUNE_REPORT_METRIC
                          ]].output(assert_metric,
                                    params=dict(metrics=[-7.0, -4.0]))

    # with data partition
    builder = TuneDatasetBuilder(space, str(tmpdir)).add_df(
        "b",
        dag.df(input_df).partition_by("a"))
    dataset = builder.build(dag, 1)
    for distributed in [True, False, None]:
        result = optimize_noniterative(
            objective=to_noniterative_objective(objective),
            dataset=dataset,
            distributed=distributed,
            monitor=monitor,
        )
        result.result()[[TUNE_REPORT, TUNE_REPORT_METRIC]].output(
            assert_metric, params=dict(metrics=[2.0, 3.0, 6.0, 1.0, 2.0, 5.0]))
        result.result(1)[[TUNE_REPORT, TUNE_REPORT_METRIC
                          ]].output(assert_metric,
                                    params=dict(metrics=[1.0, 2.0]))

    dag.run()

    assert 3 * 3 * 2 == len(monitor._reports)
Esempio n. 6
0
def test_yield(tmpdir):
    df = pd.DataFrame([[0, 0]], columns=["a", "b"])

    # schema: *
    def t(df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(b=df.b + 1)

    dag = FugueWorkflow()
    dag.df(df).transform(t).yield_dataframe_as("x")
    result = dag.run()["x"]
    assert [[0, 1]] == result.as_array()

    dag1 = FugueWorkflow()
    dag1.df(df).transform(t).yield_file_as("x")
    dag1.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})

    dag2 = FugueWorkflow()
    dag2.df(dag1.yields["x"]).transform(t).yield_dataframe_as("y")
    result = dag2.run("",
                      {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})["y"]
    assert [[0, 2]] == result.as_array()

    dag3 = FugueWorkflow()
    dag3.df(dag2.yields["y"]).transform(t).yield_dataframe_as("z")
    result = dag3.run()["z"]
    assert [[0, 3]] == result.as_array()
def test_auto_persist():
    dag1 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag1.df([[0]], "a:int")
    df1.show()
    df1.show()
    id1 = dag1.spec_uuid()

    dag2 = FugueWorkflow(NativeExecutionEngine({"fugue.workflow.auto_persist": True}))
    df1 = dag2.df([[0]], "a:int")
    df1.show()
    df1.show()
    id2 = dag2.spec_uuid()

    dag3 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag3.df([[0]], "a:int").weak_checkpoint(level=None)
    df1.show()
    df1.show()
    id3 = dag3.spec_uuid()

    assert id1 == id2
    assert id2 == id3

    dag2 = FugueWorkflow(
        NativeExecutionEngine(
            {
                "fugue.workflow.auto_persist": True,
                "fugue.workflow.auto_persist_value": "abc",
            }
        )
    )
    df1 = dag2.df([[0]], "a:int")
    df1.show()
    df1.show()
    id2 = dag2.spec_uuid()

    dag3 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag3.df([[0]], "a:int").weak_checkpoint(level="abc")
    df1.show()
    df1.show()
    id3 = dag3.spec_uuid()

    assert id2 == id3

    dag1 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag1.df([[0]], "a:int")
    df1.show()
    id1 = dag1.spec_uuid()

    dag2 = FugueWorkflow(NativeExecutionEngine({"fugue.workflow.auto_persist": True}))
    df1 = dag2.df([[0]], "a:int")
    df1.show()  # auto persist will not trigger
    id2 = dag2.spec_uuid()

    dag3 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag3.df([[0]], "a:int").weak_checkpoint(level=None)
    df1.show()
    id3 = dag3.spec_uuid()

    assert id1 == id2
    assert id2 == id3  # checkpoint, including auto_persist doesn't change determinism
Esempio n. 8
0
def test_auto_persist():
    dag1 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag1.df([[0]], "a:int")
    df1.show()
    df1.show()
    id1 = dag1.spec_uuid()

    dag2 = FugueWorkflow(
        NativeExecutionEngine({"fugue.workflow.auto_persist": True}))
    df1 = dag2.df([[0]], "a:int")
    df1.show()
    df1.show()
    id2 = dag2.spec_uuid()

    dag3 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag3.df([[0]], "a:int").persist()
    df1.show()
    df1.show()
    id3 = dag3.spec_uuid()

    assert id1 != id2
    assert id2 == id3

    dag2 = FugueWorkflow(
        NativeExecutionEngine({
            "fugue.workflow.auto_persist": True,
            "fugue.workflow.auto_persist_value": "abc"
        }))
    df1 = dag2.df([[0]], "a:int")
    df1.show()
    df1.show()
    id2 = dag2.spec_uuid()

    dag3 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag3.df([[0]], "a:int").persist("abc")
    df1.show()
    df1.show()
    id3 = dag3.spec_uuid()

    assert id2 == id3

    dag1 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag1.df([[0]], "a:int")
    df1.show()
    id1 = dag1.spec_uuid()

    dag2 = FugueWorkflow(
        NativeExecutionEngine({"fugue.workflow.auto_persist": True}))
    df1 = dag2.df([[0]], "a:int")
    df1.show()  # auto persist will not trigger
    id2 = dag2.spec_uuid()

    dag3 = FugueWorkflow(NativeExecutionEngine())
    df1 = dag3.df([[0]], "a:int").persist()
    df1.show()
    id3 = dag3.spec_uuid()

    assert id1 == id2
    assert id2 != id3
Esempio n. 9
0
def test_workflow():
    builder = FugueWorkflow()

    a = builder.create_data([[0], [0], [1]], "a:int")
    raises(InvalidOperationError, lambda: a._task.copy())
    raises(InvalidOperationError, lambda: copy.copy(a._task))
    raises(InvalidOperationError, lambda: copy.deepcopy(a._task))
    a.show()
    a.show()

    raises(FugueWorkflowCompileError, lambda: builder.df(123))

    b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"]))
    b.show()
    builder.create_data([[0], [1]], "b:int").show()
    c = ArrayDataFrame([[100]], "a:int")
    builder.show(a, b, c)
    b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast()
    b.show()

    builder.run()
    df_eq(a.result, [[0], [0], [1]], "a:int")
    raises(TypeError, lambda: builder.run("abc"))
    builder.run(FugueWorkflowContext())
    df_eq(a.result, [[0], [0], [1]], "a:int")
    builder.run("NativeExecutionEngine")
    df_eq(b.result, [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
    df_eq(b.compute(), [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
    df_eq(b.compute(NativeExecutionEngine), [[0, 2], [0, 2], [1, 1]],
          "a:int,b:int")
Esempio n. 10
0
def test_runtime_exception():
    if sys.version_info < (3, 7):
        return

    def tr(df: pd.DataFrame) -> pd.DataFrame:
        raise Exception

    def show(df):
        df.show()

    dag = FugueWorkflow()
    df = dag.df([[0]], "a:int")
    df = df.transform(tr, schema="*")
    show(df)

    try:
        dag.run()
    except:
        assert len(traceback.extract_tb(sys.exc_info()[2])) < 10

    try:
        dag.run("native", {FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE: False})
    except:
        assert len(traceback.extract_tb(sys.exc_info()[2])) > 10

    try:
        dag.run("native", {FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: ""})
    except:
        assert len(traceback.extract_tb(sys.exc_info()[2])) > 10
Esempio n. 11
0
    def test_run_ibis_duck(self):
        def _test1(con: ibis.BaseBackend) -> ibis.Expr:
            tb = con.table("a")
            return tb

        def _test2(con: ibis.BaseBackend) -> ibis.Expr:
            tb = con.table("a")
            return tb.mutate(c=tb.a + tb.b)

        dag = FugueWorkflow()
        df = dag.df([[0, 1], [2, 3]], "a:long,b:long")
        res = run_ibis(_test1, ibis_engine="duck", a=df)
        res.assert_eq(df)
        df = dag.df([[0, 1], [2, 3]], "a:long,b:long")
        res = run_ibis(_test2, ibis_engine="duckdb", a=df)
        df2 = dag.df([[0, 1, 1], [2, 3, 5]], "a:long,b:long,c:long")
        res.assert_eq(df2)
        dag.run(NativeExecutionEngine())
Esempio n. 12
0
def test_out_transform(tmpdir):
    pdf = pd.DataFrame([[1, 10], [0, 0], [1, 1], [0, 20]], columns=["a", "b"])

    class T:
        def __init__(self):
            self.n = 0

        def f(self, df: Iterable[Dict[str, Any]]) -> None:
            self.n += 1

    t = T()
    out_transform(pdf, t.f)
    assert 1 == t.n

    t = T()
    out_transform(pdf, t.f, partition=dict(by=["a"]))
    assert 2 == t.n

    dag = FugueWorkflow()
    dag.df(pdf).yield_dataframe_as("x1")
    dag.df(pdf).yield_dataframe_as("x2")
    dag.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})

    t = T()
    out_transform(dag.yields["x1"], t.f)
    assert 1 == t.n

    t = T()
    out_transform(
        dag.yields["x2"],
        t.f,
        partition=dict(by=["a"]),
        engine_conf={FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)},
    )
    assert 2 == t.n

    # schema: *
    def f3(df: pd.DataFrame, called: callable) -> pd.DataFrame:
        called()
        return df

    cb = Callback()
    result = out_transform(pdf, f3, callback=cb.called)
    assert 1 == cb.ct
Esempio n. 13
0
def test_yield():
    dag = FugueWorkflow()
    dag.df([[0]], "a:int32").show()
    id0 = dag.spec_uuid()
    x = FugueWorkflow().df([[0]], "a:int32")
    x.yield_file_as("x")
    x.show()
    id1 = x.workflow.spec_uuid()
    x = FugueWorkflow().df([[0]], "a:int32")
    x.deterministic_checkpoint().yield_file_as("y")
    x.show()
    id2 = x.workflow.spec_uuid()
    x = FugueWorkflow().df([[0]], "a:int32")
    x.deterministic_checkpoint().yield_dataframe_as("z")
    x.show()
    id3 = x.workflow.spec_uuid()
    # yield doesn't change determinism
    assert id0 == id1
    assert id0 == id2
    assert id0 == id3
Esempio n. 14
0
def test_transform_from_yield(tmpdir):
    # schema: *,x:int
    def f(df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(x=1)

    dag = FugueWorkflow()
    dag.df([[0]], "a:int").yield_dataframe_as("x1")
    dag.df([[1]], "b:int").yield_dataframe_as("x2")
    dag.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})

    result = transform(dag.yields["x1"], f)
    assert isinstance(result, DataFrame)
    assert result.as_array(type_safe=True) == [[0, 1]]

    result = transform(
        dag.yields["x2"],
        f,
        engine_conf={FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)},
    )
    assert isinstance(result, DataFrame)
    assert result.as_array(type_safe=True) == [[1, 1]]
Esempio n. 15
0
def test_invalid_module():
    # pylint: disable=no-value-for-parameter

    @module()
    def o1(wf: FugueWorkflow, df1: WorkflowDataFrame,
           df2: WorkflowDataFrame) -> None:
        pass

    @module()
    def o2(wf: FugueWorkflow, dfs: WorkflowDataFrames) -> None:
        pass

    dag1 = FugueWorkflow()
    df1 = dag1.df([[0]], "a:int")
    dag2 = FugueWorkflow()
    df2 = dag2.df([[1]], "a:int")

    with raises(ValueError):
        o1(df1, df2)

    with raises(ValueError):
        o2(WorkflowDataFrames(a=df1, b=df2))
Esempio n. 16
0
def trim_index(
    compute_engine: FugueExecutionEngine,
    df_graph: FugueDataFrame,
    indexed: bool = False,
    directed: bool = True,
    max_out_deg: int = 0,
    random_seed: Optional[int] = None,
) -> Tuple[FugueDataFrame, Optional[FugueDataFrame]]:
    """
    The very first steps to treat the input graph:
    1) basic validation of the input graph format: at least have ["src", "dst"] cols,
       it will be an unweighted graph if no "weight" col.
    2) trim some edges to avoid super hotspot vertices: random sampling will be done
       on all the edges of a vertex if the number of edges is greater than a threshold,
       this is critical to reduce data skewness and save disk space
    3) index the graph vertices by using sequential integers to represent vertices,
       this is critical to save memory

    :param compute_engine: an execution engine supported by Fugue
    :param df_graph: the input graph data as general Fugue dataframe
    :param indexed: if the input graph is using sequential integers to note vertices
    :param directed: if the graph is directed or not
    :param max_out_deg: the threshold for trimming hotspot vertices, set it to <= 0
                        to turn off trimming
    :param random_seed: optional random seed, for testing only

    Returns a validated, trimmed, and indexed graph
    """
    logging.info("trim_index(): start validating, trimming, and indexing ...")
    if "src" not in df_graph.schema or "dst" not in df_graph.schema:
        raise ValueError(
            f"Input graph NOT in the right format: {df_graph.schema}")

    params = {"max_out_degree": max_out_deg, "random_seed": random_seed}
    dag = FugueWorkflow(compute_engine)
    df = (dag.df(df_graph).partition(by=["src"]).transform(
        trim_hotspot_vertices,
        schema="*",
        params=params,
    ).compute())

    name_id = None
    if indexed is True:
        return df, name_id
    if isinstance(compute_engine, SparkExecutionEngine):
        df_res, name_id = index_graph_spark(df.native,
                                            directed)  # type: ignore
        return SparkDataFrame(df_res), SparkDataFrame(name_id)
    else:
        df_res, name_id = index_graph_pandas(df.as_pandas(), directed)
        return PandasDataFrame(df_res), PandasDataFrame(name_id)
def test_build_sk_cv(tmpdir):
    space = sum([
        ss(LinearRegression, fit_intercept=Grid(True, False)),
        ss(LinearRegression, normalize=Grid(True, False)),
    ])
    dag = FugueWorkflow()
    build_sk_cv(
        space,
        dag.df(_create_mock_data()),
        scoring="neg_mean_absolute_error",
        cv=4,
        label_col="l",
        feature_prefix="f_",
        save_path=str(tmpdir),
    ).tune(distributable=False, serialize_path=str(tmpdir)).show()
    dag.run()
Esempio n. 18
0
def test_fill():
    dag = FugueWorkflow()
    a = dag.df([[None, 1], [1, None]], "a:int, b:int", data_determiner=to_uuid)
    b = a.fillna({"a": 99, "b": -99})
    assert_eq(
        """
    a=create [[NULL, 1],[1, NULL]] schema a:int, b:int
    fill nulls params a:99, b:-99 from a""",
        dag,
    )
    assert_eq(
        """
    create [[NULL, 1],[1, NULL]] schema a:int, b:int
    fill nulls (a:99, b:-99)""",
        dag,
    )
Esempio n. 19
0
def space_to_df(wf: FugueWorkflow,
                space: Space,
                batch_size: int = 1,
                shuffle: bool = True) -> WorkflowDataFrame:
    def get_data() -> Iterable[List[Any]]:
        it = list(space.encode())  # type: ignore
        if shuffle:
            random.seed(0)
            random.shuffle(it)
        res: List[Any] = []
        for a in it:
            res.append(a)
            if batch_size == len(res):
                yield [json.dumps(res)]
                res = []
        if len(res) > 0:
            yield [json.dumps(res)]

    return wf.df(IterableDataFrame(get_data(), "__fmin_params__:str"))
Esempio n. 20
0
def suggest_sk_model(
    space: Space,
    train_df: Any,
    scoring: str,
    serialize_path: str,
    cv: int = 5,
    feature_prefix: str = "",
    label_col: str = "label",
    save_model: bool = False,
    partition_keys: List[str] = _EMPTY_LIST,
    top_n: int = 1,
    visualize_top_n: int = 0,
    objective_runner: Optional[ObjectiveRunner] = None,
    distributable: Optional[bool] = None,
    execution_engine: Any = None,
) -> List[Dict[str, Any]]:
    e = make_execution_engine(execution_engine)
    model_path = serialize_path if save_model else ""

    dag = FugueWorkflow()
    df = dag.df(train_df)
    if len(partition_keys) > 0:
        df = df.partition(by=partition_keys)
    skcv = build_sk_cv(
        space=space,
        train_df=df,
        scoring=scoring,
        cv=cv,
        feature_prefix=feature_prefix,
        label_col=label_col,
        save_path=model_path,
    )
    result = skcv.tune(
        objective_runner=objective_runner,
        distributable=distributable,
        serialize_path=serialize_path,
        shuffle=True,
    ).persist()
    best = select_best(result, top=top_n) if top_n > 0 else result
    visualize_top(result, top=visualize_top_n)
    dag.run(e)
    return list(best.result.as_dict_iterable())
Esempio n. 21
0
    def _space_to_df(self,
                     wf: FugueWorkflow,
                     batch_size: int = 1,
                     shuffle: bool = True) -> WorkflowDataFrame:
        def get_data() -> Iterable[List[Any]]:
            it = list(self._space)  # type: ignore
            if shuffle:
                random.seed(0)
                random.shuffle(it)
            res: List[Any] = []
            for a in it:
                res.append(a)
                if batch_size == len(res):
                    yield [pickle.dumps(res)]
                    res = []
            if len(res) > 0:
                yield [pickle.dumps(res)]

        return wf.df(
            IterableDataFrame(get_data(),
                              f"{TUNE_DATASET_PARAMS_PREFIX}:binary"))
Esempio n. 22
0
def test_modified_exception():
    if sys.version_info < (3, 7):
        return

    def tr(df: pd.DataFrame) -> pd.DataFrame:
        raise Exception

    def show(df):
        df.show()

    def tt(df):
        __modified_exception__ = NotImplementedError()
        return df.transform(tr, schema="*")

    dag = FugueWorkflow()
    df = dag.df([[0]], "a:int")
    df = tt(df)
    show(df)

    try:
        dag.run()
    except Exception as ex:
        assert isinstance(ex.__cause__, NotImplementedError)
Esempio n. 23
0
def test_head():
    dag = FugueWorkflow()
    a = dag.df(
        [[None, 1], [None, 2], [1, None], [1, 2]],
        "a:double, b:double",
        data_determiner=to_uuid,
    )
    b = a.partition(by=["a"], presort="b desc").take(1, na_position="first")
    c = b.take(1, presort="b desc", na_position="first")
    assert_eq(
        """
    a=create [[NULL, 1], [NULL, 2], [1, NULL], [1, 2]] schema a:double, b:double
    b=take 1 row from a prepartition by a presort b desc nulls first
    c=take 1 row from b presort b desc nulls first""",
        dag,
    )
    # anonymous
    assert_eq(
        """
    create [[NULL, 1], [NULL, 2], [1, NULL], [1, 2]] schema a:double, b:double
    take 1 row prepartition by a presort b desc nulls first
    take 1 row presort b desc nulls first""",
        dag,
    )
Esempio n. 24
0
def suggest_sk_stacking_model(
    space: Space,
    stack_space: Space,
    train_df: Any,
    scoring: str,
    serialize_path: str,
    cv: int = 5,
    feature_prefix: str = "",
    label_col: str = "label",
    save_model: bool = False,
    partition_keys: List[str] = _EMPTY_LIST,
    top_n: int = 1,
    visualize_top_n: int = 0,
    objective_runner: Optional[ObjectiveRunner] = None,
    distributable: Optional[bool] = None,
    execution_engine: Any = None,
    stack_cv: int = 2,
    stack_method: str = "auto",
    stack_passthrough: bool = False,
) -> List[Dict[str, Any]]:
    e = make_execution_engine(execution_engine)
    model_path = serialize_path if save_model else ""

    dag = FugueWorkflow()
    df = dag.df(train_df)
    if len(partition_keys) > 0:
        df = df.partition(by=partition_keys)
    skcv = build_sk_cv(
        space=space,
        train_df=df,
        scoring=scoring,
        cv=cv,
        feature_prefix=feature_prefix,
        label_col=label_col,
    )
    result = skcv.tune(
        objective_runner=objective_runner,
        distributable=distributable,
        serialize_path=serialize_path,
        shuffle=True,
    ).persist()
    best_models = select_best(result.transform(_extract_model), top=1)
    if top_n > 0:
        best_models = select_best(best_models.drop(["_sk__model"]), top=top_n)
    kwargs = Space(
        _sk__scoring=scoring,
        _sk__cv=cv,
        _sk__feature_prefix=feature_prefix,
        _sk__label_col=label_col,
        _sk__save_path=model_path,
        _sk__stack_cv=stack_cv,
        _sk__method=stack_method,
        _sk__passthrough=stack_passthrough,
    )
    space_df = best_models.process(
        _process_stack_space,
        params=dict(keys=partition_keys, space=stack_space * kwargs),
    )
    data = serialize_df(df, name="_sk__train_df", path=serialize_path)
    if len(partition_keys) > 0:
        data = data.inner_join(space_df.broadcast())
    else:
        data = data.cross_join(space_df.broadcast())
    result = tune(
        data,
        tunable=tunable(_sk_stack_cv),
        distributable=distributable,
        objective_runner=objective_runner,
    )
    best = select_best(result, top=1)
    visualize_top(result, top=visualize_top_n)
    dag.run(e)
    return list(best.result.as_dict_iterable())
Esempio n. 25
0
 def input3(wf: FugueWorkflow, a: int, b: int) -> WorkflowDataFrames:
     return WorkflowDataFrames(a=wf.df([[a]], "a:int"),
                               b=wf.df([[b]], "b:int"))
Esempio n. 26
0
 def input2(wf: FugueWorkflow, a: int) -> WorkflowDataFrame:
     return wf.df([[a]], "a:int")
Esempio n. 27
0
 def input1(wf: FugueWorkflow) -> WorkflowDataFrame:
     return wf.df([[0]], "a:int")
Esempio n. 28
0
 def create(wf: FugueWorkflow, n: int = 1) -> WorkflowDataFrame:
     return wf.df([[n]], "a:int")