コード例 #1
0
def test_transform():
    pdf = pd.DataFrame([[1, 10], [0, 0], [1, 1], [0, 20]], columns=["a", "b"])

    def f1(df: pd.DataFrame) -> pd.DataFrame:
        return df.sort_values("b").head(1)

    result = transform(pdf, f1, schema="*")
    assert isinstance(result, pd.DataFrame)
    assert result.values.tolist() == [[0, 0]]

    # schema: *
    def f2(df: pd.DataFrame) -> pd.DataFrame:
        return df.sort_values("b").head(1)

    result = transform(pdf, f2)
    assert isinstance(result, pd.DataFrame)
    assert result.values.tolist() == [[0, 0]]

    result = transform(pdf, f2, partition=dict(by=["a"]))
    assert isinstance(result, pd.DataFrame)
    assert sorted(result.values.tolist(), key=lambda x: x[0]) == [[0, 0],
                                                                  [1, 1]]

    ppdf = PandasDataFrame(pdf)
    assert isinstance(transform(ppdf, f2), DataFrame)
コード例 #2
0
ファイル: test_execution_engine.py プロジェクト: gityow/fugue
def test_transform():
    class CB:
        def __init__(self):
            self._lock = RLock()
            self.n = 0

        def add(self, n):
            with self._lock:
                self.n += n

    cb = CB()

    def tr(df: List[List[Any]], add: Optional[callable]) -> List[List[Any]]:
        if add is not None:
            add(len(df))
        return [[pickle.dumps(x[0])] for x in df]

    pdf = pd.DataFrame(dict(a=list(range(5))))
    res = transform(
        pdf,
        tr,
        schema="b:binary",
        callback=cb.add,
        as_local=True,
        force_output_fugue_dataframe=True,
        engine="dask",
    )
    assert res.is_local
    assert 5 == res.count()
    assert 5 == cb.n

    res = transform(
        pdf,
        tr,
        schema="b:binary",
        force_output_fugue_dataframe=True,
        engine="dask",
    )
    assert not res.is_local
    assert 5 == res.count()

    cb = CB()

    res = transform(
        pdf,
        tr,
        schema="b:binary",
        callback=cb.add,
        force_output_fugue_dataframe=True,
        engine="dask",
        persist=True,  # when you have a persist, you can use callback
    )
    assert not res.is_local
    assert 5 == res.count()
    assert 5 == cb.n
コード例 #3
0
    def test_interfaceless(self):
        sdf = self.spark_session.createDataFrame(
            [[1, 10], [0, 0], [1, 1], [0, 20]], "a int,b int"
        )

        # schema:*
        def f1(df: pd.DataFrame) -> pd.DataFrame:
            return df.sort_values("b").head(1)

        result = transform(sdf, f1, partition=dict(by=["a"]), engine=self.engine)
        assert isinstance(result, SDataFrame)
        assert result.toPandas().sort_values(["a"]).values.tolist() == [[0, 0], [1, 1]]
コード例 #4
0
ファイル: test_interfaceless.py プロジェクト: gityow/fugue
def test_transform_from_yield(tmpdir):
    # schema: *,x:int
    def f(df: pd.DataFrame) -> pd.DataFrame:
        return df.assign(x=1)

    dag = FugueWorkflow()
    dag.df([[0]], "a:int").yield_dataframe_as("x1")
    dag.df([[1]], "b:int").yield_dataframe_as("x2")
    dag.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)})

    result = transform(dag.yields["x1"], f)
    assert isinstance(result, DataFrame)
    assert result.as_array(type_safe=True) == [[0, 1]]

    result = transform(
        dag.yields["x2"],
        f,
        engine_conf={FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)},
    )
    assert isinstance(result, DataFrame)
    assert result.as_array(type_safe=True) == [[1, 1]]
コード例 #5
0
ファイル: test_interfaceless.py プロジェクト: gityow/fugue
def test_transform():
    pdf = pd.DataFrame([[1, 10], [0, 0], [1, 1], [0, 20]], columns=["a", "b"])

    def f1(df: pd.DataFrame) -> pd.DataFrame:
        return df.sort_values("b").head(1)

    result = transform(pdf, f1, schema="*")
    assert isinstance(result, pd.DataFrame)
    assert result.values.tolist() == [[0, 0]]

    # schema: *
    def f2(df: pd.DataFrame) -> pd.DataFrame:
        return df.sort_values("b").head(1)

    result = transform(pdf, f2)
    assert isinstance(result, pd.DataFrame)
    assert result.values.tolist() == [[0, 0]]

    result = transform(pdf, f2, partition=dict(by=["a"]))
    assert isinstance(result, pd.DataFrame)
    assert sorted(result.values.tolist(), key=lambda x: x[0]) == [[0, 0],
                                                                  [1, 1]]
    result = transform(pdf,
                       f2,
                       partition=dict(by=["a"]),
                       force_output_fugue_dataframe=True)
    assert isinstance(result, DataFrame)

    ppdf = PandasDataFrame(pdf)
    assert isinstance(transform(ppdf, f2), DataFrame)

    # schema: *
    def f3(df: pd.DataFrame, called: callable) -> pd.DataFrame:
        called()
        return df

    cb = Callback()
    result = transform(pdf, f3, callback=cb.called)
    assert 1 == cb.ct