def test_transform(): pdf = pd.DataFrame([[1, 10], [0, 0], [1, 1], [0, 20]], columns=["a", "b"]) def f1(df: pd.DataFrame) -> pd.DataFrame: return df.sort_values("b").head(1) result = transform(pdf, f1, schema="*") assert isinstance(result, pd.DataFrame) assert result.values.tolist() == [[0, 0]] # schema: * def f2(df: pd.DataFrame) -> pd.DataFrame: return df.sort_values("b").head(1) result = transform(pdf, f2) assert isinstance(result, pd.DataFrame) assert result.values.tolist() == [[0, 0]] result = transform(pdf, f2, partition=dict(by=["a"])) assert isinstance(result, pd.DataFrame) assert sorted(result.values.tolist(), key=lambda x: x[0]) == [[0, 0], [1, 1]] ppdf = PandasDataFrame(pdf) assert isinstance(transform(ppdf, f2), DataFrame)
def test_transform(): class CB: def __init__(self): self._lock = RLock() self.n = 0 def add(self, n): with self._lock: self.n += n cb = CB() def tr(df: List[List[Any]], add: Optional[callable]) -> List[List[Any]]: if add is not None: add(len(df)) return [[pickle.dumps(x[0])] for x in df] pdf = pd.DataFrame(dict(a=list(range(5)))) res = transform( pdf, tr, schema="b:binary", callback=cb.add, as_local=True, force_output_fugue_dataframe=True, engine="dask", ) assert res.is_local assert 5 == res.count() assert 5 == cb.n res = transform( pdf, tr, schema="b:binary", force_output_fugue_dataframe=True, engine="dask", ) assert not res.is_local assert 5 == res.count() cb = CB() res = transform( pdf, tr, schema="b:binary", callback=cb.add, force_output_fugue_dataframe=True, engine="dask", persist=True, # when you have a persist, you can use callback ) assert not res.is_local assert 5 == res.count() assert 5 == cb.n
def test_interfaceless(self): sdf = self.spark_session.createDataFrame( [[1, 10], [0, 0], [1, 1], [0, 20]], "a int,b int" ) # schema:* def f1(df: pd.DataFrame) -> pd.DataFrame: return df.sort_values("b").head(1) result = transform(sdf, f1, partition=dict(by=["a"]), engine=self.engine) assert isinstance(result, SDataFrame) assert result.toPandas().sort_values(["a"]).values.tolist() == [[0, 0], [1, 1]]
def test_transform_from_yield(tmpdir): # schema: *,x:int def f(df: pd.DataFrame) -> pd.DataFrame: return df.assign(x=1) dag = FugueWorkflow() dag.df([[0]], "a:int").yield_dataframe_as("x1") dag.df([[1]], "b:int").yield_dataframe_as("x2") dag.run("", {FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)}) result = transform(dag.yields["x1"], f) assert isinstance(result, DataFrame) assert result.as_array(type_safe=True) == [[0, 1]] result = transform( dag.yields["x2"], f, engine_conf={FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH: str(tmpdir)}, ) assert isinstance(result, DataFrame) assert result.as_array(type_safe=True) == [[1, 1]]
def test_transform(): pdf = pd.DataFrame([[1, 10], [0, 0], [1, 1], [0, 20]], columns=["a", "b"]) def f1(df: pd.DataFrame) -> pd.DataFrame: return df.sort_values("b").head(1) result = transform(pdf, f1, schema="*") assert isinstance(result, pd.DataFrame) assert result.values.tolist() == [[0, 0]] # schema: * def f2(df: pd.DataFrame) -> pd.DataFrame: return df.sort_values("b").head(1) result = transform(pdf, f2) assert isinstance(result, pd.DataFrame) assert result.values.tolist() == [[0, 0]] result = transform(pdf, f2, partition=dict(by=["a"])) assert isinstance(result, pd.DataFrame) assert sorted(result.values.tolist(), key=lambda x: x[0]) == [[0, 0], [1, 1]] result = transform(pdf, f2, partition=dict(by=["a"]), force_output_fugue_dataframe=True) assert isinstance(result, DataFrame) ppdf = PandasDataFrame(pdf) assert isinstance(transform(ppdf, f2), DataFrame) # schema: * def f3(df: pd.DataFrame, called: callable) -> pd.DataFrame: called() return df cb = Callback() result = transform(pdf, f3, callback=cb.called) assert 1 == cb.ct