def test_join_external_workflow(tmpdir, df, dataset, engine, preproc): # Define "external" table how = "left" drop_duplicates = True cache = "device" shift = 100 df_ext = df[["id"]].copy().sort_values("id") df_ext["new_col"] = df_ext["id"] + shift df_ext["new_col_2"] = "keep" df_ext["new_col_3"] = "ignore" df_ext_check = df_ext.copy() # Define Op on = "id" columns_ext = ["id", "new_col", "new_col_2"] df_ext_check = df_ext_check[columns_ext] if drop_duplicates: df_ext_check.drop_duplicates(ignore_index=True, inplace=True) merge_op = ops.JoinExternal( df_ext, on, how=how, columns_ext=columns_ext, cache=cache, drop_duplicates_ext=drop_duplicates, ) # Define Workflow processor = nvt.Workflow(cat_names=["name-cat", "name-string"], cont_names=["x", "y", "id"], label_name=["label"]) if preproc == "cat": processor.add_cat_preprocess(merge_op) elif preproc == "cont": processor.add_cont_preprocess(merge_op) elif preproc == "feat": processor.add_feature(merge_op) else: processor.add_preprocess(merge_op) processor.finalize() processor.apply(dataset, output_format=None) # Validate for gdf, part in zip(dataset.to_iter(), processor.get_ddf().partitions): new_gdf = part.compute(scheduler="synchronous") assert len(gdf) == len(new_gdf) assert (gdf["id"] + shift).all() == new_gdf["new_col"].all() assert gdf["id"].all() == new_gdf["id"].all() assert "new_col_2" in new_gdf.columns assert "new_col_3" not in new_gdf.columns
def test_join_external(tmpdir, df, dataset, engine, kind_ext, cache, how, drop_duplicates): # Define "external" table shift = 100 df_ext = df[["id"]].copy().sort_values("id") df_ext["new_col"] = df_ext["id"] + shift df_ext["new_col_2"] = "keep" df_ext["new_col_3"] = "ignore" df_ext_check = df_ext.copy() if kind_ext == "pandas": df_ext = df_ext.to_pandas() elif kind_ext == "arrow": df_ext = df_ext.to_arrow() elif kind_ext == "parquet": path = tmpdir.join("external.parquet") df_ext.to_parquet(path) df_ext = path elif kind_ext == "csv": path = tmpdir.join("external.csv") df_ext.to_csv(path) df_ext = path # Define Op on = "id" columns_ext = ["id", "new_col", "new_col_2"] df_ext_check = df_ext_check[columns_ext] if drop_duplicates: df_ext_check.drop_duplicates(ignore_index=True, inplace=True) merge_op = ops.JoinExternal( df_ext, on, how=how, columns_ext=columns_ext, cache=cache, drop_duplicates_ext=drop_duplicates, ) columns = mycols_pq if engine == "parquet" else mycols_csv columns_ctx = {} columns_ctx["all"] = {} columns_ctx["all"]["base"] = columns # Iterate, apply op, and check result for gdf in dataset.to_iter(): new_gdf = merge_op.apply_op(gdf, columns_ctx, "all") check_gdf = gdf.merge(df_ext_check, how=how, on=on) assert len(check_gdf) == len(new_gdf) assert (new_gdf["id"] + shift).all() == new_gdf["new_col"].all() assert gdf["id"].all() == new_gdf["id"].all() assert "new_col_2" in new_gdf.columns assert "new_col_3" not in new_gdf.columns