def test_chaining_3(): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) platform_features = ["platform"] >> ops.Dropna() joined = ["ad_id"] >> ops.JoinGroupby(cont_cols=["clicked"], stats=["sum", "count"]) joined_lambda = ( joined >> ops.LambdaOp(f=lambda col, gdf: col / gdf["ad_id_count"]) >> ops.Rename(postfix="_ctr")) workflow = Workflow(platform_features + joined + joined_lambda) dataset = nvt.Dataset(gdf_test, engine="parquet") workflow.fit(dataset) result = workflow.transform(dataset).to_ddf().compute() assert all( x in result.columns for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])
def test_dropna(tmpdir, df, dataset, engine): columns = mycols_pq if engine == "parquet" else mycols_csv dropna_features = columns >> ops.Dropna() processor = nvt.Workflow(dropna_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert new_gdf.columns.all() == df.columns.all() assert new_gdf.isnull().all().sum() < 1, "null values exist"
def test_dropna(tmpdir, df, dataset, engine): dropna = ops.Dropna() columns = mycols_pq if engine == "parquet" else mycols_csv columns_ctx = {} columns_ctx["all"] = {} columns_ctx["all"]["base"] = columns for gdf in dataset.to_iter(): new_gdf = dropna.apply_op(gdf, columns_ctx, "all") assert new_gdf.columns.all() == gdf.columns.all() assert new_gdf.isnull().all().sum() < 1, "null values exist"