def _test_dataframe_merge(backend, protocol, n_workers): if backend == "cudf": cudf = pytest.importorskip("cudf") from cudf.tests.utils import assert_eq else: from dask.dataframe.utils import assert_eq dask.config.update( dask.config.global_config, { "ucx": { "TLS": "tcp,sockcm,cuda_copy", }, }, priority="new", ) with LocalCluster( protocol=protocol, dashboard_address=None, n_workers=n_workers, threads_per_worker=1, processes=True, ) as cluster: with Client(cluster): nrows = n_workers * 10 # Let's make some dataframes that we can join on the "key" column df1 = pd.DataFrame({ "key": np.arange(nrows), "payload1": np.arange(nrows) }) key = np.arange(nrows) np.random.shuffle(key) df2 = pd.DataFrame({ "key": key[nrows // 3:], "payload2": np.arange(nrows)[nrows // 3:] }) expected = df1.merge(df2).set_index("key") if backend == "cudf": df1 = cudf.DataFrame.from_pandas(df1) df2 = cudf.DataFrame.from_pandas(df2) ddf1 = dd.from_pandas(df1, npartitions=n_workers + 1) ddf2 = dd.from_pandas(df2, npartitions=n_workers - 1 if n_workers > 1 else 1) ddf3 = dataframe_merge(ddf1, ddf2, on="key").set_index("key") got = ddf3.compute() if backend == "cudf": assert_eq(got, expected) else: pd.testing.assert_frame_equal(got, expected)
def _test_dataframe_merge_empty_partitions(nrows, npartitions): with LocalCluster( protocol="tcp", dashboard_address=None, n_workers=npartitions, threads_per_worker=1, processes=True, ) as cluster: with Client(cluster): df1 = pd.DataFrame({ "key": np.arange(nrows), "payload1": np.arange(nrows) }) key = np.arange(nrows) np.random.shuffle(key) df2 = pd.DataFrame({"key": key, "payload2": np.arange(nrows)}) expected = df1.merge(df2).set_index("key") ddf1 = dd.from_pandas(df1, npartitions=npartitions) ddf2 = dd.from_pandas(df2, npartitions=npartitions) ddf3 = dataframe_merge(ddf1, ddf2, on="key").set_index("key") got = ddf3.compute() pd.testing.assert_frame_equal(got, expected)
def _test_dataframe_merge(backend, protocol, n_workers): with LocalCluster( protocol=protocol, dashboard_address=None, n_workers=n_workers, threads_per_worker=1, processes=True, ) as cluster: with Client(cluster): nrows = n_workers * 10 # Let's make some dataframes that we can join on the "key" column df1 = pd.DataFrame({"key": np.arange(nrows), "payload1": np.arange(nrows)}) key = np.arange(nrows) np.random.shuffle(key) df2 = pd.DataFrame( {"key": key[nrows // 3 :], "payload2": np.arange(nrows)[nrows // 3 :]} ) expected = df1.merge(df2).set_index("key") if backend == "cudf": df1 = cudf.DataFrame.from_pandas(df1) df2 = cudf.DataFrame.from_pandas(df2) ddf1 = dd.from_pandas(df1, npartitions=n_workers + 1) ddf2 = dd.from_pandas( df2, npartitions=n_workers - 1 if n_workers > 1 else 1 ) ddf3 = dataframe_merge(ddf1, ddf2, on="key").set_index("key") got = ddf3.compute() if backend == "cudf": got = got.to_pandas() got.index.names = ["key"] # TODO: this shouldn't be needed pd.testing.assert_frame_equal(got, expected)
def merge_explicit_comms(args, ddf1, ddf2): t1 = clock() wait(explicit_comms.dataframe_merge(ddf1, ddf2, on="key").persist()) took = clock() - t1 return took