def _test_dataframe_merge(backend, protocol, n_workers):
    if backend == "cudf":
        cudf = pytest.importorskip("cudf")
        from cudf.tests.utils import assert_eq
    else:
        from dask.dataframe.utils import assert_eq

    dask.config.update(
        dask.config.global_config,
        {
            "ucx": {
                "TLS": "tcp,sockcm,cuda_copy",
            },
        },
        priority="new",
    )

    with LocalCluster(
            protocol=protocol,
            dashboard_address=None,
            n_workers=n_workers,
            threads_per_worker=1,
            processes=True,
    ) as cluster:
        with Client(cluster):
            nrows = n_workers * 10

            # Let's make some dataframes that we can join on the "key" column
            df1 = pd.DataFrame({
                "key": np.arange(nrows),
                "payload1": np.arange(nrows)
            })
            key = np.arange(nrows)
            np.random.shuffle(key)
            df2 = pd.DataFrame({
                "key": key[nrows // 3:],
                "payload2": np.arange(nrows)[nrows // 3:]
            })
            expected = df1.merge(df2).set_index("key")

            if backend == "cudf":
                df1 = cudf.DataFrame.from_pandas(df1)
                df2 = cudf.DataFrame.from_pandas(df2)

            ddf1 = dd.from_pandas(df1, npartitions=n_workers + 1)
            ddf2 = dd.from_pandas(df2,
                                  npartitions=n_workers -
                                  1 if n_workers > 1 else 1)
            ddf3 = dataframe_merge(ddf1, ddf2, on="key").set_index("key")
            got = ddf3.compute()

            if backend == "cudf":
                assert_eq(got, expected)

            else:
                pd.testing.assert_frame_equal(got, expected)
Exemple #2
0
def _test_dataframe_merge_empty_partitions(nrows, npartitions):
    with LocalCluster(
            protocol="tcp",
            dashboard_address=None,
            n_workers=npartitions,
            threads_per_worker=1,
            processes=True,
    ) as cluster:
        with Client(cluster):
            df1 = pd.DataFrame({
                "key": np.arange(nrows),
                "payload1": np.arange(nrows)
            })
            key = np.arange(nrows)
            np.random.shuffle(key)
            df2 = pd.DataFrame({"key": key, "payload2": np.arange(nrows)})
            expected = df1.merge(df2).set_index("key")
            ddf1 = dd.from_pandas(df1, npartitions=npartitions)
            ddf2 = dd.from_pandas(df2, npartitions=npartitions)
            ddf3 = dataframe_merge(ddf1, ddf2, on="key").set_index("key")
            got = ddf3.compute()
            pd.testing.assert_frame_equal(got, expected)
Exemple #3
0
def _test_dataframe_merge(backend, protocol, n_workers):
    with LocalCluster(
        protocol=protocol,
        dashboard_address=None,
        n_workers=n_workers,
        threads_per_worker=1,
        processes=True,
    ) as cluster:
        with Client(cluster):
            nrows = n_workers * 10

            # Let's make some dataframes that we can join on the "key" column
            df1 = pd.DataFrame({"key": np.arange(nrows), "payload1": np.arange(nrows)})
            key = np.arange(nrows)
            np.random.shuffle(key)
            df2 = pd.DataFrame(
                {"key": key[nrows // 3 :], "payload2": np.arange(nrows)[nrows // 3 :]}
            )
            expected = df1.merge(df2).set_index("key")

            if backend == "cudf":
                df1 = cudf.DataFrame.from_pandas(df1)
                df2 = cudf.DataFrame.from_pandas(df2)

            ddf1 = dd.from_pandas(df1, npartitions=n_workers + 1)
            ddf2 = dd.from_pandas(
                df2, npartitions=n_workers - 1 if n_workers > 1 else 1
            )
            ddf3 = dataframe_merge(ddf1, ddf2, on="key").set_index("key")
            got = ddf3.compute()

            if backend == "cudf":
                got = got.to_pandas()
                got.index.names = ["key"]  # TODO: this shouldn't be needed

            pd.testing.assert_frame_equal(got, expected)
Exemple #4
0
def merge_explicit_comms(args, ddf1, ddf2):
    t1 = clock()
    wait(explicit_comms.dataframe_merge(ddf1, ddf2, on="key").persist())
    took = clock() - t1
    return took