Beispiel #1
0
def test_from_edgelist(dask_client):
    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg1 = cugraph.from_edgelist(ddf,
                                source="src",
                                destination="dst",
                                edge_attr="value",
                                create_using=cugraph.DiGraph)

    dg2 = cugraph.DiGraph()
    dg2.from_dask_cudf_edgelist(ddf,
                                source="src",
                                destination="dst",
                                edge_attr="value")

    assert dg1.EdgeList == dg2.EdgeList
Beispiel #2
0
def test_compute_local_data(client_connection):

    gc.collect()

    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                             delimiter=' ',
                             names=['src', 'dst', 'value'],
                             dtype=['int32', 'int32', 'float32'])

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                               edge_attr='value')

    # Compute_local_data
    dg.compute_local_data(by='dst')
    data = dg.local_data['data']
    by = dg.local_data['by']

    assert by == 'dst'
    assert Comms.is_initialized()

    global_num_edges = data.local_data['edges'].sum()
    assert global_num_edges == dg.number_of_edges()
    global_num_verts = data.local_data['verts'].sum()
    assert global_num_verts == dg.number_of_nodes()
def test_from_edgelist(client_connection):
    # FIXME: update this to allow dataset to be parameterized and have dataset
    # part of test param id (see other tests)
    input_data_path = r"../datasets/karate.csv"
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg1 = cugraph.from_edgelist(ddf,
                                source="src",
                                destination="dst",
                                edge_attr="value",
                                create_using=cugraph.DiGraph)

    dg2 = cugraph.DiGraph()
    dg2.from_dask_cudf_edgelist(ddf,
                                source="src",
                                destination="dst",
                                edge_attr="value")

    assert dg1.EdgeList == dg2.EdgeList
Beispiel #4
0
def test_compute_local_data(client_connection):

    gc.collect()

    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf,
                               source="src",
                               destination="dst",
                               edge_attr="value")

    # Compute_local_data
    dg.compute_local_data(by="dst")
    data = dg.local_data["data"]
    by = dg.local_data["by"]

    assert by == "dst"
    assert Comms.is_initialized()

    global_num_edges = data.local_data["edges"].sum()
    assert global_num_edges == dg.number_of_edges()
    global_num_verts = data.local_data["verts"].sum()
    assert global_num_verts == dg.number_of_nodes()
Beispiel #5
0
def test_consolidation(graph_file):
    gc.collect()

    cluster = LocalCUDACluster()
    client = Client(cluster)
    chunksize = dcg.get_chunksize(graph_file)

    M = utils.read_csv_for_nx(graph_file)

    df = pd.DataFrame()
    df['source'] = pd.Series(M['0'])
    df['target'] = pd.Series(M['1'])

    ddf = dask_cudf.read_csv(graph_file,
                             chunksize=chunksize,
                             delimiter=' ',
                             names=['source', 'target', 'weight'],
                             dtype=['int32', 'int32', 'float32'],
                             header=None)

    Gnx = nx.from_pandas_edgelist(df,
                                  source='source',
                                  target='target',
                                  create_using=nx.DiGraph)
    G = cugraph.from_cudf_edgelist(ddf,
                                   source='source',
                                   destination='target',
                                   create_using=cugraph.DiGraph)

    assert compare_graphs(Gnx, G)
    Gnx.clear()
    G.clear()
    client.close()
    cluster.close()
def test_dask_katz_centrality(client_connection):
    gc.collect()

    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    largest_out_degree = g.degrees().nlargest(n=1, columns="out_degree")
    largest_out_degree = largest_out_degree["out_degree"].iloc[0]
    katz_alpha = 1 / (largest_out_degree + 1)

    mg_res = dcg.katz_centrality(dg, alpha=katz_alpha, tol=1e-6)
    mg_res = mg_res.compute()

    import networkx as nx
    from cugraph.tests import utils
    NM = utils.read_csv_for_nx(input_data_path)
    Gnx = nx.from_pandas_edgelist(
        NM, create_using=nx.DiGraph(), source="0", target="1"
    )
    nk = nx.katz_centrality(Gnx, alpha=katz_alpha)
    import pandas as pd
    pdf = pd.DataFrame(nk.items(), columns=['vertex', 'katz_centrality'])
    exp_res = cudf.DataFrame(pdf)
    err = 0
    tol = 1.0e-05

    compare_res = exp_res.merge(
        mg_res, on="vertex", suffixes=["_local", "_dask"]
    )

    for i in range(len(compare_res)):
        diff = abs(
            compare_res["katz_centrality_local"].iloc[i]
            - compare_res["katz_centrality_dask"].iloc[i]
        )
        if diff > tol * 1.1:
            err = err + 1
    assert err == 0
Beispiel #7
0
def test_dask_bfs(dask_client):
    gc.collect()

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH /
                       "netscience.csv").as_posix()

    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    def modify_dataset(df):
        temp_df = cudf.DataFrame()
        temp_df['src'] = df['src'] + 1000
        temp_df['dst'] = df['dst'] + 1000
        temp_df['value'] = df['value']
        return cudf.concat([df, temp_df])

    meta = ddf._meta
    ddf = ddf.map_partitions(modify_dataset, meta=meta)

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = modify_dataset(df)

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    expected_dist = cugraph.bfs(g, [0, 1000])
    result_dist = dcg.bfs(dg, [0, 1000])
    result_dist = result_dist.compute()

    compare_dist = expected_dist.merge(result_dist,
                                       on="vertex",
                                       suffixes=["_local", "_dask"])

    err = 0

    for i in range(len(compare_dist)):
        if (compare_dist["distance_local"].iloc[i] !=
                compare_dist["distance_dask"].iloc[i]):
            err = err + 1
    assert err == 0
Beispiel #8
0
def test_dask_pagerank(client_connection, personalization_perc):
    gc.collect()

    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    # Pre compute local data and personalize
    personalization = None
    if personalization_perc != 0:
        dg.compute_local_data(by="dst")
        personalization = personalize(dg.number_of_vertices(),
                                      personalization_perc)

    expected_pr = cugraph.pagerank(g,
                                   personalization=personalization,
                                   tol=1e-6)
    result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6)
    result_pr = result_pr.compute()

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    assert err == 0
Beispiel #9
0
def test_dask_pagerank(client_connection, personalization_perc):
    gc.collect()

    # FIXME: update this to allow dataset to be parameterized and have dataset
    # part of test param id (see other tests)
    input_data_path = r"../datasets/karate.csv"
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    personalization = None
    if personalization_perc != 0:
        personalization, p = personalize(g.nodes(), personalization_perc)

    expected_pr = cugraph.pagerank(g,
                                   personalization=personalization,
                                   tol=1e-6)
    result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6)
    result_pr = result_pr.compute()

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    assert err == 0
Beispiel #10
0
def test_dask_bfs_multi_column_depthlimit(dask_client):
    gc.collect()

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH /
                       "netscience.csv").as_posix()
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src_a", "dst_a", "value"],
        dtype=["int32", "int32", "float32"],
    )
    ddf['src_b'] = ddf['src_a'] + 1000
    ddf['dst_b'] = ddf['dst_a'] + 1000

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src_a", "dst_a", "value"],
        dtype=["int32", "int32", "float32"],
    )
    df['src_b'] = df['src_a'] + 1000
    df['dst_b'] = df['dst_a'] + 1000

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, ["src_a", "src_b"], ["dst_a", "dst_b"])

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, ["src_a", "src_b"], ["dst_a", "dst_b"])

    start = cudf.DataFrame()
    start['a'] = [0]
    start['b'] = [1000]

    depth_limit = 18
    expected_dist = cugraph.bfs(g, start, depth_limit=depth_limit)
    result_dist = dcg.bfs(dg, start, depth_limit=depth_limit)
    result_dist = result_dist.compute()

    compare_dist = expected_dist.merge(result_dist,
                                       on=["0_vertex", "1_vertex"],
                                       suffixes=["_local", "_dask"])

    err = 0
    for i in range(len(compare_dist)):
        if (compare_dist["distance_local"].iloc[i] <= depth_limit
                and compare_dist["distance_dask"].iloc[i] <= depth_limit
                and compare_dist["distance_local"].iloc[i] !=
                compare_dist["distance_dask"].iloc[i]):
            err = err + 1
    assert err == 0
Beispiel #11
0
def test_dask_pagerank(dask_client, personalization_perc):
    gc.collect()

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    personalization = None
    if personalization_perc != 0:
        personalization, p = personalize(g.nodes(), personalization_perc)

    expected_pr = cugraph.pagerank(g,
                                   personalization=personalization,
                                   tol=1e-6)
    result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6)
    result_pr = result_pr.compute()

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    assert err == 0
Beispiel #12
0
def test_dask_pagerank(client_connection):
    gc.collect()

    pandas.set_option("display.max_rows", 10000)

    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    # Pre compute local data
    # dg.compute_local_data(by='dst')

    expected_pr = cugraph.pagerank(g)
    result_pr = dcg.pagerank(dg)

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    print("Mismatches:", err)
    assert err == 0
Beispiel #13
0
def test_directed_graph_renumber_false(renumber, dask_client):
    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )
    dg = cugraph.Graph(directed=True)

    with pytest.raises(ValueError):
        dg.from_dask_cudf_edgelist(ddf, "src", "dst", renumber=renumber)
Beispiel #14
0
def test_dask_sssp(dask_client):
    gc.collect()

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH /
                       "netscience.csv").as_posix()
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst", "value", renumber=True)

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst", "value")

    expected_dist = cugraph.sssp(g, 0)
    print(expected_dist)
    result_dist = dcg.sssp(dg, 0)
    result_dist = result_dist.compute()

    compare_dist = expected_dist.merge(
        result_dist, on="vertex", suffixes=["_local", "_dask"]
    )

    err = 0

    for i in range(len(compare_dist)):
        if (
            compare_dist["distance_local"].iloc[i]
            != compare_dist["distance_dask"].iloc[i]
        ):
            err = err + 1
    assert err == 0
Beispiel #15
0
def test_dask_sssp(client_connection):
    gc.collect()

    # FIXME: update this to allow dataset to be parameterized and have dataset
    # part of test param id (see other tests)
    input_data_path = r"../datasets/netscience.csv"
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst", "value", renumber=True)

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst", "value")

    expected_dist = cugraph.sssp(g, 0)
    print(expected_dist)
    result_dist = dcg.sssp(dg, 0)
    result_dist = result_dist.compute()

    compare_dist = expected_dist.merge(result_dist,
                                       on="vertex",
                                       suffixes=["_local", "_dask"])

    err = 0

    for i in range(len(compare_dist)):
        if (compare_dist["distance_local"].iloc[i] !=
                compare_dist["distance_dask"].iloc[i]):
            err = err + 1
    assert err == 0
Beispiel #16
0
def test_dask_pagerank(dask_client):
    pandas.set_option("display.max_rows", 10000)

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.Graph(directed=True)
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.Graph(directed=True)
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    expected_pr = cugraph.pagerank(g)
    result_pr = dcg.pagerank(dg).compute()

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    print("Mismatches:", err)
    assert err == 0
Beispiel #17
0
def test_dask_wcc(client_connection):
    gc.collect()

    # FIXME: update this to allow dataset to be parameterized and have dataset
    # part of test param id (see other tests)
    input_data_path = r"../datasets/netscience.csv"
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst", renumber=True)

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    expected_dist = cugraph.weakly_connected_components(g)
    result_dist = dcg.weakly_connected_components(dg)

    result_dist = result_dist.compute()
    compare_dist = expected_dist.merge(result_dist,
                                       on="vertex",
                                       suffixes=["_local", "_dask"])

    unique_local_labels = compare_dist['labels_local'].unique()

    for label in unique_local_labels.values.tolist():
        dask_labels_df = compare_dist[compare_dist['labels_local'] == label]
        dask_labels = dask_labels_df['labels_dask']
        assert (dask_labels.iloc[0] == dask_labels).all()
Beispiel #18
0
def test_dask_bfs():
    gc.collect()
    cluster = LocalCUDACluster()
    client = Client(cluster)
    Comms.initialize()

    input_data_path = r"../datasets/netscience.csv"
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(input_data_path,
                             chunksize=chunksize,
                             delimiter=' ',
                             names=['src', 'dst', 'value'],
                             dtype=['int32', 'int32', 'float32'])

    df = cudf.read_csv(input_data_path,
                       delimiter=' ',
                       names=['src', 'dst', 'value'],
                       dtype=['int32', 'int32', 'float32'])

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, 'src', 'dst', renumber=True)

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, renumber=True)

    expected_dist = cugraph.bfs(g, 0)
    result_dist = dcg.bfs(dg, 0, True)

    compare_dist = expected_dist.merge(result_dist,
                                       on="vertex",
                                       suffixes=['_local', '_dask'])

    err = 0

    for i in range(len(compare_dist)):
        if (compare_dist['distance_local'].iloc[i] !=
                compare_dist['distance_dask'].iloc[i]):
            err = err + 1
    assert err == 0

    Comms.destroy()
    client.close()
    cluster.close()
Beispiel #19
0
def daskGraphFromDataset(request, client_connection):
    """
    Returns a new dask dataframe created from the dataset file param.
    """
    # Since parameterized fixtures do not assign param names to param values,
    # manually call the helper to do so.
    setFixtureParamNames(request, ["dataset"])
    dataset = request.param

    chunksize = dcg.get_chunksize(dataset)
    ddf = dask_cudf.read_csv(dataset,
                             chunksize=chunksize,
                             delimiter=' ',
                             names=['src', 'dst', 'value'],
                             dtype=['int32', 'int32', 'float32'])

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
    return dg
Beispiel #20
0
def test_dask_wcc(dask_client):
    gc.collect()

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH /
                       "netscience.csv").as_posix()
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst", renumber=True)

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    expected_dist = cugraph.weakly_connected_components(g)
    result_dist = dcg.weakly_connected_components(dg)

    result_dist = result_dist.compute()
    compare_dist = expected_dist.merge(result_dist,
                                       on="vertex",
                                       suffixes=["_local", "_dask"])

    unique_local_labels = compare_dist['labels_local'].unique()

    for label in unique_local_labels.values.tolist():
        dask_labels_df = compare_dist[compare_dist['labels_local'] == label]
        dask_labels = dask_labels_df['labels_dask']
        assert (dask_labels.iloc[0] == dask_labels).all()
Beispiel #21
0
def test_dask_bfs(client_connection):
    gc.collect()

    input_data_path = r"../datasets/netscience.csv"
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst", renumber=True)

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    expected_dist = cugraph.bfs(g, 0)
    result_dist = dcg.bfs(dg, 0, True)
    result_dist = result_dist.compute()

    compare_dist = expected_dist.merge(result_dist,
                                       on="vertex",
                                       suffixes=["_local", "_dask"])

    err = 0

    for i in range(len(compare_dist)):
        if (compare_dist["distance_local"].iloc[i] !=
                compare_dist["distance_dask"].iloc[i]):
            err = err + 1
    assert err == 0
Beispiel #22
0
def test_from_edgelist(client_connection):
    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg1 = cugraph.from_edgelist(
        ddf, source="src", destination="dst", edge_attr="value",
        create_using=cugraph.DiGraph)

    dg2 = cugraph.DiGraph()
    dg2.from_dask_cudf_edgelist(
        ddf, source="src", destination="dst", edge_attr="value"
    )

    assert dg1.EdgeList == dg2.EdgeList
Beispiel #23
0
def daskGraphFromDataset(request, dask_client):
    """
    Returns a new dask dataframe created from the dataset file param.
    """
    # Since parameterized fixtures do not assign param names to param values,
    # manually call the helper to do so.
    setFixtureParamNames(request, ["dataset"])
    dataset = request.param

    chunksize = dcg.get_chunksize(dataset)
    ddf = dask_cudf.read_csv(
        dataset,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")
    return dg
Beispiel #24
0
def test_dask_pagerank(client_connection):
    gc.collect()

    # Initialize and run pagerank on two distributed graphs
    # with same communicator

    input_data_path1 = r"../datasets/karate.csv"
    chunksize1 = dcg.get_chunksize(input_data_path1)

    input_data_path2 = r"../datasets/dolphins.csv"
    chunksize2 = dcg.get_chunksize(input_data_path2)

    ddf1 = dask_cudf.read_csv(input_data_path1,
                              chunksize=chunksize1,
                              delimiter=' ',
                              names=['src', 'dst', 'value'],
                              dtype=['int32', 'int32', 'float32'])

    dg1 = cugraph.DiGraph()
    dg1.from_dask_cudf_edgelist(ddf1, 'src', 'dst')

    result_pr1 = dcg.pagerank(dg1)

    ddf2 = dask_cudf.read_csv(input_data_path2,
                              chunksize=chunksize2,
                              delimiter=' ',
                              names=['src', 'dst', 'value'],
                              dtype=['int32', 'int32', 'float32'])

    dg2 = cugraph.DiGraph()
    dg2.from_dask_cudf_edgelist(ddf2, 'src', 'dst')

    result_pr2 = dcg.pagerank(dg2)

    # Calculate single GPU pagerank for verification of results
    df1 = cudf.read_csv(input_data_path1,
                        delimiter=' ',
                        names=['src', 'dst', 'value'],
                        dtype=['int32', 'int32', 'float32'])

    g1 = cugraph.DiGraph()
    g1.from_cudf_edgelist(df1, 'src', 'dst')
    expected_pr1 = cugraph.pagerank(g1)

    df2 = cudf.read_csv(input_data_path2,
                        delimiter=' ',
                        names=['src', 'dst', 'value'],
                        dtype=['int32', 'int32', 'float32'])

    g2 = cugraph.DiGraph()
    g2.from_cudf_edgelist(df2, 'src', 'dst')
    expected_pr2 = cugraph.pagerank(g2)

    # Compare and verify pagerank results

    err1 = 0
    err2 = 0
    tol = 1.0e-05

    compare_pr1 = expected_pr1.merge(result_pr1,
                                     on="vertex",
                                     suffixes=['_local', '_dask'])

    assert len(expected_pr1) == len(result_pr1)

    for i in range(len(compare_pr1)):
        diff = abs(compare_pr1['pagerank_local'].iloc[i] -
                   compare_pr1['pagerank_dask'].iloc[i])
        if diff > tol * 1.1:
            err1 = err1 + 1
    print("Mismatches in ", input_data_path1, ": ", err1)

    assert len(expected_pr2) == len(result_pr2)

    compare_pr2 = expected_pr2.merge(result_pr2,
                                     on="vertex",
                                     suffixes=['_local', '_dask'])

    for i in range(len(compare_pr2)):
        diff = abs(compare_pr2['pagerank_local'].iloc[i] -
                   compare_pr2['pagerank_dask'].iloc[i])
        if diff > tol * 1.1:
            err2 = err2 + 1
    print("Mismatches in ", input_data_path2, ": ", err2)
    assert err1 == err2 == 0
Beispiel #25
0
def test_dask_pagerank(client_connection):
    gc.collect()

    # Initialize and run pagerank on two distributed graphs
    # with same communicator

    # FIXME: update this to allow dataset to be parameterized and have dataset
    # part of test param id (see other tests)
    input_data_path1 = r"../datasets/karate.csv"
    print(f"dataset1={input_data_path1}")
    chunksize1 = dcg.get_chunksize(input_data_path1)

    input_data_path2 = r"../datasets/dolphins.csv"
    print(f"dataset2={input_data_path2}")
    chunksize2 = dcg.get_chunksize(input_data_path2)

    ddf1 = dask_cudf.read_csv(
        input_data_path1,
        chunksize=chunksize1,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg1 = cugraph.DiGraph()
    dg1.from_dask_cudf_edgelist(ddf1, "src", "dst")

    result_pr1 = dcg.pagerank(dg1).compute()

    ddf2 = dask_cudf.read_csv(
        input_data_path2,
        chunksize=chunksize2,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg2 = cugraph.DiGraph()
    dg2.from_dask_cudf_edgelist(ddf2, "src", "dst")

    result_pr2 = dcg.pagerank(dg2).compute()

    # Calculate single GPU pagerank for verification of results
    df1 = cudf.read_csv(
        input_data_path1,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g1 = cugraph.DiGraph()
    g1.from_cudf_edgelist(df1, "src", "dst")
    expected_pr1 = cugraph.pagerank(g1)

    df2 = cudf.read_csv(
        input_data_path2,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g2 = cugraph.DiGraph()
    g2.from_cudf_edgelist(df2, "src", "dst")
    expected_pr2 = cugraph.pagerank(g2)

    # Compare and verify pagerank results

    err1 = 0
    err2 = 0
    tol = 1.0e-05

    compare_pr1 = expected_pr1.merge(result_pr1,
                                     on="vertex",
                                     suffixes=["_local", "_dask"])

    assert len(expected_pr1) == len(result_pr1)

    for i in range(len(compare_pr1)):
        diff = abs(compare_pr1["pagerank_local"].iloc[i] -
                   compare_pr1["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err1 = err1 + 1
    print("Mismatches in ", input_data_path1, ": ", err1)

    assert len(expected_pr2) == len(result_pr2)

    compare_pr2 = expected_pr2.merge(result_pr2,
                                     on="vertex",
                                     suffixes=["_local", "_dask"])

    for i in range(len(compare_pr2)):
        diff = abs(compare_pr2["pagerank_local"].iloc[i] -
                   compare_pr2["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err2 = err2 + 1
    print("Mismatches in ", input_data_path2, ": ", err2)
    assert err1 == err2 == 0