Esempio n. 1
0
def cugraph_nx_call(G, max_iter, tol, alpha, personalization, nstart):
    # cugraph Pagerank Call
    t1 = time.time()
    pr = cugraph.pagerank(
        G,
        alpha=alpha,
        max_iter=max_iter,
        tol=tol,
        personalization=personalization,
        nstart=nstart,
    )
    t2 = time.time() - t1
    print("Cugraph Time : " + str(t2))

    return pr
Esempio n. 2
0
def test_dask_pagerank(client_connection):
    gc.collect()

    pandas.set_option("display.max_rows", 10000)

    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    expected_pr = cugraph.pagerank(g)
    result_pr = dcg.pagerank(dg).compute()

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    print("Mismatches:", err)
    assert err == 0
Esempio n. 3
0
def test_dask_pagerank(client_connection):
    gc.collect()

    pandas.set_option('display.max_rows', 10000)

    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(input_data_path,
                             chunksize=chunksize,
                             delimiter=' ',
                             names=['src', 'dst', 'value'],
                             dtype=['int32', 'int32', 'float32'])

    df = cudf.read_csv(input_data_path,
                       delimiter=' ',
                       names=['src', 'dst', 'value'],
                       dtype=['int32', 'int32', 'float32'])

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, 'src', 'dst')

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')

    # Pre compute local data
    # dg.compute_local_data(by='dst')

    expected_pr = cugraph.pagerank(g)
    result_pr = dcg.pagerank(dg)

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=['_local', '_dask'])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr['pagerank_local'].iloc[i] -
                   compare_pr['pagerank_dask'].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    print("Mismatches:", err)
    assert err == 0
Esempio n. 4
0
def cugraph_call(cu_M, max_iter, tol, alpha, personalization, nstart):
    # cugraph Pagerank Call
    G = cugraph.DiGraph()
    G.from_cudf_edgelist(cu_M, source='0', destination='1')
    t1 = time.time()
    df = cugraph.pagerank(G, alpha=alpha, max_iter=max_iter, tol=tol,
                          personalization=personalization, nstart=nstart)
    t2 = time.time() - t1
    print('Cugraph Time : '+str(t2))

    # Sort Pagerank values
    sorted_pr = []
    pr_scores = df['pagerank'].to_array()
    for i, rank in enumerate(pr_scores):
        sorted_pr.append((i, rank))

    return sorted_pr
Esempio n. 5
0
def test_dask_pagerank(dask_client):
    pandas.set_option("display.max_rows", 10000)

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.Graph(directed=True)
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.Graph(directed=True)
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    expected_pr = cugraph.pagerank(g)
    result_pr = dcg.pagerank(dg).compute()

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    print("Mismatches:", err)
    assert err == 0
Esempio n. 6
0
def test_pagerank():
    import cugraph

    gdf = read_csv_file(csvFile)
    sources = gdf['0']
    destinations = gdf['1']

    # Assuming that data has been loaded into a cuDF (using read_csv) Dataframe
    # create a Graph using the source and destination vertex pairs
    G = cugraph.Graph()
    G.add_edge_list(sources, destinations, None)

    # Call cugraph.pagerank to get the pagerank scores
    gdf_page = cugraph.pagerank(G)

    assert len(expectedPageRanks) == len(gdf_page["pagerank"])
    for (actual, expected) in zip(gdf_page["pagerank"], expectedPageRanks):
        assert actual == pytest.approx(expected)
Esempio n. 7
0
def test_pagerank() :
    import cugraph

    gdf = read_csv_file(csvFile)

    # Assuming that data has been loaded into a cuDF (using read_csv) Dataframe
    # create a Graph using the source and destination vertex pairs
    G = cugraph.Graph()
    G.from_cudf_edgelist(gdf, "0", "1")

    # Call cugraph.pagerank to get the pagerank scores
    # Sort values since renumbering may have changed expected order
    gdf_page = cugraph.pagerank(G)
    gdf_page = gdf_page.sort_values('vertex').reset_index(drop=True)

    assert len(expectedPageRanks) == len(gdf_page["pagerank"])
    for (actual, expected) in zip(gdf_page["pagerank"].to_pandas(),
                                  expectedPageRanks):
        assert actual == pytest.approx(expected)
Esempio n. 8
0
def data(df):
    net = cudf.from_pandas(df)

    net['to'] = net['to'].astype('int32')
    net['from'] = net['from'].astype('int32')

    n = net.iloc[0, 0]

    G = cugraph.Graph()
    G.add_edge_list(net['from'], net['to'], None)
    out_bfs = cugraph.bfs(G, n, directed=True)
    out_page = cugraph.pagerank(G)
    out_bfs = out_bfs.to_pandas()
    out_page = out_page.to_pandas()

    out_bfs.loc[out_bfs['distance'] < 3, 'group'] = 2
    out_bfs.loc[out_bfs['distance'] == 3, 'group'] = 0
    out_bfs.loc[out_bfs['distance'] > 3, 'group'] = 1
    out_bfs = out_bfs[['vertex', 'group']]
    return out_bfs, out_page
Esempio n. 9
0
def cugraph_Call(M, max_iter, tol, alpha):

    # Device data
    sources = cudf.Series(M.row)
    destinations = cudf.Series(M.col)
    # values = cudf.Series(np.ones(len(sources), dtype = np.float64))

    # cugraph Pagerank Call
    G = cugraph.Graph()
    G.add_edge_list(sources, destinations, None)
    t1 = time.time()
    df = cugraph.pagerank(G, alpha=alpha, max_iter=max_iter, tol=tol)
    t2 = time.time() - t1
    print('Time : ' + str(t2))

    # Sort Pagerank values
    sorted_pr = []
    for i, rank in enumerate(df['pagerank']):
        sorted_pr.append((i, rank))

    return sorted(sorted_pr, key=lambda x: x[1], reverse=True)
Esempio n. 10
0
def cugraph_call(G, max_iter, tol, alpha, personalization, nstart):
    # cugraph Pagerank Call
    t1 = time.time()
    df = cugraph.pagerank(
        G,
        alpha=alpha,
        max_iter=max_iter,
        tol=tol,
        personalization=personalization,
        nstart=nstart,
    )
    t2 = time.time() - t1
    print("Cugraph Time : " + str(t2))

    # Sort Pagerank values
    sorted_pr = []

    df = df.sort_values("vertex").reset_index(drop=True)

    pr_scores = df["pagerank"].to_array()
    for i, rank in enumerate(pr_scores):
        sorted_pr.append((i, rank))

    return sorted_pr
Esempio n. 11
0
def pagerank(G):
    return cugraph.pagerank(G)
Esempio n. 12
0
def test_dask_pagerank(client_connection):
    gc.collect()

    # Initialize and run pagerank on two distributed graphs
    # with same communicator

    input_data_path1 = r"../datasets/karate.csv"
    chunksize1 = dcg.get_chunksize(input_data_path1)

    input_data_path2 = r"../datasets/dolphins.csv"
    chunksize2 = dcg.get_chunksize(input_data_path2)

    ddf1 = dask_cudf.read_csv(input_data_path1,
                              chunksize=chunksize1,
                              delimiter=' ',
                              names=['src', 'dst', 'value'],
                              dtype=['int32', 'int32', 'float32'])

    dg1 = cugraph.DiGraph()
    dg1.from_dask_cudf_edgelist(ddf1, 'src', 'dst')

    result_pr1 = dcg.pagerank(dg1)

    ddf2 = dask_cudf.read_csv(input_data_path2,
                              chunksize=chunksize2,
                              delimiter=' ',
                              names=['src', 'dst', 'value'],
                              dtype=['int32', 'int32', 'float32'])

    dg2 = cugraph.DiGraph()
    dg2.from_dask_cudf_edgelist(ddf2, 'src', 'dst')

    result_pr2 = dcg.pagerank(dg2)

    # Calculate single GPU pagerank for verification of results
    df1 = cudf.read_csv(input_data_path1,
                        delimiter=' ',
                        names=['src', 'dst', 'value'],
                        dtype=['int32', 'int32', 'float32'])

    g1 = cugraph.DiGraph()
    g1.from_cudf_edgelist(df1, 'src', 'dst')
    expected_pr1 = cugraph.pagerank(g1)

    df2 = cudf.read_csv(input_data_path2,
                        delimiter=' ',
                        names=['src', 'dst', 'value'],
                        dtype=['int32', 'int32', 'float32'])

    g2 = cugraph.DiGraph()
    g2.from_cudf_edgelist(df2, 'src', 'dst')
    expected_pr2 = cugraph.pagerank(g2)

    # Compare and verify pagerank results

    err1 = 0
    err2 = 0
    tol = 1.0e-05

    compare_pr1 = expected_pr1.merge(result_pr1,
                                     on="vertex",
                                     suffixes=['_local', '_dask'])

    assert len(expected_pr1) == len(result_pr1)

    for i in range(len(compare_pr1)):
        diff = abs(compare_pr1['pagerank_local'].iloc[i] -
                   compare_pr1['pagerank_dask'].iloc[i])
        if diff > tol * 1.1:
            err1 = err1 + 1
    print("Mismatches in ", input_data_path1, ": ", err1)

    assert len(expected_pr2) == len(result_pr2)

    compare_pr2 = expected_pr2.merge(result_pr2,
                                     on="vertex",
                                     suffixes=['_local', '_dask'])

    for i in range(len(compare_pr2)):
        diff = abs(compare_pr2['pagerank_local'].iloc[i] -
                   compare_pr2['pagerank_dask'].iloc[i])
        if diff > tol * 1.1:
            err2 = err2 + 1
    print("Mismatches in ", input_data_path2, ": ", err2)
    assert err1 == err2 == 0
Esempio n. 13
0
 def cugraph_pagerank(graph: CuGraph, damping: float, maxiter: int,
                      tolerance: float) -> CuDFNodeMap:
     pagerank = cugraph.pagerank(
         graph.value, alpha=damping, max_iter=maxiter,
         tol=tolerance).set_index("vertex")["pagerank"]
     return CuDFNodeMap(pagerank)
Esempio n. 14
0
def test_dask_pagerank(client_connection):
    gc.collect()

    # Initialize and run pagerank on two distributed graphs
    # with same communicator

    # FIXME: update this to allow dataset to be parameterized and have dataset
    # part of test param id (see other tests)
    input_data_path1 = r"../datasets/karate.csv"
    print(f"dataset1={input_data_path1}")
    chunksize1 = dcg.get_chunksize(input_data_path1)

    input_data_path2 = r"../datasets/dolphins.csv"
    print(f"dataset2={input_data_path2}")
    chunksize2 = dcg.get_chunksize(input_data_path2)

    ddf1 = dask_cudf.read_csv(
        input_data_path1,
        chunksize=chunksize1,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg1 = cugraph.DiGraph()
    dg1.from_dask_cudf_edgelist(ddf1, "src", "dst")

    result_pr1 = dcg.pagerank(dg1).compute()

    ddf2 = dask_cudf.read_csv(
        input_data_path2,
        chunksize=chunksize2,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg2 = cugraph.DiGraph()
    dg2.from_dask_cudf_edgelist(ddf2, "src", "dst")

    result_pr2 = dcg.pagerank(dg2).compute()

    # Calculate single GPU pagerank for verification of results
    df1 = cudf.read_csv(
        input_data_path1,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g1 = cugraph.DiGraph()
    g1.from_cudf_edgelist(df1, "src", "dst")
    expected_pr1 = cugraph.pagerank(g1)

    df2 = cudf.read_csv(
        input_data_path2,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g2 = cugraph.DiGraph()
    g2.from_cudf_edgelist(df2, "src", "dst")
    expected_pr2 = cugraph.pagerank(g2)

    # Compare and verify pagerank results

    err1 = 0
    err2 = 0
    tol = 1.0e-05

    compare_pr1 = expected_pr1.merge(result_pr1,
                                     on="vertex",
                                     suffixes=["_local", "_dask"])

    assert len(expected_pr1) == len(result_pr1)

    for i in range(len(compare_pr1)):
        diff = abs(compare_pr1["pagerank_local"].iloc[i] -
                   compare_pr1["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err1 = err1 + 1
    print("Mismatches in ", input_data_path1, ": ", err1)

    assert len(expected_pr2) == len(result_pr2)

    compare_pr2 = expected_pr2.merge(result_pr2,
                                     on="vertex",
                                     suffixes=["_local", "_dask"])

    for i in range(len(compare_pr2)):
        diff = abs(compare_pr2["pagerank_local"].iloc[i] -
                   compare_pr2["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err2 = err2 + 1
    print("Mismatches in ", input_data_path2, ": ", err2)
    assert err1 == err2 == 0
Esempio n. 15
0
def test_pagerank_multi_column(graph_file, max_iter, tol, alpha,
                               personalization_perc, has_guess):
    gc.collect()

    # NetworkX PageRank
    M = utils.read_csv_for_nx(graph_file)
    nnz_vtx = np.unique(M[['0', '1']])

    Gnx = nx.from_pandas_edgelist(M,
                                  source="0",
                                  target="1",
                                  edge_attr="weight",
                                  create_using=nx.DiGraph())

    networkx_pr, networkx_prsn = networkx_call(Gnx, max_iter, tol, alpha,
                                               personalization_perc, nnz_vtx)

    cu_nstart = None
    if has_guess == 1:
        cu_nstart_temp = cudify(networkx_pr)
        max_iter = 100
        cu_nstart = cudf.DataFrame()
        cu_nstart["vertex_0"] = cu_nstart_temp["vertex"]
        cu_nstart["vertex_1"] = cu_nstart["vertex_0"] + 1000
        cu_nstart["values"] = cu_nstart_temp["values"]

    cu_prsn_temp = cudify(networkx_prsn)
    if cu_prsn_temp is not None:
        cu_prsn = cudf.DataFrame()
        cu_prsn["vertex_0"] = cu_prsn_temp["vertex"]
        cu_prsn["vertex_1"] = cu_prsn["vertex_0"] + 1000
        cu_prsn["values"] = cu_prsn_temp["values"]
    else:
        cu_prsn = cu_prsn_temp

    cu_M = cudf.DataFrame()
    cu_M["src_0"] = cudf.Series(M["0"])
    cu_M["dst_0"] = cudf.Series(M["1"])
    cu_M["src_1"] = cu_M["src_0"] + 1000
    cu_M["dst_1"] = cu_M["dst_0"] + 1000
    cu_M["weights"] = cudf.Series(M["weight"])

    cu_G = cugraph.DiGraph()
    cu_G.from_cudf_edgelist(cu_M,
                            source=["src_0", "src_1"],
                            destination=["dst_0", "dst_1"],
                            edge_attr="weights")

    df = cugraph.pagerank(
        cu_G,
        alpha=alpha,
        max_iter=max_iter,
        tol=tol,
        personalization=cu_prsn,
        nstart=cu_nstart,
    )

    cugraph_pr = []

    df = df.sort_values("0_vertex").reset_index(drop=True)

    pr_scores = df["pagerank"].to_array()
    for i, rank in enumerate(pr_scores):
        cugraph_pr.append((i, rank))

    # Calculating mismatch
    networkx_pr = sorted(networkx_pr.items(), key=lambda x: x[0])
    err = 0
    assert len(cugraph_pr) == len(networkx_pr)
    for i in range(len(cugraph_pr)):
        if (abs(cugraph_pr[i][1] - networkx_pr[i][1]) > tol * 1.1
                and cugraph_pr[i][0] == networkx_pr[i][0]):
            err = err + 1
    print("Mismatches:", err)
    assert err < (0.01 * len(cugraph_pr))