def _calc_bc_subset_fixed( G, Gnx, normalized, weight, endpoints, k, seed, result_dtype ): assert isinstance(k, int), ( "This test is meant for verifying coherence " "when k is given as an int" ) # In the fixed set we compare cu_bc against itself as we random.seed(seed) # on the same seed and then sample on the number of vertices themselves if seed is None: seed = 123 # random.seed(None) uses time, but we want same sources random.seed(seed) # It will be called again in cugraph's call sources = random.sample(range(G.number_of_vertices()), k) if G.renumbered: sources_df = cudf.DataFrame({'src': sources}) sources = G.unrenumber(sources_df, 'src')['src'].to_pandas().tolist() # The first call is going to proceed to the random sampling in the same # fashion as the lines above df = cugraph.betweenness_centrality( G, k=k, normalized=normalized, weight=weight, endpoints=endpoints, seed=seed, result_dtype=result_dtype, ) sorted_df = df.sort_values("vertex").rename( columns={"betweenness_centrality": "cu_bc"}, copy=False ).reset_index(drop=True) # The second call is going to process source that were already sampled # We set seed to None as k : int, seed : not none should not be normal # behavior df2 = cugraph.betweenness_centrality( G, k=sources, normalized=normalized, weight=weight, endpoints=endpoints, seed=None, result_dtype=result_dtype, ) sorted_df2 = df2.sort_values("vertex").rename( columns={"betweenness_centrality": "ref_bc"}, copy=False ).reset_index(drop=True) merged_sorted_df = cudf.concat( [sorted_df, sorted_df2["ref_bc"]], axis=1, sort=False ) return merged_sorted_df
def test_betweenness_centrality_nx( graph_file, directed, edgevals ): prepare_test() Gnx = utils.generate_nx_graph_from_file(graph_file, directed, edgevals) nx_bc = nx.betweenness_centrality(Gnx) cu_bc = cugraph.betweenness_centrality(Gnx) # Calculating mismatch networkx_bc = sorted(nx_bc.items(), key=lambda x: x[0]) cugraph_bc = sorted(cu_bc.items(), key=lambda x: x[0]) err = 0 assert len(cugraph_bc) == len(networkx_bc) for i in range(len(cugraph_bc)): if ( abs(cugraph_bc[i][1] - networkx_bc[i][1]) > 0.01 and cugraph_bc[i][0] == networkx_bc[i][0] ): err = err + 1 print(f"{cugraph_bc[i][1]} and {cugraph_bc[i][1]}") print("Mismatches:", err) assert err < (0.01 * len(cugraph_bc))
def _calc_bc_full( G, Gnx, normalized, weight, endpoints, k, seed, result_dtype ): df = cugraph.betweenness_centrality( G, k=k, normalized=normalized, weight=weight, endpoints=endpoints, result_dtype=result_dtype, ) assert ( df["betweenness_centrality"].dtype == result_dtype ), "'betweenness_centrality' column has not the expected type" nx_bc = nx.betweenness_centrality( Gnx, k=k, normalized=normalized, weight=weight, endpoints=endpoints ) sorted_df = df.sort_values("vertex").rename( columns={"betweenness_centrality": "cu_bc"}, copy=False ).reset_index(drop=True) _, nx_bc = zip(*sorted(nx_bc.items())) nx_df = cudf.DataFrame({"ref_bc": nx_bc}) merged_sorted_df = cudf.concat([sorted_df, nx_df], axis=1, sort=False) return merged_sorted_df
def _calc_bc_subset( G, Gnx, normalized, weight, endpoints, k, seed, result_dtype ): # NOTE: Networkx API does not allow passing a list of vertices # And the sampling is operated on Gnx.nodes() directly # We first mimic acquisition of the nodes to compare with same sources random.seed(seed) # It will be called again in nx's call sources = random.sample(Gnx.nodes(), k) df = cugraph.betweenness_centrality( G, k=sources, normalized=normalized, weight=weight, endpoints=endpoints, result_dtype=result_dtype, ) sorted_df = df.sort_values("vertex").rename( columns={"betweenness_centrality": "cu_bc"}, copy=False ).reset_index(drop=True) nx_bc = nx.betweenness_centrality( Gnx, k=k, normalized=normalized, weight=weight, endpoints=endpoints, seed=seed, ) _, nx_bc = zip(*sorted(nx_bc.items())) nx_df = cudf.DataFrame({"ref_bc": nx_bc}) merged_sorted_df = cudf.concat([sorted_df, nx_df], axis=1, sort=False) return merged_sorted_df
def calc_between_centralities(dg): between_centers = cnx.betweenness_centrality(dg) return { k: v for k, v in sorted( between_centers.items(), key=lambda x: x[1], reverse=True) }
def _calc_bc_subset_fixed(G, Gnx, normalized, weight, endpoints, k, seed, result_dtype): assert isinstance(k, int), "This test is meant for verifying coherence " \ "when k is given as an int" # In the fixed set we compare cu_bc against itself as we random.seed(seed) # on the same seed and then sample on the number of vertices themselves if seed is None: seed = 123 # random.seed(None) uses time, but we want same sources random.seed(seed) # It will be called again in cugraph's call sources = random.sample(range(G.number_of_vertices()), k) # The first call is going to proceed to the random sampling in the same # fashion as the lines above df = cugraph.betweenness_centrality(G, k=k, normalized=normalized, weight=weight, endpoints=endpoints, seed=seed, result_dtype=result_dtype) # The second call is going to process source that were already sampled # We set seed to None as k : int, seed : not none should not be normal # behavior df2 = cugraph.betweenness_centrality(G, k=sources, normalized=normalized, weight=weight, endpoints=endpoints, seed=None, result_dtype=result_dtype) cu_bc = { key: score for key, score in zip(df['vertex'].to_array(), df['betweenness_centrality'].to_array()) } cu_bc2 = { key: score for key, score in zip(df2['vertex'].to_array(), df2['betweenness_centrality'].to_array()) } return cu_bc, cu_bc2
def calc_betweenness_centrality(graph_file, normalized=True): cu_M = utils.read_csv_file(graph_file) G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source='0', destination='1') df = cugraph.betweenness_centrality(G, normalized=normalized) NM = utils.read_csv_for_nx(graph_file) Gnx = nx.from_pandas_edgelist(NM, create_using=nx.DiGraph(), source='0', target='1') nb = nx.betweenness_centrality(Gnx, normalized=normalized) pdf = [nb[k] for k in sorted(nb.keys())] df['nx'] = pdf df = df.rename({'betweenness_centrality': 'cu'}) return df
def _calc_bc_full(G, Gnx, normalized, weight, endpoints, k, seed, result_dtype): df = cugraph.betweenness_centrality(G, normalized=normalized, weight=weight, endpoints=endpoints, result_dtype=result_dtype) assert df['betweenness_centrality'].dtype == result_dtype, \ "'betweenness_centrality' column has not the expected type" nx_bc = nx.betweenness_centrality(Gnx, normalized=normalized, weight=weight, endpoints=endpoints) cu_bc = { key: score for key, score in zip(df['vertex'].to_array(), df['betweenness_centrality'].to_array()) } return cu_bc, nx_bc
def _calc_bc_subset(G, Gnx, normalized, weight, endpoints, k, seed, result_dtype): # NOTE: Networkx API does not allow passing a list of vertices # And the sampling is operated on Gnx.nodes() directly # We first mimic acquisition of the nodes to compare with same sources random.seed(seed) # It will be called again in nx's call sources = random.sample(Gnx.nodes(), k) df = cugraph.betweenness_centrality(G, normalized=normalized, weight=weight, endpoints=endpoints, k=sources, result_dtype=result_dtype) nx_bc = nx.betweenness_centrality(Gnx, normalized=normalized, k=k, seed=seed) cu_bc = { key: score for key, score in zip(df['vertex'].to_array(), df['betweenness_centrality'].to_array()) } return cu_bc, nx_bc