def common_neighbors(G, fn, t = 0.5): G = G.to_undirected() if os.path.isfile(fn) : H = G.copy() found = nx.read_edgelist(fn, nodetype=int, data=False) H.add_edges_from(found.edges_iter()) jacc_iter = nx.jaccard_coefficient(G, nx.non_edges(H)) print "Appending to %s" % fn outfile = open(fn,'a',1) i = found.number_of_nodes() else: jacc_iter = nx.jaccard_coefficient(G) outfile = open(fn,'w',1) i = 0 outfile.write("#vertex u; vertex v; their jaccard coef\n") cur = -1 print "Starting jacc loop %s with threshold %s" % (time.strftime("%H:%M:%S"), t) for pair in jacc_iter: if pair[2] >= t: outfile.write("%s %s %f\n" % (pair[0],pair[1],pair[2])) if pair[0] != cur: cur = pair[0] i += 1 print "%s: %s" % (i, cur) outfile.close() print "Done writing %s" % (fn)
def get_link_pred_auc(graph, pos_test, neg_test): jc_pos_test_pred = nx.jaccard_coefficient(graph, pos_test) jc_neg_test_pred = nx.jaccard_coefficient(graph, neg_test) jc_pos_score = [p for _, _, p in jc_pos_test_pred] jc_neg_score = [n for _, _, n in jc_neg_test_pred] jc_all_labels = [1] * len(jc_pos_score) + [0] * len(jc_neg_score) jc_all_scores = jc_pos_score + jc_neg_score jc_auc = metrics.roc_auc_score(jc_all_labels, jc_all_scores) aa_pos_test_pred = nx.resource_allocation_index(graph, pos_test) aa_neg_test_pred = nx.resource_allocation_index(graph, neg_test) aa_pos_score = [p for _, _, p in aa_pos_test_pred] aa_neg_score = [n for _, _, n in aa_neg_test_pred] aa_all_labels = [1] * len(aa_pos_score) + [0] * len(aa_neg_score) aa_all_scores = aa_pos_score + aa_neg_score aa_auc = metrics.roc_auc_score(aa_all_labels, aa_all_scores) return jc_auc, aa_auc
def compute_indexes(G: nx.Graph, method, negative, positive): if method == 'resource_allocation': return nx.resource_allocation_index( G, negative), nx.resource_allocation_index(G, positive) elif method == 'jaccard_coefficient': return nx.jaccard_coefficient(G, negative), nx.jaccard_coefficient( G, positive) elif method == 'adamic_adar': return nx.adamic_adar_index(G, negative), nx.adamic_adar_index( G, positive) elif method == 'preferential_attachment': return nx.preferential_attachment( G, negative), nx.preferential_attachment(G, positive) elif method == 'sorensen_neighbours': return ([(u, v, sorensen_index(G, u, v)) for u, v in negative], [(u, v, sorensen_index(G, u, v)) for u, v in positive]) elif method == 'community': c = louvain(G) commLabels = c.communities comms = c.to_node_community_map() return ([(u, v, community_index(G, u, v, commLabels, comms)) for u, v in negative], [(u, v, community_index(G, u, v, commLabels, comms)) for u, v in positive]) else: raise NameError('The given method is not supported')
def main(): G=read_edgelist('testdata/demo.net') M = nx.to_numpy_matrix(G) print M.dot(M.T) print nx.jaccard_coefficient(G, ['a']) for u, v, p in nx.jaccard_coefficient(G, ['a']): print u, v, p
def jaccard_coefficient_scores(g_train, train_test_split): if g_train.is_directed(): # Jaccard coef only works for undirected graphs g_train = g_train.to_undirected() adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split # Unpack input start_time = time.time() jc_scores = {} # Calculate scores jc_matrix = np.zeros(adj_train.shape) for u, v, p in nx.jaccard_coefficient(g_train, ebunch=get_ebunch(train_test_split)): # (u, v) = node indices, p = Jaccard coefficient jc_matrix[u][v] = p jc_matrix[v][u] = p # make sure it's symmetric jc_matrix = jc_matrix / jc_matrix.max() # Normalize matrix runtime = time.time() - start_time jc_roc, jc_ap = get_roc_score(test_edges, test_edges_false, jc_matrix) jc_scores['test_roc'] = jc_roc # jc_scores['test_roc_curve'] = jc_roc_curve jc_scores['test_ap'] = jc_ap jc_scores['runtime'] = runtime return jc_scores
def train(self, train_graph, test_edges): pos_samples, neg_samples = test_edges n = train_graph.number_of_nodes() coeff_matrix = np.zeros(shape=(n, n), dtype=np.float) samples = pos_samples + neg_samples preds = nx.jaccard_coefficient(train_graph, samples) for i, j, p in preds: coeff_matrix[int(i), int(j)] = p coeff_matrix[int(j), int(i)] = p coeff_matrix = coeff_matrix / coeff_matrix.max() ytrue = [1 for _ in range(len(pos_samples)) ] + [0 for _ in range(len(neg_samples))] y_score = [ coeff_matrix[int(edge[0]), int(edge[1])] for edge in pos_samples ] + [coeff_matrix[int(edge[0]), int(edge[1])] for edge in neg_samples] auc = roc_auc_score(y_true=ytrue, y_score=y_score) print(auc) return auc
def main(): G = nx.Graph() dfr = pd.read_csv('Followers.csv') #print(dfr.values) Followers = dfr['follows'].tolist() Followers_common = [] c = Counter(Followers) for userid, count in c.most_common(): if count > 1: Followers_common.append(userid) df_filtered = dfr.loc[dfr['follows'].isin(Followers_common)] for index, row in df_filtered.iterrows(): G.add_edge(row['follows'], row['userids']) result = girvan_newman(G, 4) #print(len(result)) count = [] for r in result: count.append(len(r)) average = sum(count) / len(result) prediction = nx.jaccard_coefficient(G) w_graph = nx.Graph() for u, v, p in prediction: #print('(%d, %d) -> %.8f' % (u, v, p)) w_graph.add_edge(u, v, weight=p) nx.draw( w_graph, node_size=50, width=0.25, alpha=0.25, ) plt.savefig('Graph_Weighted.png') with open("Communities.txt", 'w') as outfile: outfile.write("%d\n%.2f\n" % (len(result), float(average)))
def extract_features(self, prediction_set=None): edge_features = defaultdict(dict) print '{0} | extract_features: res_alloc'.format(str(datetime.now())) res_alloc = nx.resource_allocation_index(self.G, ebunch=prediction_set) self.append_features(edge_features, feature_name='res_alloc', feature_list=res_alloc) print '{0} | extract_features: jaccard_coef'.format(str( datetime.now())) jaccard_coef = nx.jaccard_coefficient(self.G, ebunch=prediction_set) self.append_features(edge_features, feature_name='jaccard_coef', feature_list=jaccard_coef) print '{0} | extract_features: adamic_adar'.format(str(datetime.now())) adamic_adar = nx.adamic_adar_index(self.G, ebunch=prediction_set) self.append_features(edge_features, feature_name='adamic_adar', feature_list=adamic_adar) print '{0} | extract_features: pref_attachment'.format( str(datetime.now())) pref_attachment = nx.preferential_attachment(self.G, ebunch=prediction_set) self.append_features(edge_features, feature_name='pref_attachment', feature_list=pref_attachment) # reformat feature dictionary to a dataframe object df, feature_names = self.feature_dict_to_df(edge_features) return df, feature_names
def retweet_similarity_network(G): V = list(G.nodes()) print(f'{len(V)} nodes in retweet network') ebunch = [] for counter, u in enumerate(V): for v in V[counter + 1:]: if (G.has_node(v)) and (G.has_node(u)): ebunch.append((u, v)) preds = nx.jaccard_coefficient(G.to_undirected(), ebunch) print(len(ebunch), " node pairs to check Jaccard index") print( "Create similarity graph between nodes using Jacard coefficient based on retweets" ) counter = 0 Gsim = nx.Graph() ne = 0 for u, v, s in preds: counter += 1 if s > 0: Gsim.add_edge(u, v, weight=s) ne += 1 if counter % 1e6 == 0: print(counter, ne, " positive weights") nv = Gsim.number_of_nodes() ne = Gsim.number_of_edges() print("Gsim has %s nodes, %s edges" % (nv, ne)) return Gsim
def L_P_JC(network): num_add = 0 # the number of egdes to be added nodes_pair_without_edge = [] # the pairs of nodes without edges probability_add = [] # the probabilities of the pairs of nodes to be added score = 0 # the score of each pair of nodes in link prediction model total_score_without_edge = 0.0 # the sum of scores of pairs of nodes without edge # calculate the score of each pair of nodes for i, elei in enumerate(list(network.nodes(), 1)): for j, elej in enumerate(list(network.nodes(), 1)): if i >= j: continue if not network.has_edge(elei, elej): try: pre = nx.jaccard_coefficient(network, [(elei, elej)]) for u, v, s in pre: score = s except: continue total_score_without_edge += score nodes_pair_without_edge.append((elei, elej, score)) for a, b, c in nodes_pair_without_edge: probability_add.append( c / total_score_without_edge ) # calculate the probabilities of edges to be added # select edges to be added according to probabilities edges_add = calculate_param.prob_select_distinct(nodes_pair_without_edge, probability_add, num_add) for a, b, c in edges_add: network.add_edge(a, b) # add selected edges return True
def networkx_call(M): sources = M['0'] destinations = M['1'] edges = [] for i in range(len(M)): edges.append((sources[i], destinations[i])) edges = sorted(edges) # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this # explicitly print('Format conversion ... ') Gnx = nx.from_pandas_edgelist(M, source='0', target='1', edge_attr='weight', create_using=nx.Graph()) # Networkx Jaccard Call print('Solving... ') t1 = time.time() preds = nx.jaccard_coefficient(Gnx, edges) t2 = time.time() - t1 print('Time : ' + str(t2)) src = [] dst = [] coeff = [] for u, v, p in preds: src.append(u) dst.append(v) coeff.append(p) return src, dst, coeff
def networkx_call(M): sources = M["0"] destinations = M["1"] edges = [] for i in range(len(sources)): edges.append((sources[i], destinations[i])) edges.append((destinations[i], sources[i])) edges = list(dict.fromkeys(edges)) edges = sorted(edges) # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this # explicitly print("Format conversion ... ") # NetworkX graph Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) # Networkx Jaccard Call print("Solving... ") t1 = time.time() preds = nx.jaccard_coefficient(Gnx, edges) t2 = time.time() - t1 print("Time : " + str(t2)) coeff = [] for u, v, p in preds: coeff.append(p) return coeff
def networkx_call(M): M = M.tocsr() M = M.tocoo() sources = M.row destinations = M.col edges = [] for i in range(len(sources)): edges.append((sources[i], destinations[i])) # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this # explicitly print('Format conversion ... ') # Directed NetworkX graph G = nx.DiGraph(M) Gnx = G.to_undirected() # Networkx Jaccard Call print('Solving... ') t1 = time.time() preds = nx.jaccard_coefficient(Gnx, edges) t2 = time.time() - t1 print('Time : '+str(t2)) src = [] dst = [] coeff = [] for u, v, p in preds: src.append(u) dst.append(v) coeff.append(p) return src, dst, coeff
def networkx_call(M, benchmark_callable=None): sources = M["0"] destinations = M["1"] edges = [] for i in range(len(M)): edges.append((sources[i], destinations[i])) edges.append((destinations[i], sources[i])) edges = list(dict.fromkeys(edges)) edges = sorted(edges) # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this # explicitly print("Format conversion ... ") Gnx = nx.from_pandas_edgelist( M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() ) # Networkx Jaccard Call print("Solving... ") if benchmark_callable is not None: preds = benchmark_callable(nx.jaccard_coefficient, Gnx, edges) else: preds = nx.jaccard_coefficient(Gnx, edges) src = [] dst = [] coeff = [] for u, v, p in preds: src.append(u) dst.append(v) # Conversion from Networkx Jaccard to Sorensen # No networkX equivalent coeff.append((2*p)/(1+p)) return src, dst, coeff
def get_test_features(): features = [] count = 0 print("Generating test data features......") for temp_data in test_edges: if (count % 100 == 0): print(count) count += 1 feature = [] try: preds = nx.resource_allocation_index(G, [temp_data]) for u, v, p in preds: feature.append(p) preds = nx.jaccard_coefficient(G, [temp_data]) for u, v, p in preds: feature.append(p) except: print("one error at: "+str(count)) pass features.append(feature) print("positive features: "+str(len(features))) return features
def new_connections_predictions(): df = future_connections df['jaccard_coefficient'] = [ x[2] for x in nx.jaccard_coefficient(G, df.index) ] df['resource_allocation_index'] = [ x[2] for x in nx.resource_allocation_index(G, df.index) ] df['preferential_attachment'] = [ x[2] for x in nx.preferential_attachment(G, df.index) ] df['common_neighbors'] = df.index.map( lambda ind: len(list(nx.common_neighbors(G, ind[0], ind[1])))) print('.......we have extracted all the features......') df_train = df[~pd.isnull(df['Future Connection'])] df_test = df[pd.isnull(df['Future Connection'])] features = [ 'jaccard_coefficient', 'resource_allocation_index', 'preferential_attachment', 'common_neighbors' ] X_train = df_train[features] Y_train = df_train['Future Connection'] X_test = df_test[features] scaler = MinMaxScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) clf = LogisticRegression(solver='liblinear', random_state=14) clf.fit(X_train_scaled, Y_train) predictions = np.round(clf.predict_proba(X_test_scaled)[:, 1], 2) results = pd.Series(data=predictions, index=X_test.index) results = results.sort_values(ascending=False) return results # print (new_connections_predictions())
def link_prediction_with_metrics(subgraph, tuples, df): jaccard_coefficient_list = list(nx.jaccard_coefficient(subgraph, tuples)) y_test = create_test_data(jaccard_coefficient_list) print( f"ROC AUC Score with Jaccard Coefficient: {roc_auc_score(df['link'], y_test)}\n" f"Average Precision with Jaccard Coefficient: {average_precision_score(df['link'], y_test)}" ) adamic_adar_list = list(nx.adamic_adar_index(subgraph, tuples)) y_test = create_test_data(adamic_adar_list) print( f"ROC AUC Score with Adamic Adar Index: {roc_auc_score(df['link'], y_test)}\n" f"Average Precision with Adamic Adar Index: {average_precision_score(df['link'], y_test)}" ) preferential_attachment_list = list( nx.preferential_attachment(subgraph, tuples)) y_test = create_test_data(preferential_attachment_list) print( f"ROC AUC Score with Preferential Attachment: {roc_auc_score(df['link'], y_test)}\n" f"Average Precision with Preferential Attachment: {average_precision_score(df['link'], y_test)}" ) resource_allocation_list = list( nx.resource_allocation_index(subgraph, tuples)) y_test = create_test_data(resource_allocation_list) print( f"ROC AUC Score with Resource Allocation Index: {roc_auc_score(df['link'], y_test)}\n" f"Average Precision with Resource Allocation Index: {average_precision_score(df['link'], y_test)}" )
def jaccard_coefficient(graph, author_osn_id, labeled_author_osn_id): if not nx.is_directed(graph): pair = [(author_osn_id, labeled_author_osn_id)] jaccard_coefficient_iterator = nx.jaccard_coefficient(graph, pair) jaccard_coefficient_score = LinkPredictionStaticFunctions.get_score_from_iterator(jaccard_coefficient_iterator) return jaccard_coefficient_score return 0
def create_features(self, G_train, edge_bunch): i = 0 X = [] page_rank = nx.pagerank_scipy(G_train) for pair in edge_bunch: commmon_neighbors = len( list(nx.common_neighbors(G_train, pair[0], pair[1]))) jaccard_coefficient = nx.jaccard_coefficient(G_train, [pair]).next()[2] adamic_adar = nx.adamic_adar_index(G_train, [pair]).next()[2] degree_0 = nx.degree(G_train, pair[0]) degree_1 = nx.degree(G_train, pair[1]) prod = degree_0 * degree_1 page_rank_0 = page_rank[pair[0]] page_rank_1 = page_rank[pair[1]] f = [ degree_0, degree_1, prod, commmon_neighbors, jaccard_coefficient, adamic_adar, page_rank_0, page_rank_1, ] X.append(f) i += 1 if i % 1000000 == 0: print(i) return np.array(X)
def networkx_call(M, benchmark_callable=None): sources = M["0"] destinations = M["1"] edges = [] for i in range(len(sources)): edges.append((sources[i], destinations[i])) edges.append((destinations[i], sources[i])) edges = list(dict.fromkeys(edges)) edges = sorted(edges) # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this # explicitly print("Format conversion ... ") # NetworkX graph Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) # Networkx Jaccard Call print("Solving... ") if benchmark_callable is not None: preds = benchmark_callable(nx.jaccard_coefficient, Gnx, edges) else: preds = nx.jaccard_coefficient(Gnx, edges) coeff = [] for u, v, p in preds: # FIXME: Use known correct values of WSorensen for few graphs, # hardcode it and compare to Cugraph WSorensen # to get a more robust test # Conversion from Networkx Jaccard to Sorensen coeff.append((2 * p) / (1 + p)) return coeff
def test_jaccard_two_hop_edge_vals(managed, pool, graph_file): gc.collect() rmm.reinitialize( managed_memory=managed, pool_allocator=pool, initial_pool_size=2 << 27 ) assert(rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) M = M.tocsr() Gnx = nx.DiGraph(M).to_undirected() G = cugraph.Graph() row_offsets = cudf.Series(M.indptr) col_indices = cudf.Series(M.indices) values = cudf.Series(M.data) G.from_cudf_adjlist(row_offsets, col_indices, values) pairs = G.get_two_hop_neighbors() nx_pairs = [] for i in range(len(pairs)): nx_pairs.append((pairs['first'][i], pairs['second'][i])) preds = nx.jaccard_coefficient(Gnx, nx_pairs) nx_coeff = [] for u, v, p in preds: nx_coeff.append(p) df = cugraph.jaccard(G, pairs['first'], pairs['second']) assert len(nx_coeff) == len(df) for i in range(len(df)): diff = abs(nx_coeff[i] - df['jaccard_coeff'][i]) assert diff < 1.0e-6
def create_related_gene_network(gene_network_nx, threshold, genes): rgn = nx.Graph() for g1, g2 in gene_network_nx.edges: if g1 in genes and g2 in genes: rgn.add_edge(g1, g2, weight=1.0) similarity = nx.jaccard_coefficient(gene_network_nx) for g1, g2, coefficient in similarity: #print('({}, {}) -> {:.4f}'.format(g1, g2, coefficient)) # adding in RGN only edges with weight greater than threshold and if genes are in the BMM if coefficient > threshold and g1 in genes and g2 in genes: rgn.add_edge(g1, g2, weight=coefficient) ## routine to print the graph #pos = nx.circular_layout(rgn) # positions for all nodes ## nodes #nx.draw_networkx_nodes(rgn, pos, node_size=300) ## edges #nx.draw_networkx_edges(rgn, pos, width=1) #labels = nx.get_edge_attributes(rgn,'weight') #nx.draw_networkx_edge_labels(rgn, pos, font_size=8,edge_labels=labels) ## labels #nx.draw_networkx_labels(rgn, pos, font_size=10) #plt.axis('off') #plt.show() return rgn
def test_jaccard_two_hop_edge_vals(managed, pool, graph_file): gc.collect() rmm.reinitialize(managed_memory=managed, pool_allocator=pool, initial_pool_size=2 << 27) assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) Gnx = nx.from_pandas_edgelist(M, source='0', target='1', edge_attr='weight', create_using=nx.Graph()) G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') pairs = G.get_two_hop_neighbors() nx_pairs = [] for i in range(len(pairs)): nx_pairs.append((pairs['first'][i], pairs['second'][i])) preds = nx.jaccard_coefficient(Gnx, nx_pairs) nx_coeff = [] for u, v, p in preds: nx_coeff.append(p) df = cugraph.jaccard(G, pairs) df = df.sort_values(by=['source', 'destination']) assert len(nx_coeff) == len(df) for i in range(len(df)): diff = abs(nx_coeff[i] - df['jaccard_coeff'][i]) assert diff < 1.0e-6
def test_jaccard_two_hop(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_cfg.use_managed_memory = managed rmm_cfg.use_pool_allocator = pool rmm.initialize() assert (rmm.is_initialized()) M = read_mtx_file(graph_file) M = M.tocsr() Gnx = nx.DiGraph(M).to_undirected() G = cugraph.Graph() row_offsets = cudf.Series(M.indptr) col_indices = cudf.Series(M.indices) G.add_adj_list(row_offsets, col_indices, None) pairs = G.get_two_hop_neighbors() nx_pairs = [] for i in range(len(pairs)): nx_pairs.append((pairs['first'][i], pairs['second'][i])) preds = nx.jaccard_coefficient(Gnx, nx_pairs) nx_coeff = [] for u, v, p in preds: nx_coeff.append(p) df = cugraph.jaccard(G, pairs['first'], pairs['second']) assert len(nx_coeff) == len(df) for i in range(len(df)): diff = abs(nx_coeff[i] - df['jaccard_coeff'][i]) assert diff < 1.0e-6
def compare_sorensen_two_hop(G, Gnx): """ Compute both cugraph and nx sorensen after extracting the two hop neighbors from G and compare both results """ pairs = ( G.get_two_hop_neighbors() .sort_values(["first", "second"]) .reset_index(drop=True) ) nx_pairs = [] nx_pairs = list(pairs.to_records(index=False)) preds = nx.jaccard_coefficient(Gnx, nx_pairs) nx_coeff = [] for u, v, p in preds: # FIXME: Use known correct values of Sorensen for few graphs, # hardcode it and compare to Cugraph Sorensen to get a more robust test # Conversion from Networkx Jaccard to Sorensen # No networkX equivalent nx_coeff.append((2*p)/(1+p)) df = cugraph.sorensen(G, pairs) df = df.sort_values(by=["source", "destination"]).reset_index(drop=True) assert len(nx_coeff) == len(df) for i in range(len(df)): diff = abs(nx_coeff[i] - df["sorensen_coeff"].iloc[i]) assert diff < 1.0e-6
def edgefeat(g, norm=False, fil='ricci'): """ wrapper for edge_probability and ricciCurvature computation :param g: graph :param fil: edge_p/ricci/jaccard :param whether normalize edge values or not :return: gp, a dense numpy array of shape (n_node, n_node) """ g = nx.convert_node_labels_to_integers(g) assert nx.is_connected(g) adj_m = nx.adj_matrix(g).todense() # dense matrix gp = np.zeros((len(g), len(g))) try: if fil == 'edge_p': gp = np.array(smoother(adj_m, h=0.3)) gp = np.multiply(adj_m, gp) elif fil == 'ricci': g = ricciCurvature(g, alpha=0.5, weight='weight') ricci_dict = nx.get_edge_attributes(g, 'ricciCurvature') for u, v in ricci_dict.keys(): gp[u][v] = ricci_dict[(u, v)] gp += gp.T elif fil == 'jaccard': jac_list = nx.jaccard_coefficient(g, g.edges( )) # important since jaccard can also be defined on non edge for u, v, jac in jac_list: gp[u][v] = jac gp += gp.T except AssertionError: print('Have not implemented fil %s. Treat as all zeros' % fil) gp = np.zeros((len(g), len(g))) assert (gp == gp.T).all() if norm: gp = gp / float(max(abs(gp))) return gp
def SimilarityMeasures(G): # resource_allocation_index preds = nx.resource_allocation_index(G, [(1, 2), (3, 4), (1, 4), (5, 6), (3, 5)]) for u, v, p in preds: print('(%d, %d) -> %.8f' % (u, v, p)) print('****************************') # Common neighours print(sorted(nx.common_neighbors(G, 1, 2))) print('****************************') # jaccard coefficient preds = nx.jaccard_coefficient(G, [(1, 2), (3, 4), (1, 4), (5, 6), (3, 5)]) for u, v, p in preds: print('(%d, %d) -> %.8f' % (u, v, p)) print('****************************') # AdamicAdar preds = nx.adamic_adar_index(G, [(1, 2), (3, 4), (1, 4), (5, 6), (3, 5)]) for u, v, p in preds: print('(%d, %d) -> %.8f' % (u, v, p)) print('****************************') # Preferential Attachment (PA), preds = nx.preferential_attachment(G, [(1, 2), (3, 4), (1, 4), (5, 6), (3, 5)]) for u, v, p in preds: print('(%d, %d) -> %.8f' % (u, v, p)) print('****************************')
def networkx_call(M, benchmark_callable=None): sources = M["0"] destinations = M["1"] edges = [] for i in range(len(sources)): edges.append((sources[i], destinations[i])) edges.append((destinations[i], sources[i])) edges = list(dict.fromkeys(edges)) edges = sorted(edges) # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this # explicitly print("Format conversion ... ") # NetworkX graph Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) # Networkx Jaccard Call print("Solving... ") if benchmark_callable is not None: preds = benchmark_callable(nx.jaccard_coefficient, Gnx, edges) else: preds = nx.jaccard_coefficient(Gnx, edges) coeff = [] for u, v, p in preds: coeff.append(p) return coeff
def feature_calculate(g, data, column_names, save_to): pairs = list(map(lambda x: (x[0], x[1]), data)) jaccard = nx.jaccard_coefficient(g, pairs) preferential = nx.preferential_attachment(g, pairs) rai = nx.resource_allocation_index(g, pairs) # shortest path total = len(data) current = 0 for row_data in zip(data, jaccard, preferential, rai): row = row_data[0] try: thisjaccard = row_data[1][2] except: thisjaccard = -1 try: thispreferential = row_data[2][2] except: thispreferential = -1 try: thisrai = row_data[3][2] except: thisrai = -1 # pred = row_data[1] # resource_allocation_index = pred[2] if current % 1000 == 0: ut.log("calculating {}/{}...".format(current, total)) path_length = 99999 try: path = nx.shortest_path(g, row[0], row[1], 'weight') path_length = len(path) except: pass # shortest path row.insert(-1, path_length) # jaccard row.insert(-1, thisjaccard) # preferential row.insert(-1, thispreferential) # rai row.insert(-1, thisrai) current += 1 original_columns_titles = list(column_names) original_columns_titles.insert(-1, "shortest_path_count") original_columns_titles.insert(-1, "jaccard") original_columns_titles.insert(-1, "preferential") original_columns_titles.insert(-1, "rai") data.insert(0, original_columns_titles) ut.write_list_csv(save_to, data)
def jaccard_cancel_list(nodes, G, effort): print("\n \t Calculating jaccard coefficents", end="") Removed_Edge = list() global jaccard_list max_number_of_edges = 0 if (len(jaccard_list) > 1): if effort != 0: max_number_of_edges = int(len(jaccard_list) * (effort / 100)) - 1 Removed_Edge = jaccard_list[0:max_number_of_edges] else: for node in G: successors = G.successors(node) for successor in successors: jaccard = nx.jaccard_coefficient(nx.Graph(G), [(node, successor)]) for node1, node2, jacc_coff in jaccard: if jacc_coff < 0.4: jaccard_list.append((node, successor)) if effort != 0: max_number_of_edges = int(len(jaccard_list) * (effort / 100)) - 1 Removed_Edge = jaccard_list[0:max_number_of_edges] return Removed_Edge
def test_jaccard_two_hop_edge_vals(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) Gnx = nx.from_pandas_edgelist(M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()) G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") pairs = (G.get_two_hop_neighbors().sort_values(["first", "second" ]).reset_index(drop=True)) nx_pairs = [] for i in range(len(pairs)): nx_pairs.append((pairs["first"].iloc[i], pairs["second"].iloc[i])) preds = nx.jaccard_coefficient(Gnx, nx_pairs) nx_coeff = [] for u, v, p in preds: nx_coeff.append(p) df = cugraph.jaccard(G, pairs) df = df.sort_values(by=["source", "destination"]).reset_index(drop=True) assert len(nx_coeff) == len(df) for i in range(len(df)): diff = abs(nx_coeff[i] - df["jaccard_coeff"].iloc[i]) assert diff < 1.0e-6
def make_jacc_predG(dirG, testG): # Given the original digraph dirG, and the testG digraph to test against, # Return predicted digraph predG with arcs matching testG, # each with the predicted lilekihood according to Jaccard coeff undG = dirG.to_undirected() undir_jaccs = nx.Graph() undir_jaccs.add_weighted_edges_from(nx.jaccard_coefficient(undG, testG.edges_iter())) return make_predG_from_jacc(undir_jaccs, dirG, testG)
def Jaccard_coef(features, G): J = [] for i in range(features.shape[0]): a = features['From'][i] b = features['To'][i] pred = nx.jaccard_coefficient(G, [(a, b)]) for u, v ,p in pred: J.append(p) return J
def sort_edges_by_jaccard_index(graph, edges): #from random import shuffle # shuffle(edges) edges_sorted = sorted(list(nx.jaccard_coefficient(graph, edges)), key=lambda l: l[ 2], reverse=True, cmp=compare_with_ties) unique_j_edges = len(np.unique(np.array(edges_sorted)[:, 2])) total_j_edges = len(edges) print "Degeneracy= ", 1.0 - float(unique_j_edges) / float(total_j_edges) return [(row[0], row[1]) for row in edges_sorted], [row[2] for row in edges_sorted]
def calculate_jaccard_similarity(graph): preds = nx.jaccard_coefficient(graph,graph.edges()) hash_new = {} for u, v, p in preds: hash_new[str(u) + ',' + str(v)] = p arr_nodes = (max(hash_new.iteritems(), key=operator.itemgetter(1))[0]) arr_value = (max(hash_new.iteritems(), key=operator.itemgetter(1))[1]) print("nodes") print(arr_nodes) print("Jaccard Coefficient") print(arr_value)
def graph_stats(distance_couple, net): distances = [] common_neighbors = [] jaccard = [] adamic = [] edge_bet = [] edge_betweeness = nx.edge_betweenness_centrality(net) for couple in distance_couple: distances.append(couple[1]) common_neighbors.append(len(list(nx.common_neighbors(net, couple[0][0], couple[0][1])))) jaccard.append(list(nx.jaccard_coefficient(net, [(couple[0][0], couple[0][1])]))[0][2]) adamic.append(list(nx.adamic_adar_index(net, [(couple[0][0], couple[0][1])]))[0][2]) try: edge_bet.append(edge_betweeness[couple[0]]) except KeyError: edge_bet.append(edge_betweeness[(couple[0][1], couple[0][0])]) r_dist = 10.0/max(distances) r_n = 10.0/max(common_neighbors) r_j = 10.0/max(jaccard) r_a = 10.0/max(adamic) r_e = 10.0/max(edge_bet) distances = [j * r_dist for j in distances] common_neighbors = [j * r_n for j in common_neighbors] jaccard = [j * r_j for j in jaccard] adamic = [j * r_a for j in adamic] edge_bet = [j * r_e for j in edge_bet] plt.loglog(common_neighbors, color='b', label='common_neighbors') plt.loglog(distances, color='r', label='distances') plt.savefig('node_similarity/stats_cm.png', format='png') plt.close() plt.loglog(jaccard, color='b', label='jaccard') plt.loglog(distances, color='r', label='distances') plt.savefig('node_similarity/stats_j.png', format='png') plt.close() plt.loglog(adamic, color='b', label='adamic') plt.loglog(distances, color='r', label='distances') plt.savefig('node_similarity/stats_aa.png', format='png') plt.close() plt.loglog(edge_bet, color='b', label='edge betwenness') plt.loglog(distances, color='r', label='distances') plt.savefig('node_similarity/stats_eb.png', format='png') plt.close()
def users_to_recommend(self, nb_reco_user=5): """ compute the authors to recommend to the user based on link prediction and similarity :param nb_reco_user: number of users to recommend :return: """ ebunch = [] authors = set(self.graph.nodes()) authors.remove(self.user.id) for a in self.authors_liked: authors.remove(a) for author in authors: ebunch.append((self.user.id, author)) preds = nx.jaccard_coefficient(self.graph, ebunch) reco_prio = [] for u, a, p in preds: reco_prio.append({'author_name': a, 'prio': p}) reco_prio = sorted(reco_prio, key=lambda k: k['prio'], reverse=True)[:nb_reco_user] reco_prio = self.rerank_authors(reco_prio) return [x['author_name'] for x in reco_prio]
import networkx as nx G = nx.read_graphml('networkx_graph.graphml') undirected_G = G.to_undirected() jaccard_similarity = nx.jaccard_coefficient(undirected_G) sorted_js = sorted(jaccard_similarity, key=lambda tup:tup[3], reverse=True) print "most similar nodes by jaccard similarity: " + str(sorted_js[0])
import networkx as nx import matplotlib.pyplot as plt import operator G=nx.read_edgelist("anoncsv.csv",'rb',delimiter=',') #PageRank c=nx.pagerank(G) sorted_c=sorted(c,key=operator.itemgetter(2)) #EigenVector Centrality c=nx.eigenvector_centrality(G) sorted_c=sorted(c,key=operator.itemgetter(2)) #Degree centrality c=nx.degree_centrality(G) sorted_c=sorted(c,key=operator.itemgetter(2)) #jaccard similarity j=nx.jaccard_coefficient(G) for u, v, p in j: print u, v, p print '\n'
## Following code lines find the rank correlation among the 3 centralty values. Values are also stores at rank_correlations.txt file. f = open("C:\\Users\\tyagi\\Desktop\\Fall2015\\SMM\\Project 1\\Phase 1\\P3\\rank_correlations.txt",'w') tau,p_value = st.kendalltau(pg,ev) print("Rank correlation between PG and EV : (Tau = '%f', p_value = '%f')" % (tau,p_value)) f.write("Rank correlation between PG and EV : Tau = "+str(tau)+', p_value = '+str(p_value)+'\n') tau,p_value = st.kendalltau(pg,deg) print("Rank correlation between PG and DC : (Tau = '%f', p_value = '%f')" % (tau,p_value)) f.write("Rank correlation between PG and DC : Tau = "+str(tau)+', p_value = '+str(p_value)+'\n') tau,p_value = st.kendalltau(ev,deg) print("Rank correlation between EV and DC : (Tau = '%f', p_value = '%f')" % (tau,p_value)) f.write("Rank correlation between EV and DC : Tau = "+str(tau)+', p_value = '+str(p_value)+'\n') f.close() # Similarity using JACCARD Coefficient print("Finding jaccard similarity :") # jaccard_coefficient() returns an iterator over values u,v,p which are edge nodes and jaccard co-efficient values # respectively. I create a dictionary jac_dct to store the values. Key are edge tupes and values are jaccrard values. jac_coef = list(nx.jaccard_coefficient(h, h.edges())) jac_coef.sort(reverse=True) print("Top two pair of nodes in terms of jaccard coefficient are :") i=0 while i<2: print(jac_coef[i]) i+=1
import networkx as nx import operator # calculate jaccard coefficient import csv fe=open('anonymized_edge_list.csv','rb') reader=csv.reader(fe) #column=[] #c_p=[] G=nx.Graph() edge_list=[] for x in reader: edge_list.append(x) print x G.add_edge(x[0],x[1]) fja=open('jaccard.csv','wb') writer=csv.writer(fja) array=nx.jaccard_coefficient(G,edge_list) for c,v,p in array: l=[c,v,p] writer.writerow(l) # column.append(l) fja.close() fe.close()
def link_prediction(G, query_nodes, target_nodes, n_edges, start_dist, alg = "ra"): """Selects a random set of links between based on the scores calculated by a standard link-prediction algorithm from networkx library Parameters ---------- G : Networkx graph The graph from which the team will be selected. query : list The set of nodes from which random walker starts. target : list The set of nodes from where the random walker ends. n_edges : integer the number of links to be added start_dist: list The starting distribution over the query set alg: string A string describing the link-prediction algorithm to be used Returns ------- links : list The set of links that reduce the absorbing RW centrality ac_scores: list The set of scores of adding the links """ assert alg in ["ra", "pa", "jaccard", "aa"], "alg must be one of [\"ra\", \"pa\", \"jaccard\", \"aa\"]." H = G.copy() query_set_size = len(query_nodes) map_query_to_org = dict(zip(query_nodes, range(query_set_size))) P = csc_matrix(nx.google_matrix(H, alpha=1)) P_abs = P[list(query_nodes),:][:,list(query_nodes)] F = compute_fundamental(P_abs) row_sums = start_dist.dot(F.sum())[0,0] candidates = list(product(query_nodes, target_nodes)) eligible = [candidates[i] for i in range(len(candidates)) if H.has_edge(candidates[i][0], candidates[i][1]) == False] links_to_add = [] if alg == 'ra': preds = nx.resource_allocation_index(H, eligible) elif alg == 'jaccard': preds = nx.jaccard_coefficient(H, eligible) elif alg == 'aa': preds = nx.adamic_adar_index(H, eligible) elif alg == 'pa': preds = nx.preferential_attachment(H, eligible) for u,v,p in preds: links_to_add.append((u,v,p)) links_to_add.sort(key=lambda x: x[2], reverse = True) ac_scores = [] ac_scores.append(row_sums) i = 0 while i < n_edges: F_updated = update_fundamental_mat(F, H, map_query_to_org, links_to_add[i][0]) H.add_edge(links_to_add[i][0], links_to_add[i][1]) abs_cen = start_dist.dot(F_updated.sum(axis = 1))[0,0] F = F_updated ac_scores.append(abs_cen) i += 1 return links_to_add, ac_scores
import networkx as nx import heapq with open('U_tuples.csv', mode='r') as infile: G = nx.read_edgelist(infile, delimiter = ',', create_using=nx.DiGraph()) UG = G.to_undirected() preds = nx.jaccard_coefficient(UG) nd = {} count = 0 check = 10000 def jac_check(): ndmax = heapq.nlargest(1,nd,nd.get) if nd[ndmax[0]] == 1: with open('jaccard.txt', mode='wb') as outfile: outfile.write("Jaccard Similarity Pair: "+str(ndmax)+"\n") outfile.write("Jaccard Similarity Maximum: "+str(nd[ndmax[0]])+"\n") for u,v,p in preds: nd[(u,v)] = p count+=1 print 'Jaccard: ' ,count ndmax = [] if count == check: jac_check() check += 10000+count print nd #ndmax = [] #ndmax = heapq.nlargest(1,nd,nd.get) #print ndmax