def adamic_adar_scores(g_train, train_test_split): if g_train.is_directed(): # Only works for undirected graphs g_train = g_train.to_undirected() adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split # Unpack input start_time = time.time() aa_scores = {} # Calculate scores aa_matrix = np.zeros(adj_train.shape) for u, v, p in nx.adamic_adar_index(g_train, ebunch=get_ebunch(train_test_split)): # (u, v) = node indices, p = Adamic-Adar index aa_matrix[u][v] = p aa_matrix[v][u] = p # make sure it's symmetric aa_matrix = aa_matrix / aa_matrix.max() # Normalize matrix runtime = time.time() - start_time aa_roc, aa_ap = get_roc_score(test_edges, test_edges_false, aa_matrix) aa_scores['test_roc'] = aa_roc # aa_scores['test_roc_curve'] = aa_roc_curve aa_scores['test_ap'] = aa_ap aa_scores['runtime'] = runtime return aa_scores
def getaa(i): t = time.time() l = list(nx.adamic_adar_index(g, E[i:i + step])) t = time.time() - t print(' finished edges [{: 7d}, {: 7d}) in {: 7.2f} s\n' .format(i, i + step, t), end='') sys.stdout.flush() return l
def train_feature(fn_list, input_file='../../output/svm_feature_merge/training.txt(split)'): percentage_of_initial_adopter = 0.1 g = load_graph('../../data/graph.txt') b = load_json('../../data/Business.txt') b = {b['business_id'][i]: ((b['latitude'][i], b['longitude'][i]), b['stars'][i]) for i in range(len(b['business_id']))} u_location = load_user_location('../../data/user_location.txt') t = load_idea(input_file) for fn in fn_list: fn = '../../output/svm_feature_merge/' + fn d = __load(fn) with open(fn+'(f0).txt', 'w') as f0: with open(fn+'(f1).txt', 'w') as f1: with open(fn+'(f2).txt', 'w') as f2: with open(fn+'(f3).txt', 'w') as f3: for k, v in d.items(): print k user = [q for q, qq in v] label = [qq for q, qq in v] n = get_node_by_idea2(t, k) print len(n['date']) index = sorted(range(len(n['date'])), key=lambda k: n['date'][k]) # level = [n['level'][i] for i in index] node = [n['node'][i] for i in index] initial_adopters = node[: int(percentage_of_initial_adopter*len(node))] assert(len(set(user) & set(initial_adopters)) == 0) # b_avg_stars = np.average([level[: 0.1*len(level)]]) for e, lb in zip(user,label): features = [] # Distance with business features.append(__distance(b[k][0], u_location[e])) # Difference between user average stars and business average stars by adopters # Average distance with initial adopters features.append(np.average([__distance(u_location[a], u_location[e]) for a in initial_adopters])) # Percentage of friends in initial adopters features.append(float(sum([1 if g.has_edge(e, a) else 0 for a in initial_adopters]))/len(initial_adopters)) # Average adar preds = nx.adamic_adar_index(g, [(e, a) for a in initial_adopters if g.has_node(a) and g.has_node(e)]) try: preds = [p for u, v, p in preds] preds = preds if preds else 0 except Exception: preds = 0 features.append(np.average(preds)) f0.write('{0} {1} {2} {3}\n'.format(k, e, features[0], lb)) f1.write('{0} {1} {2} {3}\n'.format(k, e, features[1], lb)) f2.write('{0} {1} {2} {3}\n'.format(k, e, features[2], lb)) f3.write('{0} {1} {2} {3}\n'.format(k, e, features[3], lb))
def predictLinksAdamicAdar(nodesAtHop, itemNodeIds, userNodeIds, directory, item): scores = {} GCombined = nx.read_edgelist(directory + 'Edge_List_Combined_' + item + '.txt') preds = nx.adamic_adar_index(GCombined) for u, v, p in preds: if not u in scores: scores[u] = {} scores[u][v] = p with open(directory + 'AdamicAdar', 'wb') as outfile: pickle.dump(scores, outfile)
def graph_stats(distance_couple, net): distances = [] common_neighbors = [] jaccard = [] adamic = [] edge_bet = [] edge_betweeness = nx.edge_betweenness_centrality(net) for couple in distance_couple: distances.append(couple[1]) common_neighbors.append(len(list(nx.common_neighbors(net, couple[0][0], couple[0][1])))) jaccard.append(list(nx.jaccard_coefficient(net, [(couple[0][0], couple[0][1])]))[0][2]) adamic.append(list(nx.adamic_adar_index(net, [(couple[0][0], couple[0][1])]))[0][2]) try: edge_bet.append(edge_betweeness[couple[0]]) except KeyError: edge_bet.append(edge_betweeness[(couple[0][1], couple[0][0])]) r_dist = 10.0/max(distances) r_n = 10.0/max(common_neighbors) r_j = 10.0/max(jaccard) r_a = 10.0/max(adamic) r_e = 10.0/max(edge_bet) distances = [j * r_dist for j in distances] common_neighbors = [j * r_n for j in common_neighbors] jaccard = [j * r_j for j in jaccard] adamic = [j * r_a for j in adamic] edge_bet = [j * r_e for j in edge_bet] plt.loglog(common_neighbors, color='b', label='common_neighbors') plt.loglog(distances, color='r', label='distances') plt.savefig('node_similarity/stats_cm.png', format='png') plt.close() plt.loglog(jaccard, color='b', label='jaccard') plt.loglog(distances, color='r', label='distances') plt.savefig('node_similarity/stats_j.png', format='png') plt.close() plt.loglog(adamic, color='b', label='adamic') plt.loglog(distances, color='r', label='distances') plt.savefig('node_similarity/stats_aa.png', format='png') plt.close() plt.loglog(edge_bet, color='b', label='edge betwenness') plt.loglog(distances, color='r', label='distances') plt.savefig('node_similarity/stats_eb.png', format='png') plt.close()
def train_feature(input_file='../../data/training.txt(split)', output='../../output/svm_training.txt'): percentage_of_initial_adopter = 0.1 g = load_graph('../../data/graph.txt') b = load_json('../../data/Business.txt') b = {b['business_id'][i]: ((b['latitude'][i], b['longitude'][i]), b['stars'][i]) for i in range(len(b['business_id']))} u_location = load_user_location('../../data/user_location.txt') t = load_idea(input_file) ideas = list(set(t['idea'])) l = None for m in range(len(ideas)): print m n = get_node_by_idea2(t, ideas[m]) index = sorted(range(len(n['date'])), key=lambda k: n['date'][k]) # level = [n['level'][i] for i in index] node = [n['node'][i] for i in index] initial_adopters = node[: int(percentage_of_initial_adopter*len(node))] laters = node[int(percentage_of_initial_adopter*len(node)):] # b_avg_stars = np.average([level[: 0.1*len(level)]]) for e in laters: features = [] # Distance with business features.append(__distance(b[ideas[m]][0], u_location[e])) # Difference between user average stars and business average stars by adopters # Average distance with initial adopters features.append(np.average([__distance(u_location[a], u_location[e]) for a in initial_adopters])) # Percentage of friends in initial adopters features.append(float(sum([1 if g.has_edge(e, a) else 0 for a in initial_adopters]))/len(initial_adopters)) # Average adar preds = nx.adamic_adar_index(g, [(e, a) for a in initial_adopters if g.has_node(a) and g.has_node(e)]) try: preds = [p for u, v, p in preds] preds = preds if preds else 0 except Exception: preds = 0 features.append(np.average(preds)) features = np.array([features]) l = features if l is None else np.concatenate((l, features)) dump_svmlight_file(l, [1]*l.shape[0], output)
def test_feature(input='../../data/test_data/test_data_q1.txt', output='../../output/svm_testing_q1_'): g = load_graph('../../data/graph.txt') b = load_json('../../data/Business.txt') b = {b['business_id'][i]: ((b['latitude'][i], b['longitude'][i]), b['stars'][i]) for i in range(len(b['business_id']))} u_location = load_user_location('../../data/user_location.txt') t = load_idea('../../data/testing.txt') with open('../../data/testing_business.txt', 'r') as f: tb = f.read().strip().split() with open(input, 'r') as f: test = [[e for e in l.strip().split()] for l in f] for i, (business, initial_adopters) in enumerate(zip(tb, test)): print i l = None answers = [] n = get_node_by_idea2(t, business) ans = set(n['node'])-set(initial_adopters) candidates = set(g.nodes())-set(initial_adopters) for e in candidates: features = [] # Distance with business features.append(__distance(b[business][0], u_location[e])) # Difference between user average stars and business average stars by adopters # Average distance with initial adopters features.append(np.average([__distance(u_location[a], u_location[e]) for a in initial_adopters])) # Percentage of friends in initial adopters features.append(float(sum([1 if g.has_edge(e, a) else 0 for a in initial_adopters]))/len(initial_adopters)) # Average adar preds = nx.adamic_adar_index(g, [(e, a) for a in initial_adopters if g.has_node(a) and g.has_node(e)]) try: preds = [p for u, v, p in preds] preds = preds if preds else 0 except Exception: preds = 0 features.append(np.average(preds)) features = np.array([features]) l = features if l is None else np.concatenate((l, features)) answers.append(1 if e in ans else 0) dump_svmlight_file(l, answers, output+str(i)+'.txt')
def calculate_similarities(E1, tmp_string=None): features = [] ytarget = [] tmp_feat = {} tmp_y = {} temp_common_neighbors = [] node_list = list(E1.nodes) for node1 in node_list: for node2 in node_list: connected = 0 if tmp_string == "n": if nodes_connected(E1, node1, node2): continue else: temp = nx.jaccard_coefficient(E1, [(node1, node2)]) for u, v, p in temp: tmp_feat[u, v] = [p] tmp_y[u, v] = connected temp = nx.adamic_adar_index(E1, [(node1, node2)]) try: for u, v, p in temp: tmp_feat[u, v].append(p) tmp_y[u, v] = connected except: tmp_feat[u, v].append(0.0) tmp_y[u, v] = connected temp = nx.preferential_attachment(E1, [(node1, node2)]) for u, v, p in temp: tmp_feat[u, v].append(p) tmp_y[u, v] = connected temp = sorted(nx.common_neighbors(E1, node1, node2)) temp_common_neighbors = [] temp = sorted(nx.common_neighbors(E1, node1, node2)) for common_neighbor in temp: temp_common_neighbors.append(common_neighbor) tmp_feat[node1, node2].append(len(temp_common_neighbors)) #Ill use Exception Checking cause we have DiGraph and maybe there is no path between to nodes #so an error would compile try: length = nx.shortest_path_length(E1, node1, node2) tmp_feat[node1, node2].append(length) tmp_y[node1, node2] = connected except: tmp_feat[node1, node2].append(0.0) tmp_y[node1, node2] = connected else: if nodes_connected(E1, node1, node2): connected = 1 temp = nx.jaccard_coefficient(E1, [(node1, node2)]) for u, v, p in temp: tmp_feat[u, v] = [p] tmp_y[u, v] = connected temp = nx.adamic_adar_index(E1, [(node1, node2)]) try: for u, v, p in temp: tmp_feat[u, v].append(p) tmp_y[u, v] = connected except: tmp_feat[u, v].append(0.0) tmp_y[u, v] = connected temp = nx.preferential_attachment(E1, [(node1, node2)]) for u, v, p in temp: tmp_feat[u, v].append(p) tmp_y[u, v] = connected temp = sorted(nx.common_neighbors(E1, node1, node2)) temp_common_neighbors = [] temp = sorted(nx.common_neighbors(E1, node1, node2)) for common_neighbor in temp: temp_common_neighbors.append(common_neighbor) tmp_feat[node1, node2].append(len(temp_common_neighbors)) #Ill use Exception Checking cause we have DiGraph and maybe there is no path between to nodes #so an error would compile try: length = nx.shortest_path_length(E1, node1, node2) tmp_feat[node1, node2].append(length) tmp_y[node1, node2] = connected except: tmp_feat[node1, node2].append(0.0) tmp_y[node1, node2] = connected for key, value in tmp_feat.items(): features.append(value) for key, value in tmp_y.items(): ytarget.append(value) features_numpy = numpy.array(features) y_target_numpy = numpy.array(ytarget) return features_numpy, y_target_numpy
G, [(i, j)])) # do >0 # print(prediction2) # print(prediction2[0][2]) # prediction2 = prediction2[0][2] # print('Model2 Pred:' + str(prediction2)) ########## Jaccard Coefficient ########## prediction3 = sorted(nx.jaccard_coefficient(G, [(i, j)])) # do >0 # print(prediction3) # print(prediction3[0][2]) # prediction3 = prediction3[0][2] # print('Model3 Pred:' + str(prediction3)) ########## Adamic Adar Index ########## prediction4 = sorted(nx.adamic_adar_index(G, [(i, j)])) # do >0 # print(prediction4) # print(prediction4[0][2]) # prediction4 = prediction4[0][2] # print('Model4 Pred:' + str(prediction4)) ########## Prederential Attachment ########## prediction5 = sorted(nx.preferential_attachment( G, [(i, j)])) # do >0 # print(prediction5) # print(prediction5[0][2]) # prediction5 = prediction5[0][2] # print('Model5 Pred:' + str(prediction5)) ########## Tried couple different ways to get the score up ########## ########## Really thought something like this would work well ##########
def predict(self, node_pairs): predictions = adamic_adar_index(self.graph, node_pairs) return list(predictions)
def link_prediction(G, query_nodes, target_nodes, n_edges, start_dist, alg = "ra"): """Selects a random set of links between based on the scores calculated by a standard link-prediction algorithm from networkx library Parameters ---------- G : Networkx graph The graph from which the team will be selected. query : list The set of nodes from which random walker starts. target : list The set of nodes from where the random walker ends. n_edges : integer the number of links to be added start_dist: list The starting distribution over the query set alg: string A string describing the link-prediction algorithm to be used Returns ------- links : list The set of links that reduce the absorbing RW centrality ac_scores: list The set of scores of adding the links """ assert alg in ["ra", "pa", "jaccard", "aa"], "alg must be one of [\"ra\", \"pa\", \"jaccard\", \"aa\"]." H = G.copy() query_set_size = len(query_nodes) map_query_to_org = dict(zip(query_nodes, range(query_set_size))) P = csc_matrix(nx.google_matrix(H, alpha=1)) P_abs = P[list(query_nodes),:][:,list(query_nodes)] F = compute_fundamental(P_abs) row_sums = start_dist.dot(F.sum())[0,0] candidates = list(product(query_nodes, target_nodes)) eligible = [candidates[i] for i in range(len(candidates)) if H.has_edge(candidates[i][0], candidates[i][1]) == False] links_to_add = [] if alg == 'ra': preds = nx.resource_allocation_index(H, eligible) elif alg == 'jaccard': preds = nx.jaccard_coefficient(H, eligible) elif alg == 'aa': preds = nx.adamic_adar_index(H, eligible) elif alg == 'pa': preds = nx.preferential_attachment(H, eligible) for u,v,p in preds: links_to_add.append((u,v,p)) links_to_add.sort(key=lambda x: x[2], reverse = True) ac_scores = [] ac_scores.append(row_sums) i = 0 while i < n_edges: F_updated = update_fundamental_mat(F, H, map_query_to_org, links_to_add[i][0]) H.add_edge(links_to_add[i][0], links_to_add[i][1]) abs_cen = start_dist.dot(F_updated.sum(axis = 1))[0,0] F = F_updated ac_scores.append(abs_cen) i += 1 return links_to_add, ac_scores
print('Reading %s_topological_network.csv...' % (prog_languages[prog_lang_id])) t_network = [] for row in data: dev_id_1 = int(row[0]) dev_id_2 = int(row[1]) t_network.append((dev_id_1, dev_id_2)) csvfile.close() with open('../Files/topological_metrics.csv', 'a') as a: metrics_file = csv.writer(a, delimiter=',') print('Writing topological metrics for', prog_languages[prog_lang_id]) for dev_pair in t_network: neighborhood_overlap = nx.jaccard_coefficient(G, [dev_pair]) adamic_acar = nx.adamic_adar_index(G, [dev_pair]) preferential_attachment = nx.preferential_attachment(G, [dev_pair]) for u, v, p in neighborhood_overlap: NO = p for u, v, p in adamic_acar: AA = p for u, v, p in preferential_attachment: PA = p metrics_file.writerow( [prog_lang_id, dev_pair[0], dev_pair[1], NO, AA, PA]) a.close()
def calc_indexes(G, test_edges, aa=False): """ Input - G - is the graph based upon which we calculate the indexes test_edges - are the edges for which we calculate the index aa - whether to calculate the adamic adder index or not Calculates all the indixes of the graph for the test_edgessuch as 1) Common Neighbors 2) Jaccard 3) Preferential Attachment 4) Adamic Adder """ #All indexes stored as a dictionary indexes = {} #Initiazing jaccard_arr = [] common_arr = [] preferential_arr = [] adamic_adder = [] aa1 = nx.adamic_adar_index(G, test_edges) jc = nx.jaccard_coefficient(G, test_edges) pa = nx.preferential_attachment(G, test_edges) for edge in test_edges: #Loading the nodes node1 = edge[0] node2 = edge[1] node_list = [node1, node2] #Neighbors of the nodes node1_neighbors = set(list(G.neighbors(node1))) node2_neighbors = set(list(G.neighbors(node2))) #Union of the neighbors union_neighbors = list(node1_neighbors.union(node2_neighbors)) #Intersection of the neighbors intersection_neighbors = list( node1_neighbors.intersection(node2_neighbors)) #Jaccard Index jaccard_index = 0 if len(union_neighbors) != 0: jaccard_index = len(intersection_neighbors) / len(union_neighbors) #Common Neighbors common_neighbors = len(intersection_neighbors) #Preferential Attachment preferential_attachment = len(node1_neighbors) * len(node2_neighbors) #Appending the indexes into the arrays jaccard_arr.append(jaccard_index) common_arr.append(common_neighbors) preferential_arr.append(preferential_attachment) #if adamic_adder is to be calculated if aa == True: adamic_add = 0 for neighbor in intersection_neighbors: nfneighbors_of_neighbors = len(list(G.neighbors(neighbor))) #Adamic adder doesn't make sense for nfneighbors_of_neighbors == 0 or 1 if nfneighbors_of_neighbors != 1 and nfneighbors_of_neighbors != 0: aa_score = 1 / np.log(nfneighbors_of_neighbors) adamic_add += aa_score adamic_adder.append(adamic_add) indexes['jaccard_index'] = jaccard_arr indexes['common_neighbors'] = common_arr indexes['preferential_attachment'] = preferential_arr indexes['adamic_adder'] = adamic_adder # print(adamic_adder[:10],aa[:10]) # print(jaccard_arr[:10],jc[:10]) # print(preferential_arr[:10],pa[:10]) #print(indexes['adamic_adder']) #print(adamic_adder) #print(jaccard_arr) # for u, v, p in jc: # print(u, v, p) # ind = test_edges.index((u,v)) # print(jaccard_arr[ind],ind) # for u, v, p in pa: # print(u, v, p) # ind = test_edges.index((u,v)) # print(preferential_arr[ind],ind) # for u, v, p in aa1: # print(u, v, p) # ind = test_edges.index((u,v)) # print(adamic_adder[ind],ind) return indexes
auth2.append(b) length = len(auth1) print length for i in range(0, length): for j in range(0, length): lab = 0 if (auth1[i] != auth2[j]): fet = '' d = 0 f = 0 pred1 = nx.adamic_adar_index(g, [(auth1[i], auth2[j])]) pred2 = nx.jaccard_coefficient(g, [(auth1[i], auth2[j])]) pred3 = nx.preferential_attachment(g, [(auth1[i], auth2[j])]) for item in pred1: x, y, z = item if (z == 0.0): d = 1 fet = fet + str(z) + ' ' #print x + ' '+ y +' ' + str(z) for item in pred2: x, y, z = item if (z == 0.0): f = 1 fet = fet + str(z) + ' '
def get_feature(source, target): features = {} def set_feature(name, val): if name not in features: features[name] = val def cosine_distance(node_list1, node_list2): id2index = dict([ (id, i) for i, id in enumerate(set(node_list1 + node_list2)) ]) a = np.zeros((len(id2index), )) b = np.zeros((len(id2index), )) for key in node_list1: a[id2index[key]] = 1 for key in node_list2: b[id2index[key]] = 1 #return distance.cosine(a, b) try: source_succ = set(digraph.successors(source)) source_pred = set(digraph.predecessors(source)) target_succ = set(digraph.successors(target)) target_pred = set(digraph.predecessors(target)) set_feature('len_source_successors', len(source_succ)) set_feature('len_target_successors', len(target_succ)) set_feature('len_source_predecessors', len(source_pred)) set_feature('len_target_predecessors', len(target_pred)) common_succ = len(source_succ.intersection(target_succ)) common_pred = len(source_pred.intersection(source_pred)) set_feature('common_successor_number', common_succ) set_feature('common_predecessor_number', common_pred) succ_union = source_succ.union(target_succ) pred_union = source_pred.union(target_pred) set_feature( 'jaccard_distance_between_successors', common_succ / len(succ_union) if len(succ_union) != 0 else 0) set_feature( 'jaccard_distance_between_predecessors', common_pred / len(pred_union) if len(pred_union) != 0 else 0) #set_feature('successor_cosine', cosine_distance(data[source], data[target])) #set_feature('predecessor_cosine', cosine_distance(source_pred, target_pred)) set_feature( 'shortest_path', nx.shortest_path_length(digraph, source, target) if digraph.has_edge(source, target) else 0) pref_attch = nx.preferential_attachment(graph, [(source, target)]) for u, v, p in pref_attch: set_feature('preference_attachment', p) # if graph.has_edge(source, target) else 0) aa_index = nx.adamic_adar_index(graph, [(source, target)]) for u, v, p in aa_index: set_feature('adamic_adar_index', p) # if graph.has_edge(source, target) else 0) jcd_coe = nx.jaccard_coefficient(graph, [(source, target)]) for u, v, p in jcd_coe: set_feature('jaccard_coefficient', p) # if graph.has_edge(source, target) else 0) reallo_index = nx.resource_allocation_index( graph, [(source, target)]) for u, v, p in reallo_index: set_feature('resource_allocation_index', p) # if graph.has_edge(source, target) else 0) set_feature('cluster_source', nx.clustering(graph, source)) set_feature('cluster_target', nx.clustering(graph, target)) set_feature('source_pagerank', pagerank[source]) set_feature('target_pagerank', pagerank[target]) set_feature('source_authorities', auth[source]) set_feature('target_authorities', auth[target]) set_feature('source_hubs', hub[source]) set_feature('target_hubs', hub[target]) #set_feature('source_core_num', core[source]) #set_feature('target_core_num', core[target]) except: pass return features
print("Number of edges deleted : %d" % edge_subset_G_size) print("Number of edges remaining : %d" % (t - edge_subset_G_size)) #6 Create a train set of 80 percent from G_test edges_to_remove_from_G_test = 0.201 removed_edges_from_G_test = random.sample( G_test.edges(), int(edges_to_remove_from_G_test * G_test.number_of_edges())) G_train = G_test.copy() G_train.remove_edges_from(removed_edges_from_G_test) edge_subset_G_test_size = len(list(removed_edges_from_G_test)) print("Number of edges deleted : %d" % edge_subset_G_test_size) print("Number of edges remaining : %d" % (t - edge_subset_G_size - edge_subset_G_test_size)) #6 Transform G_train and G_test to undirected G_train = G_train.to_undirected() G_test = G_test.to_undirected() #7 Calculate AA AUC pred_aa_train = list(nx.adamic_adar_index(G_train)) pred_aa_test = list(nx.adamic_adar_index(G_test)) score_aa, label_aa = zip(*[(s, (u, v) in removed_edges_from_G) for (u, v, s) in pred_aa_test]) auc_aa = roc_auc_score(label_aa, score_aa) #8 Print AUC and prediciton calculation time t2 = datetime.now() delta = t2 - t1 print(auc_aa, delta.seconds)
def make_adamic_adar_index_predG(dirG, testG): undG = dirG.to_undirected() undir_AAs = nx.Graph() undir_AAs.add_weighted_edges_from(nx.adamic_adar_index(undG, testG.edges_iter())) return make_predG_from_jacc(undir_AAs, dirG, testG)
random.shuffle(mytrain) del test[0] g = nx.Graph() connect = [] for i in range(20000): t = train[i].split() for j in range(len(t)): g.add_edge(t[0], t[j]) for i in range(19999): t1 = train[i].split() t2 = train[i + 1].split() if g.has_edge(t1[0], t2[0]): connect.append([ len(list(nx.common_neighbors(g, t1[0], t2[0]))), list(nx.adamic_adar_index(g, [(t1[0], t2[0])]))[0][2], list(nx.preferential_attachment(g, [(t1[0], t2[0])]))[0][2], list(nx.jaccard_coefficient(g, [(t1[0], t2[0])]))[0][2], list(nx.resource_allocation_index(g, [(t1[0], t2[0])]))[0][2], 1 ]) else: connect.append([ len(list(nx.common_neighbors(g, t1[0], t2[0]))), list(nx.adamic_adar_index(g, [(t1[0], t2[0])]))[0][2], list(nx.preferential_attachment(g, [(t1[0], t2[0])]))[0][2], list(nx.jaccard_coefficient(g, [(t1[0], t2[0])]))[0][2], list(nx.resource_allocation_index(g, [(t1[0], t2[0])]))[0][2], 0 ]) for i in range(300000): t0, t1, t2 = mytrain[i].split()
def adamic_adar(graph): output_file = open("data/imdb_b__adamic_adar", "w") for (i, (u, v, score)) in enumerate(nx.adamic_adar_index(graph, graph.edges_iter())): print i output_file.write("\t".join(map(str, [u, v, score])) + "\n") output_file.close()
#plt.plot(fpr, tpr, marker='.',label=leg) plt.plot(recall,precision,marker='.',label=leg) plt.legend() return roc_score, ap_score,precision,recall # ## 3. Adamic-Adar # In[7]: # Compute Adamic-Adar indexes from g_train aa_matrix = np.zeros(adj.shape) for u, v, p in nx.adamic_adar_index(g_train): # (u, v) = node indices, p = Adamic-Adar index aa_matrix[u][v] = p aa_matrix[v][u] = p # make sure it's symmetric # Normalize array aa_matrix = aa_matrix / aa_matrix.max() # In[8]: # Calculate ROC AUC and Average Precision k=0 aa_roc, aa_ap,precision,recall= get_roc_score1(test_edges, test_edges_false, aa_matrix) print ('Adamic-Adar Test ROC score: ', str(aa_roc))
# Drawing the Graph using matplotlib nx.draw_networkx(UG, node_color=['red'], with_labels=True) plt.show() all_nodes = [] nodes = list(nx.nodes(UG)) # loop to append the pair of nodes to a list for x in nodes: for y in nodes: if (x != y and not all_nodes.__contains__({x,y})): all_nodes.append({x,y}) # Implementing Adamic-Adar adamic_adar = nx.adamic_adar_index(UG, all_nodes) # the graph and the pair of nodes as parameters # Print the values print(" \nAdamic-Adar implementation \n") max=0 for u, v, p in adamic_adar: if(p>max): max = p sim_u = u sim_v = v print ('{}, {} -> {:.5f}'.format(u,v,p)) print (f'\nThe most similar according Adamic-Adar: {sim_u}, {sim_v} -> {max:.5f}') # According to this results the highest Adamic-Adar similarity is between u2 and u3
linklist.append([int(data[0]), int(data[1])]) nodepair_set[random.randint(0, n_folds - 1)].append([int(data[0]), int(data[1])]) # new_line = data[0] + ' ' + data[1] + ' 1\n' # f_w.write(new_line) train_list = [] for templist in nodepair_set[0:8]: train_list = train_list + templist test_list = nodepair_set[9] nodelist = create_vertex(linklist) train_adj = create_adjmatrix(train_list, nodelist) test_adj = create_adjmatrix(test_list, nodelist) # print(train_adj) sim_cn = np.dot(train_adj, train_adj) sim_bifan = np.dot(np.dot(train_adj, train_adj.T), train_adj) sim_AA = nx.adamic_adar_index(train_adj) sim_RA = AA(train_adj) sim_IP = IP(train_adj, 0.8) # sim_jaccard = Jaccard(train_adj) # cn_score_1 = AUC.Calculation_AUC(train_adj, test_adj, sim_cn, len(nodelist)) # cn_score_2 = evaluationMetric.cal_AUC(train_adj, test_adj, sim_cn, 10000) cn_score_3 = metric.auc_score(sim_cn, test_adj, train_adj,'cc') AA_score_3 = metric.auc_score(sim_AA, test_adj, train_adj, 'cc') RA_score_3 = metric.auc_score(sim_RA, test_adj, train_adj, 'cc') bifan_score_3 = metric.auc_score(sim_bifan, test_adj, train_adj,'cc') IP_score = metric.auc_score(sim_IP, test_adj, train_adj, 'cc') print(cn_score_3) print(AA_score_3)
rr = [] # for i in corenodes: # if i not in dataset: # print(i) # for test # for i in corenodes: # preds = nx.adamic_adar_index(G,nonedges(G,i)) # tenlargest = heapq.nlargest(100, preds, key = lambda x: x[2]) # for j in tenlargest: # rr.append(j) # print(len(rr)) #==21550 # result= heapq.nlargest(4000, rr, key = lambda x: x[2]) #for val for i in val: preds = nx.adamic_adar_index(G, nonedges(G, i)) tenlargest = heapq.nlargest(1000, preds, key=lambda x: x[2]) for j in tenlargest: rr.append(j) print(len(rr)) #==21550 result = heapq.nlargest(2000, rr, key=lambda x: x[2]) endtime = datetime.datetime.now() print("time", (endtime - starttime).seconds) count = 0 for i in result: if i[2] == 0: count += 1 print(count) print(len(result))
N = G.number_of_nodes() nodelist = list(G.nodes()) print(nx.info(G)) print(nx.number_of_nodes(G)) print(nx.number_of_edges(G)) print(nx.is_directed(G)) def nonedges(G, u): #a generator with (u,v) for every non neighbor v for v in nx.non_neighbors(G, u): yield (u, v) for u in G.nodes(): adar = nx.adamic_adar_index(G, nonedges(G, u)) for v in nx.non_neighbors(G, u): com = nx.common_neighbors(G, u, v) jac = nx.jaccard_coefficient(G, nonedges(G, u)) res = nx.resource_allocation_index(G, nonedges(G, u)) pre = nx.preferential_attachment(G, nonedges(G, u)) allm = {m: {} for m in methods} toponame = "datasetFourSquare" + ".csv" with open(toponame, "w", newline="", encoding="utf-8") as f: # binary mode for windows \r\n prob writer = csv.writer(f, delimiter=',') writer.writerow(['node1', 'node2'] + methods + ['label']) for p in total_edges: n1, n2 = p
# with open("adj_matrix.csv", "wb") as f: # writer = csv.writer(f) # writer.writerows(AJ_matrix) AJ_matrix = nx.adjacency_matrix(G) #print AJ_matrix(0,0) print AJ_matrix.shape # for ii in range(0, 1): # tmp_missing_links = [] # for jj in range(0, len(node_sets)): # if AJ_matrix[ii,jj] == 0: # tmp_missing_links.append((int(final_nodes[ii]),int(final_nodes[jj]))) # print (int(final_nodes[ii]),int(final_nodes[jj])) preds = nx.adamic_adar_index(G, ebunch=None) print max(need_to_check) #initial_node = 1 missing_links = [] old_u = 1 tmp_missing_links_score = [] tmp_missing_links_u = [] tmp_missing_links_v = [] for u, v, score in preds: if int(u) in need_to_check[0:1000]: #print u, old_u if int(u) == old_u: #print executed tmp_missing_links_score.append(score)
def set_edge_weight(self, edge_weight_method='weight'): if edge_weight_method == 'weight': return # Centrality based methods elif edge_weight_method == 'edge_betweenness_centrality': print("comptuing edge_betweenness_centrality..") C = nx.edge_betweenness_centrality(self.G, weight='weight') print("done!") elif edge_weight_method == 'edge_betweenness_centrality_subset': print("comptuing edge_betweenness_centrality_subset..") C = nx.edge_current_flow_betweenness_centrality(self.G, weight='weight') print('done') elif edge_weight_method == 'edge_current_flow_betweenness_centrality_subset': print( "comptuing edge_current_flow_betweenness_centrality_subset..") C = nx.edge_current_flow_betweenness_centrality_subset( self.G, weight='weight') print('done') elif edge_weight_method == 'edge_load_centrality': print("comptuing edge_load_centrality..") C = nx.edge_load_centrality(self.G) print('done!') # Link Prediction based methods elif edge_weight_method == 'adamic_adar_index': print("comptuing adamic_adar_index ..") preds = nx.adamic_adar_index(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p elif edge_weight_method == 'ra_index_soundarajan_hopcroft': print("comptuing ra_index_soundarajan_hopcroft ..") preds = nx.ra_index_soundarajan_hopcroft(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p elif edge_weight_method == 'preferential_attachment': print("comptuing preferential_attachment ..") preds = nx.preferential_attachment(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p #elif edge_weight_method=='cn_soundarajan_hopcroft': # print("comptuing cn_soundarajan_hopcroft ..") # preds=nx.cn_soundarajan_hopcroft(self.G,self.G.edges()) # C={} # for u, v, p in preds: # C[(u,v)]=p elif edge_weight_method == 'within_inter_cluster': print("comptuing within_inter_cluster ..") preds = nx.within_inter_cluster(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p elif edge_weight_method == 'resource_allocation_index': print("comptuing resource allocation index ..") preds = nx.resource_allocation_index(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p elif edge_weight_method == 'jaccard_coefficient': print("comptuing jaccard_coefficient..") preds = nx.jaccard_coefficient(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p print('done!') for u, v, d in self.G.edges(data=True): if edge_weight_method == None: d['weight'] = 1 else: d['weight'] = C[(u, v)] return 1
def get_edge_weight(self, i, j): aa_index = nx.adamic_adar_index(self._G, [(i, j)]) return six.next(aa_index)[2]
nodes = nx.nodes(DG) edges = nx.edges(DG) non_edges = nx.non_edges(DG) '''Compute HAA, HJC and HRA''' HAA = [] HJC = [] HRA = [] SD = [] for e in test_pairs: if not DG.has_node(e[0]): DG.add_node(e[0]) UDG.add_node(e[0]) if not DG.has_node(e[1]): DG.add_node(e[1]) UDG.add_node(e[1]) AA = nx.adamic_adar_index(UDG, [e]) JC = nx.jaccard_coefficient(UDG, [e]) RA = nx.resource_allocation_index(UDG, [e]) spec_diff = DG.in_degree(e[1]) - DG.in_degree( e[0]) # specificity_difference SD.append(spec_diff) try: for u, v, p in AA: HAA.append(p) except ZeroDivisionError: HAA.append(0) pass try: for u, v, p in JC: HJC.append(p)
positive_predictions_proba_slpc_DegCent = [] positive_predictions_proba_slpc_EigenCent = [] positive_predictions_proba_slpc_ClosenessCent = [] positive_predictions_proba_slpc_BetweenCent = [] positive_predictions_proba_slpc_PageRank = [] lenedg = len(pedges) cntr = 0 for edge in pedges: cntr += 1 print("\r {}/{}".format(cntr, lenedg), end="") positive_predictions_proba_jcc.append( list(nx.jaccard_coefficient(G, [edge]))[0][2]) positive_predictions_proba_ra.append( list(nx.resource_allocation_index(G, [edge]))[0][2]) positive_predictions_proba_aa.append( list(nx.adamic_adar_index(G, [edge]))[0][2]) positive_predictions_proba_pa.append( list(nx.preferential_attachment(G, [edge]))[0][2]) positive_predictions_proba_cnsh.append( list(nx.cn_soundarajan_hopcroft( G, [edge]))[0][2]) # needs community information positive_predictions_proba_rash.append( list(nx.ra_index_soundarajan_hopcroft( G, [edge]))[0][2]) # needs community information positive_predictions_proba_wic.append( list(nx.within_inter_cluster( G, [edge]))[0][2]) # needs community information positive_predictions_proba_slp_DegCent.append( list(SLP_prediction(G, [edge], centrality="DegCent"))[0][2]) positive_predictions_proba_slp_EigenCent.append( list(SLP_prediction(G, [edge], centrality="EigenCent"))[0][2])
return itertools.combinations(iterable, 2) G = nx.read_edgelist("./data/drugbank_interactions.tsv", delimiter="\t", nodetype=str) partition = community.best_partition(G) nx.set_node_attributes(G, name='community', values=partition) ap = list(all_pairs(G.nodes())) cn = cn.cnbors(G, ap) rai = nx.resource_allocation_index(G, ap) jc = nx.jaccard_coefficient(G, ap) aai = nx.adamic_adar_index(G, ap) pa = nx.preferential_attachment(G, ap) ccn = nx.cn_soundarajan_hopcroft(G, ap) cra = nx.ra_index_soundarajan_hopcroft(G, ap) wic = nx.within_inter_cluster(G, ap, community='community') u, v, s1, s2, s3, s4, s5, s6, s7, s8, has_edge = ([] for i in range(11)) for m1, m2, m3, m4, m5, m6, m7, m8 in zip(cn, rai, jc, aai, pa, ccn, cra, wic): u.append(m1[0]) v.append(m1[1]) s1.append(m1[2]) s2.append(m2[2]) s3.append(m3[2]) s4.append(m4[2]) s5.append(m5[2]) s6.append(m6[2])
def save_to_file_similarities(E1, j): node_list = list(E1.nodes) temp_short_path, temp_cn, temp_jc, temp_a, temp_pa = {}, {}, {}, {}, {} #calculation start for node1 in node_list: for node2 in node_list: temp = nx.jaccard_coefficient(E1, [(node1, node2)]) for u, v, p in temp: temp_jc[u, v] = p temp = nx.adamic_adar_index(E1, [(node1, node2)]) try: for u, v, p in temp: temp_a[u, v] = p except: temp_a[u, v] = 0.0 temp = nx.preferential_attachment(E1, [(node1, node2)]) for u, v, p in temp: temp_pa[u, v] = p temp = sorted(nx.common_neighbors(E1, node1, node2)) temp_common_neighbors = [] temp = sorted(nx.common_neighbors(E1, node1, node2)) for common_neighbor in temp: temp_common_neighbors.append(common_neighbor) temp_cn[node1, node2] = (len(temp_common_neighbors)) #Ill use Exception Checking cause we have DiGraph and maybe there is no path between to nodes #so an error would compile try: length = nx.shortest_path_length(E1, node1, node2) temp_short_path[node1, node2] = length except: temp_short_path[node1, node2] = 0.0 #calculation ends temp_cn = OrderedDict(sorted(temp_cn.items(), key=lambda kx: kx[1])) temp_short_path = OrderedDict( sorted(temp_short_path.items(), key=lambda kx: kx[1])) temp_jc = OrderedDict(sorted(temp_jc.items(), key=lambda kx: kx[1])) temp_a = OrderedDict(sorted(temp_a.items(), key=lambda kx: kx[1])) temp_pa = OrderedDict(sorted(temp_pa.items(), key=lambda kx: kx[1])) directory = 'Subgraphs/Similarities/' + 'T' + str(j - 1) + '-' + 'T' + str( j + 1) + '_E_' + 'T' + str(j - 1) + '-' + 'T' + str(j) if not os.path.exists(directory): os.makedirs(directory) temp_path = temp_path = directory + '/' + str(E1.graph['name']) save_as_csv(temp_path + '_shortest_path', temp_short_path, ['Node', 'Length of Shortest Path']) save_as_csv(temp_path + '_jaccard_coef', temp_jc, ['(Node1, Node2)', 'Jaccard Coefficient']) save_as_csv(temp_path + '_pref_attach.txt', temp_pa, ['(Node1, Node2)', 'Preferential Attachment']) save_as_csv(temp_path + '_adamic_index.txt', temp_a, ['(Node1, Node2)', 'Adamic Index']) save_as_csv(temp_path + '_common_neigh.txt', temp_cn, ['(Node1, Node2)', '# of Common Neighbors'])
def aa(G, i, j): return sorted(nx.adamic_adar_index(G, [(i, j)]))[0][2]
def get_adamic(filepath): D, pr, pr_df = get_pagerank(filepath, save_file=False) H = D.to_undirected() adm = nx.adamic_adar_index(H, ebunch=pr_df["relation"]) # adm_output=[(item[0],item[1],item[2]) for item in adm ] pass
parts_idx = np.where(parts == i)[0] parts_graph = train_G.subgraph(parts_idx) # find top-1% nodes landmarks = [] for node, degree in sorted(parts_graph.degree().items(), key=lambda item: item[1], reverse=True): landmarks.append(node) if len(landmarks) > parts_graph.number_of_nodes() * 0.01: landmarks_in_train_G.extend(landmarks) break # save partial graph with original label mapping = {n: parts_graph.node[n]['ori_label'] for n in parts_graph.nodes()} parts_graph = nx.relabel_nodes(parts_graph, mapping, copy=True) for (n, data) in parts_graph.nodes(data=True): data.pop('ori_label', None) with open(Path('clustering', dataset, '{}.gpickle'.format(i)), 'wb') as f: nx.write_gpickle(parts_graph, f) combinations = list(itertools.combinations(landmarks_in_train_G, 2)) combinations = set(tuple(sorted(item)) for item in combinations) landmark_graph = nx.Graph() for u, v, p in nx.adamic_adar_index(train_G, combinations): ori_u = train_G.node[u]['ori_label'] ori_v = train_G.node[v]['ori_label'] edge_w = 2./(1 + np.exp(-p/10)) - 1. landmark_graph.add_edge(ori_u, ori_v, attr_dict={'weight': edge_w}) landmark_graph.add_edge(ori_v, ori_u, attr_dict={'weight': edge_w}) with open(Path('clustering', dataset, 'landmark.gpickle'), 'wb') as f: nx.write_gpickle(landmark_graph, f)
def adamicAdar(edges: np.array, output_file: str): # Initialize graph. graph = nx.read_edgelist("out_graph.txt", nodetype=int, create_using=nx.Graph()) preds = nx.adamic_adar_index(graph, edges) RecommendationPolicies.writeNpToFile(output_file, preds)
def get_features(L, flag): X = [[] for i in range(len(L))] #=====================Social features(user-to-user graph)====================== #g0.adamic adar score if flag['g0'] is True: print("get feature g0") preds = nx.adamic_adar_index(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g1.jaccard coefficient if flag['g1'] is True: print("get feature g1") preds = nx.jaccard_coefficient(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g2.resource_allocation if flag['g2'] is True: print("get feature g2") preds = nx.resource_allocation_index(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g3.preferentail_attachment if flag['g3'] is True: print("get feature g3") preds = nx.preferential_attachment(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g4.shortest path length if flag['g4'] is True: print("get feature g4") cnt = 0 for (u, v) in L: if G.has_edge(u, v): G.remove_edge(u, v) if nx.has_path(G, u, v): X[cnt].append( nx.shortest_path_length(G, source=u, target=v) / 50000) else: X[cnt].append(1) G.add_edge(u, v) else: if nx.has_path(G, u, v): X[cnt].append( nx.shortest_path_length(G, source=u, target=v) / 50000) else: X[cnt].append(1) cnt += 1 #g5.common neighbors if flag['g5'] is True: print("get feature g5") cnt = 0 for (u, v) in L: if G.has_edge(u, v): G.remove_edge(u, v) T = [w for w in nx.common_neighbors(G, u, v)] G.add_edge(u, v) else: T = [w for w in nx.common_neighbors(G, u, v)] X[cnt].append(len(T)) cnt += 1 #g6.Approximate katz for social graph if flag['g6'] is True: print("get feature g6") cnt = 0 for (u, v) in L: p = 0 if G.has_edge(u, v): G.remove_edge(u, v) for x in G.neighbors(u): for y in G.neighbors(v): if x == y or G.has_edge(x, y): p += 1 G.add_edge(u, v) else: for x in G.neighbors(u): for y in G.neighbors(v): if x == y or G.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 #=========================checkin features========================================= #c0.follower number if flag['c0'] is True: print("get feature c0") cnt = 0 for (u, v) in L: X[cnt].append(U[u]['follow_cnt'] * U[v]['follow_cnt']) # fu*fv cnt += 1 #c1.same time same location if flag['c1'] is True: print("get feature c1") cnt = 0 for (u, v) in L: p = calculate_CCC(G, u, v) X[cnt].append(p) cnt += 1 #c2.same time same distinct spot if flag['c2'] is True: print("get deature c2") cnt = 0 for (u, v) in L: p = 0 dis_same_spot = [] for k in C[u]: if k[1] not in dis_same_spot and k in C[v]: dis_same_spot.append(k[1]) p += 1 X[cnt].append(p) cnt += 1 #c3.same distinct spot (not necessarily same time) if flag['c3'] is True: cnt = 0 print("get feature c3") for (u, v) in L: p = 0 dis_same_spot = [] for k in C[u]: if k[1] not in dis_same_spot: for m in C[v]: if k[1] == m[1]: dis_same_spot.append(k[1]) p += 1 break X[cnt].append(p) cnt += 1 #c4.min Entropy if flag['c4'] is True: print("get feature c4") cnt = 0 for (u, v) in L: p = 0 E_list = [] for k in C[u]: if k in C[v]: spot = k[1] if spot in S and S[spot]['entropy'] > 0: E_list.append(S[spot]['entropy']) if len(E_list) > 0: p = min(E_list) X[cnt].append(p) cnt += 1 #c5. distance of mean_LL if flag['c5'] is True: cnt = 0 print("get feature c5") for (u, v) in L: dist = np.sqrt((U[u]['mean_LL'][0] - U[v]['mean_LL'][0])**2 + (U[u]['mean_LL'][1] - U[v]['mean_LL'][1])**2) X[cnt].append(dist) cnt += 1 #c6.weighted same location if flag['c6'] is True: print("get feature c6") cnt = 0 for (u, v) in L: p = 0 for k in C[u]: if k in C[v]: spot = k[1] #if spot in S and S[spot]['entropy'] > 0: #p += 1/S[spot]['entropy'] if spot in S: dist = np.sqrt( (S[spot]['LL'][0] - U[u]['mean_LL'][0])**2 + (S[spot]['LL'][1] - U[u]['mean_LL'][1])**2) p += dist dist = np.sqrt( (S[spot]['LL'][0] - U[v]['mean_LL'][0])**2 + (S[spot]['LL'][1] - U[v]['mean_LL'][1])**2) p += dist X[cnt].append(p) cnt += 1 #c7.PP if flag['c7'] is True: print("get feature c7") cnt = 0 for (u, v) in L: p = len(C[u]) * len(C[v]) X[cnt].append(p) cnt += 1 #c8.Total Common Friend Closeness (TCFC) if flag['c8'] is True: print("get feature c8") cnt = 0 for (u, v) in L: p = 0 if G.has_edge(u, v): G.remove_edge(u, v) for w in nx.common_neighbors(G, u, v): T1 = [x for x in nx.common_neighbors(G, u, w)] T2 = [x for x in nx.common_neighbors(G, v, w)] p += len(T1) * len(T2) G.add_edge(u, v) else: for w in nx.common_neighbors(G, u, v): T1 = [x for x in nx.common_neighbors(G, u, w)] T2 = [x for x in nx.common_neighbors(G, v, w)] p += len(T1) * len(T2) X[cnt].append(p) cnt += 1 #c9.Total Common friend Checkin Count (TCFCC) if flag['c9'] is True: print("get feature c9") cnt = 0 for (u, v) in L: p = 0 if G.has_edge(u, v): G.remove_edge(u, v) for w in nx.common_neighbors(G, u, v): p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w) G.add_edge(u, v) else: for w in nx.common_neighbors(G, u, v): p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w) X[cnt].append(p) cnt += 1 #c10. Common Category Checkin Counts Product (CCCP) if flag['c10'] is True: print("get feature c10") cnt = 0 for (u, v) in L: p = 0 for cat in U[u]['cate']: if cat in U[v]['cate']: p += U[u]['cate'][cat] * U[v]['cate'][cat] X[cnt].append(p) cnt += 1 #c11. Common Category Checkin Counts Product Ratio(CCCPR) if flag['c11'] is True: print("get feature c11") cnt = 0 for (u, v) in L: p = 0 u_cate_total = sum(U[u]['cate'][cat]**2 for cat in U[u]['cate']) v_cate_total = sum(U[v]['cate'][cat]**2 for cat in U[v]['cate']) for cat in U[u]['cate']: if cat in U[v]['cate']: p += (U[u]['cate'][cat] * U[v]['cate'][cat] / np.sqrt(u_cate_total * v_cate_total)) X[cnt].append(p) cnt += 1 #c12.trip route length all if flag['c12'] is True: print("get feature c12") cnt = 0 for (u, v) in L: tripDayLen1 = list() tripDayLen2 = list() tripDay = "starting" tripLen = 0.0 lastSpot = [0.0, 0.0] for k in C[u]: if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0): if k[1] in S: tripLen += np.sqrt((lastSpot[0] - S[k[1]]['LL'][0])**2 + (lastSpot[1] - S[k[1]]['LL'][1])**2) lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] else: if k[1] in S: lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] tripDay = "starting" tripLen2 = 0.0 lastSpot = [0.0, 0.0] for k in C[v]: if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0): if k[1] in S: tripLen2 += np.sqrt( (lastSpot[0] - S[k[1]]['LL'][0])**2 + (lastSpot[1] - S[k[1]]['LL'][1])**2) lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] else: if k[1] in S: lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] X[cnt].append(tripLen + tripLen2) cnt += 1 #=========================Heter Graph features===================================== #h0.Approximate katz for bipartite graph if flag['h0'] is True: print("get feature h0") cnt = 0 for (u, v) in L: p = 0 for x in B.neighbors(u): for y in B.neighbors(v): if x == y or B.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 #h1.Approximate katz on HB if flag['h1'] is True: print("get feature h1") cnt = 0 for (u, v) in L: p = 0 if HB.has_edge(u, v): HB.remove_edge(u, v) for x in HB.neighbors(u): for y in HB.neighbors(v): if x == y or HB.has_edge(x, y): p += 1 HB.add_edge(u, v) else: for x in HB.neighbors(u): for y in HB.neighbors(v): if x == y or HB.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 #h2.Approximate katz on H if flag['h2'] is True: print("get feature h2") cnt = 0 for (u, v) in L: p = 0 if H.has_edge(u, v): H.remove_edge(u, v) for x in H.neighbors(u): for y in H.neighbors(v): if x == y or H.has_edge(x, y): p += 1 H.add_edge(u, v) else: for x in H.neighbors(u): for y in H.neighbors(v): if x == y or H.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 #h3.shortest path length on B if flag['h3'] is True: print("get feature h3") cnt = 0 for (u, v) in L: if nx.has_path(B, u, v): X[cnt].append( nx.shortest_path_length(B, source=u, target=v) / 50000) else: X[cnt].append(1) cnt += 1 #h4.clustering coefiicient on H if flag['h4'] is True: print("get feature h4") cnt = 0 for (u, v) in L: if H.has_edge(u, v): H.remove_edge(u, v) p = nx.clustering(H, u) * nx.clustering(H, v) H.add_edge(u, v) else: p = nx.clustering(H, u) * nx.clustering(H, v) X[cnt].append(p) cnt += 1 #h5. number of (user's loc friends)'s loc friends if flag['h5'] is True: print("get feature h5") cnt = 0 for (u, v) in L: counter1 = 0 for neighbor in H.neighbors(u): if not neighbor.isnumeric(): for neighbor2 in H.neighbors(neighbor): if not neighbor.isnumeric(): counter1 += 1 counter2 = 0 for neighbor in H.neighbors(v): if not neighbor.isnumeric(): for neighbor2 in H.neighbors(neighbor): if not neighbor.isnumeric(): counter2 += 1 #print(str(counter1)+" "+str(counter2)+"\n") X[cnt].append(counter1 * counter2) cnt += 1 return X
jaccard = np.zeros(n) adar = np.zeros(n) preferential_attachment = np.zeros(n) resource_allocation_index = np.zeros(n) common_neighbors = np.zeros(n) # computing features for training set for i in tqdm(range(len(id1))): if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: G.remove_edge(id1[i], id2[i]) pred = nx.jaccard_coefficient(G, [(id1[i], id2[i])]) pred = [(u, v, p) for (u, v, p) in pred] jaccard[i] = pred[0][2] pred = nx.adamic_adar_index(G, [(id1[i], id2[i])]) pred = [(u, v, p) for (u, v, p) in pred] adar[i] = pred[0][2] pred = nx.preferential_attachment(G, [(id1[i], id2[i])]) pred = [(u, v, p) for (u, v, p) in pred] preferential_attachment[i] = pred[0][2] pred = nx.resource_allocation_index(G, [(id1[i], id2[i])]) pred = [(u, v, p) for (u, v, p) in pred] resource_allocation_index[i] = pred[0][2] pred = nx.common_neighbors(G, id1[i], id2[i]) pred = len([u for u in pred]) common_neighbors[i] = pred
def adamic_adar(graph, output_file_name): outputFile = open(output_file_name + "_adamic_adar", 'w') for (u, v, score) in nx.adamic_adar_index(graph, graph.edges()): line = outputFormat(u, v, score) outputFile.write(line) outputFile.close()
# Common Neighbors CN = [(e[0], e[1], len(list(nx.common_neighbors(M, e[0], e[1])))) for e in nx.non_edges(M)] CN.sort(key=operator.itemgetter(2), reverse=True) # Jaccard coef jaccard = list(nx.jaccard_coefficient(M)) jaccard.sort(key=operator.itemgetter(2), reverse=True) # Resource Allocation index RA = list(nx.resource_allocation_index(M)) RA.sort(key=operator.itemgetter(2), reverse=True) # Adamic-Adar index AA = list(nx.adamic_adar_index(M)) AA.sort(key=operator.itemgetter(2), reverse=True) # Preferential Attachement PA = list(nx.preferential_attachment(M)) PA.sort(key=operator.itemgetter(2), reverse=True) # Community Common Neighbors !!! requires graph to have node attribute: 'community' !!! #CCN = list(nx.cn_soundarajan_hopcroft(M)) #CCN.sort(key=operator.itemgetter(2), reverse = True) # Community Resource Allocation !!! requires graph to have node attribute: 'community' !!! #CRA = list(nx.ra_index_soundarajan_hopcroft(M)) #CRA.sort(key=operator.itemgetter(2), reverse = True) # ###################### Prediction on Future Edge Linkage ####################
nx.draw_networkx(graph_1) # In[18]: plt.rcParams["figure.figsize"] = (20, 15) nx.draw_networkx(graph_2, node_size=np.array(list(nx.pagerank(graph_2, .5).values())) * 10**5) # ## Prédiction de liens # Évaluez la similarité, au sens de l'indice Adamic/Adar, entre toutes les paires de noeuds non adjacents du graphe non orienté `graph_1` # In[19]: print('\n[+] Exercice 8') print(top_k_triplets(nx.adamic_adar_index(graph_1), 5)) # Implémentez la fonction `generic_common_neighbors` utilisée par `generic_adamic_adar`. # In[20]: def generic_common_neighbors(g, u, v): """ Intersection of u's neighbors and v's neighbors :g: networkx graph :u, v: str, nodes :return: list, of common neighbors """ common = set(g.neighbors(u)).intersection(g.neighbors(v)) return list(common)
def gen_topol_feats(A_orig, A, edge_s): """ This function generates the topological features for matrix A (A_tr or A_ho) over edge samples edge_s (edge_tr or edge_ho). Input and Parameters: ------- A: the training or holdout adjacency matrix that the topological features are going to be computed over A_orig: the original adjacency matrix edge_s: the sample set of training or holdout edges that the topological features are going to be computed over Returns: ------- df_feat: data frame of features Examples: ------- >>> gen_topol_feats(A_orig, A_tr, edge_tr) >>> gen_topol_feats(A_orig, A_ho, edge_ho) """ _, edges = adj_to_nodes_edges(A) nodes = [int(iii) for iii in range(A.shape[0])] N = len(nodes) if len(edges.shape) == 1: edges = [(int(iii), int(jjj)) for iii, jjj in [edges]] else: edges = [(int(iii), int(jjj)) for iii, jjj in edges] # define graph G = nx.Graph() G.add_nodes_from(nodes) G.add_edges_from(edges) # average degree (AD) ave_deg_net = np.sum(A) / A.shape[0] # variance of degree distribution (VD) var_deg_net = np.sqrt( np.sum(np.square(np.sum(A, axis=0) - ave_deg_net)) / (A.shape[0] - 1)) # average (local) clustering coefficient (ACC) ave_clust_net = nx.average_clustering(G) # samples chosen - features edge_pairs_f_i = edge_s[:, 0] edge_pairs_f_j = edge_s[:, 1] # local number of triangles for i and j (LNT_i, LNT_j) numtriang_nodes_obj = nx.triangles(G) numtriang_nodes = [] for nn in range(len(nodes)): numtriang_nodes.append(numtriang_nodes_obj[nn]) numtriang1_edges = [] numtriang2_edges = [] for ee in range(len(edge_s)): numtriang1_edges.append(numtriang_nodes[edge_s[ee][0]]) numtriang2_edges.append(numtriang_nodes[edge_s[ee][1]]) # Page rank values for i and j (PR_i, PR_j) page_rank_nodes_obj = nx.pagerank(G) page_rank_nodes = [] for nn in range(len(nodes)): page_rank_nodes.append(page_rank_nodes_obj[nn]) page_rank1_edges = [] page_rank2_edges = [] for ee in range(len(edge_s)): page_rank1_edges.append(page_rank_nodes[edge_s[ee][0]]) page_rank2_edges.append(page_rank_nodes[edge_s[ee][1]]) # j-th entry of the personalized page rank of node i (PPR) page_rank_pers_nodes = [] hot_vec = {} for nn in range(len(nodes)): hot_vec[nn] = 0 for nn in range(len(nodes)): hot_vec_copy = hot_vec.copy() hot_vec_copy[nn] = 1 page_rank_pers_nodes.append( nx.pagerank(G, personalization=hot_vec_copy)) page_rank_pers_edges = [] for ee in range(len(edge_s)): page_rank_pers_edges.append( page_rank_pers_nodes[edge_s[ee][0]][edge_s[ee][1]]) # local clustering coefficients for i and j (LCC_i, LCC_j) clust_nodes_obj = nx.clustering(G) clust_nodes = [] for nn in range(len(nodes)): clust_nodes.append(clust_nodes_obj[nn]) clust1_edges = [] clust2_edges = [] for ee in range(len(edge_s)): clust1_edges.append(clust_nodes[edge_s[ee][0]]) clust2_edges.append(clust_nodes[edge_s[ee][1]]) # average neighbor degrees for i and j (AND_i, AND_j) ave_neigh_deg_nodes_obj = nx.average_neighbor_degree(G) ave_neigh_deg_nodes = [] for nn in range(len(nodes)): ave_neigh_deg_nodes.append(ave_neigh_deg_nodes_obj[nn]) ave_neigh_deg1_edges = [] ave_neigh_deg2_edges = [] for ee in range(len(edge_s)): ave_neigh_deg1_edges.append(ave_neigh_deg_nodes[edge_s[ee][0]]) ave_neigh_deg2_edges.append(ave_neigh_deg_nodes[edge_s[ee][1]]) # degree centralities for i and j (DC_i, DC_j) deg_cent_nodes_obj = nx.degree_centrality(G) deg_cent_nodes = [] for nn in range(len(nodes)): deg_cent_nodes.append(deg_cent_nodes_obj[nn]) deg_cent1_edges = [] deg_cent2_edges = [] for ee in range(len(edge_s)): deg_cent1_edges.append(deg_cent_nodes[edge_s[ee][0]]) deg_cent2_edges.append(deg_cent_nodes[edge_s[ee][1]]) # eigenvector centralities for i and j (EC_i, EC_j) tr = 1 toler = 1e-6 while tr == 1: try: eig_cent_nodes_obj = nx.eigenvector_centrality(G, tol=toler) tr = 0 except: toler = toler * 1e1 eig_cent_nodes = [] for nn in range(len(nodes)): eig_cent_nodes.append(eig_cent_nodes_obj[nn]) eig_cent1_edges = [] eig_cent2_edges = [] for ee in range(len(edge_s)): eig_cent1_edges.append(eig_cent_nodes[edge_s[ee][0]]) eig_cent2_edges.append(eig_cent_nodes[edge_s[ee][1]]) # Katz centralities for i and j (KC_i, KC_j) ktz_cent_nodes_obj = nx.katz_centrality_numpy(G) ktz_cent_nodes = [] for nn in range(len(nodes)): ktz_cent_nodes.append(ktz_cent_nodes_obj[nn]) ktz_cent1_edges = [] ktz_cent2_edges = [] for ee in range(len(edge_s)): ktz_cent1_edges.append(ktz_cent_nodes[edge_s[ee][0]]) ktz_cent2_edges.append(ktz_cent_nodes[edge_s[ee][1]]) # Jaccard’s coefficient of neighbor sets of i, j (JC) jacc_coeff_obj = nx.jaccard_coefficient(G, edge_s) jacc_coeff_edges = [] for uu, vv, jj in jacc_coeff_obj: jacc_coeff_edges.append([uu, vv, jj]) df_jacc_coeff = pd.DataFrame(jacc_coeff_edges, columns=['i', 'j', 'jacc_coeff']) df_jacc_coeff['ind'] = df_jacc_coeff.index # resource allocation index of i, j (RA) res_alloc_ind_obj = nx.resource_allocation_index(G, edge_s) res_alloc_ind_edges = [] for uu, vv, jj in res_alloc_ind_obj: res_alloc_ind_edges.append([uu, vv, jj]) df_res_alloc_ind = pd.DataFrame(res_alloc_ind_edges, columns=['i', 'j', 'res_alloc_ind']) df_res_alloc_ind['ind'] = df_res_alloc_ind.index # Adamic/Adar index of i, j (AA) adam_adar_obj = nx.adamic_adar_index(G, edge_s) adam_adar_edges = [] for uu, vv, jj in adam_adar_obj: adam_adar_edges.append([uu, vv, jj]) df_adam_adar = pd.DataFrame(adam_adar_edges, columns=['i', 'j', 'adam_adar']) df_adam_adar['ind'] = df_adam_adar.index df_merge = pd.merge(df_jacc_coeff, df_res_alloc_ind, on=['ind', 'i', 'j'], sort=False) df_merge = pd.merge(df_merge, df_adam_adar, on=['ind', 'i', 'j'], sort=False) # preferential attachment (degree product) of i, j (PA) pref_attach_obj = nx.preferential_attachment(G, edge_s) pref_attach_edges = [] for uu, vv, jj in pref_attach_obj: pref_attach_edges.append([uu, vv, jj]) df_pref_attach = pd.DataFrame(pref_attach_edges, columns=['i', 'j', 'pref_attach']) df_pref_attach['ind'] = df_pref_attach.index # global features: # similarity of connections in the graph with respect to the node degree # degree assortativity (DA) deg_ass_net = nx.degree_assortativity_coefficient(G) # transitivity: fraction of all possible triangles present in G # network transitivity (clustering coefficient) (NT) transit_net = nx.transitivity(G) # network diameter (ND) try: diam_net = nx.diameter(G) except: diam_net = np.inf ave_deg_net = [ave_deg_net for ii in range(10000)] var_deg_net = [var_deg_net for ii in range(10000)] ave_clust_net = [ave_clust_net for ii in range(10000)] deg_ass_net = [deg_ass_net for ii in range(10000)] transit_net = [transit_net for ii in range(10000)] diam_net = [diam_net for ii in range(10000)] com_ne = [] for ee in range(len(edge_s)): com_ne.append( len(sorted(nx.common_neighbors(G, edge_s[ee][0], edge_s[ee][1])))) # closeness centralities for i and j (CC_i, CC_j) closn_cent_nodes_obj = nx.closeness_centrality(G) closn_cent_nodes = [] for nn in range(len(nodes)): closn_cent_nodes.append(closn_cent_nodes_obj[nn]) closn_cent1_edges = [] closn_cent2_edges = [] for ee in range(len(edge_s)): closn_cent1_edges.append(closn_cent_nodes[edge_s[ee][0]]) closn_cent2_edges.append(closn_cent_nodes[edge_s[ee][1]]) # shortest path between i, j (SP) short_Mat_aux = nx.shortest_path_length(G) short_Mat = {} for ss in range(N): value = next(short_Mat_aux) short_Mat[value[0]] = value[1] short_path_edges = [] for ee in range(len(edge_s)): if edge_s[ee][1] in short_Mat[edge_s[ee][0]].keys(): short_path_edges.append(short_Mat[edge_s[ee][0]][edge_s[ee][1]]) else: short_path_edges.append(np.inf) # load centralities for i and j (LC_i, LC_j) load_cent_nodes_obj = nx.load_centrality(G, normalized=True) load_cent_nodes = [] for nn in range(len(nodes)): load_cent_nodes.append(load_cent_nodes_obj[nn]) load_cent1_edges = [] load_cent2_edges = [] for ee in range(len(edge_s)): load_cent1_edges.append(load_cent_nodes[edge_s[ee][0]]) load_cent2_edges.append(load_cent_nodes[edge_s[ee][1]]) # shortest-path betweenness centralities for i and j (SPBC_i, SPBC_j) betw_cent_nodes_obj = nx.betweenness_centrality(G, normalized=True) betw_cent_nodes = [] for nn in range(len(nodes)): betw_cent_nodes.append(betw_cent_nodes_obj[nn]) betw_cent1_edges = [] betw_cent2_edges = [] for ee in range(len(edge_s)): betw_cent1_edges.append(betw_cent_nodes[edge_s[ee][0]]) betw_cent2_edges.append(betw_cent_nodes[edge_s[ee][1]]) neigh_ = {} for nn in range(len(nodes)): neigh_[nn] = np.where(A[nn, :])[0] df_pref_attach = [] for ee in range(len(edge_s)): df_pref_attach.append( len(neigh_[edge_s[ee][0]]) * len(neigh_[edge_s[ee][1]])) U, sig, V = np.linalg.svd(A, full_matrices=False) S = np.diag(sig) Atilda = np.dot(U, np.dot(S, V)) Atilda = np.array(Atilda) f_mean = lambda x: np.mean(x) if len(x) > 0 else 0 # entry i, j in low rank approximation (LRA) via singular value decomposition (SVD) svd_edges = [] # dot product of columns i and j in LRA via SVD for each pair of nodes i, j svd_edges_dot = [] # average of entries i and j’s neighbors in low rank approximation svd_edges_mean = [] for ee in range(len(edge_s)): svd_edges.append(Atilda[edge_s[ee][0], edge_s[ee][1]]) svd_edges_dot.append( np.inner(Atilda[edge_s[ee][0], :], Atilda[:, edge_s[ee][1]])) svd_edges_mean.append( f_mean(Atilda[edge_s[ee][0], neigh_[edge_s[ee][1]]])) # Leicht-Holme-Newman index of neighbor sets of i, j (LHN) f_LHN = lambda num, den: 0 if (num == 0 and den == 0) else float(num) / den LHN_edges = [ f_LHN(num, den) for num, den in zip(np.array(com_ne), np.array(df_pref_attach)) ] U, sig, V = np.linalg.svd(A) S = linalg.diagsvd(sig, A.shape[0], A.shape[1]) S_trunc = S.copy() S_trunc[S_trunc < sig[int(np.ceil(np.sqrt(A.shape[0])))]] = 0 Atilda = np.dot(np.dot(U, S_trunc), V) Atilda = np.array(Atilda) f_mean = lambda x: np.mean(x) if len(x) > 0 else 0 # an approximation of LRA (LRA-approx) svd_edges_approx = [] # an approximation of dLRA (dLRA-approx) svd_edges_dot_approx = [] # an approximation of mLRA (mLRA-approx) svd_edges_mean_approx = [] for ee in range(len(edge_s)): svd_edges_approx.append(Atilda[edge_s[ee][0], edge_s[ee][1]]) svd_edges_dot_approx.append( np.inner(Atilda[edge_s[ee][0], :], Atilda[:, edge_s[ee][1]])) svd_edges_mean_approx.append( f_mean(Atilda[edge_s[ee][0], neigh_[edge_s[ee][1]]])) # number of nodes (N) num_nodes = A_orig.shape[0] # number of observed edges (OE) num_edges = int(np.sum(A) / 2) # construct a dictionary of the features d = {'i':edge_pairs_f_i, 'j':edge_pairs_f_j, 'com_ne':com_ne, 'ave_deg_net':ave_deg_net, \ 'var_deg_net':var_deg_net, 'ave_clust_net':ave_clust_net, 'num_triangles_1':numtriang1_edges, 'num_triangles_2':numtriang2_edges, \ 'page_rank_pers_edges':page_rank_pers_edges, 'pag_rank1':page_rank1_edges, 'pag_rank2':page_rank2_edges, 'clust_coeff1':clust1_edges, 'clust_coeff2':clust2_edges, 'ave_neigh_deg1':ave_neigh_deg1_edges, 'ave_neigh_deg2':ave_neigh_deg2_edges,\ 'eig_cent1':eig_cent1_edges, 'eig_cent2':eig_cent2_edges, 'deg_cent1':deg_cent1_edges, 'deg_cent2':deg_cent2_edges, 'clos_cent1':closn_cent1_edges, 'clos_cent2':closn_cent2_edges, 'betw_cent1':betw_cent1_edges, 'betw_cent2':betw_cent2_edges, \ 'load_cent1':load_cent1_edges, 'load_cent2':load_cent2_edges, 'ktz_cent1':ktz_cent1_edges, 'ktz_cent2':ktz_cent2_edges, 'pref_attach':df_pref_attach, 'LHN':LHN_edges, 'svd_edges':svd_edges,'svd_edges_dot':svd_edges_dot,'svd_edges_mean':svd_edges_mean,\ 'svd_edges_approx':svd_edges_approx,'svd_edges_dot_approx':svd_edges_dot_approx,'svd_edges_mean_approx':svd_edges_mean_approx, 'short_path':short_path_edges, 'deg_assort':deg_ass_net, 'transit_net':transit_net, 'diam_net':diam_net, \ 'num_nodes':num_nodes, 'num_edges':num_edges} # construct a dataframe of the features df_feat = pd.DataFrame(data=d) df_feat['ind'] = df_feat.index df_feat = pd.merge(df_feat, df_merge, on=['ind', 'i', 'j'], sort=False) return df_feat
def sort_edges_by_adamic_adar_index(graph, edges): edges_sorted = sorted(list(nx.adamic_adar_index(graph, edges)), key=lambda l: l[ 2], reverse=True, cmp=compare_with_ties) return [(row[0], row[1]) for row in edges_sorted], [row[2] for row in edges_sorted]