def new_connections_predictions(): pref_attach = list(nx.preferential_attachment(G)) df = pd.DataFrame(index=[(x[0], x[1]) for x in pref_attach]) df['pref_attch'] = [x[2] for x in pref_attach] common_neigh = [(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1])))) for e in nx.non_edges(G)] df1 = pd.DataFrame(index=[(x[0], x[1]) for x in common_neigh]) df1['common_neigh'] = [x[2] for x in common_neigh] df = df.join(df1, how='outer') df['common_neigh'] = df['common_neigh'].fillna(value=0) del df1 community_common_neigh = list( nx.cn_soundarajan_hopcroft(G, community='Department')) df1 = pd.DataFrame(index=[(x[0], x[1]) for x in community_common_neigh]) df1['community_common_neigh'] = [x[2] for x in community_common_neigh] df = df.join(df1, how='outer') df['community_common_neigh'] = df['community_common_neigh'].fillna(value=0) del df1 community_res_alloc = list( nx.ra_index_soundarajan_hopcroft(G, community='Department')) df1 = pd.DataFrame(index=[(x[0], x[1]) for x in community_res_alloc]) df1['community_res_alloc'] = [x[2] for x in community_res_alloc] df = df.join(df1, how='outer') df['community_res_alloc'] = df['community_res_alloc'].fillna(value=0) del df1 df['res_alloc'] = [x[2] for x in list(nx.resource_allocation_index(G))] df['jaccard_coeff'] = [x[2] for x in list(nx.jaccard_coefficient(G))] features = [ 'jaccard_coeff', 'res_alloc', 'pref_attch', 'common_neigh', 'community_common_neigh', 'community_res_alloc' ] df = future_connections.join(df, how='outer') df_train = df[~pd.isnull(df['Future Connection'])] df_test = df[pd.isnull(df['Future Connection'])] X_train = df_train[features] X_test = df_test[features] y_train = df_train['Future Connection'] scalar = MinMaxScaler() X_train_scaled = scalar.fit_transform(X_train) X_test_scaled = scalar.fit_transform(X_test) clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=10, random_state=0).fit(X_train_scaled, y_train) test_proba = clf.predict_proba(X_test_scaled)[:, 1] predictions = pd.Series(test_proba, X_test.index) # target = future_connections[pd.isnull(future_connections['Future Connection'])] # target['proba'] = [predictions[x] for x in target.index] return predictions
def generate_positive_features(): features = [] count = 0 print("Generating positive features......") for sample in positive_samples: if (count % 100 == 0): print(count) count += 1 feature = [] try: preds = nx.resource_allocation_index(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.jaccard_coefficient(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.adamic_adar_index(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.preferential_attachment(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.cn_soundarajan_hopcroft(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.ra_index_soundarajan_hopcroft(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.within_inter_cluster(UG, [sample]) for u, v, p in preds: feature.append(p) feature.append(1) # label=1 except: print("one error at: " + str(count)) pass features.append(feature) print("positive features: " + str(len(features))) return features
def getNodeSim(node, g, metric): allDatasets = [i for i in g.nodes if (str(i).startswith('data_'))] pairs = generatePairs(node, [i for i in allDatasets if str(i) != node]) if metric == 'Jaccard': preds = nx.jaccard_coefficient(g, pairs) elif metric == 'Adamic-Adar': preds = nx.adamic_adar_index(g, pairs) elif metric == 'Hopcroft': preds = nx.ra_index_soundarajan_hopcroft(g, pairs) elif metric == 'Cosine': return cosineSimilarity(g, pairs) else: return [] res = [] for u, v, p in preds: if p > 0.0: res.append((u, int(v.replace('data_', '')), p)) df = pd.DataFrame(res, columns=['x', 'data_set_id', 'score']) return df[['data_set_id', 'score']].iloc[:10]
G.node[1]['community'] = 0 G.node[2]['community'] = 0 preds = nx.cn_soundarajan_hopcroft(G, [(0, 2)]) for u, v, p in preds: print('(%d, %d) -> %d' % (u, v, p)) #%% import networkx as nx G = nx.Graph() G.add_edges_from([(0, 1), (0, 2), (1, 3), (2, 3)]) G.node[0]['community'] = 0 G.node[1]['community'] = 0 G.node[2]['community'] = 1 G.node[3]['community'] = 0 nx.draw_networkx(G) preds = nx.ra_index_soundarajan_hopcroft(G, [(0, 3)]) for u, v, p in preds: print('(%d, %d) -> %.8f' % (u, v, p)) #%% # ER network import networkx as nx nx.draw(nx.fast_gnp_random_graph(100, 0.05)) #%% # WS small world network import networkx as nx nx.draw_spring(nx.watts_strogatz_graph(10, 4, 0)) #%% # newmann ws small world network
def ra_index_soundarajan_hopcroft(self): return list( nx.ra_index_soundarajan_hopcroft( self.graph, [(self.node_1, self.node_2)]))[0][2]
G = nx.read_edgelist("./data/drugbank_interactions.tsv", delimiter="\t", nodetype=str) partition = community.best_partition(G) nx.set_node_attributes(G, name='community', values=partition) ap = list(all_pairs(G.nodes())) cn = cn.cnbors(G, ap) rai = nx.resource_allocation_index(G, ap) jc = nx.jaccard_coefficient(G, ap) aai = nx.adamic_adar_index(G, ap) pa = nx.preferential_attachment(G, ap) ccn = nx.cn_soundarajan_hopcroft(G, ap) cra = nx.ra_index_soundarajan_hopcroft(G, ap) wic = nx.within_inter_cluster(G, ap, community='community') u, v, s1, s2, s3, s4, s5, s6, s7, s8, has_edge = ([] for i in range(11)) for m1, m2, m3, m4, m5, m6, m7, m8 in zip(cn, rai, jc, aai, pa, ccn, cra, wic): u.append(m1[0]) v.append(m1[1]) s1.append(m1[2]) s2.append(m2[2]) s3.append(m3[2]) s4.append(m4[2]) s5.append(m5[2]) s6.append(m6[2]) s7.append(m7[2]) s8.append(m8[2]) has_edge.append(int(G.has_edge(m1[0], m2[1])))
def new_connections_predictions(): from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score future_connections = pd.read_csv(path + 'Future_Connections.csv', index_col=0, converters={0:eval}) def communities(row): """ Check to whether are in the same department or notself. Vectorized for rows, use with pd.DataFrame.apply(x, axis=1) """ nodes = row.name a = nodes[0] b = nodes[1] comm_a = G.node[a]['Department'] comm_b = G.node[b]['Department'] if comm_a == comm_b: return 1 else: return 0 future_connections['same_comm'] = future_connections.apply(communities, axis=1) # For Soundarajan-Hopcroft algorithms. for node in G.nodes(): G.node[node]['community'] = G.node[node]['Department'] pa = list(nx.preferential_attachment(G)) pa_df = pd.DataFrame(index=[(i[0], i[1]) for i in pa], data={'pref_att':[i[2] for i in pa]}) cn = [(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1])))) for e in nx.non_edges(G)] cn_df = pd.DataFrame(index=[(i[0], i[1]) for i in cn], data={'comm_neigh':[i[2] for i in cn]}) cnsh = list(nx.cn_soundarajan_hopcroft(G)) cnsh_df = pd.DataFrame(index=[(i[0], i[1]) for i in cnsh], data={'sh_comm_neigh':[i[2] for i in cnsh]}) ra = list(nx.resource_allocation_index(G)) ra_df = pd.DataFrame(index=[(i[0], i[1]) for i in ra], data={'reso_alloc':[i[2] for i in ra]}) rash = list(nx.ra_index_soundarajan_hopcroft(G)) rash_df = pd.DataFrame(index=[(i[0], i[1]) for i in rash], data={'sh_reso_alloc':[i[2] for i in rash]}) jc = [i for i in nx.jaccard_coefficient(G)] jc_df = pd.DataFrame(index=[(i[0], i[1]) for i in jc], data={'jacc_coeff':[i[2] for i in jc]}) for df in [pa_df, cn_df, cnsh_df, ra_df, rash_df, jc_df]: future_connections = future_connections.merge(df, how='left', left_index=True, right_index=True) keep = future_connections[~future_connections['Future Connection'].isnull()] hold = future_connections[future_connections['Future Connection'].isnull()] X_keep = keep.drop('Future Connection', axis=1) y_keep = keep['Future Connection'] X_hold = hold.drop('Future Connection', axis=1) X_train, X_test, y_train, y_test = train_test_split(X_keep, y_keep, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) # Check on ROC_AUC performance. roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) probs = clf.predict_proba(X_hold)[:, 1] answer = pd.Series(index=X_hold.index, data=probs) return answer
for edge in pedges: cntr += 1 print("\r {}/{}".format(cntr, lenedg), end="") positive_predictions_proba_jcc.append( list(nx.jaccard_coefficient(G, [edge]))[0][2]) positive_predictions_proba_ra.append( list(nx.resource_allocation_index(G, [edge]))[0][2]) positive_predictions_proba_aa.append( list(nx.adamic_adar_index(G, [edge]))[0][2]) positive_predictions_proba_pa.append( list(nx.preferential_attachment(G, [edge]))[0][2]) positive_predictions_proba_cnsh.append( list(nx.cn_soundarajan_hopcroft( G, [edge]))[0][2]) # needs community information positive_predictions_proba_rash.append( list(nx.ra_index_soundarajan_hopcroft( G, [edge]))[0][2]) # needs community information positive_predictions_proba_wic.append( list(nx.within_inter_cluster( G, [edge]))[0][2]) # needs community information positive_predictions_proba_slp_DegCent.append( list(SLP_prediction(G, [edge], centrality="DegCent"))[0][2]) positive_predictions_proba_slp_EigenCent.append( list(SLP_prediction(G, [edge], centrality="EigenCent"))[0][2]) positive_predictions_proba_slp_ClosenessCent.append( list(SLP_prediction(G, [edge], centrality="ClosenessCent"))[0][2]) positive_predictions_proba_slp_BetweenCent.append( list(SLP_prediction(G, [edge], centrality="BetweenCent"))[0][2]) positive_predictions_proba_slp_PageRank.append( list(SLP_prediction(G, [edge], centrality="PageRank"))[0][2]) positive_predictions_proba_slpc_DegCent.append( list(SLPC_prediction(
def set_edge_weight(self, edge_weight_method='weight'): if edge_weight_method == 'weight': return # Centrality based methods elif edge_weight_method == 'edge_betweenness_centrality': print("comptuing edge_betweenness_centrality..") C = nx.edge_betweenness_centrality(self.G, weight='weight') print("done!") elif edge_weight_method == 'edge_betweenness_centrality_subset': print("comptuing edge_betweenness_centrality_subset..") C = nx.edge_current_flow_betweenness_centrality(self.G, weight='weight') print('done') elif edge_weight_method == 'edge_current_flow_betweenness_centrality_subset': print( "comptuing edge_current_flow_betweenness_centrality_subset..") C = nx.edge_current_flow_betweenness_centrality_subset( self.G, weight='weight') print('done') elif edge_weight_method == 'edge_load_centrality': print("comptuing edge_load_centrality..") C = nx.edge_load_centrality(self.G) print('done!') # Link Prediction based methods elif edge_weight_method == 'adamic_adar_index': print("comptuing adamic_adar_index ..") preds = nx.adamic_adar_index(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p elif edge_weight_method == 'ra_index_soundarajan_hopcroft': print("comptuing ra_index_soundarajan_hopcroft ..") preds = nx.ra_index_soundarajan_hopcroft(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p elif edge_weight_method == 'preferential_attachment': print("comptuing preferential_attachment ..") preds = nx.preferential_attachment(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p #elif edge_weight_method=='cn_soundarajan_hopcroft': # print("comptuing cn_soundarajan_hopcroft ..") # preds=nx.cn_soundarajan_hopcroft(self.G,self.G.edges()) # C={} # for u, v, p in preds: # C[(u,v)]=p elif edge_weight_method == 'within_inter_cluster': print("comptuing within_inter_cluster ..") preds = nx.within_inter_cluster(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p elif edge_weight_method == 'resource_allocation_index': print("comptuing resource allocation index ..") preds = nx.resource_allocation_index(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p elif edge_weight_method == 'jaccard_coefficient': print("comptuing jaccard_coefficient..") preds = nx.jaccard_coefficient(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p print('done!') for u, v, d in self.G.edges(data=True): if edge_weight_method == None: d['weight'] = 1 else: d['weight'] = C[(u, v)] return 1
def compute_variable(self, variable_name, train: bool, load=True, path_to_file=None, save=True): assert variable_name in self.handled_variables, "Variable %s is not handled. Handled variables are : %s" % ( variable_name, str(self.handled_variables)) if load and train: if path_to_file is None and os.path.isfile( "variables/%s.npy" % variable_name): print("Loading STANDARD %s file!" % variable_name) result = np.load("variables/%s.npy" % variable_name) return result[:self.nb_training_samples] elif path_to_file is not None and os.path.isfile(path_to_file): print("Loading CUSTOM %s file!" % variable_name) result = np.load(path_to_file) return result[:self.nb_training_samples] print("Did not find saved %s in `variables` folder." % variable_name) if load and not train: if path_to_file is None and os.path.isfile( "variables/TEST_%s.npy" % variable_name): print("Loading STANDARD TEST_%s file!" % variable_name) result = np.load("variables/TEST_%s.npy" % variable_name) return result[:self.nb_training_samples] elif path_to_file is not None and os.path.isfile(path_to_file): print("Loading CUSTOM %s file!" % variable_name) result = np.load(path_to_file) return result[:self.nb_training_samples] print("Did not find saved TEST_%s in `variables` folder." % variable_name) print("Starting computation of %s..." % variable_name) t1 = time() gd = self.graph_structure.graph_dicts # "graph_dictionaries if train: nb_of_samples = self.nb_training_samples else: nb_of_samples = self.nb_testing_samples result = np.zeros(shape=nb_of_samples) for i in range(nb_of_samples): if train: t = self.train_array[i] else: t = self.test_array[i] if variable_name == "publication_2": result[i] = np.log( len( set(self.node_information.loc[t[0], "publication_2"]) & set(self.node_information.loc[t[1], "publication_2"])) + 1) elif variable_name == "adam_coeff": if train: if t[2] == 1: self.graph_structure.g.remove_edge(t[0], t[1]) result[i] = \ next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g, [(t[0], t[1])]))[2] self.graph_structure.g.add_edge(t[0], t[1]) else: result[i] = \ next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g, [(t[0], t[1])]))[2] else: result[i] = \ next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g, [(t[0], t[1])]))[2] elif variable_name == "overlapping_words_in_title": result[i] = compute_intersection( self.node_information.loc[t[0], "title"], self.node_information.loc[t[1], "title"], self.stemmer, self.stpwds) elif variable_name == "number_of_common_authors": result[i] = nbr_common_authors( self.node_information.loc[t[0], "author"], self.node_information.loc[t[1], "author"]) elif variable_name == "difference_of_years": result[i] = abs(self.node_information.loc[t[0], 'year'] - self.node_information.loc[t[1], 'year']) elif variable_name == "affinity_between_authors": result[i] = compute_affinity_between_authors( self.node_information.loc[t[0], 'author'], self.node_information.loc[t[1], 'author'], self.authors_dict) elif variable_name == "identical_journal": result[i] = np.int(self.node_information.loc[t[0], 'journal'] == self.node_information.loc[t[1], 'journal']) elif variable_name == "l2_distance": result[i] = np.linalg.norm( self.node_information.loc[t[0], 'wv'] - self.node_information.loc[t[1], 'wv']) elif variable_name == "cosine_distance_tfid": v1 = self.node_information.loc[t[0], "wv_tfid"] v2 = self.node_information.loc[t[1], "wv_tfid"] try: b1 = np.isnan(v1) except TypeError: b1 = False try: b2 = np.isnan(v2) except TypeError: b2 = False if not b1 and not b2: result[i] = cosine_similarity(v1, v2) else: result[i] = 0 elif variable_name == "l2_distance_between_titles": dst = np.linalg.norm( self.node_information.loc[t[0], 'title_wv'] - self.node_information.loc[t[1], 'title_wv']) if np.isnan(dst): result[i] = 0 else: result[i] = dst # elif variable_name == "cosine_distance_between_titles": # result[i] = cosine_distances( # np.nan_to_num(self.node_information.loc[t[0], 'title_wv']).reshape(-1, 1) - (self.node_information.loc[t[1], 'title_wv']).reshape(-1, 1) # )[0][0] elif variable_name == "common_neighbors": result[i] = len( sorted( nx.common_neighbors(self.graph_structure.g, t[0], t[1]))) elif variable_name == "clustering_coeff": result[i] = gd["clustering_coeff"][ t[0]] * gd["clustering_coeff"][t[1]] elif variable_name == "betweenness": result[i] = gd["betweenness"][t[0]] * gd["betweenness"][t[1]] elif variable_name == "closeness": result[i] = gd["closeness"][t[0]] * gd["closeness"][t[1]] elif variable_name == "degree": result[i] = gd["degree"][t[0]] * gd["degree"][t[1]] elif variable_name == "eigenvector": result[i] = gd["eigenvector"][t[0]] * gd["eigenvector"][t[1]] elif variable_name == "jaccard_coeff": if train: if t[2] == 1: self.graph_structure.g.remove_edge(t[0], t[1]) result[i] = next( nx.jaccard_coefficient(self.graph_structure.g, [(t[0], t[1])]))[2] self.graph_structure.g.add_edge(t[0], t[1]) else: result[i] = next( nx.jaccard_coefficient(self.graph_structure.g, [(t[0], t[1])]))[2] else: result[i] = next( nx.jaccard_coefficient(self.graph_structure.g, [(t[0], t[1])]))[2] elif variable_name == "shortest_path": if train: if t[2] == 1: assert self.graph_structure.g.has_edge( t[0], t[1] ), "There's a problem with the structure of the graph for id %i and %i" % ( t[0], t[1]) self.graph_structure.g.remove_edge(t[0], t[1]) try: result[ i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length( self.graph_structure.g, t[0], t[1]) except nx.NetworkXNoPath: result[i] = 0 self.graph_structure.g.add_edge(t[0], t[1]) else: try: result[ i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length( self.graph_structure.g, t[0], t[1]) except nx.NetworkXNoPath: result[i] = 0 else: try: result[ i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length( self.graph_structure.g, t[0], t[1]) except nx.NetworkXNoPath: result[i] = 0 elif variable_name == "pagerank": result[i] = gd["pagerank"][t[0]] * gd["pagerank"][t[1]] elif variable_name == "community": if self.graph_structure.partition[ t[0]] == self.graph_structure.partition[t[1]]: result[i] = 1 else: result[i] = 0 elif variable_name == "lp_resource_allocation_index": if train: if t[2] == 1: self.graph_structure.g.remove_edge(t[0], t[1]) result[i] = sorted( nx.resource_allocation_index( self.graph_structure.g, [(t[0], t[1])]))[0][2] self.graph_structure.g.add_edge(t[0], t[1]) else: result[i] = sorted( nx.resource_allocation_index( self.graph_structure.g, [(t[0], t[1])]))[0][2] else: result[i] = sorted( nx.resource_allocation_index(self.graph_structure.g, [(t[0], t[1])]))[0][2] elif variable_name == "lp_preferential_attachment": if train: if t[2] == 1: self.graph_structure.g.remove_edge(t[0], t[1]) result[i] = sorted( nx.preferential_attachment(self.graph_structure.g, [(t[0], t[1])]))[0][2] self.graph_structure.g.add_edge(t[0], t[1]) else: result[i] = sorted( nx.preferential_attachment(self.graph_structure.g, [(t[0], t[1])]))[0][2] else: result[i] = sorted( nx.preferential_attachment(self.graph_structure.g, [(t[0], t[1])]))[0][2] elif variable_name == "lp_cn_soundarajan": if train: if t[2] == 1: self.graph_structure.g.remove_edge(t[0], t[1]) result[i] = sorted( nx.cn_soundarajan_hopcroft(self.graph_structure.g, [(t[0], t[1])]))[0][2] self.graph_structure.g.add_edge(t[0], t[1]) else: result[i] = sorted( nx.cn_soundarajan_hopcroft(self.graph_structure.g, [(t[0], t[1])]))[0][2] else: result[i] = sorted( nx.cn_soundarajan_hopcroft(self.graph_structure.g, [(t[0], t[1])]))[0][2] elif variable_name == "lp_ra_index_soundarajan": if train: if t[2] == 1: self.graph_structure.g.remove_edge(t[0], t[1]) result[i] = sorted( nx.ra_index_soundarajan_hopcroft( self.graph_structure.g, [(t[0], t[1])]))[0][2] self.graph_structure.g.add_edge(t[0], t[1]) else: result[i] = sorted( nx.ra_index_soundarajan_hopcroft( self.graph_structure.g, [(t[0], t[1])]))[0][2] else: result[i] = sorted( nx.ra_index_soundarajan_hopcroft( self.graph_structure.g, [(t[0], t[1])]))[0][2] elif variable_name == "lp_within_inter_cluster": if train: if t[2] == 1: self.graph_structure.g.remove_edge(t[0], t[1]) result[i] = sorted( nx.within_inter_cluster(self.graph_structure.g, [(t[0], t[1])]))[0][2] self.graph_structure.g.add_edge(t[0], t[1]) else: result[i] = sorted( nx.within_inter_cluster(self.graph_structure.g, [(t[0], t[1])]))[0][2] else: result[i] = sorted( nx.within_inter_cluster(self.graph_structure.g, [(t[0], t[1])]))[0][2] print("Did %s column in %5.1fs" % (variable_name, time() - t1)) if save and train: print("Saved variable %s in `variables` directory." % variable_name) np.save("variables/" + variable_name, result) if save and not train: np.save("variables/TEST_" + variable_name, result) print("Saved variable TEST_%s in `variables` directory." % variable_name) if np.isnan(result).shape[0] >= 1: print("Careful, you have nan values !") result[np.isnan(result)] = 0 return result
def get_community_resource_allocation(G): cra = list(nx.ra_index_soundarajan_hopcroft(G)) cra.sort(key=operator.itemgetter(2), reverse=True) return cra
import networkx as nx import operator G = nx.from_edgelist([('A', 'C'), ('A', 'E'), ('A', 'D'), ('B', 'D'), ('C', 'G'), ('D', 'G'), ('D', 'H'), ('D', 'E'), ('E', 'H'), ('H', 'F')]) res_alloc = list(nx.resource_allocation_index(G)) print(sorted(res_alloc, key=operator.itemgetter(2), reverse=True)) pref_attach = list(nx.preferential_attachment(G)) print(sorted(pref_attach, key=operator.itemgetter(2), reverse=True)) G.node['A']['community']=0 G.node['B']['community']=0 G.node['C']['community']=0 G.node['D']['community']=0 G.node['G']['community']=0 G.node['F']['community']=1 G.node['E']['community']=1 G.node['H']['community']=1 community_common_neigh = list(nx.cn_soundarajan_hopcroft(G)) print(sorted(community_common_neigh, key=operator.itemgetter(2), reverse=True)) community_res_alloc = list(nx.ra_index_soundarajan_hopcroft(G)) print(sorted(community_res_alloc, key=operator.itemgetter(2), reverse=True))
def new_connections_predictions(): import operator # Import preprocessing, selection and metrics from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, GridSearchCV from sklearn.metrics import roc_auc_score from sklearn.dummy import DummyClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC, LinearSVC # Your Code Here df_fc_test_mask = pd.isnull(future_connections.loc[:, 'Future Connection']) df = pd.DataFrame() # Measure 1: Common Neighbors (intercept) # The number of common neighbors of nodes 𝑋 and 𝑌 # future_connections['common_neigh'] L = [(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1])))) for e in nx.non_edges(G)] df['pair'] = [(x, y) for x, y, z in L] df['common_nb'] = [z for x, y, z in L] # L.sort(key=operator.itemgetter(2), reverse=True) # print(L) # Measure 2: Jaccard Coefficient (intercept over union) # Number of common neighbors normalized by the total number of neighbors # common_neighbors/total_neighbors # future_connections['jaccard'] df['jaccard'] = pd.Series([z for x, y, z in nx.jaccard_coefficient(G)]) # L.sort(key=operator.itemgetter(2), reverse=True) # print(L) # Measure 3: Resource # Fraction of a ”resource” that a node can send to another through their common neighbors # sum(1/degree_common_neighbor) df['resource'] = pd.Series([z for x, y, z in nx.resource_allocation_index(G)]) # L.sort(key=operator.itemgetter(2), reverse=True) # print(L) # Measure 4: # Adamic Adar Index # Similar to resource allocation index, but with log in the denominator # sum(1/log(degree_common_neighbor)) future_connections['adamic_adar'] = pd.Series([z for x, y, z in nx.adamic_adar_index(G)]) # L.sort(key=operator.itemgetter(2), reverse=True) # print(L) # Method 5: # Preferential Attachment # In the preferential attachment model, nodes with high degree get more neighbors # degree_source * degree_target future_connections['pref_att'] = pd.Series([z for x, y, z in nx.preferential_attachment(G)]) # print(L) # Measure 6: # Community Common Neighbors # Number of common neighbors with bonus of 1 for each neighbor in same community # f(u) = 1 if same community else 0 # sum(f(u) * degree) for i, dept in enumerate(nx.get_node_attributes(G, 'Department')): G.node[i]['community'] = dept future_connections['com_common_nb'] = pd.Series([z for x, y, z in nx.cn_soundarajan_hopcroft(G)]) # L.sort(key=operator.itemgetter(2), reverse=True) # print(L) # Measure 7: # Community Resource Allocation # Similar to resource allocation index, but only considering nodes in the same community # f(u) = 1 if same community else 0 # sum(f(u)/degree) future_connections['com_resource'] = pd.Series([z for x, y, z in nx.ra_index_soundarajan_hopcroft(G)]) # L.sort(key=operator.itemgetter(2), reverse=True) # print(L) print(df.head()) # # # df_fc_train = future_connections.loc[~df_fc_test_mask, :] # df_fc_test = future_connections.loc[df_fc_test_mask, :] # y_train = df_fc_train.loc[:, 'Future Connection'] # y_test = df_fc_test.loc[:, 'Future Connection'] # X_train = df_fc_train.index # X_test = df_fc_test.index # def auc_scores(model, *args, k=5, threshold=0.50): # """CV scores""" # X, y = args # predictions = cross_val_predict(model, X, y, cv=k, n_jobs=-1) # print('AUC - Test predict {:.2%}'.format(roc_auc_score(y, predictions))) # classifiers = [ # # GaussianNB(), # # DecisionTreeClassifier(random_state=0), # # DecisionTreeClassifier(max_depth=3, random_state=0), # # DecisionTreeClassifier(max_depth=4, random_state=0), # # DecisionTreeClassifier(max_depth=5, random_state=0), # # DecisionTreeClassifier(max_depth=6, random_state=0), # GradientBoostingClassifier(random_state=0), # # GradientBoostingClassifier(learning_rate=0.08, random_state=0), # # GradientBoostingClassifier(learning_rate=0.12, random_state=0), # # GradientBoostingClassifier(learning_rate=0.1, max_depth=3, random_state=0), # # GradientBoostingClassifier(learning_rate=0.1, max_depth=4, random_state=0), # # RandomForestClassifier(n_estimators=100, random_state=0), # # AdaBoostClassifier(learning_rate=0.1, n_estimators=100, random_state=0), # # KNeighborsClassifier(), # # KNeighborsClassifier(n_neighbors=4), # # LinearSVC(random_state=0) # ] # for model in classifiers: # # print('-'*80) # # print(model) # # Training scores # # clf_train = model.fit(X_train, y_train) # # pred_train = clf_train.predict(X_train) # # print('AUC - Train pred {:.2%}'.format(roc_auc_score(y_train, pred_train))) # # CV scores # clf = model.fit(X_train, y_train) # # auc_scores(clf, X_train, y_train) # # Predict # predicted = clf.predict(X_test) # pred_series = pd.Series(predicted) # assert type(pred_series) == pd.Series, 'wtf: ' + str(type(pred_series)) return pred_series
G.node[2]['community'] = 0 G.node[3]['community'] = 0 G.node[4]['community'] = 1 G.node[5]['community'] = 1 G.node[6]['community'] = 1 G.node[7]['community'] = 1 G.node[8]['community'] = 1 L = list(nx.cn_soundarajan_hopcroft(G)) L.sort(key=operator.itemgetter(2), reverse=True); L # Measure 7: # Community Resource Allocation # Similar to resource allocation index, but only considering nodes in the same community # f(u) = 1 if same community else 0 # sum(f(u)/degree) L = list(nx.ra_index_soundarajan_hopcroft(G)) L.sort(key=operator.itemgetter(2), reverse=True); L # Summary # • Link prediction problem: Given a network, predict which edges will be formed in the future. # • 5 basic measures: # – NumberofCommonNeighbors – JaccardCoefficient # – ResourceAllocationIndex # – Adamic-AdarIndex # – PreferentialAttachmentScore # • 2 measures that require community information: # – CommonNeighborSoundarajan-HopcroftScore – ResourceAllocationSoundarajan-HopcroftScore # ---------------------------------------------> Plot <-------------------------------------------- #
def ra_index_soundarajan_hopcroft(uG, ni, nj, rand_node): a, b = nx.ra_index_soundarajan_hopcroft(uG, [(ni, nj), (ni, rand_node)]) return a[2], b[2]
nx.draw_networkx_labels(G, pos, labels={u: u for t in candidate_edges for u in t}, font_size=13, font_weight='bold', font_color='yellow') plt.axis('off') plt.tight_layout() plt.show() # Create a data frame to store various centrality measures. df = pd.DataFrame(index=candidate_edges) # Add generic and community aware edge features for potential machine learning classification. df['pref-att'] = list( map(operator.itemgetter(2), nx.preferential_attachment(G, candidate_edges))) df['jaccard-c'] = list( map(operator.itemgetter(2), nx.jaccard_coefficient(G, candidate_edges))) df['aa-idx'] = list( map(operator.itemgetter(2), nx.adamic_adar_index(G, candidate_edges))) df['ccn'] = list( map(operator.itemgetter(2), nx.cn_soundarajan_hopcroft(G, candidate_edges, 'club'))) df['cra'] = list( map(operator.itemgetter(2), nx.ra_index_soundarajan_hopcroft(G, candidate_edges, 'club'))) print(df)
def get_features(L, flag): X = [[] for i in range(len(L))] #=====================Social features(user-to-user graph)====================== #g0.adamic adar score if flag['g0'] is True: print("get feature g0") preds = nx.adamic_adar_index(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g1.jaccard coefficient if flag['g1'] is True: print("get feature g1") preds = nx.jaccard_coefficient(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g2.resource_allocation if flag['g2'] is True: print("get feature g2") preds = nx.resource_allocation_index(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g3.preferentail_attachment if flag['g3'] is True: print("get feature g3") preds = nx.preferential_attachment(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g4.shortest path length if flag['g4'] is True: print("get feature g4") cnt = 0 for (u, v) in L: if G.has_edge(u, v): G.remove_edge(u, v) if nx.has_path(G, u, v): X[cnt].append( nx.shortest_path_length(G, source=u, target=v) / 50000) else: X[cnt].append(1) G.add_edge(u, v) else: if nx.has_path(G, u, v): X[cnt].append( nx.shortest_path_length(G, source=u, target=v) / 50000) else: X[cnt].append(1) cnt += 1 #g5.common neighbors if flag['g5'] is True: print("get feature g5") cnt = 0 for (u, v) in L: if G.has_edge(u, v): G.remove_edge(u, v) T = [w for w in nx.common_neighbors(G, u, v)] G.add_edge(u, v) else: T = [w for w in nx.common_neighbors(G, u, v)] X[cnt].append(len(T)) cnt += 1 #g6.Approximate katz for social graph if flag['g6'] is True: print("get feature g6") cnt = 0 for (u, v) in L: p = 0 if G.has_edge(u, v): G.remove_edge(u, v) for x in G.neighbors(u): for y in G.neighbors(v): if x == y or G.has_edge(x, y): p += 1 G.add_edge(u, v) else: for x in G.neighbors(u): for y in G.neighbors(v): if x == y or G.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 if flag['g7'] is True: print("get feature g7") cnt = 0 with open("best_part_G.txt", "r") as f: for line in f: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.cn_soundarajan_hopcroft(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['g8'] is True: print("get feature g8") cnt = 0 with open("best_part_G.txt", "r") as f: for line in f: if line == "": continue v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.ra_index_soundarajan_hopcroft(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['g9'] is True: print("get feature g9") cnt = 0 with open("best_part_G.txt", "r") as f: for line in f: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.within_inter_cluster(G, L, delta=0.5) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['g10'] is True: print("get feature g10") cnt = 0 with open("dendo_G.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.cn_soundarajan_hopcroft(G, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.cn_soundarajan_hopcroft(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds if flag['g11'] is True: print("get feature g11") cnt = 0 with open("dendo_G.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.ra_index_soundarajan_hopcroft(G, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.ra_index_soundarajan_hopcroft(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds if flag['g12'] is True: print("get feature g12") cnt = 0 with open("dendo_G.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.within_inter_cluster(G, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.within_inter_cluster(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds #=========================checkin features========================================= #c0.follower number if flag['c0'] is True: print("get feature c0") cnt = 0 for (u, v) in L: X[cnt].append(U[u]['follow_cnt'] * U[v]['follow_cnt']) # fu*fv cnt += 1 #c1.same time same location if flag['c1'] is True: print("get feature c1") cnt = 0 for (u, v) in L: p = calculate_CCC(G, u, v) X[cnt].append(p) cnt += 1 #c2.same time same distinct spot if flag['c2'] is True: print("get deature c2") cnt = 0 for (u, v) in L: p = 0 dis_same_spot = [] for k in C[u]: if k[1] not in dis_same_spot and k in C[v]: dis_same_spot.append(k[1]) p += 1 X[cnt].append(p) cnt += 1 #c3.same distinct spot (not necessarily same time) if flag['c3'] is True: cnt = 0 print("get feature c3") for (u, v) in L: p = 0 dis_same_spot = [] for k in C[u]: if k[1] not in dis_same_spot: for m in C[v]: if k[1] == m[1]: dis_same_spot.append(k[1]) p += 1 break X[cnt].append(p) cnt += 1 #c4.min Entropy if flag['c4'] is True: print("get feature c4") cnt = 0 for (u, v) in L: p = 0 E_list = [] for k in C[u]: if k in C[v]: spot = k[1] if spot in S and S[spot]['entropy'] > 0: E_list.append(S[spot]['entropy']) if len(E_list) > 0: p = min(E_list) X[cnt].append(p) cnt += 1 #c5. distance of mean_LL if flag['c5'] is True: cnt = 0 print("get feature c5") for (u, v) in L: dist = np.sqrt((U[u]['mean_LL'][0] - U[v]['mean_LL'][0])**2 + (U[u]['mean_LL'][1] - U[v]['mean_LL'][1])**2) X[cnt].append(dist) cnt += 1 #c6.weighted same location if flag['c6'] is True: print("get feature c6") cnt = 0 for (u, v) in L: p = 0 for k in C[u]: if k in C[v]: spot = k[1] #if spot in S and S[spot]['entropy'] > 0: #p += 1/S[spot]['entropy'] if spot in S: dist = np.sqrt( (S[spot]['LL'][0] - U[u]['mean_LL'][0])**2 + (S[spot]['LL'][1] - U[u]['mean_LL'][1])**2) p += dist dist = np.sqrt( (S[spot]['LL'][0] - U[v]['mean_LL'][0])**2 + (S[spot]['LL'][1] - U[v]['mean_LL'][1])**2) p += dist X[cnt].append(p) cnt += 1 #c7.PP if flag['c7'] is True: print("get feature c7") cnt = 0 for (u, v) in L: p = len(C[u]) * len(C[v]) X[cnt].append(p) cnt += 1 #c8.Total Common Friend Closeness (TCFC) if flag['c8'] is True: print("get feature c8") cnt = 0 for (u, v) in L: p = 0 if G.has_edge(u, v): G.remove_edge(u, v) for w in nx.common_neighbors(G, u, v): T1 = [x for x in nx.common_neighbors(G, u, w)] T2 = [x for x in nx.common_neighbors(G, v, w)] p += len(T1) * len(T2) G.add_edge(u, v) else: for w in nx.common_neighbors(G, u, v): T1 = [x for x in nx.common_neighbors(G, u, w)] T2 = [x for x in nx.common_neighbors(G, v, w)] p += len(T1) * len(T2) X[cnt].append(p) cnt += 1 #c9.Total Common friend Checkin Count (TCFCC) if flag['c9'] is True: print("get feature c9") cnt = 0 for (u, v) in L: p = 0 if G.has_edge(u, v): G.remove_edge(u, v) for w in nx.common_neighbors(G, u, v): p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w) G.add_edge(u, v) else: for w in nx.common_neighbors(G, u, v): p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w) X[cnt].append(p) cnt += 1 #c10. Common Category Checkin Counts Product (CCCP) if flag['c10'] is True: print("get feature c10") cnt = 0 for (u, v) in L: p = 0 for cat in U[u]['cate']: if cat in U[v]['cate']: p += U[u]['cate'][cat] * U[v]['cate'][cat] X[cnt].append(p) cnt += 1 #c11. Common Category Checkin Counts Product Ratio(CCCPR) if flag['c11'] is True: print("get feature c11") cnt = 0 for (u, v) in L: p = 0 u_cate_total = sum(U[u]['cate'][cat]**2 for cat in U[u]['cate']) v_cate_total = sum(U[v]['cate'][cat]**2 for cat in U[v]['cate']) for cat in U[u]['cate']: if cat in U[v]['cate']: p += (U[u]['cate'][cat] * U[v]['cate'][cat] / np.sqrt(u_cate_total * v_cate_total)) X[cnt].append(p) cnt += 1 #c12.trip route length all if flag['c12'] is True: print("get feature c12") cnt = 0 for (u, v) in L: tripDayLen1 = list() tripDayLen2 = list() tripDay = "starting" tripLen = 0.0 lastSpot = [0.0, 0.0] for k in C[u]: if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0): if k[1] in S: tripLen += np.sqrt((lastSpot[0] - S[k[1]]['LL'][0])**2 + (lastSpot[1] - S[k[1]]['LL'][1])**2) lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] else: if k[1] in S: lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] tripDay = "starting" tripLen2 = 0.0 lastSpot = [0.0, 0.0] for k in C[v]: if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0): if k[1] in S: tripLen2 += np.sqrt( (lastSpot[0] - S[k[1]]['LL'][0])**2 + (lastSpot[1] - S[k[1]]['LL'][1])**2) lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] else: if k[1] in S: lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] X[cnt].append(tripLen + tripLen2) cnt += 1 #=========================Heter Graph features===================================== #h0.Approximate katz for bipartite graph if flag['h0'] is True: print("get feature h0") cnt = 0 for (u, v) in L: p = 0 for x in B.neighbors(u): for y in B.neighbors(v): if x == y or B.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 #h1.Approximate katz on HB if flag['h1'] is True: print("get feature h1") cnt = 0 for (u, v) in L: p = 0 if HB.has_edge(u, v): HB.remove_edge(u, v) for x in HB.neighbors(u): for y in HB.neighbors(v): if x == y or HB.has_edge(x, y): p += 1 HB.add_edge(u, v) else: for x in HB.neighbors(u): for y in HB.neighbors(v): if x == y or HB.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 #h2.Approximate katz on H if flag['h2'] is True: print("get feature h2") cnt = 0 for (u, v) in L: p = 0 if H.has_edge(u, v): H.remove_edge(u, v) for x in H.neighbors(u): for y in H.neighbors(v): if x == y or H.has_edge(x, y): p += 1 H.add_edge(u, v) else: for x in H.neighbors(u): for y in H.neighbors(v): if x == y or H.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 #h3.shortest path length on B if flag['h3'] is True: print("get feature h3") cnt = 0 for (u, v) in L: if nx.has_path(B, u, v): X[cnt].append( nx.shortest_path_length(B, source=u, target=v) / 50000) else: X[cnt].append(1) cnt += 1 #h4.clustering coefiicient on H if flag['h4'] is True: print("get feature h4") cnt = 0 for (u, v) in L: if H.has_edge(u, v): H.remove_edge(u, v) p = nx.clustering(H, u) * nx.clustering(H, v) H.add_edge(u, v) else: p = nx.clustering(H, u) * nx.clustering(H, v) X[cnt].append(p) cnt += 1 #h5. number of (user's loc friends)'s loc friends if flag['h5'] is True: print("get feature h5") cnt = 0 for (u, v) in L: counter1 = 0 for neighbor in H.neighbors(u): if not neighbor.isnumeric(): for neighbor2 in H.neighbors(neighbor): if not neighbor.isnumeric(): counter1 += 1 counter2 = 0 for neighbor in H.neighbors(v): if not neighbor.isnumeric(): for neighbor2 in H.neighbors(neighbor): if not neighbor.isnumeric(): counter2 += 1 #print(str(counter1)+" "+str(counter2)+"\n") X[cnt].append(counter1 * counter2) cnt += 1 #h6. location friends' degree sum if flag['h6'] is True: print("get feature h6") cnt = 0 for (u, v) in L: counter1 = 0 for locationNeighbor in H.neighbors(u): if not locationNeighbor.isnumeric(): #print(str(locationNeighbor)+"\n") if locationNeighbor in LG: counter1 += LG.degree(locationNeighbor) counter2 = 0 for locationNeighbor in H.neighbors(v): if not locationNeighbor.isnumeric(): if locationNeighbor in LG: counter2 += LG.degree(locationNeighbor) X[cnt].append(counter1 * counter2) cnt += 1 #h7. Approximate katz for social graph if flag['h7'] is True: print("get feature h7") cnt = 0 for (u, v) in L: counter = 0 for node in H.neighbors(u): if not node.isnumeric(): for node2 in H.neighbors(v): if not node2.isnumeric(): if node == node2 or H.has_edge(node, node2): counter += 1 X[cnt].append(counter) cnt += 1 #h8. adamic adar score on H if flag['h8'] is True: print("get feature h8") preds = nx.adamic_adar_index(H, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #h9. resource_allocation on H if flag['h9'] is True: print("get feature h9") preds = nx.resource_allocation_index(H, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #h10. shortest path length on H if flag['h10'] is True: print("get feature h10") cnt = 0 for (u, v) in L: if H.has_edge(u, v): H.remove_edge(u, v) if nx.has_path(H, u, v): X[cnt].append( nx.shortest_path_length(H, source=u, target=v) / 50000) else: X[cnt].append(1) H.add_edge(u, v) else: if nx.has_path(H, u, v): X[cnt].append( nx.shortest_path_length(H, source=u, target=v) / 50000) else: X[cnt].append(1) cnt += 1 #h11. common neighbors on H if flag['h11'] is True: print("get feature h11") cnt = 0 for (u, v) in L: if H.has_edge(u, v): H.remove_edge(u, v) T = [w for w in nx.common_neighbors(H, u, v)] H.add_edge(u, v) else: T = [w for w in nx.common_neighbors(H, u, v)] X[cnt].append(len(T)) cnt += 1 #h12.Approximate katz for social graph if flag['h12'] is True: print("get feature h12") cnt = 0 for (u, v) in L: p = 0 if H.has_edge(u, v): H.remove_edge(u, v) for x in H.neighbors(u): for y in H.neighbors(v): if x == y or H.has_edge(x, y): p += 1 H.add_edge(u, v) else: for x in H.neighbors(u): for y in H.neighbors(v): if x == y or H.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 if flag['h13'] is True: print("get feature h13") cnt = 0 with open("best_part_HB.txt", "r") as f: for line in f: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.cn_soundarajan_hopcroft(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['h14'] is True: print("get feature h14") cnt = 0 with open("best_part_HB.txt", "r") as f: for line in f: if line == "": continue v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.ra_index_soundarajan_hopcroft(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['h15'] is True: print("get feature h15") cnt = 0 with open("best_part_HB.txt", "r") as f: for line in f: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.within_inter_cluster(HB, L, delta=0.5) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['h16'] is True: print("get feature h16") cnt = 0 with open("dendo_HB.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.cn_soundarajan_hopcroft(HB, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.cn_soundarajan_hopcroft(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds if flag['h17'] is True: print("get feature h17") cnt = 0 with open("dendo_HB.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.ra_index_soundarajan_hopcroft(HB, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.ra_index_soundarajan_hopcroft(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds if flag['h18'] is True: print("get feature h18") cnt = 0 with open("dendo_HB.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.within_inter_cluster(HB, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.within_inter_cluster(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds return X
# Resources Allocation (sum of fractions of the end node receive from middle nodes based on their degrees) ra = list(nx.resource_allocation_index(g)) # Adamic-Adar Index (Resources Allocation with log of degrees) aa = list(nx.adamic_adar_index(g)) # Preferential Attachment (product of nodes' degree) pa = list(nx.preferential_attachment(g)) # Community Common Neighbors (with bonus for nieghbors in the same community) g.nodes[0]['community'] = 0 g.nodes[1]['community'] = 1 g.nodes[2]['community'] = 0 g.nodes[3]['community'] = 1 g.nodes[4]['community'] = 1 g.nodes[5]['community'] = 0 g.nodes[6]['community'] = 1 g.nodes[7]['community'] = 1 g.nodes[8]['community'] = 0 g.nodes[9]['community'] = 0 ccn = list(nx.cn_soundarajan_hopcroft(g)) # Community Resource Allocation (only consider nodes in the same community) g.nodes[0]['community'] = 0 g.nodes[1]['community'] = 1 g.nodes[2]['community'] = 0 g.nodes[3]['community'] = 1 g.nodes[4]['community'] = 1 g.nodes[5]['community'] = 0 g.nodes[6]['community'] = 1 g.nodes[7]['community'] = 1 g.nodes[8]['community'] = 0 g.nodes[9]['community'] = 0 cra = list(nx.ra_index_soundarajan_hopcroft(g))