def test_prf(fn1, fn2, sth, L): y_true = [] y_score = [] edges_1 = prep.read_edges(fn1) edges_2 = prep.read_edges(fn2) predict_set = {} for key in sth.keys(): predict_set[key] = predict_set.get(key, 0.) + sth[key] predict_set = sorted(predict_set.iteritems(), key=lambda d: d[1], reverse=True) # threshold = predict_set[L][1] for i in edges_1: if sth[i] > threshold: y_score.append(1) else: y_score.append(0) for i in edges_1: if i not in edges_2: y_true.append(0) else: y_true.append(1) print classification_report(y_true, y_score) print auc_score(y_true, y_score)
def cal_clust(fn1,fn2): neig=get_neig(fn1) edges=prep.read_edges(fn2) all_edges=prep.read_edges(fn1) sth={edge:0 for edge in edges} for x in sth: edges_among_neig_i=0 neig_i=neig[x[0]] for edge in all_edges: if (edge[0] in neig_i) & (edge[1] in neig_i): edges_among_neig_i=edges_among_neig_i+1 if edges_among_neig_i==0: Clust_i=0 else: Clust_i=2.0*edges_among_neig_i/(len(neig[x[0]])*(len(neig[x[0]])-1)) edges_among_neig_j=0 neig_j=neig[x[1]] for edge in all_edges: if (edge[0] in neig_j) & (edge[1] in neig_j): edges_among_neig_j=edges_among_neig_j+1 if edges_among_neig_i==0: Clust_j=0 else: Clust_j=2.0*edges_among_neig_j/(len(neig[x[1]])*(len(neig[x[1]])-1)) sth[x]=Clust_i*Clust_j return sth
def cal_auc(fn1, fn2, sth): edges_1 = prep.read_edges(fn1) edges_2 = prep.read_edges(fn2) edges_miss = edges_1 - edges_2 pos = len(edges_miss) neg = len(edges_1) - pos predict_set = {} for key in sth.keys(): predict_set[key] = predict_set.get(key, 0.) + sth[key] predict_set = sorted(predict_set.iteritems(), key=lambda d: d[1], reverse=False) ##predict_set is list xy_arr = [] tp, fp = 0., 0. for i in range(len(predict_set)): if (predict_set[i][0] in edges_miss): tp += 1 else: fp += 1 xy_arr.append([fp / neg, tp / pos]) auc = 0. prev_x = 0 for x, y in xy_arr: if x != prev_x: auc += (x - prev_x) * y prev_x = x return auc
def test_prf(fn1,fn2,sth,L): y_true=[] y_score=[] edges_1=prep.read_edges(fn1) edges_2=prep.read_edges(fn2) predict_set={} for key in sth.keys(): predict_set[key]=predict_set.get(key,0.)+sth[key] predict_set=sorted(predict_set.iteritems(),key=lambda d:d[1],reverse=True)# threshold=predict_set[L][1] for i in edges_1: if sth[i]>threshold: y_score.append(1) else: y_score.append(0) for i in edges_1: if i not in edges_2: y_true.append(0) else: y_true.append(1) print classification_report(y_true,y_score) print auc_score(y_true,y_score)
def cal_auc(fn1,fn2,sth): edges_1=prep.read_edges(fn1) edges_2=prep.read_edges(fn2) edges_miss=edges_1-edges_2 pos=len(edges_miss) neg=len(edges_1)-pos predict_set={} for key in sth.keys(): predict_set[key]=predict_set.get(key,0.)+sth[key] predict_set=sorted(predict_set.iteritems(),key=lambda d:d[1],reverse=False)##predict_set is list xy_arr=[] tp, fp = 0., 0. for i in range(len(predict_set)): if (predict_set[i][0] in edges_miss): tp+=1 else: fp+=1 xy_arr.append([fp/neg,tp/pos]) auc=0. prev_x=0 for x,y in xy_arr: if x!=prev_x: auc+=(x-prev_x)*y prev_x=x return auc
def cal_miss_ratio(fn1,fn2): edges_1=prep.read_edges(fn1) edges_2=prep.read_edges(fn2) print "edges_1 "+str(len(edges_1)) print "edges_2 "+str(len(edges_2)) miss_ratio=float(len(edges_1-edges_2))/len(edges_1) new_ratio=float(len(edges_2-edges_1))/len(edges_1) print "miss_ratio is "+str(miss_ratio) print "new added ratio is "+str(new_ratio)
def cal_miss_ratio(fn1, fn2): edges_1 = prep.read_edges(fn1) edges_2 = prep.read_edges(fn2) print "edges_1 " + str(len(edges_1)) print "edges_2 " + str(len(edges_2)) miss_ratio = float(len(edges_1 - edges_2)) / len(edges_1) new_ratio = float(len(edges_2 - edges_1)) / len(edges_1) print "miss_ratio is " + str(miss_ratio) print "new added ratio is " + str(new_ratio)
def cal_PA(fn1,fn2): neig=get_neig(fn1) edges=prep.read_edges(fn2) sth={edge:0. for edge in edges} for x in sth: sth[x]=len(neig[x[0]])*len(neig[x[1]]) return sth
def cal_edge_current_flow_betweenness_centrality(fn1,fn2): edges=prep.read_edges(fn2) sth={edge:0. for edge in edges} G=nx.Graph() edges_all=prep.read_edges(fn1) G.add_edges_from(edges_all) graphs=list(nx.connected_component_subgraphs(G))# for g in graphs: edge_flow=nx.edge_current_flow_betweenness_centrality( g,normalized=True, weight=None,dtype=np.float32) for x in edge_flow.keys(): if edge_flow.get(x) is not None: sth[x]=edge_flow.get(x) return sth
def main(): edges = prep.read_edges(path + fn) net = prep.build_net(edges) train, test = prep.divide_net(net, ratio) #CN sim = CN.predict_link(train) #Jaccard sim = Jaccard.predict_link(train) #AA sim = AA.predict_link(train) #RA sim = RA.predict_link(train) #PA sim = PA.predict_link(train) #Katz sim = Katz.predict_link(train) #SimRank sim = SimRank.predict_link(train)
def cal_edge_betweenness(fn1,fn2): edges=prep.read_edges(fn2) sth={edge:0. for edge in edges} G=nx.Graph() edges_all=prep.read_edges(fn1) G.add_edges_from(edges_all) edge_betweenness=nx.edge_betweenness_centrality(G) for x in sth.keys(): u=(x[0], x[1])#!!!!!there is a blank between two nodes in edge_betweenness, so here need a switch if edge_betweenness.get(u) is not None: sth[x]=edge_betweenness.get(u) else: sth[x]=0 return sth
def cal_RA(fn1,fn2): neig=get_neig(fn1) edges=prep.read_edges(fn2) sth={edge:0. for edge in edges} for x in sth: for z in (neig[x[0]] & neig[x[1]]): sth[x]+=1/(len(neig[z])) return sth
def test(): edges = prep.read_edges(path + in_fn) net = prep.build_net(edges) my = SimRank.predict_link(net, 0.8) output_mat(my, path + 'my.txt') oth = read_mat(path + 'oth.txt') print cmp_mat(my, oth)
def cal_PA_extend(fn1,fn2): neig=get_neig(fn1) edges=prep.read_edges(fn2) sth={edge:0. for edge in edges} for x in sth: if ((len(neig[x[0]])==1) or (len(neig[x[1]])==1)): sth[x]=0. else: sth[x]=1.0/(len(neig[x[0]])*len(neig[x[1]])) return sth
def cal_communicability_centrality(fn1,fn2): edges=prep.read_edges(fn2) sth={edge:0. for edge in edges} G=nx.Graph() edges_all=prep.read_edges(fn1) G.add_edges_from(edges_all) communicability=nx.communicability_centrality(G) for x in sth: n1= communicability.get(x[0]) n2= communicability.get(x[1]) print n1,n2 n3=max(n1,n2) n4=min(n1,n2) sth[x]=float(n4)/(n3+1) # sth[x]=n1*n2 # sth[x]=n1+n2 return sth
def cal_auc(fn1, fn2, sth): edges_1 = prep.read_edges(fn1) edges_2 = prep.read_edges(fn2) edges_miss = edges_1 - edges_2 pos = len(edges_miss) neg = len(edges_1) - pos predict_set = {} for key in sth: predict_set[key] = predict_set.get(key, 0.) + sth[key] predict_set = sorted(predict_set.iteritems(), key=lambda d: d[1], reverse=False) ##predict_set is list xy_arr = [] tp, fp = 0., 0. summ = 0.0 rank = {} num = {} for i in range(len(predict_set)): rank[predict_set[i][1]] = rank.get(predict_set[i][1], 0) + len(predict_set) - i num[predict_set[i][1]] = num.get(predict_set[i][1], 0) + 1 for i in range(len(predict_set)): if (predict_set[i][0] in edges_miss): tp += 1 summ += float(rank[predict_set[i][1]]) / num[predict_set[i][1]] else: fp += 1 xy_arr.append([fp / neg, tp / pos]) auc = 0. prev_x = 0 for x, y in xy_arr: if x != prev_x: auc += (x - prev_x) * y prev_x = x print(float(summ) - pos * (pos + 1) / 2) / neg / pos return auc
def cal_triangles(fn1,fn2): edges=prep.read_edges(fn2) sth={edge:0. for edge in edges} G=nx.Graph() edges_all=prep.read_edges(fn1) G.add_edges_from(edges_all) for x in sth: n1=nx.triangles(G,x[0]) n2=nx.triangles(G,x[1]) n3=max(n1,n2) n4=min(n1,n2) sth[x]=float(n4)/(n3+1) ## sth[x]=(n1+1)*(n2+1) ## sth[x]=n1+n2 return sth
def cal_embed(fn1,fn2): neig=get_neig(fn1) edges=prep.read_edges(fn2) sth={edge:0 for edge in edges} for x in sth: common_neig=len(neig[x[0]] & neig[x[1]]) if common_neig==0: sth[x]=0.0 else: sth[x]=float(common_neig)/((len(neig[x[0]])-1)+(len(neig[x[1]])-1)-common_neig) return sth
def cal_precision_recall(fn1,fn2,sth,L): edges_1=prep.read_edges(fn1) edges_2=prep.read_edges(fn2) edges_miss=edges_1-edges_2 predict_set={} for key in sth.keys(): predict_set[key]=predict_set.get(key,0.)+sth[key] predict_set=sorted(predict_set.iteritems(),key=lambda d:d[1],reverse=False)# ##record the first L edges whose strength is weak. predict_L=set() for i in range(L): predict_L.add(predict_set[i][0]) # print predict_L precision=float(len(predict_L & edges_miss))/len(predict_L) recall=float(len(predict_L & edges_miss))/len(edges_miss) F1=2*(precision*recall)/(precision+recall) return precision,recall, F1
def cal_JC(fn1,fn2): neig=get_neig(fn1) #print neig edges=prep.read_edges(fn2) sth={edge:0. for edge in edges} for x in sth: if ((len(neig[x[0]])==1) or (len(neig[x[1]])==1)): sth[x] = 0 else: sth[x]=float(len(neig[x[0]]&neig[x[1]]))/(len(neig[x[0]]|neig[x[1]])-2) return sth
def get_neig(fn): all_nodes=prep.read_nodes(fn) all_edges=prep.read_edges(fn) neig={} ## neighbor of all_nodes for x in all_nodes: neig[x]=set() for line in all_edges: u=line[0] v=line[1] neig[u].add(v) neig[v].add(u) return neig
def cal_precision_recall(fn1, fn2, sth, L): edges_1 = prep.read_edges(fn1) edges_2 = prep.read_edges(fn2) edges_miss = edges_1 - edges_2 predict_set = {} for key in sth.keys(): predict_set[key] = predict_set.get(key, 0.) + sth[key] predict_set = sorted(predict_set.iteritems(), key=lambda d: d[1], reverse=False) # ##record the first L edges whose strength is weak. predict_L = set() for i in range(L): predict_L.add(predict_set[i][0]) # print predict_L precision = float(len(predict_L & edges_miss)) / len(predict_L) recall = float(len(predict_L & edges_miss)) / len(edges_miss) F1 = 2 * (precision * recall) / (precision + recall) return precision, recall, F1
def cal_degree_ratio(fn1,fn2): edges=prep.read_edges(fn2) sth={edge:0 for edge in edges} all_nodes=prep.read_nodes(fn1) all_edges=prep.read_edges(fn1) neig={} ## neighbor of all_nodes for x in all_nodes: neig[x]=set() for line in all_edges: u=line[0] v=line[1] neig[u].add(v) neig[v].add(u) for x in sth: i=len(neig[x[0]]) j=len(neig[x[1]]) if (i<=j): sth[x]=float(i)/j else: sth[x]=float(j)/i return sth
import construct_graph import preprocess # for subgraph graph = construct_graph.Graph(logfile='../datasets/subset/construct_graph.log') subnodes = preprocess.read_list(path='../datasets/subset/1000_nodelist_url.txt') graph.create_nodes_from_db(longabsdb_path='../datasets/subset/1000_long_abstracts.db', labelsdb_path='../datasets/subset/1000_labels.db', lookupdb_path='../datasets/subset/1000_nodes_lookup.db', subnodes=subnodes) print('nodes created..') edges = preprocess.read_edges(path='../datasets/subset/1000_edgelist.txt') print('edges are read...') graph.create_edges_from_list(edges=edges) print('edges are created...') graph.write_graph(path='../datasets/subset/1000_graph_sub.gpickle') print('graph is written...') graph.draw()