def plot_embeddings(embeddings, ): """ 将实体的向量降维到2维,然后显示出来 :param embeddings: :return: """ X, Y = read_node_label( '/Users/admin/Desktop/GraphEmbedding-deeplearning/data/XunYiWenYao/寻医问药category.txt' ) emb_list = [] for k in X: emb_list.append(embeddings[k]) emb_list = np.array(emb_list) model = TSNE(n_components=2) node_pos = model.fit_transform(emb_list) color_idx = {} for i in range(len(X)): color_idx.setdefault(Y[i][0], []) color_idx[Y[i][0]].append(i) for c, idx in color_idx.items(): plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) plt.legend() plt.show()
def plot(embeddings): emb_list = [] for i in range(1, len(embeddings) + 1): # print(i) emb_list.append(embeddings[str(i)]) emb_list = np.array(emb_list) model = TSNE(n_components=2) node_pos = model.fit_transform(emb_list) # print(node_pos) X, Y = read_node_label('../data/Net300/community.dat', is_net=True) # colors = ['c', 'b', 'g', 'r', 'm', 'y', 'k'] colors = ['r', 'b', 'g', 'y'] # color_idx = {} for i in range(len(X)): color_idx.setdefault(Y[i][0], []) color_idx[Y[i][0]].append(i) # plt.figure(dpi=300, figsize=(24, 12)) for c, idx in color_idx.items(): # print(type(idx)) # print(c,idx) # print(node_pos[idx, 0],node_pos[idx, 1],node_pos[idx,2]) plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) # ax.scatter(node_pos[idx, 0], node_pos[idx, 1], node_pos[idx, 2], label=c) plt.legend() # 图例 plt.show()
def plot_embeddings(embeddings, ): X, Y = read_node_label('../data/flight/labels-brazil-airports.txt', skip_head=True) emb_list = [] for k in X: emb_list.append(embeddings[k]) emb_list = np.array(emb_list) model = TSNE(n_components=2) node_pos = model.fit_transform(emb_list) color_idx = {} for i in range(len(X)): color_idx.setdefault(Y[i][0], []) color_idx[Y[i][0]].append(i) for c, idx in color_idx.items(): plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) # c=node_colors) plt.legend() plt.show()
def evaluate_embeddings(embeddings): X, Y = read_node_label('../data/flight/labels-brazil-airports.txt', skip_head=True) tr_frac = 0.8 print("Training classifier using {:.2f}% nodes...".format(tr_frac * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, tr_frac)
def evaluate_embeddings(embeddings): X, Y = read_node_label('../data/wiki/wiki_labels.txt') tr_frac = 0.8 print("Training classifier using {:.2f}% nodes...".format( tr_frac * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings_3D(embeddings,): """将嵌入降维到3维并可视化 :param embeddings: """ X, Y = read_node_label('../data/wiki/wiki_labels.txt') # print(Y) emb_list = [] for k in X: emb_list.append(embeddings[k]) emb_list = np.array(emb_list) model = TSNE(n_components=3) node_pos = model.fit_transform(emb_list) # 返回矩阵 # print(node_pos) color_idx = {} for i in range(len(X)): color_idx.setdefault(Y[i][0], []) color_idx[Y[i][0]].append(i) # print(color_idx) # 生产标签_节点字典 fig = plt.figure() ax = Axes3D(fig) for c, idx in color_idx.items(): # print(c,idx) # print(node_pos[idx, 0],node_pos[idx, 1],node_pos[idx,2]) ax.scatter(node_pos[idx, 0], node_pos[idx, 1], node_pos[idx, 2], label=c) plt.legend() # 图例 plt.show()
def plot_embeddings(embeddings,): X, Y = read_node_label('../data/wiki/karate.txt') emb_list = [] embeddings = field_to_json(embeddings) key = embeddings.keys() for k in key: emb_list.append(embeddings[k]) emb_list = np.array(emb_list) # model = TSNE(n_components=2) # node_pos = model.fit_transform(emb_list) # color_idx = {} # for i in range(len(X)): # color_idx.setdefault(Y[i][0], []) # color_idx[Y[i][0]].append(i) # # for c, idx in color_idx.items(): # plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) for i in range(len(key)): plt.scatter(emb_list[i, 0], emb_list[i, 1], alpha=0.5, s=150, color='b') plt.text(emb_list[i, 0], emb_list[i, 1], list(key)[i], horizontalalignment='center', verticalalignment='center') # for j in range(len(X)): # if X[j] == str(i+1): # x = int(X[j]) # y = int(Y[j][0]) # plt.plot(node_pos[int(list(key)[x-1])-1], node_pos[int(list(key)[y-1])-1], color='r') plt.legend() plt.show()
def evaluate_embeddings(embeddings): #读入真实的分类label X, Y = read_node_label('../data/wiki/wiki_labels.txt') tr_frac = 0.8 #80%的节点用于训练分类器,其余的用于测试 print("Training classifier using {:.2f}% nodes...".format(tr_frac * 100)) #应用分类器对节点进行分类以评估向量的质量 clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings,): X, Y = read_node_label('./data/miserables/miserables_labels.txt') emb_list = [] for k in X: emb_list.append(embeddings[k]) emb_list = np.array(emb_list) model = TSNE(n_components=2) node_pos = model.fit_transform(emb_list) color_idx = {} color_dict = {} for i in range(len(X)): color_idx.setdefault(Y[i][0], []) color_idx[Y[i][0]].append(i) color_dict[X[i]] = COLOR_MAP[int(Y[i][0])] plt.figure(figsize=(10, 10)) ax = plt.axes([0.05, 0.05, 1 - 0.05 * 2, 1 - 0.05 * 2]) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["left"].set_visible(False) pos = {} for i, x in enumerate(X): pos[x] = node_pos[i] color_list = [] cluster_color_list = [] for node in G.nodes: color_list.append(color_dict[node]) nx.draw_networkx_nodes(G, pos, node_size=80, node_color=color_list, edgecolors='white', linewidths=0.7) #for c, idx in color_idx.items(): #plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) # c=node_colors) # plt.legend() t = datetime.datetime.now() plt.savefig("./pics/miserables/{}-{}-{}/dw-{}-{}-{}-emb-real.png".format( t.year, t.month, t.day, t.hour, t.minute, int(t.second) )) plt.cla() cluster = kmeans(embeddings, K=8) cluster_color = [] for node in G.nodes: cluster_color.append(COLOR_MAP[cluster[node]]) nx.draw_networkx_nodes(G, pos, node_size=80, node_color=cluster_color, edgecolors='white', linewidths=0.7) plt.savefig("./pics/miserables/{}-{}-{}/dw-{}-{}-{}-emb-cluster.png".format( t.year, t.month, t.day, t.hour, t.minute, int(t.second) )) plt.cla() return color_dict, cluster
def evaluate_embeddings(embeddings): X, Y = read_node_label( '../data/ETH/Phishing node classification/label.txt', skip_head=True) tr_frac = 0.8 print("Training classifier using {:.2f}% nodes...".format(tr_frac * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, tr_frac)
def evaluate_embeddings(embeddings): """分割训练评估,输出性能参数 :param embeddings: """ # X, Y = read_node_label('../data/wiki/wiki_labels.txt') # X, Y = read_node_label('../data/flight/labels-brazil-airports.txt', True) # X, Y = read_node_label('../data/flight/labels-europe-airports.txt', True) X, Y = read_node_label('../data/flight/labels-usa-airports.txt', True) tr_frac = 0.8 # 交叉验证百分比 print("Training classifier using {:.2%} nodes...".format(tr_frac)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) return clf.split_train_evaluate(X, Y, tr_frac)
def evaluate_embeddings(embeddings): """ 一个分类器函数,用来评价向量好坏,因为每个实体有对应的标签,通过向量实现多分类 :param embeddings: :return: """ X, Y = read_node_label( '/Users/admin/Desktop/GraphEmbedding-deeplearning/data/XunYiWenYao/寻医问药category.txt' ) tr_frac = 0.8 print("Training classifier using {:.2f}% nodes...".format(tr_frac * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, tr_frac)
def NMI_Q(embeddings, num_coms): emb_list = [] for i in range(1, len(embeddings) + 1): emb_list.append(embeddings[str(i)]) # edgelist index start at 1 / emb_list index start at 0 predict = kmeans_from_vec(emb_list, num_coms) # index start by 0 for i in range(len(predict)): predict[i] = [str(x) for x in predict[i]] # 数据处理 X, Y = read_node_label('../data/Net_mu/Net0.8/community.dat', is_net=True) comu_idx = {} for i in range(len(X)): comu_idx.setdefault(Y[i][0], []) comu_idx[Y[i][0]].append(i) # print(comu_idx) real = [] # print(real) for key in comu_idx: real.append(comu_idx[key]) for i in range(len(real)): real[i] = [str(x) for x in real[i]] # print(predict) # print(real) mni = NMI(predict, real) # print(mni) # predict_ = predict for i in range(len(predict)): predict_[i] = [str(int(x) + 1) for x in predict[i]] # predict index add 1 # # q = Q(predict_, G) # print(q) # return mni, q
def plot_embeddings(embeddings, ): X, Y = read_node_label('../data/wiki/karate.txt', skip_head=True) emb_list = [] for k in X: emb_list.append(embeddings[k]) emb_list = np.array(emb_list) # model = TSNE(n_components=2) # node_pos = model.fit_transform(emb_list) # color_idx = {} # for i in range(len(X)): # color_idx.setdefault(Y[i][0], []) # color_idx[Y[i][0]].append(i) for idx in range(len(X)): plt.scatter(emb_list[idx, 0], emb_list[idx, 1]) plt.legend() plt.show()
def evaluate_embeddings(embeddings): X, Y = read_node_label('../data/flight/igraph2Label.txt', skip_head=True) # tr_frac = 0.8 # # print("Training classifier using {:.2f}% nodes...".format( # # tr_frac * 100)) # # # clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) # # # # clf.split_train_evaluate(X, Y, tr_frac) # X_ = numpy.asarray([embeddings[x] for x in X]) # print("kexinxin") X = numpy.asarray([embeddings[x] for x in X]) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.4) clf = linear_model.LogisticRegression(solver='liblinear') # clf = RandomForestClassifier() # clf=tree.DecisionTreeClassifier(criterion='entropy') #clf = KNeighborsClassifier() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) # count=0 # # for i in range(len(result)): # # if result[i]==y_test[i]: # # count=count+1 # # print(count/len(result)) y_test = np.array(y_test) y_test = y_test.astype(np.int16) y_pred = y_pred.astype(np.int16) #y_pred.reshape(-1,1) #y_test.reshape(-1,1) #y_pred=y_pred.tolist() #arr=np.array([1,2,3,4]) print("acc", accuracy_score(y_test, y_pred)) #print("precision", precision_score(y_test, y_pred)) print("recall", recall_score(y_test, y_pred)) print("f1", f1_score(y_test, y_pred))
def plot_embeddings(embeddings, ): X, Y = read_node_label('../data/wiki/friend_labels_baseline.txt') emb_list = [] for k in X: emb_list.append(embeddings[k]) emb_list = np.array(emb_list) model = TSNE(n_components=2) node_pos = model.fit_transform(emb_list) color_idx = {} for i in range(len(X)): color_idx.setdefault(Y[i][0], []) color_idx[Y[i][0]].append(i) for c, idx in color_idx.items(): plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) plt.legend() plt.show()
def plot_embeddings(embeddings,): X, Y = read_node_label('../data/wiki/wiki_labels.txt') emb_list = [] for k in X: emb_list.append(embeddings[k]) emb_list = np.array(emb_list) model = TSNE(n_components=2) node_pos = model.fit_transform(emb_list) color_idx = {} for i in range(len(X)): color_idx.setdefault(Y[i][0], []) color_idx[Y[i][0]].append(i) # 分辨率参数-dpi,画布大小参数-figsize plt.figure(dpi=300, figsize=(24, 12)) for c, idx in color_idx.items(): plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) plt.legend() plt.show()
def plot(embeddings): emb_list = [] for i in range(1, len(embeddings) + 1): emb_list.append(embeddings[str(i)]) # 将嵌入向量字典转换成列表 emb_list = np.array(emb_list) model = TSNE(n_components=2) node_pos = model.fit_transform(emb_list) # 通过tsne转换成二维坐标 X, Y = read_node_label('../data/Net300/community.dat', is_net=True) # ['1', '2', '3', '4'],[['4'], ['2'], ['4'], ['1']] commu_idx = {} for i in range(len(X)): commu_idx.setdefault(Y[i][0], []) commu_idx[Y[i][0]].append(i) for c, idx in commu_idx.items(): plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) plt.legend() plt.show()
def plot_embeddings(embeddings,): """将嵌入降维到2维并可视化 :param embeddings: """ X, Y = read_node_label('../data/wiki/wiki_labels.txt') # X, Y = read_node_label('../data/flight/labels-brazil-airports.txt', True) # X, Y = read_node_label('../data/flight/labels-europe-airports.txt', True) # X, Y = read_node_label('../data/flight/labels-usa-airports.txt', True) # print(Y) emb_list = [] for k in X: emb_list.append(embeddings[k]) emb_list = np.array(emb_list) model = TSNE(n_components=2) # model = TSNE(n_components=3) node_pos = model.fit_transform(emb_list) # 返回矩阵 # print(node_pos) color_idx = {} for i in range(len(X)): color_idx.setdefault(Y[i][0], []) color_idx[Y[i][0]].append(i) # print(color_idx) # 生产标签_节点字典 # fig = plt.figure() # ax = Axes3D(fig) # 分辨率参数-dpi,画布大小参数-figsize plt.figure(dpi=300, figsize=(24, 12)) for c, idx in color_idx.items(): # print(type(idx)) # print(c,idx) # print(node_pos[idx, 0],node_pos[idx, 1],node_pos[idx,2]) plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) # ax.scatter(node_pos[idx, 0], node_pos[idx, 1], node_pos[idx, 2], label=c) # plt.legend() # 图例 plt.show()
plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) # c=node_colors) plt.legend() plt.show() d_set = [ "Facebook-Page2Page", "PubMed-Diabetes", "Terrorists-Relation" ] # ["Cora", "CiteSeer", "Facebook-Page2Page", "PubMed-Diabetes", "Terrorists-Relation", "Zachary-Karate", "Internet-Industry-Partnerships"] # [sparse, dense] mdl = ["Node2Vec", "SDNE", "DeepWalk", "LINE"] # ["Node2Vec", "SDNE", "Struc2Vec", "DeepWalk", "LINE"] for i in range(len(d_set)): # Load/Prepare data graph_fname = "data/" + d_set[i] + "/" + d_set[i] X, Y = read_node_label(graph_fname + ".labels", skip_head=True) X = np.asarray(X) Y = np.asarray(Y) # Preserve ratio/percentage of samples per class using efficent data-splitting && data-resampling strageies train_frac = 0.8 test_frac = round((1 - train_frac), 1) print("Training classifier using {:.2f}% nodes...".format(train_frac * 100)) if not os.path.isfile(graph_fname + "_strat_train_test.splits"): stratified_data = StratifiedShuffleSplit(n_splits=1, test_size=test_frac, train_size=train_frac, random_state=42) for train_index, test_index in stratified_data.split(X, Y): strat_X_train, strat_y_train = X[train_index], Y[train_index]