def preprocess(name): adj, features, labels = load_local_data(exp_name, IDF_THRESHOLD, name=name) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train = gen_train_edges(adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) edge_labels = [] n_samples = len(labels) for i in range(n_samples - 1): for j in range(i + 1, n_samples): if labels[i] == labels[j]: edge_labels.append([i, j]) edge_labels = np.array(edge_labels) adj_label = sp.coo_matrix((np.ones(edge_labels.shape[0]), (edge_labels[:, 0], edge_labels[:, 1])), shape=(features.shape[0], features.shape[0]), dtype=np.float32) pos_weight = float(adj_label.shape[0] * adj_label.shape[0] - adj_label.sum()) / adj_label.sum() norm = adj_label.shape[0] * adj_label.shape[0] / float( (adj_label.shape[0] * adj_label.shape[0] - adj_label.nnz) * 2) adj_label = adj_label + sp.eye(adj_label.shape[0]) adj_label = sparse_to_tuple(adj_label) if FLAGS.is_sparse: # TODO to test # features = sparse_to_tuple(features.tocoo()) # features_nonzero = features[1].shape[0] features = features.todense() # TODO else: features = normalize_vectors(features) # print("positive_label", len(edge_labels)) print('positive edge weight', pos_weight) # negative edges/pos edges print('norm', norm) # negative edges/pos edges return adj_norm, adj_label, features, pos_weight, norm, labels
def gae_for_na(name, rawfeature): """ train and evaluate disambiguation results for a specific name :param name: author name :return: evaluation results """ adj, features, labels = load_local_data(name=name, rawfeature=rawfeature) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train = gen_train_edges(adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) num_nodes = adj.shape[0] input_feature_dim = features.shape[1] if FLAGS.is_sparse: # TODO to test # features = sparse_to_tuple(features.tocoo()) # features_nonzero = features[1].shape[0] features = features.todense() # TODO else: features = normalize_vectors(features) # Define placeholders placeholders = { # 'features': tf.sparse_placeholder(tf.float32), 'features': tf.placeholder(tf.float32, shape=(None, input_feature_dim)), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } # Create model model = None if model_str == 'gcn_ae': model = GCNModelAE(placeholders, input_feature_dim) elif model_str == 'gcn_vae': model = GCNModelVAE(placeholders, input_feature_dim, num_nodes) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() # negative edges/pos edges print('positive edge weight', pos_weight) norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.nnz) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) def get_embs(): feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) # z_mean is better return emb # Train model for epoch in range(FLAGS.epochs): t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), "train_acc=", "{:.5f}".format(avg_accuracy), "time=", "{:.5f}".format(time.time() - t)) emb = get_embs() n_clusters = len(set(labels)) emb_norm = normalize_vectors(emb) clusters_pred = clustering(emb_norm, num_clusters=n_clusters) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels) print('pairwise precision', '{:.5f}'.format(prec), 'recall', '{:.5f}'.format(rec), 'f1', '{:.5f}'.format(f1)) clusters_pred2 = clustering(features, num_clusters=n_clusters) prec2, rec2, f12 = pairwise_precision_recall_f1(clusters_pred2, labels) print('pairwise precision', '{:.5f}'.format(prec2), 'recall', '{:.5f}'.format(rec2), 'f1', '{:.5f}'.format(f12)) from sklearn.manifold import TSNE features_new = TSNE(learning_rate=100).fit_transform(features) emb_new = TSNE(learning_rate=100).fit_transform(emb_norm) labels = np.array(labels) + 2 clusters_pred = np.array(clusters_pred) + 2 clusters_pred2 = np.array(clusters_pred2) + 2 if rawfeature == RAW_INTER_NAME: tSNEAnanlyse(emb_norm, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final_raw.png" % (name))) tSNEAnanlyse(features, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features_raw.png" % (name))) elif rawfeature == ATTENTIONFEATURE: tSNEAnanlyse(emb_new, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final.png" % (name))) tSNEAnanlyse(features_new, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features.png" % (name))) tSNEAnanlyse(emb_new, clusters_pred, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final_clusterresult.png" % (name))) tSNEAnanlyse(features_new, clusters_pred2, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features_clusterresult.png" % (name))) else: tSNEAnanlyse(emb_norm, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final_triplet.png" % (name))) tSNEAnanlyse(features, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features_triplet.png" % (name))) return [prec, rec, f1], num_nodes, n_clusters
def gae_for_na(name, n_clusters): # 对一个具体的姓名预测其消歧结果 评估值[pre, rec, f1], 文档数, 聚类数 """ train and evaluate disambiguation results for a specific name :param name: author name :return: evaluation results """ adj, features, pids = load_local_data(name=name) # 邻接矩阵(i,j)=1, 文档特征集(y, aid), 标记集 aid索引编号i; features与labels关于下标 一一对应 # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() # 在原邻接矩阵的基础上 去对角线元素 删除0 adj_train = gen_train_edges(adj) # 这里搞了半天 感觉就是 把adj的对角元素删了 用的csr_matrix 类型 adj = adj_train # 完整的邻接矩阵 # Some preprocessing adj_norm = preprocess_graph(adj) # 标准化 矩阵 A^' 返回的是元组tuple 坐标(x,y), 值, 形状 num_nodes = adj.shape[0] # 节点数 input_feature_dim = features.shape[1] # 输入 特征 维数 [0]是个数 if FLAGS.is_sparse: # TODO to test # features = sparse_to_tuple(features.tocoo()) # features_nonzero = features[1].shape[0] features = features.todense() # TODO else: # 条件 进入的是 这边 features = normalize_vectors(features)# 特征向量 标准化 # Define placeholders # tf.placeholder 此函数可以理解为形参,用于定义过程,在执行的时候再赋具体的值 ?_? placeholders = { # 'features': tf.sparse_placeholder(tf.float32), 'features': tf.placeholder(tf.float32, shape=(None, input_feature_dim)), 'adj': tf.sparse_placeholder(tf.float32),# 为稀疏张量插入占位符,该稀疏张量将始终被提供 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=())# 该函数将返回一个张量。与 input 具有相同的类型。一个占位符张量,默认为 input 的占位符张量 (如果未送入)。 } # Create model model = None if model_str == 'gcn_ae': model = GCNModelAE(placeholders, input_feature_dim) elif model_str == 'gcn_vae':# 使用的模型 是 gcn_vae model = GCNModelVAE(placeholders, input_feature_dim, num_nodes) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() # negative edges/pos edges print('positive edge weight', pos_weight)# 负边/正边 norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.nnz) * 2) # 矩阵中非零元素的数量nnz # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif model_str == 'gcn_vae':# 使用的模型 是 gcn_vae opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0])# sp.eye 单位矩阵 标记, 解码应该得到原矩阵 adj_label = sparse_to_tuple(adj_label)# 稀疏矩阵 -> 元组 (坐标, 值, 维度形状) def get_embs():# 获得内部 嵌入z feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) # z_mean is better return emb # Train model for epoch in range(FLAGS.epochs):# 训练批次 epoch t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), "train_acc=", "{:.5f}".format(avg_accuracy), "time=", "{:.5f}".format(time.time() - t)) emb = get_embs() # 经过 编码器后 的 嵌入层 ''' n_clusters = int(name_to_ncluster.get(name, 0)) n_clusters = len(set(labels))# 直接获得 真实的 聚类大小 if n_clusters == 1: return None, None, None, None ''' #n_clusters = len(set(labels))# 直接获得 真实的 聚类大小 emb_norm = normalize_vectors(emb)# 标准化 嵌入层 clusters_pred = clustering(emb_norm, num_clusters=max(n_clusters,1)) # 聚类, 嵌入集 与 聚类大小 print('clusters_pred: ', clusters_pred) ret = {} for i, pred_label in enumerate(clusters_pred): pred_label = str(pred_label) if pred_label not in ret: ret[pred_label] = [] ret[pred_label].append(pids[i]) rett = [] for pred_label in ret: tmp = [] for pid in ret[pred_label]: tmp.append(pid) rett.append(tmp) return rett ''' ret = {}
def gae_for_na(name, mode=0): """ train and evaluate disambiguation results for a specific name :param name: author name :return: evaluation results :mode: 0-train 1-val """ pids, adj, features, labels = load_local_data(name=name) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train = gen_train_edges(adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) num_nodes = adj.shape[0] input_feature_dim = features.shape[1] if FLAGS.is_sparse: # TODO to test # features = sparse_to_tuple(features.tocoo()) # features_nonzero = features[1].shape[0] features = features.todense() # TODO else: features = normalize_vectors(features) # Define placeholders placeholders = { # 'features': tf.sparse_placeholder(tf.float32), 'features': tf.placeholder(tf.float32, shape=(None, input_feature_dim)), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } # Create model model = None if model_str == 'gcn_ae': model = GCNModelAE(placeholders, input_feature_dim) elif model_str == 'gcn_vae': model = GCNModelVAE(placeholders, input_feature_dim, num_nodes) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() # negative edges/pos edges print('positive edge weight', pos_weight) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.nnz) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) def get_embs(): feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) # z_mean is better return emb # Train model for epoch in range(FLAGS.epochs): t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), "train_acc=", "{:.5f}".format(avg_accuracy), "time=", "{:.5f}".format(time.time() - t)) emb = get_embs() for idx, pid in enumerate(pids): local_output[pid] = emb[idx] # Train mode calcul F1 if not (mode == 2): n_clusters = len(set(labels)) emb_norm = normalize_vectors(emb) model = AgglomerativeClustering(n_clusters=n_clusters) model.fit(emb_norm) prec, rec, f1 = pairwise_precision_recall_f1(model.labels_, labels) print('pairwise precision', '{:.5f}'.format(prec), 'recall', '{:.5f}'.format(rec), 'f1', '{:.5f}'.format(f1)) return [prec, rec, f1], num_nodes, n_clusters