def train(self, adj_list, fea_list, y_train, y_val, y_test, train_mask, val_mask, test_mask, y_all, all_mask, rawlabels, needtSNE=False, rawFeature=[]): prec, rec, f1 = 0.0, 0.0, 0.0 nb_nodes = fea_list[0].shape[0] ft_size = fea_list[0].shape[1] nb_classes = y_train.shape[1] # nb_classes = len(set(rawlabels)) # adj = adj.todense() # features = features[np.newaxis] # [1, nb_node, ft_size] fea_list = [fea[np.newaxis] for fea in fea_list] adj_list = [adj[np.newaxis] for adj in adj_list] y_train = y_train[np.newaxis] y_val = y_val[np.newaxis] y_test = y_test[np.newaxis] y_all = y_all[np.newaxis] train_mask = train_mask[np.newaxis] val_mask = val_mask[np.newaxis] test_mask = test_mask[np.newaxis] all_mask = all_mask[np.newaxis] biases_list = [process.adj_to_bias(adj, [nb_nodes], nhood=1) for adj in adj_list] print('build graph...') with tf.Graph().as_default(): with tf.name_scope('input'): metric_ftr_in = tf.placeholder(dtype=tf.float32, shape=(nb_nodes, ft_size), name='metric_ftr_in') ftr_in_list = [tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, ft_size), name='ftr_in_{}'.format(i)) for i in range(len(fea_list))] bias_in_list = [tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, nb_nodes), name='bias_in_{}'.format(i)) for i in range(len(biases_list))] lbl_in = tf.placeholder(dtype=tf.int32, shape=( batch_size, nb_nodes, nb_classes), name='lbl_in') msk_in = tf.placeholder(dtype=tf.int32, shape=(batch_size, nb_nodes), name='msk_in') attn_drop = tf.placeholder(dtype=tf.float32, shape=(), name='attn_drop') ffd_drop = tf.placeholder(dtype=tf.float32, shape=(), name='ffd_drop') is_train = tf.placeholder(dtype=tf.bool, shape=(), name='is_train') # forward logits, final_embedding, att_val, centers_embed, test_final_embeed = model.inference(ftr_in_list, nb_classes, nb_nodes, is_train, attn_drop, ffd_drop, bias_mat_list=bias_in_list, hid_units=hid_units, n_heads=n_heads, features=fea_list, labels=rawlabels, residual=residual, activation=nonlinearity, feature_size=ft_size) # final_embedding: checkout Tensor("Sum:0", shape=(286, 64), dtype=float32) # logits: checkout Tensor("ExpandDims_3:0", shape=(1, 286, 30), dtype=float32) # cal masked_loss # lab_list = tf.placeholder(dtype=tf.float32, shape=(nb_nodes, ), name='lab_list') # ftr_resh = tf.placeholder(dtype=tf.float32, shape=(nb_nodes, ft_size), name='ftr_resh') log_resh = tf.reshape(logits, [-1, nb_classes]) lab_resh = tf.reshape(lbl_in, [-1, nb_classes]) msk_resh = tf.reshape(msk_in, [-1]) print ("final_embedding: checkout", final_embedding) print ("logits: checkout", logits) print ("log_resh: checkout", log_resh) # print ("ftr_resh: ", ftr_resh) print ("lab_resh: ", lab_resh) print ("fea_list: ", fea_list) print ("centers_embed: ", centers_embed) print ("batch_size, nb_nodes, nb_classes, ft_size", batch_size, nb_nodes, nb_classes, ft_size) osm_caa_loss = OSM_CAA_Loss(batch_size=nb_nodes) osm_loss = osm_caa_loss.forward # final_embedding: checkout Tensor("Sum:0", shape=(286, 64), dtype=float32) # logits: checkout Tensor("ExpandDims_3:0", shape=(1, 286, 30), dtype=float32) # log_resh: checkout Tensor("Reshape:0", shape=(286, 30), dtype=float32) # ftr_resh: Tensor("ftr_resh:0", shape=(286, 100), dtype=float32) # lab_resh: Tensor("Reshape_1:0", shape=(286, 30), dtype=int32) osmLoss, checkvalue = osm_loss(final_embedding, rawlabels, centers_embed) # osmLoss, checkvalue = osm_loss(metric_ftr_in, rawlabels, centers_embed) SoftMaxloss = model.masked_softmax_cross_entropy(log_resh, lab_resh, msk_resh) loss = osmLoss # 为什么loss会固定 # loss = osmLoss # loss = SoftMaxloss accuracy = model.masked_accuracy(log_resh, lab_resh, msk_resh) # optimzie train_op = model.training(loss, lr, l2_coef) Path = 'pre_trained/{}/{}/{}'.format(dataset, dataset, self.name) self.mkdir(Path) checkpt_file = '{}/allMP_multi_{}_.ckpt'.format(Path, featype) print('model: {}'.format(checkpt_file)) saver = tf.train.Saver() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) vlss_mn = np.inf vacc_mx = 0.0 curr_step = 0 with tf.Session(config=config) as sess: sess.run(init_op) train_loss_avg = 0 train_acc_avg = 0 val_loss_avg = 0 val_acc_avg = 0 for epoch in range(nb_epochs): tr_step = 0 tr_size = fea_list[0].shape[0] # ================ training ============ while tr_step * batch_size < tr_size: fd1 = {i: d[tr_step * batch_size:(tr_step + 1) * batch_size] for i, d in zip(ftr_in_list, fea_list)} fd2 = {i: d[tr_step * batch_size:(tr_step + 1) * batch_size] for i, d in zip(bias_in_list, biases_list)} fd3 = {lbl_in: y_train[tr_step * batch_size:(tr_step + 1) * batch_size], msk_in: train_mask[tr_step * batch_size:(tr_step + 1) * batch_size], metric_ftr_in: rawFeature, is_train: True, attn_drop: 0.6, ffd_drop: 0.6} fd = fd1 fd.update(fd2) fd.update(fd3) _, loss_value_tr, acc_tr, att_val_train = sess.run([train_op, loss, accuracy, att_val], feed_dict=fd) test_check_value = sess.run(checkvalue, feed_dict=fd) print ("test_check_value: ", test_check_value) train_loss_avg += loss_value_tr train_acc_avg += acc_tr tr_step += 1 vl_step = 0 vl_size = fea_list[0].shape[0] # ============= val ================= while vl_step * batch_size < vl_size: # fd1 = {ftr_in: features[vl_step * batch_size:(vl_step + 1) * batch_size]} fd1 = {i: d[vl_step * batch_size:(vl_step + 1) * batch_size] for i, d in zip(ftr_in_list, fea_list)} fd2 = {i: d[vl_step * batch_size:(vl_step + 1) * batch_size] for i, d in zip(bias_in_list, biases_list)} fd3 = {lbl_in: y_val[vl_step * batch_size:(vl_step + 1) * batch_size], msk_in: val_mask[vl_step * batch_size:(vl_step + 1) * batch_size], metric_ftr_in: rawFeature, is_train: False, attn_drop: 0.0, ffd_drop: 0.0} fd = fd1 fd.update(fd2) fd.update(fd3) loss_value_vl, acc_vl = sess.run([loss, accuracy], feed_dict=fd) val_loss_avg += loss_value_vl val_acc_avg += acc_vl vl_step += 1 # import pdb; pdb.set_trace() print('Epoch: {}, att_val: {}'.format(epoch, np.mean(att_val_train, axis=0))) print('Training: loss = %.5f, acc = %.5f | Val: loss = %.5f, acc = %.5f | vl_step: %d, tr_step: %d' % (train_loss_avg / tr_step, train_acc_avg / tr_step, val_loss_avg / vl_step, val_acc_avg / vl_step, vl_step, tr_step)) if val_acc_avg / vl_step >= vacc_mx or val_loss_avg / vl_step <= vlss_mn: if val_acc_avg / vl_step >= vacc_mx and val_loss_avg / vl_step <= vlss_mn: vacc_early_model = val_acc_avg / vl_step vlss_early_model = val_loss_avg / vl_step saver.save(sess, checkpt_file) vacc_mx = np.max((val_acc_avg / vl_step, vacc_mx)) vlss_mn = np.min((val_loss_avg / vl_step, vlss_mn)) curr_step = 0 else: curr_step += 1 if curr_step == patience: print('Early stop! Min loss: ', vlss_mn, ', Max accuracy: ', vacc_mx) print('Early stop model validation loss: ', vlss_early_model, ', accuracy: ', vacc_early_model) break train_loss_avg = 0 train_acc_avg = 0 val_loss_avg = 0 val_acc_avg = 0 # check save saver.save(sess, checkpt_file) saver.restore(sess, checkpt_file) print('load model from : {}'.format(checkpt_file)) ts_size = fea_list[0].shape[0] ts_step = 0 ts_loss = 0.0 ts_acc = 0.0 while ts_step * batch_size < ts_size: fd1 = {i: d[ts_step * batch_size:(ts_step + 1) * batch_size] for i, d in zip(ftr_in_list, fea_list)} fd2 = {i: d[ts_step * batch_size:(ts_step + 1) * batch_size] for i, d in zip(bias_in_list, biases_list)} fd3 = {lbl_in: y_all[ts_step * batch_size:(ts_step + 1) * batch_size], msk_in: all_mask[ts_step * batch_size:(ts_step + 1) * batch_size], metric_ftr_in: rawFeature, is_train: False, attn_drop: 0.0, ffd_drop: 0.0} fd = fd1 fd.update(fd2) fd.update(fd3) loss_value_ts, acc_ts, jhy_final_embedding, test_final_embeed_check = sess.run([loss, accuracy, final_embedding, test_final_embeed], feed_dict=fd) ts_loss += loss_value_ts ts_acc += acc_ts ts_step += 1 xx = np.expand_dims(jhy_final_embedding, axis=0)[all_mask] xx2 = np.expand_dims(test_final_embeed_check, axis=0)[all_mask] yy = y_all[all_mask] print ("check fd") print('xx: {}, yy: {}, ts_size: {}, ts_step: {}, batch_size: {}'.format(xx.shape, yy.shape, ts_size, ts_step,batch_size)) labels, numberofLabels = self.getLabel(yy) from utils import clustering, pairwise_precision_recall_f1 clusters_pred = clustering(xx2, num_clusters=numberofLabels) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels) print ('prec: ', prec, ', rec: ', rec, ', f1: ', f1, ', originNumberOfClusterlabels: ', numberofLabels) if needtSNE: tSNEAnanlyse(xx, labels, join(settings.PIC_DIR, "HAN", "rawReature_%s_final.png" % (self.name))) tSNEAnanlyse(rawFeature, labels, join(settings.PIC_DIR, "HAN", "rawReature_%s_features.png" % (self.name))) tSNEAnanlyse(xx2, labels, join(settings.PIC_DIR, "HAN", "rawReature_%s_xx2.png" % (self.name))) tSNEAnanlyse(xx, clusters_pred, join(settings.PIC_DIR, "HAN", "rawReature_%s_result_label.png" % (self.name))) sess.close() return prec, rec, f1, xx2
def gae_for_na(name, rawfeature): """ train and evaluate disambiguation results for a specific name :param name: author name :return: evaluation results """ adj, features, labels = load_local_data(name=name, rawfeature=rawfeature) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train = gen_train_edges(adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) num_nodes = adj.shape[0] input_feature_dim = features.shape[1] if FLAGS.is_sparse: # TODO to test # features = sparse_to_tuple(features.tocoo()) # features_nonzero = features[1].shape[0] features = features.todense() # TODO else: features = normalize_vectors(features) # Define placeholders placeholders = { # 'features': tf.sparse_placeholder(tf.float32), 'features': tf.placeholder(tf.float32, shape=(None, input_feature_dim)), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } # Create model model = None if model_str == 'gcn_ae': model = GCNModelAE(placeholders, input_feature_dim) elif model_str == 'gcn_vae': model = GCNModelVAE(placeholders, input_feature_dim, num_nodes) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() # negative edges/pos edges print('positive edge weight', pos_weight) norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.nnz) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) def get_embs(): feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) # z_mean is better return emb # Train model for epoch in range(FLAGS.epochs): t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), "train_acc=", "{:.5f}".format(avg_accuracy), "time=", "{:.5f}".format(time.time() - t)) emb = get_embs() n_clusters = len(set(labels)) emb_norm = normalize_vectors(emb) clusters_pred = clustering(emb_norm, num_clusters=n_clusters) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels) print('pairwise precision', '{:.5f}'.format(prec), 'recall', '{:.5f}'.format(rec), 'f1', '{:.5f}'.format(f1)) clusters_pred2 = clustering(features, num_clusters=n_clusters) prec2, rec2, f12 = pairwise_precision_recall_f1(clusters_pred2, labels) print('pairwise precision', '{:.5f}'.format(prec2), 'recall', '{:.5f}'.format(rec2), 'f1', '{:.5f}'.format(f12)) from sklearn.manifold import TSNE features_new = TSNE(learning_rate=100).fit_transform(features) emb_new = TSNE(learning_rate=100).fit_transform(emb_norm) labels = np.array(labels) + 2 clusters_pred = np.array(clusters_pred) + 2 clusters_pred2 = np.array(clusters_pred2) + 2 if rawfeature == RAW_INTER_NAME: tSNEAnanlyse(emb_norm, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final_raw.png" % (name))) tSNEAnanlyse(features, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features_raw.png" % (name))) elif rawfeature == ATTENTIONFEATURE: tSNEAnanlyse(emb_new, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final.png" % (name))) tSNEAnanlyse(features_new, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features.png" % (name))) tSNEAnanlyse(emb_new, clusters_pred, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final_clusterresult.png" % (name))) tSNEAnanlyse(features_new, clusters_pred2, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features_clusterresult.png" % (name))) else: tSNEAnanlyse(emb_norm, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final_triplet.png" % (name))) tSNEAnanlyse(features, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features_triplet.png" % (name))) return [prec, rec, f1], num_nodes, n_clusters
def MetricDebug(self, adj_list, fea_list, y_train, y_val, y_test, train_mask, val_mask, test_mask, y_all, all_mask, rawlabels, needtSNE=False, rawFeature=[]): prec, rec, f1 = 0.0, 0.0, 0.0 nb_nodes = fea_list[0].shape[0] ft_size = fea_list[0].shape[1] nb_classes = y_train.shape[1] # nb_classes = len(set(rawlabels)) # adj = adj.todense() # features = features[np.newaxis] # [1, nb_node, ft_size] fea_list = [fea[np.newaxis] for fea in fea_list] adj_list = [adj[np.newaxis] for adj in adj_list] y_train = y_train[np.newaxis] y_val = y_val[np.newaxis] y_test = y_test[np.newaxis] y_all = y_all[np.newaxis] train_mask = train_mask[np.newaxis] val_mask = val_mask[np.newaxis] test_mask = test_mask[np.newaxis] all_mask = all_mask[np.newaxis] biases_list = [process.adj_to_bias(adj, [nb_nodes], nhood=1) for adj in adj_list] print('build graph...') with tf.Graph().as_default(): with tf.name_scope('input'): metric_ftr_in = tf.placeholder(dtype=tf.float32, shape=(nb_nodes, ft_size), name='metric_ftr_in') ftr_in_list = [tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, ft_size), name='ftr_in_{}'.format(i)) for i in range(len(fea_list))] bias_in_list = [tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, nb_nodes), name='bias_in_{}'.format(i)) for i in range(len(biases_list))] lbl_in = tf.placeholder(dtype=tf.int32, shape=( batch_size, nb_nodes, nb_classes), name='lbl_in') msk_in = tf.placeholder(dtype=tf.int32, shape=(batch_size, nb_nodes), name='msk_in') attn_drop = tf.placeholder(dtype=tf.float32, shape=(), name='attn_drop') ffd_drop = tf.placeholder(dtype=tf.float32, shape=(), name='ffd_drop') is_train = tf.placeholder(dtype=tf.bool, shape=(), name='is_train') # forward logits, final_embedding, att_val, centers_embed, test_final_embeed = model.inference(ftr_in_list, nb_classes, nb_nodes, is_train, attn_drop, ffd_drop, bias_mat_list=bias_in_list, hid_units=hid_units, n_heads=n_heads, features=fea_list, labels=rawlabels, residual=residual, activation=nonlinearity, feature_size=ft_size) log_resh = tf.reshape(logits, [-1, nb_classes]) lab_resh = tf.reshape(lbl_in, [-1, nb_classes]) msk_resh = tf.reshape(msk_in, [-1]) osm_caa_loss = OSM_CAA_Loss(batch_size=nb_nodes) osm_loss = osm_caa_loss.forward osmLoss, checkvalue = osm_loss(final_embedding, rawlabels, centers_embed) SoftMaxloss = model.masked_softmax_cross_entropy(log_resh, lab_resh, msk_resh) loss = osmLoss accuracy = model.masked_accuracy(log_resh, lab_resh, msk_resh) # optimzie train_op = model.training(loss, lr, l2_coef) Path = 'pre_trained/{}/{}/{}'.format(dataset, dataset, self.name) self.mkdir(Path) checkpt_file = '{}/allMP_multi_{}_.ckpt'.format(Path, featype) saver = tf.train.Saver() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) ts_size = fea_list[0].shape[0] ts_step = 0 ts_loss = 0.0 ts_acc = 0.0 with tf.Session(config=config) as sess: sess.run(init_op) saver.restore(sess, checkpt_file) while ts_step * batch_size < ts_size: fd1 = {i: d[ts_step * batch_size:(ts_step + 1) * batch_size] for i, d in zip(ftr_in_list, fea_list)} fd2 = {i: d[ts_step * batch_size:(ts_step + 1) * batch_size] for i, d in zip(bias_in_list, biases_list)} fd3 = {lbl_in: y_all[ts_step * batch_size:(ts_step + 1) * batch_size], msk_in: all_mask[ts_step * batch_size:(ts_step + 1) * batch_size], metric_ftr_in: rawFeature, is_train: False, attn_drop: 0.0, ffd_drop: 0.0} fd = fd1 fd.update(fd2) fd.update(fd3) test_final_embeed_check = sess.run([ test_final_embeed], feed_dict=fd) ts_step += 1 xx2 = np.expand_dims(test_final_embeed_check, axis=0)[all_mask] yy = y_all[all_mask] labels, numberofLabels = self.getLabel(yy) from utils import clustering, pairwise_precision_recall_f1 clusters_pred = clustering(xx2, num_clusters=numberofLabels) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels) print ('prec: ', prec, ', rec: ', rec, ', f1: ', f1, ', originNumberOfClusterlabels: ', numberofLabels) if needtSNE: tSNEAnanlyse(rawFeature, labels, join(settings.PIC_DIR, "MetricLearning", "rawReature_%s_features.png" % (self.name))) tSNEAnanlyse(xx2, labels, join(settings.PIC_DIR, "MetricLearning", "rawReature_%s_xx2.png" % (self.name)))
def train(name, needtSNE=False, savefile=True): adj, adj2, features, labels, Clusterlabels, Ids = load_local_data( name=name) initClusterlabel = Clusterlabels oneHotClusterLabels = toOneHot(Clusterlabels) num_logits = len(oneHotClusterLabels[0]) # enc.transform([['Female', 1], ['Male', 4]]).toarray() print('debuging ', oneHotClusterLabels.shape) originClusterlabels = Clusterlabels n_clusters = len(set(labels)) OldClusterlabels = Clusterlabels originNumberOfClusterlabels = len(set(Clusterlabels)) num_nodes = adj.shape[0] input_feature_dim = features.shape[1] adj_norm, adj_label = NormalizedAdj(adj) adj_norm2, adj_label2 = NormalizedAdj(adj2) if FLAGS.is_sparse: # TODO to test # features = sparse_to_tuple(features.tocoo()) # features_nonzero = features[1].shape[0] features = features.todense() # TODO else: features = normalize_vectors(features) # Define placeholders placeholders = { # 'features': tf.sparse_placeholder(tf.float32), 'features': tf.placeholder(tf.float32, shape=(None, input_feature_dim)), 'labels': tf.placeholder(tf.int64, shape=(None), name='labels'), 'graph1': tf.sparse_placeholder(tf.float32), 'graph2': tf.sparse_placeholder(tf.float32), 'graph1_orig': tf.sparse_placeholder(tf.float32), 'graph2_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()), 'epoch': tf.placeholder_with_default(0., shape=()), 'clusterEpoch': tf.placeholder_with_default(0., shape=()) } # pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() # negative edges/pos edges # norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.nnz) * 2) def get_embs(): feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean_1, feed_dict=feed_dict) # z_mean is better return emb def getGraphDetail(adj): pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() # negative edges/pos edges norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.nnz) * 2) return {'norm': norm, 'pos_weight': pos_weight} # return pos_weight, norm # loss1s = [] # loss2s = [] # loss3s = [] n_clusters = len(set(labels)) graph1 = getGraphDetail(adj) graph2 = getGraphDetail(adj2) # construct adj_orig graph1['labels'] = tf.reshape( tf.sparse_tensor_to_dense(placeholders['graph1_orig'], validate_indices=False), [-1]) graph2['labels'] = tf.reshape( tf.sparse_tensor_to_dense(placeholders['graph2_orig'], validate_indices=False), [-1]) # Train model for clusterepoch in range(FLAGS.clusterEpochs): print('cluster epoch: ', clusterepoch) # tf.reset_default_graph() # num_logits model = BuildModel(placeholders, input_feature_dim, num_nodes, name='model%d' % (clusterepoch), num_logits=num_logits) # Session # tf.reset_default_graph() # sess = tf.InteractiveSession() opt = OptimizerDualGCNAutoEncoder(model=model, num_nodes=num_nodes, z_label=Clusterlabels, name='model%d' % (clusterepoch), graph1=graph1, graph2=graph2) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Centers # centers = opt.centers for epoch in range(FLAGS.epochs): # print ('epoch: ', epoch) # opt.epoch = epoch model.epoch = epoch # Construct feed dictionary # Number of logics and preb feed_dict = construct_feed_dict(adj_norm, adj_label, adj_norm2, adj_label2, features, placeholders, Clusterlabels, epoch, clusterepoch + 1) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update # outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) outs = sess.run([opt.opt_op, opt.cost], feed_dict=feed_dict) # [Loss, softmax_loss, loss3, centerloss, reconstructloss] = sess.run([opt.cost, opt.softmax_loss, opt.loss3, opt.centerloss, opt.reconstructloss], feed_dict=feed_dict) # [Loss, loss3, centerloss, reconstructloss, L2loss] = sess.run([opt.cost, opt.loss3, opt.centerloss, opt.reconstructloss, opt.L2loss], feed_dict=feed_dict) [Loss, reconstructloss] = sess.run([opt.cost, opt.reconstructloss], feed_dict=feed_dict) # print ('loss: ', Loss, ', loss1: ', loss1, ', loss2: ', loss2 ,', centerloss: ', centerloss, ', acc: ', outs[2]) print('epoch: ', epoch, ', loss: ', Loss, ', reconstructloss : ', reconstructloss) # if clusterepoch != FLAGS.clusterEpochs -1 : emb = get_embs() X_new = TSNE(learning_rate=100).fit_transform(emb) tClusterLabels = [] Maxscore = -10000 NumberOfCluster = 0 for nc in range(2, originNumberOfClusterlabels + 1, 1): TempLabels = clustering(X_new, nc) score = silhouette_score(X_new, TempLabels) print('nc: ', nc, ', score: ', score) if score > Maxscore: Maxscore = score tClusterLabels = TempLabels NumberOfCluster = nc print('NumberOfCluster: ', NumberOfCluster, ', originNumberOfClusterlabels : ', originNumberOfClusterlabels, ', Maxscore: ', Maxscore) if NumberOfCluster < 0 or NumberOfCluster > originNumberOfClusterlabels: continue # 符合不断缩小的要求 # 重新修改这些参数 Clusterlabels = tClusterLabels originNumberOfClusterlabels = NumberOfCluster prec, rec, f1 = pairwise_precision_recall_f1(Clusterlabels, labels) print('prec: ', prec, ', rec: ', rec, ', f1: ', f1, ', originNumberOfClusterlabels: ', originNumberOfClusterlabels) Cc = Counter(Clusterlabels) print(Cc) if needtSNE: sNEComparingAnanlyse(emb, OldClusterlabels, labels, Clusterlabels, savepath=join( settings.PIC_DIR, "%s_%s.png" % (name, clusterepoch))) # tSNEAnanlyse(emb, labels, join(settings.PIC_DIR, "%s.png"%(clusterepoch)) ) # tf.reset_default_graph() emb = get_embs() emb_norm = normalize_vectors(emb) clusters_pred = clustering(emb_norm, num_clusters=originNumberOfClusterlabels) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels) print('prec: ', prec, ', rec: ', rec, ', f1: ', f1, ', originNumberOfClusterlabels: ', originNumberOfClusterlabels) # lossPrint(range(FLAGS.epochs), loss1s, loss2s, loss3s) if needtSNE: tSNEAnanlyse(emb, labels, join(settings.PIC_DIR, "%s_final.png" % (name))) tf.reset_default_graph() return [prec, rec, f1], num_nodes, n_clusters
while epoch < epochs: _, losscheck, value2 = sess.run([train_op, loss, checkvalue], feed_dict=fd) print ("epoch: {} loss: {}, checkvalue: {}".format(epoch, losscheck, value2)) epoch += 1 print ("final_embed: ", final_embed) embedding = sess.run([final_embed], feed_dict=fd) embedding = embedding[0] print ("embedding: ", embedding) from utils import clustering, pairwise_precision_recall_f1 clusters_pred = clustering(embedding, num_clusters=nb_class) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, rawlabels) print ('prec: ', prec, ', rec: ', rec, ', f1: ', f1, ', originNumberOfClusterlabels: ', nb_class) tSNEAnanlyse(embedding, rawlabels, join(settings.PIC_DIR, "PureMetricLoss", "%s_final.png" % (name))) tSNEAnanlyse(features, rawlabels, join(settings.PIC_DIR, "PureMetricLoss", "%s_features.png" % (name))) # my_KNN(xx, yy) # my_Kmeans(xx, yy) sess.close()
labels.append(aid) rf.append(rawFeature.get(pid)) tf.append(tripletFeature.get(pid)) attentionf.append(lc_emb.get(pid)) labels = encode_labels(labels) numberofLabels = len(set(labels)) def clusterTest(embedding, numberofLabels): clusters_pred = clustering(embedding, num_clusters=numberofLabels) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels) return [prec, rec, f1] tSNEAnanlyse(rf, labels, join(settings.PIC_DIR, "FINALResult", "%s_rawFeature.png" % (name))) tSNEAnanlyse(tf, labels, join(settings.PIC_DIR, "FINALResult", "%s_tripletFeature.png" % (name))) tSNEAnanlyse(attentionf, labels, join(settings.PIC_DIR, "FINALResult", "%s_lcmbFeature.png" % (name))) Res = {} Res['rawfeature'] = clusterTest(rf, numberofLabels=numberofLabels) Res['tripletfeature'] = clusterTest(tf, numberofLabels=numberofLabels) Res['lcmbfeature'] = clusterTest(attentionf, numberofLabels=numberofLabels) print ("Res: ", Res)
EndIndex = -2 featurePath = getPATH(name, idf_threshold, 'feature_and_label', ispretrain) # idx_features_labels = np.genfromtxt(join(settings.DATA_DIR, 'local', 'graph-{}'.format(idf_threshold)), dtype=np.dtype(str)) idx_features_labels = np.genfromtxt(featurePath, dtype=np.dtype(str)) features = np.array(idx_features_labels[:, 1:EndIndex], dtype=np.float32) # sparse? rawlabels = encode_labels(idx_features_labels[:, EndIndex]) pids = idx_features_labels[:, 0] return features, pids, rawlabels def load_test_names(): return data_utils.load_json(settings.DATA_DIR, 'test_name_list2.json') Res = {} names = load_test_names() for name in names: features, pids, rawlabels = loadFeature(name, ispretrain=False) tSNEAnanlyse( features, rawlabels, join(settings.PIC_DIR, "MetricLearning", "rawReature_%s_train.png" % (name))) numberofLabels = len(set(rawlabels)) clusters_pred = clustering(features, num_clusters=numberofLabels) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, rawlabels) Res[name] = {"prec": prec, "rec": rec, "f1": f1} print(Res)
embs_input = [] labels = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) labels.append(aid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) labels = encode_labels(labels) for i, pid_ in enumerate(pids): res_embs.append(inter_embs[i]) # Clustering and save the result tSNEAnanlyse( res_embs, labels, join(settings.PIC_DIR, "OnlyTriplete", "rawReature_%s_triplet.png" % (name))) tSNEAnanlyse( embs_input, labels, join(settings.PIC_DIR, "OnlyTriplete", "rawReature_%s_features.png" % (name)))