def main(): names = load_test_names() wf = codecs.open(join(settings.OUT_DIR, 'local_clustering_results.csv'), 'w', encoding='utf-8') wf.write('name,n_pubs,n_clusters,precision,recall,f1\n') metrics = np.zeros(3) cnt = 0 for name in names: cur_metric, num_nodes, n_clusters = gae_for_na(name, rawfeature="attention_feature") # cur_metric, num_nodes, n_clusters = gae_for_na(name, rawfeature="attention_feature") wf.write('{0},{1},{2},{3:.5f},{4:.5f},{5:.5f}\n'.format( name, num_nodes, n_clusters, cur_metric[0], cur_metric[1], cur_metric[2])) wf.flush() for i, m in enumerate(cur_metric): metrics[i] += m cnt += 1 macro_prec = metrics[0] / cnt macro_rec = metrics[1] / cnt macro_f1 = cal_f1(macro_prec, macro_rec) print('average until now', [macro_prec, macro_rec, macro_f1]) time_acc = time.time()-start_time print(cnt, 'names', time_acc, 'avg time', time_acc/cnt) macro_prec = metrics[0] / cnt macro_rec = metrics[1] / cnt macro_f1 = cal_f1(macro_prec, macro_rec) wf.write('average,,,{0:.5f},{1:.5f},{2:.5f}\n'.format( macro_prec, macro_rec, macro_f1)) wf.close()
def main(): names = load_test_names()# 加载测试 作者名 列表 ans = {} wf = codecs.open(join(settings.OUT_DIR, 'local_clustering_results.csv'), 'w', encoding='utf-8')# 结果保存 文件 wf.write('name,n_pubs,n_clusters,precision,recall,f1\n')#姓名, 论文数, 聚类数, 准确率, 召回, f1分数 metrics = np.zeros(3)# 3个0 cnt = 0 for name in names:#枚举 姓名 cur_metric, num_nodes, n_clusters, ans[name] = gae_for_na(name)#评估值[pre, rec, f1], 文档数, 聚类数 if cur_metric == None: continue wf.write('{0},{1},{2},{3:.5f},{4:.5f},{5:.5f}\n'.format(# 保存到文件 name, num_nodes, n_clusters, cur_metric[0], cur_metric[1], cur_metric[2])) wf.flush() for i, m in enumerate(cur_metric): # 各评估值 求和 取平均 metrics[i] += m cnt += 1 macro_prec = metrics[0] / cnt macro_rec = metrics[1] / cnt macro_f1 = cal_f1(macro_prec, macro_rec) print('average until now', [macro_prec, macro_rec, macro_f1]) # 现在的 各宏-评估值, 计算到 当前name的 time_acc = time.time()-start_time print(cnt, 'names', time_acc, 'avg time', time_acc/cnt)# 运算 的 时间 macro_prec = metrics[0] / cnt macro_rec = metrics[1] / cnt macro_f1 = cal_f1(macro_prec, macro_rec) wf.write('average,,,{0:.5f},{1:.5f},{2:.5f}\n'.format( macro_prec, macro_rec, macro_f1))# 最终 的 各宏-评估值 wf.close() dump_json(ans, settings.OUT_DIR, 'local_clustering_results.json', True)
def main(): names = load_test_names(test_dataset_name) wf = codecs.open(join(settings.get_out_dir(exp_name), 'local_clustering_results.csv'), 'w', encoding='utf-8') wf.write('name,n_pubs,n_clusters,precision,recall,f1\n') metrics = np.zeros(3) cnt = 0 tp_fp_fn_sum = np.zeros(3) for name in names: try: tp_fp_fn, cur_metric, num_nodes, n_clusters = gae_for_na(name) wf.write('{0},{1},{2},{3:.5f},{4:.5f},{5:.5f},{6:.5f},{7:.5f},{8:.5f},\n'.format( name, num_nodes, n_clusters, cur_metric[0], cur_metric[1], cur_metric[2], *tp_fp_fn)) wf.flush() for i, m in enumerate(cur_metric): metrics[i] += m cnt += 1 tp_fp_fn_sum += np.array(tp_fp_fn) macro_prec = metrics[0] / cnt macro_rec = metrics[1] / cnt macro_f1 = cal_f1(macro_prec, macro_rec) print('average until now', [macro_prec, macro_rec, macro_f1]) time_acc = time.time() - start_time print(cnt, 'names', time_acc, 'avg time', time_acc / cnt) except: continue macro_prec = metrics[0] / cnt macro_rec = metrics[1] / cnt macro_f1 = cal_f1(macro_prec, macro_rec) tp, fp, fn = tp_fp_fn_sum micro_precision = tp / (tp + fp) micro_recall = tp / (tp + fn) micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) wf.write('average,,,{0:.5f},{1:.5f},{2:.5f},{3:.5f},{4:5f},{5:5f}\n'.format( macro_prec, macro_rec, macro_f1, micro_precision, micro_recall, micro_f1)) wf.close()
def testDataRun(): cnt = 0 metrics = np.zeros(3) wf = codecs.open(join(settings.OUT_DIR, 'local_clustering_results.csv'), 'w', encoding='utf-8') LMDB_NAME_EMB = "graph_auto_encoder_embedding" lc_emb = LMDBClient(LMDB_NAME_EMB) han = HAN(lc_emb) name_to_pubs_test = load_test_names() for name in name_to_pubs_test: prec, rec, f1, pids, attentionEmbeddings = han.prepare_and_train( name=name, needtSNE=True) print(name, prec, rec, f1) wf.write('{0},{1:.5f},{2:.5f},{3:.5f}\n'.format(name, prec, rec, f1)) wf.flush() metrics[0] = metrics[0] + prec metrics[1] = metrics[1] + rec metrics[2] = metrics[2] + f1 cnt += 1 for pid, embedding in zip(pids, attentionEmbeddings): lc_emb.set(pid, embedding) macro_prec = metrics[0] / cnt macro_rec = metrics[1] / cnt macro_f1 = eval_utils.cal_f1(macro_prec, macro_rec) wf.write('average,,,{0:.5f},{1:.5f},{2:.5f}\n'.format( macro_prec, macro_rec, macro_f1)) wf.close()
def main(): """ train and evaluate YUTAO results for a specific name :param name: author name :return: evaluation results """ # Store original adjacency matrix (without diagonal entries) for later # Define placeholders placeholders = { # 'features': tf.sparse_placeholder(tf.float32), 'features': tf.placeholder(tf.float32, shape=(None, input_feature_dim)), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()), 'pos_weight': tf.placeholder(tf.float32, shape=()), 'norm': tf.placeholder(tf.float32), } # Create model model = None if model_str == 'gcn_ae': model = GCNModelInductiveAE(placeholders, input_feature_dim) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerInductiveAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=model.pos_weight, norm=model.norm) saver = tf.train.Saver() # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) def infer(): feed_dict.update({placeholders['dropout']: 0}) acc, emb = sess.run([opt.accuracy, model.z_mean], feed_dict=feed_dict) # z_mean is better return acc, emb train_name_list, _ = settings.get_split_name_list(train_dataset_name) _, test_name_list = settings.get_split_name_list(test_dataset_name) # Train model for epoch in range(FLAGS.epochs): epoch_avg_cost = 0 epoch_avg_accuracy = 0 for name in train_name_list: adj_norm, adj_label, features, pos_weight, norm, labels = load_local_preprocess_result( exp_name, IDF_THRESHOLD, name) # print('positive edge weight', pos_weight) # negative edges/pos edges t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict_inductive(adj_norm, adj_label, features, pos_weight, norm, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] epoch_avg_cost += avg_cost epoch_avg_accuracy += avg_accuracy # print(avg_cost, avg_accuracy) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(epoch_avg_cost / len(train_name_list)), "train_acc=", "{:.5f}".format(epoch_avg_accuracy / len(train_name_list)), "time=", "{:.5f}".format(time.time() - t)) metrics = np.zeros(3) tp_fp_fn_sum = np.zeros(3) avg_acc = 0 for name in test_name_list: adj_norm, adj_label, features, pos_weight, norm, labels = load_local_preprocess_result( exp_name, IDF_THRESHOLD, name) feed_dict = construct_feed_dict_inductive(adj_norm, adj_label, features, pos_weight, norm, placeholders) acc, emb = infer() n_clusters = len(set(labels)) emb_norm = normalize_vectors(emb) clusters_pred = clustering(emb_norm, num_clusters=n_clusters) tp, fp, fn, prec, rec, f1 = pairwise_precision_recall_f1( clusters_pred, labels) tp_fp_fn_sum += np.array([tp, fp, fn]) metrics += np.array([prec, rec, f1]) avg_acc += acc macro_prec = metrics[0] / len(test_name_list) macro_rec = metrics[1] / len(test_name_list) avg_acc /= len(test_name_list) macro_f1 = cal_f1(macro_prec, macro_rec) tp, fp, fn = tp_fp_fn_sum micro_precision = tp / (tp + fp) micro_recall = tp / (tp + fn) micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) print( 'average,acc:{0:.5f},macro_prec:{1:.5f},macro_rec:{2:.5f},macro_f1:{3:.5f},micro_precision:{4:.5f},micro_recall:{5:5f},micro_f1:{6:5f}\n' .format(avg_acc, macro_prec, macro_rec, macro_f1, micro_precision, micro_recall, micro_f1)) path = join(settings.get_data_dir(exp_name), 'local', 'model-{}'.format(IDF_THRESHOLD), model_name) saver.save(sess, path)