def do_eval(sess, error, placeholder,dev, devtest, curr_best, FLAGS,error_file_name, rel2idx, word2idx): feed_dict_dev = feeder.fill_feed_dict(dev, placeholder, rel2idx, 0) true_label = feed_dict_dev[placeholder['label_placeholder']] pred_error = sess.run(error, feed_dict = feed_dict_dev) print('Dev Stats:', end = '') thresh, _ = best_threshold(pred_error, true_label) #evaluat devtest feed_dict_devtest = feeder.fill_feed_dict(devtest, placeholder, rel2idx, 0) true_label_devtest = feed_dict_devtest[placeholder['label_placeholder']] devtest_he_error = sess.run(error, feed_dict = feed_dict_devtest) pred = devtest_he_error <= thresh correct = (pred == true_label_devtest) accuracy = float(correct.astype('float32').mean()) wrong_indices = np.logical_not(correct).nonzero()[0] wrong_preds = pred[wrong_indices] if accuracy>curr_best: # #evaluat devtest error_file = open(error_file_name+"_test.txt",'wt') if FLAGS.rel_acc: rel_acc_checker(feed_dict_devtest, placeholder, correct, dev, error_file, rel2idx) if FLAGS.error_analysis: err_analysis(dev, wrong_indices, feed_dict_devtest, placeholder, error_file, rel2idx, word2idx, devtest_he_error) return accuracy
def accuracy_eval(sess, error, placeholder, data_set, rel2idx, FLAGS, error_file_name): feed_dict = feeder.fill_feed_dict(data_set, placeholder, rel2idx, 0) true_label = feed_dict[placeholder['label_placeholder']] pred_error = sess.run(error, feed_dict=feed_dict) _, acc = best_f1_threshold(pred_error, true_label) print('auc', calc_auc(pred_error, true_label)) return acc
def dev_eval(sess, error, placeholder, data_set, rel2idx, FLAGS, error_file_name): feed_dict = feeder.fill_feed_dict(data_set, placeholder, rel2idx, 0) true_label = feed_dict[placeholder['label_placeholder']] pred_error = sess.run(error, feed_dict=feed_dict) pred_prob = np.exp(-1 * np.asarray(pred_error)) pred_prob = np.clip(pred_prob, 0, 1) kldiv_mean = kl_divergence_batch(pred_prob, true_label) # pears_corr = np.corrcoef(pred_prob, true_label)[0,1] # Pearson pears_corr = pearsonr(pred_prob, true_label)[0] # Pearson spear_corr = spearmanr(pred_prob, true_label)[0] # Spearman return kldiv_mean, pears_corr, spear_corr
def do_eval(sess, error, placeholder, dev, devtest, curr_best, FLAGS, error_file_name, rel2idx, word2idx): feed_dict_dev = feeder.fill_feed_dict(dev, placeholder, rel2idx, 0) true_label = feed_dict_dev[placeholder['label_placeholder']] pred_error = sess.run(error, feed_dict=feed_dict_dev) print('Dev Stats:', end='') print('AUC', calc_auc(pred_error, true_label)) # print('average precision') # return average_precision_score(true_label, -pred_error) thresh, _ = best_f1_threshold(pred_error, true_label) # thresh, _ = best_accu_threshold(pred_error, true_label) # evaluat devtest feed_dict_devtest = feeder.fill_feed_dict(devtest, placeholder, rel2idx, 0) true_label_devtest = feed_dict_devtest[placeholder['label_placeholder']] devtest_he_error = sess.run(error, feed_dict=feed_dict_devtest) print('Dev Test AUC', calc_auc(devtest_he_error, true_label_devtest)) # f1 score calculation tp, tn, fp, fn = 0, 0, 0, 0 for n in range(len(devtest_he_error)): if devtest_he_error[n] <= thresh and true_label_devtest[n] == 1: tp += 1 if devtest_he_error[n] <= thresh and true_label_devtest[n] == 0: fp += 1 if devtest_he_error[n] > thresh and true_label_devtest[n] == 1: fn += 1 if devtest_he_error[n] > thresh and true_label_devtest[n] == 0: tn += 1 if (tp + fp) > 0: precision = tp / (tp + fp) else: precision = 0 if (tp + fn) > 0: recall = tp / (tp + fn) else: recall = 0 print('precision, recall', precision, recall) if precision + recall > 0: f1 = (2 * precision * recall) / (precision + recall) else: f1 = 0 # accuracy calculation pred = devtest_he_error <= thresh correct = (pred == true_label_devtest) accuracy = float(correct.astype('float32').mean()) wrong_indices = np.logical_not(correct).nonzero()[0] wrong_preds = pred[wrong_indices] if FLAGS.error_analysis: error_file = open(error_file_name + "_test.txt", 'wt') print('error analysis') err_analysis(dev, wrong_indices, feed_dict_devtest, placeholder, error_file, rel2idx, word2idx, devtest_he_error) if accuracy > curr_best: # #evaluat devtest error_file = open(error_file_name + "_test.txt", 'wt') if FLAGS.rel_acc: rel_acc_checker(feed_dict_devtest, placeholder, correct, dev, error_file, rel2idx) if FLAGS.error_analysis: print('error analysis') err_analysis(dev, wrong_indices, feed_dict_devtest, placeholder, error_file, rel2idx, word2idx, devtest_he_error) return f1
def run_training(): exp_name = 'time' + str(datetime.now()) + 'train_file' + str(FLAGS.train_file) + 'freeze_grad' + str( FLAGS.freeze_grad) + 'neg' + str(FLAGS.neg) + 'model' + str(FLAGS.model) + '_measure' + str(FLAGS.measure) + \ '_w1' + str(FLAGS.w1) + '_w2' + str(FLAGS.w2) + '_learning_rate' + str( FLAGS.learning_rate) + '_batchsize' + str(FLAGS.batch_size) + '_dim' + str(FLAGS.embed_dim) + \ '_cube_eps' + str(FLAGS.cube_eps) + '_steps' + str(FLAGS.max_steps) + '_softfreeze' + str( FLAGS.softfreeze) + '_r1' + str(FLAGS.r1) + '_paireval' + str(FLAGS.pair_eval) print('experiment file name', exp_name) error_file_name = FLAGS.error_file + exp_name + '.txt' save_model_name = FLAGS.params_file + exp_name + '.pkl' log_folder = FLAGS.log_file + exp_name + '/' # define evalution number list train_acc_list, dev2_acc_list = [], [] curr_best = 0.0 # read data set is a one time thing, so even it takes a little bit longer, it's fine. data_sets = data_loader.read_data_sets(FLAGS) if FLAGS.overfit: train_data = data_sets.dev train_test_data = data_sets.dev else: train_data = data_sets.train train_test_data = data_sets.train_test with tf.Graph().as_default(): print('Build Model...') placeholder = feeder.define_placeholder() if FLAGS.model == 'transe': model = transe_model.tf_model(data_sets, placeholder, FLAGS) elif FLAGS.model == 'cube' and FLAGS.rel_size > 1: model = multi_model.tf_model(data_sets, placeholder, FLAGS) elif FLAGS.model == 'cube' and FLAGS.rel_size == 1: model = tf_model(data_sets, placeholder, FLAGS) else: raise ValueError('no valid model combination, transe or cube') eval_neg_prob = model.eval_prob print('Build Training Function...') train_op = model.training(model.loss, FLAGS.epsilon, FLAGS.learning_rate) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) if not os.path.exists(log_folder): os.makedirs(log_folder) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(log_folder, graph=sess.graph) for step in range(FLAGS.max_steps): start_time = time.time() train_feed_dict = feeder.fill_feed_dict(train_data, placeholder, data_sets.rel2idx, FLAGS.batch_size) _, loss_value, summary = sess.run( [train_op, model.loss, summary_op], feed_dict=train_feed_dict) summary_writer.add_summary(summary, step) duration = time.time() - start_time if (step % (FLAGS.print_every) == 0): print('=' * 100) print('Epoch %d: kb_loss = %.5f (%.3f sec)' % (train_data._epochs_completed, loss_value, duration)) print('Training Stats:', end='') train_acc = evaluater.single_eval(sess, eval_neg_prob, placeholder, train_test_data, data_sets.rel2idx, FLAGS, error_file_name) train_acc_list.append(train_acc) dev2_acc = evaluater.do_eval(sess, eval_neg_prob, placeholder, data_sets.dev, data_sets.devtest, curr_best, FLAGS, error_file_name, data_sets.rel2idx, data_sets.word2idx) dev2_acc_list.append(dev2_acc) print("Accuracy for Devtest: %.5f" % dev2_acc) if dev2_acc >= curr_best: curr_best = dev2_acc if FLAGS.save: save_model(save_model_name, sess, model) print('current best accurancy', curr_best) print('Average of Top 10 Training Score', np.mean(sorted(train_acc_list, reverse=True)[:10])) opt_idx = np.argmax(np.asarray(dev2_acc_list)) print('Epoch', opt_idx) print('Best Dev2 Score: %.5f' % dev2_acc_list[opt_idx])
def single_eval(sess, error, placeholder, data_set, rel2idx, FLAGS, error_file_name): feed_dict = feeder.fill_feed_dict(data_set, placeholder, rel2idx, 0) true_label = feed_dict[placeholder['label_placeholder']] pred_error = sess.run(error, feed_dict = feed_dict) _, acc = best_threshold(pred_error, true_label) return acc
def run_training(): # exp_name = 'time' + str(datetime.now()) + 'train_file' + str(FLAGS.train_file) + 'freeze_grad' + str( # FLAGS.freeze_grad) + 'neg' + str(FLAGS.neg) + 'model' + str(FLAGS.model) + '_measure' + str(FLAGS.measure) + \ # '_w1' + str(FLAGS.w1) + '_w2' + str(FLAGS.w2) + '_learning_rate' + str( # FLAGS.learning_rate) + '_batchsize' + str(FLAGS.batch_size) + '_dim' + str(FLAGS.embed_dim) + \ # '_cube_eps' + str(FLAGS.cube_eps) + '_steps' + str(FLAGS.max_steps) + '_softfreeze' + str( # FLAGS.softfreeze) + '_r1' + str(FLAGS.r1) + '_paireval' + str(FLAGS.pair_eval) exp_name = 'time' + str(datetime.now()) + '_EXP' + str(FLAGS.train_dir) + \ '_w1' + str(FLAGS.w1) + '_w2' + str(FLAGS.w2) + '_r1' + str(FLAGS.r1) + \ '_dim' + str(FLAGS.embed_dim) + '_lr' + str(FLAGS.learning_rate) exp_name = exp_name.replace(":", "-") exp_name = exp_name.replace("/", "-") print('experiment file name:-', exp_name) error_file_name = FLAGS.error_file + exp_name + '.txt' save_model_name = FLAGS.params_file + exp_name.split("_EXP.-")[1] + '.pkl' log_folder = FLAGS.log_file + exp_name + '/' loss_file = log_folder + 'losses.txt' eval_file = log_folder + 'evals.txt' dev_res = log_folder + 'dev_results.txt' viz_dict_file = log_folder + 'viz_dict.npy' viz_dict = {} # key: epoch_item1_item2, val: conditional prop if FLAGS.init_embedding == "pre_train": loss_file = log_folder + 'pre_train_losses.txt' eval_file = log_folder + 'pre_train_evals.txt' dev_res = log_folder + 'pre_train_dev_results.txt' if not FLAGS.init_embedding_file: FLAGS.init_embedding_file = save_model_name curr_best = np.inf # maximum value for kl-divergence # read data set is a one time thing, so even it takes a little bit longer, it's fine. data_sets = data_loader.read_data_sets(FLAGS) if FLAGS.overfit: train_data = data_sets.dev train_test_data = data_sets.dev else: train_data = data_sets.train train_test_data = data_sets.train_test with tf.Graph().as_default(): print('Build Model...') placeholder = feeder.define_placeholder() if FLAGS.model == 'transe': model = transe_model.tf_model(data_sets, placeholder, FLAGS) elif FLAGS.model == 'oe': print('OE') model = oe_model.tf_model(data_sets, placeholder, FLAGS) elif FLAGS.model == 'cube' and FLAGS.rel_size > 1: model = multi_model.tf_model(data_sets, placeholder, FLAGS) elif FLAGS.model == 'cube' and FLAGS.rel_size == 1: print('hard box') model = tf_model(data_sets, placeholder, FLAGS) elif FLAGS.model == 'softbox': print('softbox') model = soft_model.tf_model(data_sets, placeholder, FLAGS) else: raise ValueError('no valid model combination, transe or cube') eval_neg_prob = model.eval_prob print('Build Training Function...') train_op = model.training(model.loss, FLAGS.epsilon, FLAGS.learning_rate) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) # gradient plot # grad_norm_list = [] # plt.ion() i = 0 #variable to track performance on dev set and stop if no perf gain. log_folder = log_folder.replace(":", "-")[:150] if not os.path.exists(log_folder): os.makedirs(log_folder) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(log_folder, graph=sess.graph) if FLAGS.marginal_method == 'softplus' or FLAGS.model == 'box': sess.run([model.project_op]) for step in range(FLAGS.max_steps): start_time = time.time() train_feed_dict = feeder.fill_feed_dict(train_data, placeholder, data_sets.rel2idx, FLAGS.batch_size) if FLAGS.marginal_method == 'softplus' or FLAGS.model == 'box': sess.run([model.project_op], feed_dict=train_feed_dict) _, cond_loss, marg_loss, reg_loss, loss_value, temperature, summary = sess.run( [ train_op, model.cond_loss, model.marg_loss, model.regularization, model.loss, model.temperature, summary_op ], feed_dict=train_feed_dict) # grad_norm_list.append(grad_norm) # moving_average = np.convolve(grad_norm_list, np.ones((50,))/50, mode='valid') # if step %5: # plt.figure(1) # plt.plot(moving_average) # plt.draw() # plt.pause(0.0001) # plt.clf() # min_embed, delta_embed = sess.run([model.min_embed, model.delta_embed], feed_dict=train_feed_dict) debug, loss_value, summary = sess.run( [model.debug, model.loss, summary_op], feed_dict=train_feed_dict) summary_writer.add_summary(summary, step) duration = time.time() - start_time if (step % (FLAGS.print_every) == 0) and step > 1: print('=' * 100) print('step', step) print('temperature', temperature) if temperature > 0.0001: sess.run(model.temperature_update) print('Epoch %d: Total_loss = %.5f (%.3f sec)' % (train_data._epochs_completed, loss_value, duration)) print( 'Conditional loss: %.5f, Marginal loss: %.5f , Regularization loss: %.5f' % (cond_loss, marg_loss, reg_loss)) print('w Stats:', end='') loss_tuple = (loss_value, cond_loss, marg_loss, reg_loss) # Should be calculated on the subset of training data, not the traintest! # train_eval is a tuple of (KL, Pearson, Spearman) train_eval = evaluater.kl_corr_eval(sess, eval_neg_prob, placeholder, train_test_data, data_sets.rel2idx, FLAGS, error_file_name) print("Train eval KL & Corr:", train_eval, end='\n') with open(loss_file, "a") as lfile: lfile.write(str(loss_tuple)[1:-1] + '\n') with open(eval_file, "a") as efile: efile.write(str(train_eval)[1:-1] + '\n') # # Over-write any saved model by the current model if FLAGS.save: save_model(save_model_name, sess, model) if FLAGS.visualize: # Process data for visualizing confusion matrix and rectangle plots viz_dict = evaluater.visualization( sess, model, viz_dict, train_feed_dict, train_data._epochs_completed) # DEV SET EVAL -- eval on dev set after training is over! dev_err_file = 'dev_' + error_file_name dev_eval = evaluater.kl_corr_eval(sess, eval_neg_prob, placeholder, data_sets.dev, data_sets.rel2idx, FLAGS, dev_err_file) # Save the dev set results print("DEV data eval:", dev_eval) with open(dev_res, 'w') as dfile: dfile.write(str(dev_eval)[1:-1]) if FLAGS.visualize: print("Saved viz dict to file:", viz_dict_file) np.save(viz_dict_file, viz_dict) np.save(log_folder + "word2idx.npy", data_sets.word2idx) np.save(log_folder + "idx2word.npy", data_sets.idx2word)
def run_training(): exp_name = 'time' + str(datetime.now()) + 'train_file' + str(FLAGS.train_file) + 'freeze_grad' + str( FLAGS.freeze_grad) + 'neg' + str(FLAGS.neg) + 'model' + str(FLAGS.model) + '_measure' + str(FLAGS.measure) + \ '_w1' + str(FLAGS.w1) + '_w2' + str(FLAGS.w2) + '_learning_rate' + str( FLAGS.learning_rate) + '_batchsize' + str(FLAGS.batch_size) + '_dim' + str(FLAGS.embed_dim) + \ '_cube_eps' + str(FLAGS.cube_eps) + '_steps' + str(FLAGS.max_steps) + '_softfreeze' + str( FLAGS.softfreeze) + '_r1' + str(FLAGS.r1) + '_paireval' + str(FLAGS.pair_eval) print('experiment file name', exp_name) error_file_name = FLAGS.error_file + exp_name + '.txt' save_model_name = FLAGS.params_file + exp_name + '.pkl' log_folder = FLAGS.log_file + exp_name + '/' # define evalution number list train_acc_list, dev2_acc_list = [], [] curr_best = 0.0 # read data set is a one time thing, so even it takes a little bit longer, it's fine. data_sets = data_loader.read_data_sets(FLAGS) if FLAGS.overfit: train_data = data_sets.dev train_test_data = data_sets.dev else: train_data = data_sets.train train_test_data = data_sets.train_test with tf.Graph().as_default(): print('Build Model...') placeholder = feeder.define_placeholder() if FLAGS.model == 'transe': model = transe_model.tf_model(data_sets, placeholder, FLAGS) elif FLAGS.model == 'oe': print('OE') model = oe_model.tf_model(data_sets, placeholder, FLAGS) elif FLAGS.model == 'cube' and FLAGS.rel_size > 1: model = multi_model.tf_model(data_sets, placeholder, FLAGS) elif FLAGS.model == 'cube' and FLAGS.rel_size == 1: print('hard box') model = tf_model(data_sets, placeholder, FLAGS) elif FLAGS.model == 'softbox': print('softbox') model = soft_model.tf_model(data_sets, placeholder, FLAGS) else: raise ValueError('no valid model combination, transe or cube') eval_neg_prob = model.eval_prob print('Build Training Function...') train_op = model.training(model.loss, FLAGS.epsilon, FLAGS.learning_rate) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) # gradient plot # grad_norm_list = [] # plt.ion() i = 0 if not os.path.exists(log_folder): os.makedirs(log_folder) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(log_folder, graph=sess.graph) if FLAGS.marginal_method == 'softplus' or FLAGS.model == 'box': sess.run([model.project_op]) for step in range(FLAGS.max_steps): start_time = time.time() train_feed_dict = feeder.fill_feed_dict(train_data, placeholder, data_sets.rel2idx, FLAGS.batch_size) if FLAGS.marginal_method == 'softplus' or FLAGS.model == 'box': sess.run([model.project_op], feed_dict=train_feed_dict) _, cond_loss, marg_loss, reg_loss, loss_value, temperature, summary = sess.run( [ train_op, model.cond_loss, model.marg_loss, model.regularization, model.loss, model.temperature, summary_op ], feed_dict=train_feed_dict) # grad_norm_list.append(grad_norm) # moving_average = np.convolve(grad_norm_list, np.ones((50,))/50, mode='valid') # if step %5: # plt.figure(1) # plt.plot(moving_average) # plt.draw() # plt.pause(0.0001) # plt.clf() # min_embed, delta_embed = sess.run([model.min_embed, model.delta_embed], feed_dict=train_feed_dict) debug, loss_value, summary = sess.run( [model.debug, model.loss, summary_op], feed_dict=train_feed_dict) summary_writer.add_summary(summary, step) duration = time.time() - start_time if (step % (FLAGS.print_every) == 0) and step > 1: print('=' * 100) print('step', step) print('temperature', temperature) if temperature > 0.0001: sess.run(model.temperature_update) print('Epoch %d: kb_loss = %.5f (%.3f sec)' % (train_data._epochs_completed, loss_value, duration)) print( 'Conditional loss: %.5f, Marginal loss: %.5f , Regularization loss: %.5f' % (cond_loss, marg_loss, reg_loss)) print('Training Stats:', end='') train_acc = evaluater.accuracy_eval(sess, eval_neg_prob, placeholder, train_test_data, data_sets.rel2idx, FLAGS, error_file_name) train_acc_list.append(train_acc) dev2_acc = evaluater.do_eval(sess, eval_neg_prob, placeholder, data_sets.dev, data_sets.devtest, curr_best, FLAGS, error_file_name, data_sets.rel2idx, data_sets.word2idx) dev2_acc_list.append(dev2_acc) print("Accuracy for Devtest: %.5f" % dev2_acc) print(i) if dev2_acc >= curr_best: i = 0 curr_best = dev2_acc if FLAGS.save: save_model(save_model_name, sess, model) # elif dev2_acc < curr_best and i<50: # i+=1 # elif i>=50: # sys.exit() print('current best accurancy: %.5f' % curr_best) print('Average of dev2 score %.5f' % (np.mean(sorted(dev2_acc_list, reverse=True)[:10]))) print('Average of Top 10 Training Score', np.mean(sorted(train_acc_list, reverse=True)[:10])) opt_idx = np.argmax(np.asarray(dev2_acc_list)) print('Epoch', opt_idx) # print('Best Dev2 Score: %.5f' %dev2_acc_list[opt_idx]) print('Average of dev2 score %.5f' % (np.mean(sorted(dev2_acc_list, reverse=True)[:10])))