def eval_all(esnli_net, criterion_expl, params): word_index = params.word_index word_emb_dim = params.word_emb_dim batch_size = params.eval_batch_size print_every = params.print_every current_run_dir = params.current_run_dir train_snli_classif = params.train_snli_classif use_prototype_senteval = params.use_prototype_senteval esnli_net.eval() transfer_tasks = [ 'MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC', 'SICKEntailment', 'SICKRelatedness', 'STS14', 'STSBenchmark' ] if params.do_image_caption: transfer_tasks.append('ImageCaptionRetrieval') accuracy_tasks = [ 'MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC', 'SICKEntailment' ] infersent_allnli = { 'MR': 81.1, 'CR': 86.3, 'SUBJ': 92.4, 'MPQA': 90.2, 'SST2': 84.6, 'TREC': 88.2, 'MRPC_acc': 76.2, 'MRPC_f1': 83.1, 'SICKRelatedness': 0.884, 'SICKEntailment': 86.3, 'STSB_pearson': 75.8, 'STSB_spearman': 75.5 } # save auxiliary tasks results at each epoch in a csv file dev_csv = os.path.join( current_run_dir, time.strftime("%d:%m") + "_" + time.strftime("%H:%M:%S") + "_" + "aux_tasks.csv") remove_file(dev_csv) dev_f = open(dev_csv, "a") writer = csv.writer(dev_f) headers = [] headers.append('set') row_dev = ['dev'] row_test = ['test'] # SNLI expl_no_unk_dev = get_dev_test_original_expl(params.esnli_path, 'dev') expl_no_unk_test = get_dev_test_original_expl(params.esnli_path, 'test') preproc = params.preproc_expl + "_" snli_dev = get_dev_test_with_expl(params.esnli_path, 'dev', preproc, params.min_freq) snli_test = get_dev_test_with_expl(params.esnli_path, 'test', preproc, params.min_freq) snli_sentences = snli_dev['s1'] + snli_dev['s2'] + snli_dev[ 'expl_1'] + snli_dev['expl_2'] + snli_dev['expl_3'] + snli_test[ 's1'] + snli_test['s2'] + snli_test['expl_1'] + snli_test[ 'expl_2'] + snli_test['expl_3'] word_vec = build_vocab(snli_sentences, GLOVE_PATH) for split in ['s1', 's2', 'expl_1', 'expl_2', 'expl_3']: for data_type in ['snli_dev', 'snli_test']: eval(data_type)[split] = np.array( [['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)[split]]) final_dev_acc, dev_bleu_score, final_dev_ppl, acc_from_expl_dev = evaluate_snli_final( esnli_net, criterion_expl, 'snli_dev', snli_dev, expl_no_unk_dev, word_vec, word_index, batch_size, print_every, current_run_dir) test_acc, test_bleu_score, test_ppl, acc_from_expl_test = evaluate_snli_final( esnli_net, criterion_expl, 'snli_test', snli_test, expl_no_unk_test, word_vec, word_index, batch_size, print_every, current_run_dir) headers.append('SNLI-acc') row_dev.append(final_dev_acc) row_test.append(test_acc) headers.append('SNLI-acc_from_expl') row_dev.append(acc_from_expl_dev) row_test.append(acc_from_expl_test) headers.append('SNLI-ppl') row_dev.append(final_dev_ppl) row_test.append(test_ppl) headers.append('SNLI-BLEU') row_dev.append(dev_bleu_score) row_test.append(test_bleu_score) # Run best model on downstream tasks. def prepare(params, samples): params.infersent.build_vocab([' '.join(s) for s in samples], tokenize=False) def batcher(params, batch): #batch = [['<s>'] + s + ['</s>'] for s in batch] sentences = [' '.join(s) for s in batch] embeddings = params.infersent.encode(sentences, bsize=params.batch_size, tokenize=False) return embeddings # final params params_senteval = { 'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10 } params_senteval['classifier'] = { 'nhid': 0, 'optim': 'adam', 'batch_size': 64, 'tenacity': 5, 'epoch_size': 4 } # prototype params to speed up, for development only if use_prototype_senteval: params_senteval = { 'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5 } params_senteval['classifier'] = { 'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 'tenacity': 3, 'epoch_size': 2 } params_senteval['infersent'] = esnli_net.encoder params_senteval['infersent'].set_glove_path(GLOVE_PATH) se = senteval.engine.SE(params_senteval, batcher, prepare) results = se.eval(transfer_tasks) print("results ", results) macro_dev = 0 micro_dev = 0 n_total_dev = 0 macro_test = 0 micro_test = 0 n_total_test = 0 delta = 0 for task in transfer_tasks: if task in accuracy_tasks: if task == 'MRPC': headers.append('MRPC-acc') row_dev.append(round(results[task]['devacc'], 1)) row_test.append(round(results[task]['acc'], 1)) headers.append('MRPC-F1') row_test.append(round(results[task]['f1'], 1)) row_dev.append(" ") delta += results[task]['f1'] - infersent_allnli['MRPC_f1'] else: headers.append(task) row_test.append(round(results[task]['acc'], 1)) row_dev.append(round(results[task]['devacc'], 1)) delta += results[task]['acc'] - infersent_allnli[task] macro_test += round(results[task]['acc'], 1) micro_test += round(results[task]['ntest'] * results[task]['acc'], 1) n_total_test += results[task]['ntest'] macro_dev += round(results[task]['devacc'], 1) micro_dev += round(results[task]['ndev'] * results[task]['devacc'], 1) n_total_dev += results[task]['ndev'] elif task == "SICKRelatedness": headers.append('SICK-R_pearson') row_test.append(round(results[task]['pearson'], 3)) row_dev.append(round(results[task]['devpearson'], 3)) delta += 100 * (results[task]['pearson'] - infersent_allnli[task]) elif task == "STS14": headers.append('STS14_pearson') row_dev.append(" ") row_test.append(round(results[task]['all']['pearson']['mean'], 2)) headers.append('STS14_spearman') row_test.append(round(results[task]['all']['spearman']['mean'], 2)) row_dev.append(" ") elif task == "STSBenchmark": headers.append('STSB_pearson') row_dev.append(round(results[task]['devpearson'], 3)) row_test.append(round(results[task]['pearson'], 3)) headers.append('STSB_spearman') row_test.append(round(results[task]['spearman'], 3)) row_dev.append(" ") delta += round(100 * results[task]['spearman'], 1) - infersent_allnli['STSB_spearman'] elif task == "ImageCaptionRetrieval": headers += [ 'Caption_retrival_R1', 'Caption_retrival_R5', 'Caption_retrival_R10', 'Caption_retrival_Medr', 'Image_retrival_R1', 'Image_retrival_R5', 'Image_retrival_R10', 'Image_retrival_Medr' ] for i in range(8): row_dev.append(" ") for j in range(2): for i in range(4): row_test.append(results[task]['acc'][j][i]) headers.append('Delta') delta = round(delta / 10, 2) row_dev.append("") row_test.append(delta) headers.append('MACRO') row_dev.append(round(macro_dev / len(accuracy_tasks), 1)) row_test.append(round(macro_test / len(accuracy_tasks), 1)) headers.append('MICRO') row_dev.append(round(micro_dev / n_total_dev, 1)) row_test.append(round(micro_test / n_total_test, 1)) if train_snli_classif: # Ignore the trained classifier(or it might not even be trained if alpha=0) and train the same architecure of MLP classifier on top of the learned embeddings. For the case when we had trained a classifier, let's see how the new one compares to it. params_senteval = { 'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10 } params_senteval['classifier'] = { 'nhid': params.fc_dim, 'optim': 'adam', 'batch_size': 128, 'tenacity': 5, 'epoch_size': 4 } params_senteval['infersent'] = esnli_net.encoder params_senteval['infersent'].set_glove_path(GLOVE_PATH) se = senteval.engine.SE(params_senteval, batcher, prepare) results = se.eval(['SNLI']) print("results SNLI classif trained with SentEval ", results) headers.append('SNLI_train_classif') row_dev.append(round(results['SNLI']['devacc'], 1)) row_test.append(round(results['SNLI']['acc'], 1)) writer.writerow(headers) writer.writerow(row_dev) writer.writerow(row_test) dev_f.close()
random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) """ ALL DATA, some will only be needed for eval for we want to build glove vocab once """ preproc = params.preproc_expl + "_" train = get_train(params.esnli_path, preproc, params.min_freq, params.n_train) snli_dev = get_dev_test_with_expl(params.esnli_path, 'dev', preproc, params.min_freq) all_sentences = train['s1'] + train['s2'] + train['expl_1'] + snli_dev[ 's1'] + snli_dev['s2'] + snli_dev['expl_1'] + snli_dev[ 'expl_2'] + snli_dev['expl_3'] word_vec = build_vocab(all_sentences, GLOVE_PATH) expl_sentences_train = train['expl_1'] word_index_train = get_word_dict(expl_sentences_train) expl_sentences = train['expl_1'] + snli_dev['expl_1'] + snli_dev[ 'expl_2'] + snli_dev['expl_3'] + snli_test['expl_1'] + snli_test[ 'expl_2'] + snli_test['expl_3'] word_index = get_word_dict(expl_sentences) params.word_index = word_index print "difference ", set(word_index.keys()) - set(word_index_train.keys()) if params.n_train == -1: # there may be some words that appear in premise and hypothesis of train as well as in expl of dev or test but not in explanation of train. There was only one as far as i looked but if there are too many we should maybe take care of this. assert len( word_index) - len(word_index_train) < 5, "n words in train " + str( len(word_index_train)) + " while n words in total " + str(