def eval_all(esnli_net, criterion_expl, params):
    word_index = params.word_index
    word_emb_dim = params.word_emb_dim
    batch_size = params.eval_batch_size
    print_every = params.print_every
    current_run_dir = params.current_run_dir
    train_snli_classif = params.train_snli_classif
    use_prototype_senteval = params.use_prototype_senteval

    esnli_net.eval()

    transfer_tasks = [
        'MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC', 'SICKEntailment',
        'SICKRelatedness', 'STS14', 'STSBenchmark'
    ]
    if params.do_image_caption:
        transfer_tasks.append('ImageCaptionRetrieval')

    accuracy_tasks = [
        'MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC', 'SICKEntailment'
    ]

    infersent_allnli = {
        'MR': 81.1,
        'CR': 86.3,
        'SUBJ': 92.4,
        'MPQA': 90.2,
        'SST2': 84.6,
        'TREC': 88.2,
        'MRPC_acc': 76.2,
        'MRPC_f1': 83.1,
        'SICKRelatedness': 0.884,
        'SICKEntailment': 86.3,
        'STSB_pearson': 75.8,
        'STSB_spearman': 75.5
    }

    # save auxiliary tasks results at each epoch in a csv file
    dev_csv = os.path.join(
        current_run_dir,
        time.strftime("%d:%m") + "_" + time.strftime("%H:%M:%S") + "_" +
        "aux_tasks.csv")
    remove_file(dev_csv)
    dev_f = open(dev_csv, "a")
    writer = csv.writer(dev_f)

    headers = []
    headers.append('set')
    row_dev = ['dev']
    row_test = ['test']

    # SNLI
    expl_no_unk_dev = get_dev_test_original_expl(params.esnli_path, 'dev')
    expl_no_unk_test = get_dev_test_original_expl(params.esnli_path, 'test')

    preproc = params.preproc_expl + "_"
    snli_dev = get_dev_test_with_expl(params.esnli_path, 'dev', preproc,
                                      params.min_freq)
    snli_test = get_dev_test_with_expl(params.esnli_path, 'test', preproc,
                                       params.min_freq)
    snli_sentences = snli_dev['s1'] + snli_dev['s2'] + snli_dev[
        'expl_1'] + snli_dev['expl_2'] + snli_dev['expl_3'] + snli_test[
            's1'] + snli_test['s2'] + snli_test['expl_1'] + snli_test[
                'expl_2'] + snli_test['expl_3']
    word_vec = build_vocab(snli_sentences, GLOVE_PATH)

    for split in ['s1', 's2', 'expl_1', 'expl_2', 'expl_3']:
        for data_type in ['snli_dev', 'snli_test']:
            eval(data_type)[split] = np.array(
                [['<s>'] + [word
                            for word in sent.split() if word in word_vec] +
                 ['</s>'] for sent in eval(data_type)[split]])

    final_dev_acc, dev_bleu_score, final_dev_ppl, acc_from_expl_dev = evaluate_snli_final(
        esnli_net, criterion_expl, 'snli_dev', snli_dev, expl_no_unk_dev,
        word_vec, word_index, batch_size, print_every, current_run_dir)
    test_acc, test_bleu_score, test_ppl, acc_from_expl_test = evaluate_snli_final(
        esnli_net, criterion_expl, 'snli_test', snli_test, expl_no_unk_test,
        word_vec, word_index, batch_size, print_every, current_run_dir)

    headers.append('SNLI-acc')
    row_dev.append(final_dev_acc)
    row_test.append(test_acc)

    headers.append('SNLI-acc_from_expl')
    row_dev.append(acc_from_expl_dev)
    row_test.append(acc_from_expl_test)

    headers.append('SNLI-ppl')
    row_dev.append(final_dev_ppl)
    row_test.append(test_ppl)

    headers.append('SNLI-BLEU')
    row_dev.append(dev_bleu_score)
    row_test.append(test_bleu_score)

    # Run best model on downstream tasks.
    def prepare(params, samples):
        params.infersent.build_vocab([' '.join(s) for s in samples],
                                     tokenize=False)

    def batcher(params, batch):
        #batch = [['<s>'] + s + ['</s>'] for s in batch]
        sentences = [' '.join(s) for s in batch]
        embeddings = params.infersent.encode(sentences,
                                             bsize=params.batch_size,
                                             tokenize=False)
        return embeddings

    # final params
    params_senteval = {
        'task_path': PATH_TO_DATA,
        'usepytorch': True,
        'kfold': 10
    }
    params_senteval['classifier'] = {
        'nhid': 0,
        'optim': 'adam',
        'batch_size': 64,
        'tenacity': 5,
        'epoch_size': 4
    }

    # prototype params to speed up, for development only
    if use_prototype_senteval:
        params_senteval = {
            'task_path': PATH_TO_DATA,
            'usepytorch': True,
            'kfold': 5
        }
        params_senteval['classifier'] = {
            'nhid': 0,
            'optim': 'rmsprop',
            'batch_size': 128,
            'tenacity': 3,
            'epoch_size': 2
        }

    params_senteval['infersent'] = esnli_net.encoder
    params_senteval['infersent'].set_glove_path(GLOVE_PATH)

    se = senteval.engine.SE(params_senteval, batcher, prepare)
    results = se.eval(transfer_tasks)
    print("results ", results)

    macro_dev = 0
    micro_dev = 0
    n_total_dev = 0

    macro_test = 0
    micro_test = 0
    n_total_test = 0

    delta = 0

    for task in transfer_tasks:
        if task in accuracy_tasks:
            if task == 'MRPC':
                headers.append('MRPC-acc')
                row_dev.append(round(results[task]['devacc'], 1))
                row_test.append(round(results[task]['acc'], 1))

                headers.append('MRPC-F1')
                row_test.append(round(results[task]['f1'], 1))
                row_dev.append(" ")

                delta += results[task]['f1'] - infersent_allnli['MRPC_f1']
            else:
                headers.append(task)
                row_test.append(round(results[task]['acc'], 1))
                row_dev.append(round(results[task]['devacc'], 1))
                delta += results[task]['acc'] - infersent_allnli[task]

            macro_test += round(results[task]['acc'], 1)
            micro_test += round(results[task]['ntest'] * results[task]['acc'],
                                1)
            n_total_test += results[task]['ntest']

            macro_dev += round(results[task]['devacc'], 1)
            micro_dev += round(results[task]['ndev'] * results[task]['devacc'],
                               1)
            n_total_dev += results[task]['ndev']

        elif task == "SICKRelatedness":
            headers.append('SICK-R_pearson')
            row_test.append(round(results[task]['pearson'], 3))
            row_dev.append(round(results[task]['devpearson'], 3))
            delta += 100 * (results[task]['pearson'] - infersent_allnli[task])

        elif task == "STS14":
            headers.append('STS14_pearson')
            row_dev.append(" ")
            row_test.append(round(results[task]['all']['pearson']['mean'], 2))

            headers.append('STS14_spearman')
            row_test.append(round(results[task]['all']['spearman']['mean'], 2))
            row_dev.append(" ")

        elif task == "STSBenchmark":
            headers.append('STSB_pearson')
            row_dev.append(round(results[task]['devpearson'], 3))
            row_test.append(round(results[task]['pearson'], 3))

            headers.append('STSB_spearman')
            row_test.append(round(results[task]['spearman'], 3))
            row_dev.append(" ")

            delta += round(100 * results[task]['spearman'],
                           1) - infersent_allnli['STSB_spearman']

        elif task == "ImageCaptionRetrieval":
            headers += [
                'Caption_retrival_R1', 'Caption_retrival_R5',
                'Caption_retrival_R10', 'Caption_retrival_Medr',
                'Image_retrival_R1', 'Image_retrival_R5', 'Image_retrival_R10',
                'Image_retrival_Medr'
            ]
            for i in range(8):
                row_dev.append(" ")
            for j in range(2):
                for i in range(4):
                    row_test.append(results[task]['acc'][j][i])

    headers.append('Delta')
    delta = round(delta / 10, 2)
    row_dev.append("")
    row_test.append(delta)

    headers.append('MACRO')
    row_dev.append(round(macro_dev / len(accuracy_tasks), 1))
    row_test.append(round(macro_test / len(accuracy_tasks), 1))

    headers.append('MICRO')
    row_dev.append(round(micro_dev / n_total_dev, 1))
    row_test.append(round(micro_test / n_total_test, 1))

    if train_snli_classif:
        # Ignore the trained classifier(or it might not even be trained if alpha=0) and train the same architecure of MLP classifier on top of the learned embeddings. For the case when we had trained a classifier, let's see how the new one compares to it.
        params_senteval = {
            'task_path': PATH_TO_DATA,
            'usepytorch': True,
            'kfold': 10
        }
        params_senteval['classifier'] = {
            'nhid': params.fc_dim,
            'optim': 'adam',
            'batch_size': 128,
            'tenacity': 5,
            'epoch_size': 4
        }
        params_senteval['infersent'] = esnli_net.encoder
        params_senteval['infersent'].set_glove_path(GLOVE_PATH)

        se = senteval.engine.SE(params_senteval, batcher, prepare)
        results = se.eval(['SNLI'])
        print("results SNLI classif trained with SentEval ", results)

        headers.append('SNLI_train_classif')
        row_dev.append(round(results['SNLI']['devacc'], 1))
        row_test.append(round(results['SNLI']['acc'], 1))

    writer.writerow(headers)
    writer.writerow(row_dev)
    writer.writerow(row_test)
    dev_f.close()
Example #2
0
random.seed(params.seed)
torch.manual_seed(params.seed)
torch.cuda.manual_seed(params.seed)
"""
ALL DATA, some will only be needed for eval for we want to build glove vocab once
"""
preproc = params.preproc_expl + "_"
train = get_train(params.esnli_path, preproc, params.min_freq, params.n_train)

snli_dev = get_dev_test_with_expl(params.esnli_path, 'dev', preproc,
                                  params.min_freq)

all_sentences = train['s1'] + train['s2'] + train['expl_1'] + snli_dev[
    's1'] + snli_dev['s2'] + snli_dev['expl_1'] + snli_dev[
        'expl_2'] + snli_dev['expl_3']
word_vec = build_vocab(all_sentences, GLOVE_PATH)

expl_sentences_train = train['expl_1']
word_index_train = get_word_dict(expl_sentences_train)
expl_sentences = train['expl_1'] + snli_dev['expl_1'] + snli_dev[
    'expl_2'] + snli_dev['expl_3'] + snli_test['expl_1'] + snli_test[
        'expl_2'] + snli_test['expl_3']
word_index = get_word_dict(expl_sentences)
params.word_index = word_index

print "difference ", set(word_index.keys()) - set(word_index_train.keys())
if params.n_train == -1:
    # there may be some words that appear in premise and hypothesis of train as well as in expl of dev or test but not in explanation of train. There was only one as far as i looked but if there are too many we should maybe take care of this.
    assert len(
        word_index) - len(word_index_train) < 5, "n words in train " + str(
            len(word_index_train)) + " while n words in total " + str(