Exemple #1
0
def eval_meteor_file(run_file, ref_file, tokenizer=None, detokenizer=None):
    run_dict = {}
    with codecs.open(run_file, encoding='utf-8') as f:
        for line in f:
            temp = line.strip('\n').strip('\r').split('\t', 3)
            assert len(temp) == 4
            run_dict[temp[1]+'##<>##'+temp[2]] = temp[3]
    ref_dict = {}
    with codecs.open(ref_file, encoding='utf-8') as f:
        for line in f:
            temp = line.strip('\n').strip('\r').split('\t', 3)
            assert len(temp) == 4
            tokenized = temp[3]
            if tokenizer is not None:
                tokenized = detokenizer(tokenizer(temp[3]))
            if temp[1] in ref_dict:
                ref_dict[temp[1]].append(tokenized)
            else:
                ref_dict[temp[1]] = [tokenized]


        meteor = 0.
    for id in run_dict:  # [text1(,text2)] vs text
        meteor += meteor_score(ref_dict[id.split('##<>##')[0]], run_dict[id])
    return {'METEOR': rounder(meteor*100/len(run_dict))}
Exemple #2
0
def batch_meteor(comments, predicts, nl_i2w):
    references, hypothesises = batch_evaluate(comments, predicts, nl_i2w)
    scores = []
    for i in range(len(references)):
        if len(hypothesises[i]) == 0:
            scores.append(0)
        else:
            _, _, _, score = meteor_score(' '.join(hypothesises[i]),
                                          ' '.join(references[i]))
            scores.append(score)
    return scores
Exemple #3
0
def evaluate_on_file(path):
    files = os.listdir(path)
    for file in files:
        if file.startswith('.D') or file.endswith('.txt'):
            continue
        writer = SummaryWriter(logdir='test_log/' + file.split('.')[0] + '/')
        file_path = path + file
        data = load_json(file_path)

        node_len_bleu = {i: [] for i in range(21)}
        com_len_bleu = {i: [] for i in range(31)}
        bleu = []
        rouge_all = []
        meteor_all = []

        for i, d in enumerate(data):
            node_len = min(int(d['node_len']), 100)
            node_len = int(node_len / 5)

            predict = d['predict'].strip().split()
            true = d['true'].split()

            com_len = len(true)
            if com_len > 30:
                com_len = 30

            if len(predict) <= 1 or com_len < 4:
                score = 0
            else:
                score = sentence_bleu(
                    [true],
                    predict,
                    smoothing_function=SmoothingFunction().method4)

            _, _, rouge_s = rouge_l_score(d['predict'], d['true'])
            meteor_s = meteor_score(d['predict'], d['true'])

            node_len_bleu[node_len].append(score)
            com_len_bleu[com_len].append(score)
            bleu.append(score)
            rouge_all.append(rouge_s)
            meteor_all.append(meteor_s)

        if file not in ['transformer_seq.json']:
            for key, value in node_len_bleu.items():
                if len(value) == 0 or key == 0:
                    continue
                score = np.mean(value)
                writer.add_scalar('node_len_bleu', score, (key) * 5)
        for key, value in com_len_bleu.items():
            if len(value) == 0:
                continue
            score = np.mean(value)
            if score == 0:
                continue
            writer.add_scalar('com_len_bleu', score, key)

        writer.close()
        print(
            file.split('.')[0] + ': bleu:' + round_3(bleu) + ', rouge:' +
            round_3(rouge_all) + ', meteor:' + round_3(meteor_all))
def main(args):
    # Setting
    warnings.simplefilter("ignore", UserWarning)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Args Parser
    hj_method = args.hj_method
    kr_method = args.kr_method
    batch_size = args.batch_size
    beam_size = args.beam_size
    hidden_size = args.hidden_size
    embed_size = args.embed_size
    vocab_size = args.vocab_size
    max_len = args.max_len
    padding_index = args.pad_id
    n_layers = args.n_layers
    stop_ix = args.stop_ix

    # Load saved model & Word2vec
    save_path = 'save_{}_{}_{}_maxlen_{}'.format(vocab_size, hj_method,
                                                 kr_method, max_len)
    save_list = sorted(glob.glob(f'./save/{save_path}/*.*'))
    save_pt = save_list[-1]
    print('Will load {} pt file...'.format(save_pt))
    word2vec_hj = Word2Vec.load('./w2v/word2vec_hj_{}_{}.model'.format(
        vocab_size, hj_method))

    # SentencePiece model load
    spm_kr = spm.SentencePieceProcessor()
    spm_kr.Load("./spm/m_korean_{}.model".format(vocab_size))

    # Test data load
    with open('./test_dat.pkl', 'rb') as f:
        test_dat = pickle.load(f)

    test_dataset = CustomDataset(test_dat['test_hanja'],
                                 test_dat['test_korean'])
    test_loader = getDataLoader(test_dataset,
                                pad_index=padding_index,
                                shuffle=False,
                                batch_size=batch_size)

    # Model load
    print('Model loading...')
    encoder = Encoder(vocab_size,
                      embed_size,
                      hidden_size,
                      word2vec_hj,
                      n_layers=n_layers,
                      padding_index=padding_index)
    decoder = Decoder(embed_size,
                      hidden_size,
                      vocab_size,
                      n_layers=n_layers,
                      padding_index=padding_index)
    seq2seq = Seq2Seq(encoder, decoder, beam_size).cuda()
    #optimizer = optim.Adam(seq2seq.parameters(), lr=lr, weight_decay=w_decay)
    #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=scheduler_step_size, gamma=lr_decay)
    print(seq2seq)

    print('Testing...')
    start_time = time.time()
    results = test(seq2seq,
                   test_loader,
                   vocab_size,
                   load_pt=save_pt,
                   stop_ix=stop_ix)
    print(time.time() - start_time)
    print('Done!')

    print("Decoding...")
    pred_list = list()
    for result_text in tqdm(results):
        text = torch.Tensor(result_text).squeeze().tolist()
        text = [int(x) for x in text]
        prediction_sentence = spm_kr.decode_ids(
            text).strip()  # Decode with strip
        pred_list.append(prediction_sentence)
    ref_list = list()
    for ref_text in tqdm(test_dat['test_korean'][:stop_ix]):
        ref_list.append(spm_kr.decode_ids(ref_text).strip())
    print('Done!')

    with open(f'./save/{save_path}/test_result.pkl', 'wb') as f:
        pickle.dump({
            'pred': pred_list,
            'reference': ref_list,
        }, f)
    print('Save file; /test_dat.pkl')

    # Calculate BLEU Score
    print('Calculate BLEU4, METEOR, Rogue-L...')
    chencherry = SmoothingFunction()
    bleu4 = corpus_bleu(test_dat['reference'],
                        test_dat['pred'],
                        smoothing_function=chencherry.method4)
    print('BLEU Score is {}'.format(bleu4))

    # Calculate METEOR Score
    meteor = meteor_score(test_dat['reference'], test_dat['pred'])
    print('METEOR Score is {}'.format(meteor))

    # Calculate Rouge-L Score
    r = Rouge()
    total_test_length = len(test_dat['reference'])
    precision_all = 0
    recall_all = 0
    f_score_all = 0
    for i in range(total_test_length):
        [precision, recall, f_score] = r.rouge_l([test_dat['reference'][i]],
                                                 [test_dat['pred'][i]])
        precision_all += precision
        recall_all += recall
        f_score_all += f_score
    print('Precision : {}'.foramt(round(precision_all / total_test_length, 4)))
    print('Recall : {}'.foramt(round(recall_all / total_test_length, 4)))
    print('F Score : {}'.foramt(round(f_score_all / total_test_length, 4)))