def get_chrf_score(self, text1, text2):

        try:

            return chrf_score.corpus_chrf([text1], [text2], min_len=2)

        except ZeroDivisionError:

            return 0.
Exemple #2
0
def calculate_evaluation_metrics(eval_file_name,
                                 voc,
                                 encoder,
                                 decoder,
                                 embedding,
                                 N,
                                 k,
                                 delimiter,
                                 device,
                                 skip_indices=[],
                                 print_indices=[],
                                 morfessor=None):

    spacy_fi = Finnish()
    searcher = GreedySearchDecoder(encoder, decoder)

    most_common_word = max(voc.word2count.items(),
                           key=operator.itemgetter(1))[0]

    true_first = 0
    true_top_k = 0
    corpus_hypothesis = []
    corpus_references = []
    true_answer_losses = []
    hypotheses_for_humans = []

    df = pd.read_csv(eval_file_name, sep=delimiter, engine='python')
    for index, row in df.iterrows():
        if index in skip_indices:
            continue

        question = row['TEXT'].strip(
        )  # TODO what if question or answer is zero, make sure it is not in create file?
        if morfessor:
            question = morfenize_fi(question, morfessor, spacy_fi)

        answers = row['CHOICE_SENTENCES'].split('|')
        assert len(
            answers
        ) >= N, "CSV file does not have enough choices for value of given N"
        answers = answers[:10]
        assert N >= k, "N is not larger than or equal k"

        losses = []
        prepared_question = prepare_sentence(question, voc)
        if len(prepared_question) == 0:
            prepared_question = most_common_word

        first_answer = True
        for answer in answers:
            answer = answer.strip()
            if morfessor:
                answer = morfenize_fi(answer, morfessor, spacy_fi)

            prepared_answer = prepare_sentence(answer, voc)
            if len(prepared_answer) == 0:
                prepared_answer = most_common_word

            # Following gets the length for character normalized perplexity, and saves ref and hyp for BLEU
            if first_answer:

                correct_answer_length_char = max(len(prepared_answer), 1)
                correct_answer_length_tokens = max(
                    len(prepared_answer.split(' ')), 1)

                # Had some problem with indexing so this is done twice for every row
                evaluation_batch = [
                    batch2TrainData(voc,
                                    [[prepared_question, prepared_answer]])
                ]
                input_variable, lengths, target_variable, mask, max_target_len = evaluation_batch[
                    0]

                loss = calculate_loss(input_variable, lengths, target_variable,
                                      mask, max_target_len, encoder, decoder,
                                      embedding, device, 1)
                true_answer_losses.append([
                    loss, correct_answer_length_char,
                    correct_answer_length_tokens
                ])
                first_answer = False

                # Next is for BLEU
                hypothesis = evaluate(encoder,
                                      decoder,
                                      searcher,
                                      voc,
                                      prepared_question,
                                      device,
                                      max_length=MAX_LENGTH)
                try:
                    first_EOS_index = hypothesis.index(
                        voc.index2word[EOS_token])
                except ValueError:
                    first_EOS_index = MAX_LENGTH  # Generated hypothesis has 50 tokens, none is EOS, so is added as 51th.
                hypothesis = hypothesis[:first_EOS_index]
                corpus_hypothesis.append(hypothesis)
                if index in print_indices:
                    hypothesis_string = " ".join(
                        morf_list_to_word_list(hypothesis))
                    hypotheses_for_humans.append(
                        [str(index), row['TEXT'].strip(), hypothesis_string])

                answer_in_tokens = answer.split()
                corpus_references.append(answer_in_tokens)

            evaluation_batch = [
                batch2TrainData(voc, [[prepared_question, prepared_answer]])
            ]
            input_variable, lengths, target_variable, mask, max_target_len = evaluation_batch[
                0]

            loss = calculate_loss(input_variable, lengths, target_variable,
                                  mask, max_target_len, encoder, decoder,
                                  embedding, device, 1)
            losses.append(loss)
        if np.argmin(np.asarray(losses)) == 0:
            true_first += 1
        if 0 in np.asarray(losses).argsort()[:k]:
            true_top_k += 1

    fraction_of_correct_firsts = true_first / len(true_answer_losses)
    franction_of_N_choose_k = true_top_k / len(true_answer_losses)

    np_true_answer_losses = np.asarray(true_answer_losses)
    #perplexity = np.exp(np.mean(np_true_answer_losses[:,0]))
    cross_entropy = np.mean(np_true_answer_losses[:, 0])

    token_to_character_modifier = np_true_answer_losses[:,
                                                        2] / np_true_answer_losses[:,
                                                                                   1]
    #char_perplexity = np.exp(np.mean(np_true_answer_losses[:,0] * token_to_character_modifier))
    char_cross_entropy = np.mean(np_true_answer_losses[:, 0] *
                                 token_to_character_modifier)

    bleu_morf = corpus_bleu(corpus_references, corpus_hypothesis)
    chrf_morf = corpus_chrf(corpus_references, corpus_hypothesis)

    corpus_references_word = [
        morf_list_to_word_list(sentence) for sentence in corpus_references
    ]
    corpus_hypothesis_word = [
        morf_list_to_word_list(sentence) for sentence in corpus_hypothesis
    ]
    print(corpus_hypothesis_word)
    print("FOR HUMANS")
    for answer_for_human in hypotheses_for_humans:
        print(" --- ".join(answer_for_human))

    bleu_word = corpus_bleu(corpus_references_word, corpus_hypothesis_word)
    chrf_word = corpus_chrf(corpus_references_word, corpus_hypothesis_word)

    return fraction_of_correct_firsts, franction_of_N_choose_k, cross_entropy, char_cross_entropy, bleu_word, bleu_morf, chrf_word, chrf_morf
Exemple #3
0
def corpus_chrF_plus(references, hypotheses):
    '''
    Macro-average of sentence-level chrF+ scores.
    '''
    return chrf_score.corpus_chrf(references, hypotheses)
Exemple #4
0
    if match:
        acc += 1
        resultItem.append('---Match---' + '\n')
    all += 1
    resultItem.append('\n')
    result.append(resultItem)

print('ExactMatch:{} all:{} match:{}'.format(acc / all, all, acc))

corpusBLEU = corpus_bleu(pendingCalcBLEUGroundTruth,
                         pendingCalcBLEUCandidate,
                         weights=(0.5, 0.5, 0, 0),
                         smoothing_function=smoothFunction.method3)
print('CorpusBLEU: {}'.format(corpusBLEU))

corpusCHRF = corpus_chrf(pendingCalcCHRFGroundTruth, pendingCalcCHRFCandidate)
print('CorpusCHRF: {}'.format(corpusCHRF))

avgROUGE = rouge.get_scores(pendingCalcROUGECandidate,
                            pendingCalcROUGEGroundTruth,
                            avg=True)
avgROUGE1 = avgROUGE['rouge-1']['f']
avgROUGE2 = avgROUGE['rouge-2']['f']
avgROUGEl = avgROUGE['rouge-l']['f']

print('AvgROUGE1:{}'.format(avgROUGE1))
print('AvgROUGE2:{}'.format(avgROUGE2))
print('AvgROUGEl:{}'.format(avgROUGEl))

avgEditDistanceSimilarity = float(np.mean(editDistanceSimilarityList))
print('AvgEditDistanceSimilarity:{}'.format(avgEditDistanceSimilarity))
Exemple #5
0
#test model
if REVERSE:
    x_val = np.flip(x_val, axis=1)
preds = model.predict(x_val, verbose=0)
print(preds)
simpl_sentences_h = []
simpl_sentences_r = []
for i in range(len(preds)):
    print('Source: ', end='')
    try:
        s = ''.join([indx2w[np.argmax(w)] for w in y_val[i]])
        simpl_sentences_h.append(s)
        print(s)
    except KeyError:
        continue
    print('Target: ', end='')
    try:
        if ATTENTION:
            s = ''.join([indx2w[np.argmax(w)] for w in preds[i]])
        else:
            s = ''.join([indx2w[w] for w in preds[i]])
        simpl_sentences_r.append(s)
        print(s)
        print()
    except KeyError:
        continue


print('Chrf score: %s' % corpus_chrf(simpl_sentences_h, simpl_sentences_r))
print('Bleu score: %s' % corpus_bleu(simpl_sentences_h, simpl_sentences_r))
Exemple #6
0
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.chrf_score import corpus_chrf
s1 = [['I', 'am', 'a', 'boy', 'dressed', 'in', 'white', 'shirt', 'black', 'shoe'], ['boy', 'with', 'black', 'hair', 'suit']]
s2 =[['I', 'am', 'boy', 'in', 'black', 'suit', 'grey', 'hair']]
ch_s1 = s1[0]
ch_s2 = ' '.join(s2[0])
ch_st_s1 = [' '.join(i) for i in s1]
print(corpus_bleu([s1],s2,weights = (1,0,0,0)))
print(corpus_chrf([ch_st_s1],[ch_s2]))