def get_chrf_score(self, text1, text2): try: return chrf_score.corpus_chrf([text1], [text2], min_len=2) except ZeroDivisionError: return 0.
def calculate_evaluation_metrics(eval_file_name, voc, encoder, decoder, embedding, N, k, delimiter, device, skip_indices=[], print_indices=[], morfessor=None): spacy_fi = Finnish() searcher = GreedySearchDecoder(encoder, decoder) most_common_word = max(voc.word2count.items(), key=operator.itemgetter(1))[0] true_first = 0 true_top_k = 0 corpus_hypothesis = [] corpus_references = [] true_answer_losses = [] hypotheses_for_humans = [] df = pd.read_csv(eval_file_name, sep=delimiter, engine='python') for index, row in df.iterrows(): if index in skip_indices: continue question = row['TEXT'].strip( ) # TODO what if question or answer is zero, make sure it is not in create file? if morfessor: question = morfenize_fi(question, morfessor, spacy_fi) answers = row['CHOICE_SENTENCES'].split('|') assert len( answers ) >= N, "CSV file does not have enough choices for value of given N" answers = answers[:10] assert N >= k, "N is not larger than or equal k" losses = [] prepared_question = prepare_sentence(question, voc) if len(prepared_question) == 0: prepared_question = most_common_word first_answer = True for answer in answers: answer = answer.strip() if morfessor: answer = morfenize_fi(answer, morfessor, spacy_fi) prepared_answer = prepare_sentence(answer, voc) if len(prepared_answer) == 0: prepared_answer = most_common_word # Following gets the length for character normalized perplexity, and saves ref and hyp for BLEU if first_answer: correct_answer_length_char = max(len(prepared_answer), 1) correct_answer_length_tokens = max( len(prepared_answer.split(' ')), 1) # Had some problem with indexing so this is done twice for every row evaluation_batch = [ batch2TrainData(voc, [[prepared_question, prepared_answer]]) ] input_variable, lengths, target_variable, mask, max_target_len = evaluation_batch[ 0] loss = calculate_loss(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding, device, 1) true_answer_losses.append([ loss, correct_answer_length_char, correct_answer_length_tokens ]) first_answer = False # Next is for BLEU hypothesis = evaluate(encoder, decoder, searcher, voc, prepared_question, device, max_length=MAX_LENGTH) try: first_EOS_index = hypothesis.index( voc.index2word[EOS_token]) except ValueError: first_EOS_index = MAX_LENGTH # Generated hypothesis has 50 tokens, none is EOS, so is added as 51th. hypothesis = hypothesis[:first_EOS_index] corpus_hypothesis.append(hypothesis) if index in print_indices: hypothesis_string = " ".join( morf_list_to_word_list(hypothesis)) hypotheses_for_humans.append( [str(index), row['TEXT'].strip(), hypothesis_string]) answer_in_tokens = answer.split() corpus_references.append(answer_in_tokens) evaluation_batch = [ batch2TrainData(voc, [[prepared_question, prepared_answer]]) ] input_variable, lengths, target_variable, mask, max_target_len = evaluation_batch[ 0] loss = calculate_loss(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding, device, 1) losses.append(loss) if np.argmin(np.asarray(losses)) == 0: true_first += 1 if 0 in np.asarray(losses).argsort()[:k]: true_top_k += 1 fraction_of_correct_firsts = true_first / len(true_answer_losses) franction_of_N_choose_k = true_top_k / len(true_answer_losses) np_true_answer_losses = np.asarray(true_answer_losses) #perplexity = np.exp(np.mean(np_true_answer_losses[:,0])) cross_entropy = np.mean(np_true_answer_losses[:, 0]) token_to_character_modifier = np_true_answer_losses[:, 2] / np_true_answer_losses[:, 1] #char_perplexity = np.exp(np.mean(np_true_answer_losses[:,0] * token_to_character_modifier)) char_cross_entropy = np.mean(np_true_answer_losses[:, 0] * token_to_character_modifier) bleu_morf = corpus_bleu(corpus_references, corpus_hypothesis) chrf_morf = corpus_chrf(corpus_references, corpus_hypothesis) corpus_references_word = [ morf_list_to_word_list(sentence) for sentence in corpus_references ] corpus_hypothesis_word = [ morf_list_to_word_list(sentence) for sentence in corpus_hypothesis ] print(corpus_hypothesis_word) print("FOR HUMANS") for answer_for_human in hypotheses_for_humans: print(" --- ".join(answer_for_human)) bleu_word = corpus_bleu(corpus_references_word, corpus_hypothesis_word) chrf_word = corpus_chrf(corpus_references_word, corpus_hypothesis_word) return fraction_of_correct_firsts, franction_of_N_choose_k, cross_entropy, char_cross_entropy, bleu_word, bleu_morf, chrf_word, chrf_morf
def corpus_chrF_plus(references, hypotheses): ''' Macro-average of sentence-level chrF+ scores. ''' return chrf_score.corpus_chrf(references, hypotheses)
if match: acc += 1 resultItem.append('---Match---' + '\n') all += 1 resultItem.append('\n') result.append(resultItem) print('ExactMatch:{} all:{} match:{}'.format(acc / all, all, acc)) corpusBLEU = corpus_bleu(pendingCalcBLEUGroundTruth, pendingCalcBLEUCandidate, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothFunction.method3) print('CorpusBLEU: {}'.format(corpusBLEU)) corpusCHRF = corpus_chrf(pendingCalcCHRFGroundTruth, pendingCalcCHRFCandidate) print('CorpusCHRF: {}'.format(corpusCHRF)) avgROUGE = rouge.get_scores(pendingCalcROUGECandidate, pendingCalcROUGEGroundTruth, avg=True) avgROUGE1 = avgROUGE['rouge-1']['f'] avgROUGE2 = avgROUGE['rouge-2']['f'] avgROUGEl = avgROUGE['rouge-l']['f'] print('AvgROUGE1:{}'.format(avgROUGE1)) print('AvgROUGE2:{}'.format(avgROUGE2)) print('AvgROUGEl:{}'.format(avgROUGEl)) avgEditDistanceSimilarity = float(np.mean(editDistanceSimilarityList)) print('AvgEditDistanceSimilarity:{}'.format(avgEditDistanceSimilarity))
#test model if REVERSE: x_val = np.flip(x_val, axis=1) preds = model.predict(x_val, verbose=0) print(preds) simpl_sentences_h = [] simpl_sentences_r = [] for i in range(len(preds)): print('Source: ', end='') try: s = ''.join([indx2w[np.argmax(w)] for w in y_val[i]]) simpl_sentences_h.append(s) print(s) except KeyError: continue print('Target: ', end='') try: if ATTENTION: s = ''.join([indx2w[np.argmax(w)] for w in preds[i]]) else: s = ''.join([indx2w[w] for w in preds[i]]) simpl_sentences_r.append(s) print(s) print() except KeyError: continue print('Chrf score: %s' % corpus_chrf(simpl_sentences_h, simpl_sentences_r)) print('Bleu score: %s' % corpus_bleu(simpl_sentences_h, simpl_sentences_r))
from nltk.translate.bleu_score import corpus_bleu from nltk.translate.bleu_score import sentence_bleu from nltk.translate.chrf_score import corpus_chrf s1 = [['I', 'am', 'a', 'boy', 'dressed', 'in', 'white', 'shirt', 'black', 'shoe'], ['boy', 'with', 'black', 'hair', 'suit']] s2 =[['I', 'am', 'boy', 'in', 'black', 'suit', 'grey', 'hair']] ch_s1 = s1[0] ch_s2 = ' '.join(s2[0]) ch_st_s1 = [' '.join(i) for i in s1] print(corpus_bleu([s1],s2,weights = (1,0,0,0))) print(corpus_chrf([ch_st_s1],[ch_s2]))