def get_metric(self, reset: bool = False): metrics = {"BLEU": bleu.list_bleu([self.ref_list], self.hypo_list)} # Compute score for each pair and rouge type rouge_list: List[List[Score]] = list( map(self._unpack_rouge, self.ref_list, self.hypo_list)) for i, current_rouge in enumerate(zip(*rouge_list)): for j, metric_name in enumerate(self.metric_mapping.keys()): metrics[ f"{self.rouge.rouge_types[i].upper()} {self.metric_mapping[metric_name]}"] = ( sum(score[j] for score in current_rouge) / len(current_rouge) * 100) if reset: self.reset() return metrics
def __call__(self, prediction): preds = prediction.predictions preds_size = prediction.predictions_size label_ids = prediction.label_ids label_size = prediction.label_size p_start, l_start = 0, 0 correct, total = 0, 0 ref = [] hyp = [] if is_rank_0(): fout = open(self.output_path, "w") for idx, (p_size, l_size) in enumerate(zip(preds_size, label_size)): p_end = p_start + p_size l_end = l_start + l_size pred = self.get_sequence(preds[p_start:p_end]) label = self.get_sequence(label_ids[l_start:l_end]) p_start = p_end l_start = l_end if pred == label: correct += 1 total += 1 if is_rank_0(): pred_text = self.tokenizer.decode( pred, skip_special_tokens=True, clean_up_tokenization_spaces=True).strip() label_text = self.tokenizer.decode( label, skip_special_tokens=True, clean_up_tokenization_spaces=True).strip() ref.append(label_text) hyp.append(pred_text) fout.write( json.dumps({ "idx": idx, "pred": pred_text, "label": label_text }) + "\n") score = list_bleu([ref], hyp) return { "bleu": score, "accuracy": correct / total, "correct": correct, "total": total }
K.TOKENIZER_TRG_PATH, True, encoder, decoder) target_hyp_sequences.append(hyp) # pad predicted target sequences target_hyp_sequences = pad_sequences(target_hyp_sequences, padding="post", maxlen=K.SEQUENCE_LENGTH) target_hyp_sentences = [ target_tokenizer.decode_ids(s.tolist()) for s in target_hyp_sequences ] print("Computing Scores.") # compute BLEU score bleu_score = list_bleu([target_ref_sentences], target_hyp_sentences) # compute average ROUGE score rouge = Rouge() rouge_scores = rouge.get_scores(target_hyp_sentences, target_ref_sentences, avg=True) # compute average Accuracy score # important: padding tokens in both sequences would yield a higher accuracy score. # to avoid this, for each reference and hypotheses pair, we compute the inidices of the first occurence # of a padding token id, and then take the higher index. Then we truncate the sequence pairs according to # the computed index, such that we will minimize the influence of padding tokens affecting the acc score. # truncate both sequences # will store sequences with removed padding tokens
files = [open(i, "r") for i in filenames] cnt = 0 #for line in test_file: # print(cnt, " : ") # preds = model.predict([line]) # cnt += 1 # pred_file.write(preds[0] + "\n") for rows in zip(*files): preds = model.predict([rows[0]]) hyp = rows[1][:-1] result = '' max_score = -1 for ele in preds[0]: bleu_score = list_bleu([ele], [hyp]) if bleu_score > max_score: max_score = bleu_score result = ele cnt += 1 pred_file.write(result + "\n") print(cnt, " : ", max_score, " : ", result)
from bleu import list_bleu ref = ['it is a white cat .', 'wow , this dog is huge .'] ref1 = ['This cat is white .', 'wow , this is a huge dog .'] hyp = ['it is a white kitten .', 'wowww , the dog is huge !'] hyp1 = ["it 's a white kitten .", 'wow , this dog is huge !'] assert 34.99 == list_bleu([ref], hyp) assert 34.99 == list_bleu(ref, hyp) assert 57.91 == list_bleu([ref, ref1], hyp1) from bleu import multi_list_bleu assert [34.99, 53.28] == multi_list_bleu(ref, [hyp, hyp1]) assert [34.99, 57.91] == multi_list_bleu([ref, ref1], [hyp, hyp1]) # if you want to get files that saved the detokenized version of your input lists bleus, ref_files, hyp_files = multi_list_bleu([ref, ref1], [hyp, hyp1], return_files=True) print(ref_files) ['TMP_DIR/ref0.txt', 'TMP_DIR/ref1.txt'] print(hyp_files) ['TMP_DIR/hyp0.txt', 'TMP_DIR/hyp1.txt'] assert 39.76 == list_bleu([ref], hyp, detok=False) # or if you want to test multiple hypotheses assert [39.76, 47.47] == multi_list_bleu([ref, ref1], [hyp, hyp1], detok=False) from bleu import file_bleu hyp_file, hyp_file1 = hyp_files
# machine_n_gram[j] = 0 # #print (sum(machine_n_gram.values()), c) # clipped_precision_score.append(sum(machine_n_gram.values())/c) # #print (clipped_precision_score) # weights =[0.25]*4 # s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, clipped_precision_score)) # s = BP * math.exp(math.fsum(s)) # return s # original = "It is a guide to action which ensures that the military alwasy obeys the command of the party" # machine_translated = "It is the guiding principle which guarantees the military forces alwasy being under the command of the party" # print (bleu_score(original, machine_translated)) # print (sentence_bleu([original.split()], machine_translated.split())) from bleu import list_bleu ref = ['it is a white cat .', 'wow , this dog is huge .'] ref1 = ['This cat is white .', 'wow , this is a huge dog .'] hyp = ['it is a white kitten .', 'wowww , the dog is huge !'] hyp1 = ["it 's a white kitten .", 'wow , this dog is huge !'] list_bleu([ref], hyp) list_bleu([ref, ref1], hyp1)
TOP_K = 5 HIDDEN_DIM = 32 BATCH_SIZE = 16 CHUNK_SIZE = 32 TRAIN_PATH = "data/french.txt" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dataset = CorpusDataset(TRAIN_PATH, CHUNK_SIZE, BATCH_SIZE) model = Model(EMBEDDING_DIM, HIDDEN_DIM, len(dataset.vocabulary), device) model.load_state_dict(torch.load('models/french.pt')) model.eval() input_sequence = "Le" # running BLEU evaluation ref = [ top_k(model, input_sequence.split(), 25, dataset.word_to_integer, dataset.integer_to_word, TOP_K, sample=True) ] hyp = [ 'Le comportement de la', ] output = list_bleu([ref], hyp) with open('results/turing.txt', 'w+') as f: f.write("BLEU Metric: " + str(output))
F1 : 0.18038845327110314 Precision: 0.26704042769616537 Recall : 0.14676986008799853 ''' ### BLUE SCORE """ !pip install bleu from bleu import list_bleu KPs = [r[1] for r in RougeScores] Args = [r[2] for r in RougeScores] BlueScore = list_bleu([Args], KPs) BlueScore # """Blue Score Result: 0.98 (below 1.0%!!!) ###BERTScore """ !pip install bert-score from bert_score import score def calc_bert_score(cands, refs): Precision, Recall, F1 = score(cands, refs, lang="en", verbose=True) return Precision.numpy().tolist(), Recall.numpy().tolist(), F1.numpy().tolist()