def evaluate_specific(train_network, input_lang, pair, name='tracking_pair'): print('>', pair[0]) print('=', pair[1]) output_words = evaluate(train_network, input_lang, pair[0]) output_sentence = ' '.join(output_words) print('<', output_sentence) print('BLEU Score', bleu_score.corpus_bleu([output_sentence], [pair[1]]))
def evaluate(self): ys_pred = [] ys_true = [] if self.measure == 'accuracy': for inputs, targets in self.data_loader: predicted = self.model.predict(inputs) predicted, targets = self._pad( predicted, targets) # when over- or under-generate ys_pred.append(np.concatenate(predicted)) ys_true.append(np.concatenate(targets)) ys_pred = np.concatenate(ys_pred) ys_true = np.concatenate(ys_true) value = accuracy_score(ys_true, ys_pred) elif self.measure == 'BLEU': for inputs, targets in self.data_loader: ys_pred += [np.array(p) for p in self.model.predict(inputs)] ys_true += [np.array(t)[np.newaxis, :] for t in targets] value = bleu_score.corpus_bleu(ys_true, ys_pred) elif self.measure == 'sent_BLEU': sent_bleues = [] for inputs, targets in self.data_loader: predicted = self.model.predict(inputs) sent_bleues += [ sentence_bleu(np.array(cand), np.array(tgt)) for cand, tgt in zip(predicted, targets) ] value = sum(sent_bleues) / len(sent_bleues) else: raise ValueError("measure: ['accuray', 'BLEU', 'sent_BLEU']") return value
def evaluate_specific(self, in_seq, out_seq, in_len, person, name='tracking_pair'): dialogue = [self.index2word[j] for j in in_seq] response = [self.index2word[j] for j in out_seq] print('>', dialogue, 'By :', person[0].data) print('=', response, 'By :', person[1].data) _, output_words, attentions = self.evaluate([in_seq], [out_seq], [in_len], person[0].view(1, 1)) try: target_index = output_words[0].index('<EOS>') + 1 except ValueError: target_index = len(output_words[0]) output_words = output_words[0][:target_index] attentions = attentions[0, :target_index, :].view(target_index, -1) output_sentence = ' '.join(output_words) print('<', output_sentence) print('BLEU Score', bleu_score.corpus_bleu([output_sentence], [response])) self.help_fn.show_attention(dialogue, output_words, attentions, name=name)
def evaluate_specific(model, in_seq, out_seq, in_len, person, senti, types, types_2, index2word): response = [index2word[j] for j in out_seq] criterion = nn.NLLLoss(ignore_index=0) loss_eva, output_words = model.evaluate([in_seq], [out_seq], [in_len], [person], [senti], criterion) #这里就跑一句话 try: target_index = output_words[0].index('<EOS>') + 1 except ValueError: target_index = len(output_words[0]) # TODO: Remove this false target_index 所以隐去下面 output_words = output_words[0][:target_index] output_sentence = ' '.join(output_words) #print('<', output_sentence) #它这里计算是不是反了?应该是参考句在前,候选句在后吧--已经替换了 bleu1 = bleu_score.corpus_bleu([response], [output_sentence], weights=(1, 0, 0, 0)) bleu2 = bleu_score.corpus_bleu([response], [output_sentence], weights=(0, 1, 0, 0)) bleu3 = bleu_score.corpus_bleu([response], [output_sentence], weights=(0, 0, 1, 0)) bleu4 = bleu_score.corpus_bleu([response], [output_sentence], weights=(0, 0, 0, 1)) #print('BLEU1 Score', bleu1,'BLEU4 Score', bleu4) num_token, types = distinct1(output_words, types) num_token_2, types_2 = distinct2(output_words, types_2) #help_fn.show_attention(dialogue, output_words, attentions, name=name) return loss_eva, bleu1, bleu2, bleu3, bleu4, num_token, types, num_token_2, types_2
def calc_bleu(refs, hyps): """ BLEUスコアを計算する関数 :param refs: list, 参照訳。単語のリストのリスト (例: [['I', 'have', 'a', 'pen'], ...]) :param hyps: list, モデルの生成した訳。 単語のリストのリスト (例: [['I', 'have', 'a', 'pen'], ...]) :return: float, BLEUスコア(0~100) """ refs = [[ref[:ref.index(word2id['<EOS>'])]] for ref in refs] hyps = [hyp[:hyp.index(word2id['<EOS>'])] if word2id['<EOS>'] in hyp else hyp for hyp in hyps] return 100 * bleu_score.corpus_bleu(refs, hyps)
def calc_bleu(self, refs: List[List[T]], hyps: List[List[T]]): """ Args: refs: reference sentences splitted by word / word_idx hyps: generated sentences splitted by word / word_idx Returns: bleu_score (float): [0, 100] score (upper is better) """ refs = [[ref[:ref.index(self.EOS)]] for ref in refs] hyps = [ hyp[:hyp.index(self.EOS)] if self.EOS in hyp else hyp for hyp in hyps ] return 100 * bleu_score.corpus_bleu( refs, hyps, smoothing_function=self.smoothing_function)
def Validation(transformer, n, epoch): all_BLEUscore = 0 new_n = int(n/max_epoch) start = 0 stop = n with torch.no_grad(): pair = pairs_dev inputs, outputs = map(list, zip(*pair)) dev_pairs = [tensorsFromPair(inputs[i].split(" "), outputs[i].split(" ")) for i in range(n)] dataset = MyDataset(dev_pairs) dataloader = torch.utils.data.DataLoader(dataset, batch_size=50, shuffle=False, num_workers=0, collate_fn=my_collate_fn) cnt = 0 res = [] hy = [] chencherry = bleu_score.SmoothingFunction() for idx in tqdm(dataloader, ascii = True): batch = len(idx[0][0]) dev_pair = idx[0] input_tensor = torch.tensor(dev_pair[0], dtype=torch.long, device=device) re_tensor = torch.tensor(dev_pair[1], dtype=torch.long, device=device) output_words = evaluate(transformer, input_tensor, batch, 0) #output_words = evaluate(transformer, input_tensor, batch, -1) #debag for i in range(len(output_words)): re = outputs[cnt].split(" ") res.append([ re ]) if len(output_words[i]) != 0: # 空判定 hy.append(output_words[i].split(" ")) else: hy.append([""]) cnt += 1 #all_BLEUscore += bleu_score.corpus_bleu(res, hy, smoothing_function=chencherry.method4) # smoothing_function=chencherry.method4 all_BLEUscore += bleu_score.corpus_bleu(res, hy) all_BLEUscore *= 100 return all_BLEUscore
#!/usr/bin/env python # -*- coding: utf8 -*- # for python3 # txt1にreference、txt2にMT outputを入れてcorpus_BLEUを取得 from nltk import word_tokenize from nltk import bleu_score from nltk.translate.bleu_score import SmoothingFunction cc = SmoothingFunction() txt1 = open("txt1.txt", encoding='utf-8').read().splitlines() txt2 = open("txt2.txt", encoding='utf-8').read().splitlines() l = len(txt1) ref = [0] * l hyp = [0] * l for i in range(l): ref[i] = word_tokenize(txt1[i]) hyp[i] = word_tokenize(txt2[i]) #print(ref) #print(hyp) print(bleu_score.corpus_bleu(ref, hyp, smoothing_function=cc.method7)) # b[i] = str(bleu_score.sentence_bleu([ref], hyp, smoothing_function=cc.method7)) #f = open('bleu.txt', 'w') #b = "\n".join(b) #f.write(b) #f.close()
reference = [['this', 'is', 'small', 'test']] candidate = ['this', 'is', 'a', 'test'] score = sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0.0)) print(score) # In[126]: from nltk import bleu_score from nltk.translate.bleu_score import SmoothingFunction from nltk.translate.bleu_score import corpus_bleu reference = actual hypothesis = predicted smoothie = SmoothingFunction().method4 print('bleu_score.corpus_bleu(reference, hypothesis): {0}'. format(bleu_score.corpus_bleu(reference,hypothesis,smoothing_function=smoothie))) # In[1]: import nltk import pkg_resources pkg_resources.get_distribution("nltk").version # In[15]: #Attention Model #train-data train_X = encode_text(ger_tokenizer,ger_max,train[:,1])
def calc_bleu(refs, hyps): _refs = [[ref[:ref.index(EOS)]] for ref in refs] _hyps = [hyp[:hyp.index(EOS)] if EOS in hyp else hyp for hyp in hyps] return 100 * bleu_score.corpus_bleu(_refs, _hyps)