def bleu_score_enc_dec(encoder, decoder, src, tar, batch_size=64): n_batches = src.shape[0] // batch_size pred = np.zeros((batch_size * n_batches, tar.shape[1]), dtype=np.int32) for b, (s, _) in enumerate(nmt_infer_generator(src, tar, batch_size)): pred[b * batch_size:(b + 1) * batch_size] = nmt_infer( encoder, decoder, s) tar = [np.trim_zeros(t, trim='b') for t in tar] pred = [np.trim_zeros(p, trim='b') for p in pred] return bleu_score(tar, pred, smooth=True)
def bleu_score_enc_dec(encoder, decoder, src, tar, batch_size=64): ''' Computes bleu score for the encoder and decoder outputs by creating a numpy array of zeros as per the generator of NMT inference which basically computes whether the RNNs gave out the best translation or not. ''' n_batches = src.shape[0] // batch_size pred = np.zeros((batch_size * n_batches, tar.shape[1]), dtype=np.int32) for b, (s, _) in enumerate(nmt_infer_generator(src, tar, batch_size)): pred[b * batch_size:(b + 1) * batch_size] = nmt_infer( encoder, decoder, s) tar = [np.trim_zeros(t, trim='b') for t in tar] pred = [np.trim_zeros(p, trim='b') for p in pred] return bleu_score(tar, pred, smooth=True)
def get_pairwise_edits(text_before, text_after, tokenizer): min_bleu = 0.5 min_leven = 3 sen_before = [sen.strip() for sen in tokenizer.tokenize(text_before)] sen_after = [sen.strip() for sen in tokenizer.tokenize(text_after)] # sen_before = [sent.string.strip() for sent in tokenizer(text_before).sents] # sen_after = [sent.string.strip() for sent in tokenizer(text_after).sents] w_size = 4 edits = set() for i in range(len(sen_before)): start = max(0, i - w_size) end = min(i + w_size, len(sen_after)) nei_bleus = [] match_idx = [] prev_sents_tok = word_tokenize(sen_before[i]) # prev_sents_tok = sen_before[i].split() for j in range(start, end): post_sents_tok = word_tokenize(sen_after[j]) # post_sents_tok = sen_after[j].split() bleu = bleu_score(prev_sents_tok, post_sents_tok) nei_bleus.append(bleu) match_idx.append(j) if not nei_bleus: continue max_bleu = max(nei_bleus) idx = nei_bleus.index(max_bleu) lev_dist = Levenshtein.distance(sen_before[i], sen_after[match_idx[idx]]) if max_bleu > min_bleu and max_bleu < 1.0 and lev_dist > min_leven: if i == 0: context_before = 'NA' else: context_before = sen_before[i - 1] if i == len(sen_before) - 1: context_after = 'NA' else: context_after = sen_before[i + 1] edits.add((sen_before[i], sen_after[match_idx[idx]], context_before, context_after)) return list(edits)
def run(self): self.model.eval() total_bleu = 0 total_f1 = 0 total_dist1 = 0 total_dist2 = 0 total_loss = 0 print('Run eval...') with torch.no_grad(): for batch_idx, feature in enumerate(self.test_iter): utils.feature_to_device(feature, self.device) out, out_lm = self.model(feature) print(self.vocab.itos(out[3, 0].argmax(dim=0).item()), self.vocab.itos(out_lm[3, 0].argmax(dim=0).item())) loss, loss_lm = models.AR.loss(self.out_loss_fn, out, out_lm, feature.resp, feature.lm.y) print(loss, loss_lm) loss = loss + self.model_config.alpha * loss_lm total_loss += loss.item() # target include w1, w2...[EOS], len: max_seq_length + 1 target = copy.deepcopy(feature.resp[1:]) # feature will be changed pred, pred_padded = utils.sample_sequence( feature, self.vocab, self.model, self.args) pred_tokens = [[self.vocab.itos(k) for k in ks] for ks in pred] target_tokens = [[[self.vocab.itos(k) for k in ks]] for ks in target.T.tolist()] print('----------------------------------') print( 'Context: ', ''.join([ self.vocab.itos(k) for k in feature.context.T.tolist()[0] ])) print( 'LM x: ', ''.join([ self.vocab.itos(k) for k in feature.lm.x.T.tolist()[0] ])) print( 'LM y: ', ''.join([ self.vocab.itos(k) for k in feature.lm.y.T.tolist()[0] ])) print( 'Pred: ', ''.join([ self.vocab.itos(k) for k in pred_padded.T.tolist()[0] ])) print('Target: ', ''.join(target_tokens[0][0])) print( 'Pred: ', ''.join([ self.vocab.itos(k) for k in pred_padded.T.tolist()[-1] ])) print('Target: ', ''.join(target_tokens[-1][0])) print('----------------------------------') bleu = metrics.bleu_score(pred_tokens, target_tokens) f1 = metrics.f1_score(pred_padded.T.to('cpu'), target.T.to('cpu')) # dist1 = metrics.distinct_score([v[:-1] for v in pred]) dist1 = metrics.distinct_score(pred_tokens) dist2 = metrics.distinct_score(pred_tokens, 2) total_bleu += bleu total_f1 += f1 total_dist1 += dist1 total_dist2 += dist2 l = len(self.test_iter) bleu = total_bleu / l f1 = total_f1 / l dist1 = total_dist1 / l dist2 = total_dist2 / l # https://stackoverflow.com/questions/59209086/calculate-perplexity-in-pytorch # see per-word perplexity: # https://github.com/huggingface/transfer-learning-conv-ai/blob/master/convai_evaluation.py#L161 # https://github.com/facebookresearch/ParlAI/blob/56d46551190a7ffaedccd13534412d43bc7076e5/parlai/scripts/eval_ppl.py ppl = math.exp(total_loss / l) print(f'\tBleu: {bleu:.8f} | F1: {f1:.8f} | ' f'Dist1: {dist1:.3f} | Dist2: {dist2:.3f} | PPL: {ppl:7.3f}')