def test_corpus_bleu(self): ref_file = find('models/wmt15_eval/ref.ru') hyp_file = find('models/wmt15_eval/google.ru') mteval_output_file = find('models/wmt15_eval/mteval-13a.output') # Reads the BLEU scores from the `mteval-13a.output` file. # The order of the list corresponds to the order of the ngrams. with open(mteval_output_file, 'r') as mteval_fin: # The numbers are located in the last 2nd line of the file. # The first and 2nd item in the list are the score and system names. mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1]) with io.open(ref_file, 'r', encoding='utf8') as ref_fin: with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin: # Whitespace tokenize the file. # Note: split() automatically strip(). hypothesis = list(map(lambda x: x.split(), hyp_fin)) # Note that the corpus_bleu input is list of list of references. references = list(map(lambda x: [x.split()],ref_fin)) # Without smoothing. for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores): nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i) # Check that the BLEU scores difference is less than 0.005 . # Note: This is an approximate comparison; as much as # +/- 0.01 BLEU might be "statistically significant", # the actual translation quality might not be. assert abs(mteval_bleu - nltk_bleu) < 0.005 # With the same smoothing method used in mteval-v13a.pl chencherry = SmoothingFunction() for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores): nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i, smoothing_function=chencherry.method3) assert abs(mteval_bleu - nltk_bleu) < 0.005
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) args = setup_args() logging.info(args) f = codecs.open('report-%s.csv'% args.model, 'w') csv_f = csv.writer(f, delimiter=',', encoding='utf-8') src_lines = codecs.open(args.src, 'r', 'utf-8').readlines() src_lines_nounk = codecs.open(args.src + '.nounk', 'r', 'utf-8').readlines() target_lines = codecs.open(args.target, 'r', 'utf-8').readlines() target_lines_nounk = codecs.open(args.target + '.nounk', 'r', 'utf-8').readlines() gold_lines = codecs.open(args.gold, 'r', 'utf-8').readlines() gold_lines_nounk = codecs.open(args.gold + '.nounk', 'r', 'utf-8').readlines() data = ['Src', 'Src_UNK', 'Target_UNK', 'Target', 'Gold_UNK', 'Gold', 'BLEU1'] csv_f.writerow(data) num_lines = len(gold_lines) logging.info('Num Lines: %d'% num_lines) references = [] hypotheses = [] for index in range(num_lines): data = [] data.append(src_lines_nounk[index].strip()) data.append(src_lines[index].strip()) data.append(target_lines[index].strip()) data.append(target_lines_nounk[index].strip()) data.append(gold_lines[index].strip()) data.append(gold_lines_nounk[index].strip()) gold = gold_lines[index].strip().split() output = target_lines[index].strip().split() default = 'UNK UNK UNK UNK'.split() if len(output) < 4: bleu_score = 0.0 hypotheses.append(default) else: bleu_score = sentence_bleu([gold], output, weights=(1.0,)) hypotheses.append(output) references.append([gold]) logging.info('sentence:%d bleu:%f'%(index, bleu_score)) data.append(str(bleu_score)) csv_f.writerow(data) final_bleu = corpus_bleu(references, hypotheses) unigram_bleu = corpus_bleu(references, hypotheses, weights=(1.0,)) logging.info('Final BLEU: %f Unigram_BLEU: %f '% (final_bleu, unigram_bleu))
def evaluate(self): bt = time.time() with chainer.no_backprop_mode(): references = [] hypotheses = [] observation = {} with reporter.report_scope(observation): for i in range(0, len(self.test_data), self.batch): src, trg = zip(*self.test_data[i:i + self.batch]) references.extend([[t.tolist()] for t in trg]) src = [chainer.dataset.to_device(self.device, x) for x in src] if self.comm.rank == 0: self.model.translate(src, self.max_length) elif self.comm.rank == 1: ys = [y.tolist() for y in self.model.translate( src, self.max_length)] hypotheses.extend(ys) if self.comm.rank == 1: bleu = bleu_score.corpus_bleu( references, hypotheses, smoothing_function=bleu_score. SmoothingFunction().method1) reporter.report({'bleu': bleu}, self.model) et = time.time() if self.comm.rank == 1: print("BleuEvaluator(single)::evaluate(): " "took {:.3f} [s]".format(et - bt)) sys.stdout.flush() return observation
def bleu_1(decoded, references): listed_references = [[s] for s in references] bleu_1 = \ 100 * corpus_bleu(listed_references, decoded, weights=[1.0, 0, 0, 0], smoothing_function=bleu_smoothing) return bleu_1
def test_corpus_bleu_with_emulate_multibleu(self): hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R" ref = str("Their tasks include changing a pump on the faulty stokehold ." "Likewise , two species that are very similar in morphology " "were distinguished using genetics .") references = [[ref.split()]] hypothese = [hyp.split()] try: # Check that the warning is raised since no. of 2-grams < 0. with self.assertWarns(UserWarning): # Verify that the BLEU output is undesired since no. of 2-grams < 0. self.assertAlmostEqual(corpus_bleu(references, hypothese), 0.4309, places=4) except AttributeError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. desired_output = corpus_bleu(references, hypothese, emulate_multibleu=True) #assert assert desired_output == 0.0
def eval(): # Load graph g = Graph(is_training=False) print("Graph loaded") # Load data X, Sources, Targets = load_test_data() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # X, Sources, Targets = X[:33], Sources[:33], Targets[:33] # Start session with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: ## Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print("Restored!") ## Get model name mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name ## Inference if not os.path.exists('results'): os.mkdir('results') with codecs.open("results/" + mname, "w", "utf-8") as fout: list_of_refs, hypotheses = [], [] for i in range(len(X) // hp.batch_size): ### Get mini-batches x = X[i*hp.batch_size: (i+1)*hp.batch_size] sources = Sources[i*hp.batch_size: (i+1)*hp.batch_size] targets = Targets[i*hp.batch_size: (i+1)*hp.batch_size] ### Autoregressive inference preds = np.zeros((hp.batch_size, hp.maxlen), np.int32) for j in range(hp.maxlen): _preds = sess.run(g.preds, {g.x: x, g.y: preds}) preds[:, j] = _preds[:, j] ### Write to file for source, target, pred in zip(sources, targets, preds): # sentence-wise got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip() fout.write("- source: " + source +"\n") fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") fout.flush() # bleu score ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis) ## Calculate bleu score score = corpus_bleu(list_of_refs, hypotheses) fout.write("Bleu Score = " + str(100*score))
def evaluation(data, classifier, normalizer, pca, params, limit=1000): (inputs, references, candidates) = data bleu_references = [[x] for x in references] bleu_hypotheses_baseline = best_baseline(inputs, candidates) baseline_blue = corpus_bleu(bleu_references, bleu_hypotheses_baseline) print("Baseline BLEU: %0.10f" % baseline_blue) bleu_hypotheses_reranking = best_reranking(inputs, candidates, classifier, normalizer, pca, params, limit) reranking_blue = corpus_bleu(bleu_references, bleu_hypotheses_reranking) print("Reranking BLEU: %0.10f" % reranking_blue) blue_diff = reranking_blue - baseline_blue print("BLEU Diff: %0.10f" % blue_diff) return baseline_blue, reranking_blue, blue_diff, bleu_hypotheses_reranking
def compute_BLEU_score_corpus(): print("Generate Captions") print("Loading Vocab") with open('models/vocab_list_'+dataname+'.pkl', 'rb') as f: vocabs = pickle.load(f) for v in vocabs: if v not in vocab: update_vocab(v) generating_model = load_model(rnn_model_name, 'models/'+dataname+'/best_' + rnn_model_name + '_model_'+ str(image_caption_model)+'_output_rnn_'+str(output_rnn_dim)+'_weights_iteration_' + str(iteration) + '.h5') print("Loading Image Caption dict") with open('dataset/'+dataname+'/image_caption_dict.pkl', 'rb') as f: image_caption_dict = pickle.load(f) sentences = [] references = [] i = 0 # Calculate Bleu-n weights = [0.25,0.25,0.25,0.25] for key, value in image_caption_dict.iteritems(): print(str(i)+'/'+str(len(image_caption_dict.keys()))) i += 1 image_path_new = image_path+key result = get_caption(generating_model, image_path_new, value['image_data']) sentences.append(result) reference = [[str(word).lower() for word in x['tokens']] for x in value['sentences']] references.append(reference) corpus_score = corpus_bleu(references, sentences, weights) * 100 print(corpus_score) return corpus_score
def __call__(self, trainer): with chainer.no_backprop_mode(): references = [] hypotheses = [] for i in range(0, len(self.test_data), self.batch): sources, targets = zip(*self.test_data[i:i + self.batch]) references.extend([[t.tolist()] for t in targets]) sources = [ chainer.dataset.to_device(self.device, x) for x in sources] ys = [y.tolist() for y in self.model.translate(sources, self.max_length)] hypotheses.extend(ys) bleu = bleu_score.corpus_bleu( references, hypotheses, smoothing_function=bleu_score.SmoothingFunction().method1) reporter.report({self.key: bleu})
def compute_corpus_level_bleu_score(references: List[List[str]], hypotheses: List[Hypothesis]) -> float: """ Given decoding results and reference sentences, compute corpus-level BLEU score Args: references: a list of gold-standard reference target sentences hypotheses: a list of hypotheses, one for each reference Returns: bleu_score: corpus-level BLEU score """ if references[0][0] == '<s>': references = [ref[1:-1] for ref in references] bleu_score = corpus_bleu([[ref] for ref in references], [hyp.value for hyp in hypotheses]) return bleu_score
def bleu_4_dedup(decoded, references): listed_references = [[s] for s in references] deduplicated_sentences = [] for sentence in decoded: last_w = None dedup_snt = [] for word in sentence: if word != last_w: dedup_snt.append(word) last_w = word deduplicated_sentences.append(dedup_snt) bleu_4 = \ 100 * corpus_bleu(listed_references, deduplicated_sentences, weights=[0.25, 0.25, 0.25, 0.25], smoothing_function=bleu_smoothing) return bleu_4
def compute_bleu(batch_in, predicted): weights = [] n = 4 for i in range(n): weights.append(float(1.0 / n)) # Create hypothesis and reference arrays taking 5 predicted captions per image (maybe we could modify this?) # Initialize hypothesis and reference arrays hypotheses = [] references = [] for j, (_, sentence_in, _) in enumerate(batch_in): references.append([sentence_in]) hypotheses.append(tkn.tokenize(predicted[j])) # Compute BLEU score score = corpus_bleu(references, hypotheses, weights=weights) # Display BLEU score # logging.info('BLEU-{} score: {}'.format(n, score)) return score
def evaluate(self): bt = time.time() with chainer.no_backprop_mode(): references = [] hypotheses = [] observation = {} with reporter.report_scope(observation): for i in range(0, len(self.test_data), self.batch): src, trg = zip(*self.test_data[i:i + self.batch]) references.extend([[t.tolist()] for t in trg]) src = [chainer.dataset.to_device(self.device, x) for x in src] ys = [y.tolist() for y in self.model.translate(src, self.max_length)] hypotheses.extend(ys) bleu = bleu_score.corpus_bleu( references, hypotheses, smoothing_function=bleu_score.SmoothingFunction().method1) reporter.report({'bleu': bleu}, self.model) et = time.time() if self.comm is not None: # This evaluator is called via chainermn.MultiNodeEvaluator for i in range(0, self.comm.size): print('BleuEvaluator::evaluate(): ' 'took {:.3f} [s]'.format(et - bt)) sys.stdout.flush() self.comm.mpi_comm.Barrier() else: # This evaluator is called from a conventional # Chainer exntension print('BleuEvaluator(single)::evaluate(): ' 'took {:.3f} [s]'.format(et - bt)) sys.stdout.flush() return observation
print(score) # 会输出一个满分, 因为候选语句完全匹配其中一个参考语句 reference = [['the', 'cat', "is", "sitting", "on", "the", "mat"]] test = ["on", 'the', "mat", "is", "a", "cat"] # The hypothesis contains 0 counts of 4-gram overlaps. print(sentence_bleu(reference, test)) # 5.5546715329196825e-78 test = ['the', 'cat', 'is', 'sitting', 'on', 'mat'] print(sentence_bleu(reference, test)) # 0.6731821382417487 ################################################################## ## 二: corpus_bleu: 计算多个句子(如段落或文档)的 BLEU 分数 # 参考文本必须被指定为文档列表, 其中每个文档是一个参考语句列表, 并且每个可替换的参考语句也是记号列表, 也就是说文档列表是记号列表的列表的列表 # 候选文档必须被指定为列表, 其中每个文件是一个记号列表, 也就是说候选文档是记号列表的列表 references = [[['this', 'is', 'a', 'test'], ['this', 'is' 'test']]] # two references for one document candidates = [['this', 'is', 'a', 'test']] score = corpus_bleu(references, candidates) print(score) # 1.0; 运行这个例子就像之前一样输出满分 ################################################################## ## 累加和单独的 BLEU 分数 # NLTK 中提供的 BLEU 评分方法允许你在计算 BLEU 分数时为不同的 n 元组指定权重 # 这使你可以灵活地计算不同类型的 BLEU 分数, 如单独和累加的 n-gram 分数 ## 单独的 N-Gram 分数 # 单独的 N-gram 分数是对特定顺序的匹配 n 元组的评分, 例如单个单词(称为 1-gram)或单词对(称为 2-gram 或 bigram) # 权重被指定为一个数组, 其中每个索引对应相应次序的 n 元组 # 仅要计算 1-gram 匹配的 BLEU 分数, 你可以指定 1-gram 权重为 1, 对于 2 元, 3 元和 4 元指定权重为 0, 也就是权重为(1, 0, 0, 0): ## 1-gram individual BLEU reference = [['this', 'is', 'small', 'test']] candidate = ['this', 'is', 'a', 'test']
def evaluate(opt_translates): eval_stats = [] eval_results = [] not_in_train = 0 total = 0 for opt_translate in opt_translates: src_lines = [] with open(opt_translate.src, 'r') as f: src_lines = f.readlines() with open(opt_translate.output, 'r') as handle: lines = [line.strip() for line in handle] for i, l in enumerate(lines): di = opt.parser.test_src_to_di[opt_translate.predicate][ src_lines[i].strip()] total += 1 stats = '\n++++NOT IN TRAIN++++' if src_lines[i].strip() not in opt.parser.train_src_to_di[ opt_translate.predicate]: not_in_train += 1 stats = '' lexicalized_l = lexicalize_word_sequence( l.split(), di.input.delexicalizationMap) stats += '\nSRC:' + str(src_lines[i]) + 'PRED:' + str( opt_translate.predicate) + '\nMR:' + str( di.input.attributeValues) + '\nREAL: ' + ' '.join( lexicalized_l) + '\nDREF: ' + str( di.directReference) logger.info(stats) eval_stats.append(di.output.evaluateAgainst(lexicalized_l)) eval_results.append( (" ".join(lexicalized_l), di.output.evaluationReferences)) stats = '\nEREF: ' + str( eval_stats[-1].refs) + '\nBLEU: ' + str( eval_stats[-1].BLEU) + '\n' logger.info(stats) if (' '.join(lexicalized_l)).strip() == str( di.directReference).strip( ) and eval_stats[-1].BLEU != 1.0: exit() realizations = [] references = [] for realization, refs in eval_results: realizations.append(realization) references.append(refs) corpusBLEU = corpus_bleu(references, realizations) bleu = numpy.average([e.BLEU for e in eval_stats]) rouge = numpy.average([e.ROUGE for e in eval_stats]) coverage = numpy.average([e.COVERAGE for e in eval_stats]) print("corpusBLEU:", corpusBLEU) print("BLEU:", bleu) print("smoothBLEU:", numpy.average([e.BLEUSmooth for e in eval_stats])) print("ROUGE:", rouge) print("COVERAGE:", coverage) print("NOT IN TRAIN:", not_in_train, '/', total) return corpusBLEU, bleu, rouge, coverage
eq = (preds==targs).float() indy_acc = eq[bitmask].mean() eq[~bitmask] = 1 eq = eq.reshape(og_shape) acc = (eq.sum(-1)==sl).float().mean() bleu_trgs=targs.reshape(og_shape).data.cpu().numpy() trg_ends = np.argmax((bleu_trgs==stop_idx),axis=1) bleu_prds=preds.reshape(og_shape).data.cpu().numpy() prd_ends = np.argmax((bleu_prds==stop_idx),axis=1) btrgs = [] bprds = [] for i in range(len(bleu_trgs)): temp = bleu_trgs[i,None,:trg_ends[i]].tolist() btrgs.append(temp) bprds.append(bleu_prds[i,:prd_ends[i]].tolist()) bleu = corpus_bleu(btrgs,bprds) avg_bleu += bleu avg_acc += acc.item() avg_indy_acc += indy_acc.item() avg_loss += loss.item() s="Loss:{:.5f} | Acc:{:.5f} | Bleu:{:.5f} | {:.0f}%" s = s.format(loss.item(), acc.item(), bleu, b/len(X)*100) print(s, end=len(s)*" " + "\r") if hyps['exp_name']=="test" and b > 5: break val_avg_bleu = avg_bleu/n_loops val_avg_loss = avg_loss/n_loops val_avg_acc = avg_acc/n_loops val_avg_indy = avg_indy_acc/n_loops
def validate(val_loader, encoder, decoder, criterion, rev_word_map): """ Performs one epoch's validation. :param val_loader: DataLoader for validation data. :param encoder: encoder model :param decoder: decoder model :param criterion: loss layer :return: BLEU-4 score """ # eval mode decoder.eval() if encoder is not None: encoder.eval() # meter batch_time = AverageMeter() losses = AverageMeter() top5accs = AverageMeter() start = time.time() references = list( ) # references (true captions) for calculating BLEU-4 score hypotheses = list() # hypotheses (predictions) # explicitly disable gradient calculation to avoid CUDA memory error # solves the issue #57 with torch.no_grad(): # Batches for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader): # break after one epoch if debugging locally if (args.run_local or args.debug) and i > 2: break # Move to device, if available imgs = imgs.to(device) caps = caps.to(device) caplens = caplens.to(device) # Forward prop. if encoder is not None: imgs = encoder(imgs) scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder( imgs, caps, caplens) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] # Remove timesteps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this scores_copy = scores.clone() scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).data # Calculate loss loss = criterion(scores, targets) # Add doubly stochastic attention regularization # We know the weights sum to 1 at a given timestep. But we also encourage # the weights at a single pixel p to sum to 1 across all timesteps T # This means we want the model to attend to every pixel over the course of generating # the entire sequence. Therefore, we try to minimize the difference between 1 and the sum of # a pixel's weights across all timesteps loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean() # Keep track of metrics_roc_and_more losses.update(loss.item(), sum(decode_lengths)) top5 = accuracy(scores, targets, 5) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() if i % print_freq == 0: print( '4 Validation: [{0}/{1}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format( i, len(val_loader), batch_time=batch_time, loss=losses, top5=top5accs)) # Store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] # References allcaps = allcaps[ sort_ind] # because images were sorted in the decoder for j in range(allcaps.shape[0]): # for each example img_caps = allcaps[j].tolist() img_captions = list( map( lambda c: [ w for w in c if w not in {word_map['<start>'], word_map['<pad>']} ], img_caps)) # remove <start> and pads references.append(img_captions) # Hypotheses # get for each example the max pred at each time step (batch size, max length caption) pred_values, preds_ind = torch.max(scores_copy, dim=2) preds_ind = preds_ind.tolist() temp_preds = list() # remove pads for j, p in enumerate(preds_ind): temp_preds.append(preds_ind[j][:decode_lengths[j]]) preds_ind = temp_preds hypotheses.extend(preds_ind) assert len(references) == len(hypotheses) if (i + 1) % 300 == 0: print('-1 ************print captions***********') num_to_print = 0 for h in hypotheses: if num_to_print < 100: words = [] for w in h: words.append(rev_word_map[w]) print('1 ' + ' '.join(words)) num_to_print += 1 else: break print('2 **************************************') # Calculate BLEU-4 scores bleu4 = corpus_bleu(references, hypotheses) print( '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n' .format(loss=losses, top5=top5accs, bleu=bleu4)) return bleu4
max_test_encoder_sequence_length = max([len(txt) for txt in test_input_texts]) test_encoder_input_data = np.zeros( (num_test_samples, max_test_encoder_sequence_length, num_encoder_tokens), dtype='float32') for i in range(num_test_samples): for j in range(len(test_input_texts[i])): if test_input_texts[i][j] in input_words: test_encoder_input_data[i, j, input_words[test_input_texts[i][j]]] = 1 references=[] hypotheses=[] with open('{}.txt'.format(int(time.time())),'w') as fi: for i in range(num_samples): decoded_sentence=decode_sequence(encoder_input_data[i:i+1]) references.append([target_texts[i][1:-1]]) hypotheses.append(decoded_sentence[:-1]) print(corpus_bleu(references,hypotheses),file=fi) references=[] hypotheses=[] for i in range(num_test_samples): decoded_sentence=decode_sequence(test_encoder_input_data[i:i+1]) references.append([test_target_texts[i][1:-1]]) hypotheses.append(decoded_sentence[:-1]) print(corpus_bleu(references,hypotheses),file=fi)
for i in range(n): weights.append(float(1.0/n)) # Initialize hypothesis and reference arrays hypotheses = [] references = [] # Create hypothesis and reference arrays taking 5 predicted captions per image (maybe we could modify this?) for row in dataset['images']: caption = row['sentences'][0]['tokens'] predicted_0 = row['predicted caption'][0] predicted_1 = row['predicted caption'][1] predicted_2 = row['predicted caption'][2] predicted_3 = row['predicted caption'][3] predicted_4 = row['predicted caption'][4] references.append([caption]) references.append([caption]) references.append([caption]) references.append([caption]) references.append([caption]) hypotheses.append(tkn.tokenize(predicted_0)) hypotheses.append(tkn.tokenize(predicted_1)) hypotheses.append(tkn.tokenize(predicted_2)) hypotheses.append(tkn.tokenize(predicted_3)) hypotheses.append(tkn.tokenize(predicted_4)) # Compute BLEU score bleu_score = corpus_bleu(references, hypotheses, weights=weights) # Display BLEU score print 'BLEU score: ' + str(bleu_score)
def validate(val_loader, net, encoder, decoder, criterion, word_map): net.eval() encoder.eval() decoder.eval() batch_time = AverageMeter() losses = AverageMeter() top3accs = AverageMeter() start = time.time() references = list() hypotheses = list() with torch.no_grad(): # Batches for i, (imgs1, imgs2, caps, caplens, allcaps) in enumerate(val_loader): imgs1 = imgs1.to(device) imgs2 = imgs2.to(device) caps = caps.to(device) caplens = caplens.to(device) im1_enc = net(imgs1) im2_enc = net(imgs2) # Forward prop. l_bef, l_aft, alpha_bef, alpha_aft = encoder(im1_enc, im2_enc) scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(l_bef, l_aft, caps, caplens) targets = caps_sorted[:, 1:] scores_copy = scores.clone() scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).data loss = criterion(scores, targets) # TODO # Add doubly stochastic attention regularization losses.update(loss.item(), sum(decode_lengths)) top3 = accuracy(scores, targets, 3) top3accs.update(top3, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() if i % print_freq == 0: print('Validation: [{0}/{1}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-3 Accuracy {top3.val:.3f} ({top3.avg:.3f})\t'.format(i, len(val_loader), batch_time=batch_time, loss=losses, top3=top3accs)) # References allcaps = allcaps[sort_ind] for j in range(allcaps.shape[0]): img_caps = allcaps[j].tolist() img_captions = list( map(lambda c: [w for w in c if w not in [0, 1, 2]], img_caps)) references.append(img_captions) # Hypotheses _, preds = torch.max(scores_copy, dim=2) preds = preds.tolist() temp_preds = list() for j, p in enumerate(preds): temp_preds.append(preds[j][:decode_lengths[j]]) preds = temp_preds hypotheses.extend(preds) assert len(references) == len(hypotheses) weights1 = (1.0, 0.0, 0.0, 0.0) weights2 = (0.5, 0.5, 0.0, 0.0) weights3 = (0.33, 0.33, 0.33, 0.0) weights4 = (0.25, 0.25, 0.25, 0.25) bleu1 = corpus_bleu(references, hypotheses, weights1) bleu2 = corpus_bleu(references, hypotheses, weights2) bleu3 = corpus_bleu(references, hypotheses, weights3) bleu4 = corpus_bleu(references, hypotheses, weights4) print( '\n * LOSS - {loss.avg:.3f}, TOP-3 ACCURACY - {top3.avg:.3f}, BLEU-1 - {bleu11}, BLEU-2 - {bleu22}, BLEU-3 - {bleu33}, BLEU-4 - {bleu44},\n'.format( loss=losses, top3=top3accs, bleu11=bleu1, bleu22=bleu2, bleu33=bleu3, bleu44=bleu4, )) return bleu4
print("Loading initial model from {}...".format(ckpt_path)) optimistic_restore(sess, ckpt_path) try: step = 0 while True: step += 1 print("training step {}...".format(step)) training_start_time = time.time() ops = [gleu_train_op, global_step_var, graph_sums_op, mle_loss, gleu_score, preds, noised_y] _, global_step, graph_sums, loss, gleu, pred_values, y_values = sess.run(ops) training_step_time = time.time() - training_start_time # Compute batch BLEU and GLEU and save summaries of them cropped_y = [[_crop(y_values[k, :], EOS)] for k in range(batch_size)] cropped_preds = [_crop(pred_values[k, :], EOS) for k in range(batch_size)] nltk_bleu = corpus_bleu(cropped_y, cropped_preds, emulate_multibleu=True) nltk_gleu, nltk_n_match, nltk_n_all = custom_corpus_gleu(cropped_y, cropped_preds) sums = { 'nltk.bleu': nltk_bleu, 'nltk.gleu': nltk_gleu, 'nltk.n_match': nltk_n_match, 'nltk.n_all': nltk_n_all, } sum_writer.add_summary(graph_sums, global_step=global_step) for label, measure in sums.items(): summary = tf.Summary(value=[tf.Summary.Value(tag=label, simple_value=measure)]) sum_writer.add_summary(summary, global_step=global_step) print("step took {:.2f} seconds".format(training_step_time))
def evaluate(beam_size): """ Evaluation :param beam_size: beam size at which to generate captions for evaluation :return: BLEU-4 score """ # DataLoader loader = torch.utils.data.DataLoader( CaptionDataset(data_folder, data_name, 'TEST', transform=transforms.Compose([normalize])), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) # TODO: Batched Beam Search # Therefore, do not use a batch_size greater than 1 - IMPORTANT! # Lists to store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] references = list() hypotheses = list() # For each image for i, (image, caps, caplens, allcaps) in enumerate( tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))): if i > 14: break k = beam_size # Move to GPU device, if available image = image.to(device) # (1, 3, 256, 256) # Encode img, img_mean = model.imgEncoder(image) # (1, enc_image_size, enc_image_size, encoder_dim) enc_image_size = img.size(1) encoder_dim = img.size(3) # Flatten encoding encoder_out = img.view(1, -1, encoder_dim) # (1, num_pixels, encoder_dim) num_pixels = encoder_out.size(1) # We'll treat the problem as having a batch size of k encoder_out = encoder_out.expand(k, num_pixels, encoder_dim) # (k, num_pixels, encoder_dim) # Tensor to store top k previous words at each step; now they're just <start> k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(device) # (k, 1) # Tensor to store top k sequences; now they're just <start> seqs = k_prev_words # (k, 1) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(k, 1).to(device) # (k, 1) # Lists to store completed sequences and scores complete_seqs = list() complete_seqs_scores = list() # Start decoding step = 1 if standard_gaussian: z = torch.randn([k, latent_size]) if torch.cuda.is_available(): z = z.cuda() else: h2 = torch.relu(model.fc2(img_mean)) mu2 = model.hidden2mu2(h2) logv2 = model.hidden2logv2(h2) std = torch.exp(0.5 * logv2) z = torch.randn([k, latent_size]) if torch.cuda.is_available(): z = z.cuda() z = z * std + mu2 h, c = model.attnDecoder.init_hidden_state(z) smth_wrong = False # s is a number less than or equal to k, because sequences are removed from this process once they hit <end> while True: embeddings = model.attnDecoder.embedding(k_prev_words).squeeze(1) # (s, embed_dim) awe, _ = model.attnDecoder.attention(encoder_out, h) # (s, encoder_dim), (s, num_pixels) gate = model.attnDecoder.sigmoid(model.attnDecoder.f_beta(h)) # gating scalar, (s, encoder_dim) awe = gate * awe h, c = model.attnDecoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c)) # (s, decoder_dim) scores = model.attnDecoder.fc(h) # (s, vocab_size) scores = F.log_softmax(scores, dim=1) # Add scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size) # For the first step, all k points will have the same scores (since same k previous words, h, c) if step == 1: top_k_scores, top_k_words = scores[0].topk(k, 0, True, True) # (s) else: # Unroll and find top scores, and their unrolled indices top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True) # (s) # Convert unrolled indices to actual indices of scores prev_word_inds = top_k_words / vocab_size # (s) next_word_inds = top_k_words % vocab_size # (s) # Add new words to sequences seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1) # Which sequences are incomplete (didn't reach <end>)? incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>']] complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds)) # Set aside complete sequences if len(complete_inds) > 0: complete_seqs.extend(seqs[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) k -= len(complete_inds) # reduce beam length accordingly # Proceed with incomplete sequences if k == 0: break seqs = seqs[incomplete_inds] h = h[prev_word_inds[incomplete_inds]] c = c[prev_word_inds[incomplete_inds]] encoder_out = encoder_out[prev_word_inds[incomplete_inds]] top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1) k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1) # Break if things have been going on too long if step > 50: smth_wrong = True break step += 1 if smth_wrong is not True: i = complete_seqs_scores.index(max(complete_seqs_scores)) seq = complete_seqs[i] else: seq = seqs[0][:20] seq = [x.item() for x in seq] # i = complete_seqs_scores.index(max(complete_seqs_scores)) # seq = complete_seqs[i] # References img_caps = allcaps[0].tolist() img_captions = list( map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}], img_caps)) # remove <start> and pads references.append(img_captions) # Hypotheses hypotheses.append([w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}]) print(' '.join([rev_word_map[x] for x in hypotheses[-1]])) assert len(references) == len(hypotheses) # Calculate BLEU-4 scores bleu4 = corpus_bleu(references, hypotheses) return bleu4
def bleu(self, reference, candidate): bleu4 = corpus_bleu( reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)) * 100 return bleu4
import matplotlib.pyplot as plt import matplotlib.image as mpimg from nltk.translate import bleu_score, meteor_score import numpy as np x = [1, 2, 2, 3] hyp = ['am', 'leg'] ref = ['am', 'legend'] # print(bleu_score.sentence_bleu([ref],hyp,weights=[0.5,0.5])) print(bleu_score.corpus_bleu([[ref]], [hyp], weights=[1.0])) # plt.imsave(path) # import tensorflow as tf # print(tf.__version__) # conda config --add channels conda-forge # conda install keras opencv shapely tensorflow gensim pandas imgaug # pip install --upgrade tensorflow==2.0.0-beta1 # pip install "C:\Users\omarm\Downloads\Shapely-1.6.4.post2-cp37-cp37m-win_amd64.whl" matplotlib pandas imgaug gensim tensorflow==2.0.0-beta1
def validate(validation_loader, model_network, criterion_func): model_network.eval() batch_time = avgValsTracker() losses = avgValsTracker() top5accs = avgValsTracker() start = time.time() references = list( ) # references (true captions) for calculating BLEU-4 score hypotheses = list() # hypotheses (predictions) # explicitly disable gradient calculation to avoid CUDA memory error # solves the issue #57 with torch.no_grad(): # Batches for i, (imgs, caps, caplens, allcaps) in enumerate(validation_loader): # Move to device, if available imgs = imgs.to(device) caps = caps.to(device) caplens = caplens.to(device) # Forwardla scores, caps_sorted, decode_lengths, alphas, sort_ind = model( imgs, caps, caplens) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] # Remove time-steps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this scores_copy = scores.clone() scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).data # Calculate loss loss = criterion_func(scores, targets) # Add doubly stochastic attention regularization loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean() # Keep track of metrics losses.update(loss.item(), sum(decode_lengths)) top5 = accuracy(scores, targets, 5) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() if i % 100 == 0: print( 'Validation: [{0}/{1}]\t' 'Batch Time {batch_time.value:.3f} ({batch_time.average:.3f})\t' 'Loss {loss.value:.4f} ({loss.average:.4f})\t' 'Top-5 Accuracy {top5.value:.3f} ({top5.average:.3f})\t'. format(i, len(validation_loader), batch_time=batch_time, loss=losses, top5=top5accs)) # Store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] # References # allcaps = allcaps[sort_ind] # because images were sorted in the decoder for j in range(allcaps.shape[0]): img_caps = allcaps[j].tolist() for idxx in range(len(img_caps) - 1, -1, -1): if img_caps[idxx][0] == -1: del img_caps[idxx] img_captions = list( map( lambda c: [w for w in c if w not in {word_map['x_START_']}], img_caps)) # remove <start> and pads references.append(img_captions) # Hypotheses _, preds = torch.max(scores_copy, dim=2) preds = preds.tolist() temp_preds = list() for j, p in enumerate(preds): temp_preds.append(preds[j][:decode_lengths[j]]) # remove pads preds = temp_preds hypotheses.extend(preds) assert len(references) == len(hypotheses) # Calculate BLEU-4 scores bleu4 = corpus_bleu(references, hypotheses) print( '\n * LOSS: {loss.avg:.3f}, TOP-5 ACCURACY: {top5.avg:.3f}, BLEU-4: {bleu}\n' .format(loss=losses, top5=top5accs, bleu=bleu4)) return bleu4
def evaluate(self, model, data, vocabs=None, use_concept=False, log_dir=None, embed=None, cur_step=0): """ Evaluate a model on given dataset and return performance. Args: model (seq2seq.models): model to evaluate data (seq2seq.dataset.dataset.Dataset): dataset to evaluate against Returns: loss (float): loss of the given model on the given dataset """ eval_limit = 5000 step_limit = int(eval_limit / self.batch_size) model.eval() loss = self.loss loss.reset() match = 0 total = 0 device = torch.device('cuda', 0) if torch.cuda.is_available() else None batch_iterator = torchtext.data.BucketIterator( dataset=data, batch_size=self.batch_size, sort=True, sort_key=lambda x: len(x.src), device=device, train=False) tgt_vocab = data.fields[seq2seq.tgt_field_name].vocab src_vocab = data.fields[seq2seq.src_field_name].vocab pad = tgt_vocab.stoi[data.fields[seq2seq.tgt_field_name].pad_token] cnt = 0 loss_sum = 0 context_corpus = [] reference_corpus = [] prediction_corpus = [] state_corpus = [] with torch.no_grad(): for batch in batch_iterator: print(cnt) cnt += 1 input_variables, input_lengths = getattr( batch, seq2seq.src_field_name) if torch.cuda.is_available(): input_index = input_variables.cpu().numpy() else: input_index = input_variables.numpy() input_words = [[src_vocab.itos[word] for word in line] for line in input_index] context_corpus.extend(input_words) if use_concept: concept, _ = getattr(batch, seq2seq.cpt_field_name) else: concept = [] target_variables = getattr(batch, seq2seq.tgt_field_name) if use_concept: (decoder_outputs, decoder_hidden, other), state_loss, state_print = model( input_variables, input_lengths.tolist(), target_variables, concept=concept, vocabs=vocabs, use_concept=use_concept, track_state=use_concept) state_corpus.extend(state_print) """ decoder_outputs, decoder_hidden, other = model(input_variables, input_lengths.tolist(), target_variables, concept=concept, vocabs=vocabs, use_concept=use_concept, track_state=False) """ else: decoder_outputs, decoder_hidden, other = model( input_variables, input_lengths.tolist(), target_variables, vocabs=vocabs) # Evaluation seqlist = other['sequence'] reference = [] prediction = [] for step, step_output in enumerate(decoder_outputs): target = target_variables[:, step + 1] loss.eval_batch( step_output.view(target_variables.size(0), -1), target) non_padding = target.ne(pad) correct = seqlist[step].view(-1).eq(target).masked_select( non_padding).sum().item() match += correct total += non_padding.sum().item() if torch.cuda.is_available(): pred = seqlist[step].view(-1).cpu().numpy() tgt = target.view(-1).cpu().numpy() else: pred = seqlist[step].view(-1).numpy() tgt = target.view(-1).numpy() for i in range(len(step_output)): target_char = tgt_vocab.itos[tgt[i]] pred_char = tgt_vocab.itos[pred[i]] if target_char != '<pad>': if len(reference) >= i + 1: reference[i].append(target_char) else: reference.append([target_char]) if pred_char != '<pad>': if len(prediction) >= i + 1: if prediction[i][-1] != '<eos>': prediction[i].append(pred_char) else: prediction.append([pred_char]) for i in range(len(reference)): reference[i] = reference[i][:-1] prediction[i] = prediction[i][:-1] reference_corpus.extend([[line] for line in reference]) prediction_corpus.extend(prediction) if cnt > step_limit: break bleu = corpus_bleu(reference_corpus, prediction_corpus, smoothing_function=smoothie) # embedding = embed.eval_embedding(reference_corpus, prediction_corpus) distinct_1 = distinct(prediction_corpus, 1) distinct_2 = distinct(prediction_corpus, 2) print("Corpus BLEU: ", bleu) # print("Embedding dist: ", embedding) print("Distinct-1: ", distinct_1) print("Distinct-2: ", distinct_2) with open(log_dir + '/log.txt', 'a+', encoding='utf-8') as file: file.write("Distinct-1: " + str(distinct_1) + '\n') file.write("Distinct-2: " + str(distinct_2) + '\n\n') with open(log_dir + '/log-' + str(cur_step), 'w', encoding='utf-8') as file: file.write("Corpus BLEU: " + str(bleu) + '\n') # file.write("Embedding Dist: " + str(embedding) + '\n') file.write("Distinct-1: " + str(distinct_1) + '\n') file.write("Distinct-2: " + str(distinct_2) + '\n\n') for i in range(len(reference_corpus)): file.write("Context: " + '\n') context_str = " ".join(context_corpus[i]) context_list = context_str.split('<eou>') for j in range(len(context_list)): file.write(context_list[j] + '\n') if use_concept and state_corpus: file.write("\nStates: " + '\n') cd_pairs = zip(state_corpus[i][0], state_corpus[i][1]) cd_pairs = sorted(set(cd_pairs), key=lambda x: x[1]) for j in range(len(state_corpus[i][0])): file.write("Concept: {}. Prob: {}.\n".format( cd_pairs[j][0], cd_pairs[j][1])) file.write("\nGold: " + ' '.join(reference_corpus[i][0]) + '\n\n') file.write("Response: " + ' '.join(prediction_corpus[i]) + '\n\n') file.write('\n') if total == 0: accuracy = float('nan') else: accuracy = match / total return loss.get_loss(), accuracy
#weights[n-1] = 1 weights = [] for i in range(n): weights.append(float(1.0 / n)) # Create hypothesis and reference arrays taking 5 predicted captions per image (maybe we could modify this?) for conf_value in confidence: samples = 0 # Initialize hypothesis and reference arrays hypotheses = [] references = [] for row in dataset['images']: caption = row['sentences'][0]['tokens'] #for idx in range(len(row['predicted caption'])): # if float(row['confidence'][idx]) >= conf_value and row['split'] in ['test'] and row['predicted caption'][idx]: # samples += 1 # predicted = row['predicted caption'][idx] # references.append([caption]) # hypotheses.append(tkn.tokenize(predicted)) #if float(row['top confidence']) >= conf_value and row['split'] in ['test'] and row['top caption']: if float(row['top confidence']) >= conf_value and row['top caption']: samples += 1 predicted = row['top caption'] references.append([caption]) hypotheses.append(tkn.tokenize(predicted)) # Compute BLEU score bleu_score = corpus_bleu(references, hypotheses, weights=weights) # Display BLEU score print 'Confidence: {0} BLEU-{1} score: {2} Samples: {3}'.format(conf_value, n, bleu_score, samples)
def evaluate_decode_results(dataset, decode_results, verbose=True): from lang.py.parse import tokenize_code, de_canonicalize_code # tokenize_code = tokenize_for_bleu_eval import ast assert dataset.count == len(decode_results) f = f_decode = None if verbose: f = open(dataset.name + '.exact_match', 'w') exact_match_ids = [] f_decode = open(dataset.name + '.decode_results.txt', 'w') eid_to_annot = dict() if config.data_type == 'django': for raw_id, line in enumerate(open(DJANGO_ANNOT_FILE)): eid_to_annot[raw_id] = line.strip() f_bleu_eval_ref = open(dataset.name + '.ref', 'w') f_bleu_eval_hyp = open(dataset.name + '.hyp', 'w') f_generated_code = open(dataset.name + '.geneated_code', 'w') logging.info('evaluating [%s] set, [%d] examples', dataset.name, dataset.count) cum_oracle_bleu = 0.0 cum_oracle_acc = 0.0 cum_bleu = 0.0 cum_acc = 0.0 sm = SmoothingFunction() all_references = [] all_predictions = [] if all(len(cand) == 0 for cand in decode_results): logging.ERROR('Empty decoding results for the current dataset!') return -1, -1 for eid in range(dataset.count): example = dataset.examples[eid] ref_code = example.code ref_ast_tree = ast.parse(ref_code).body[0] refer_source = astor.to_source(ref_ast_tree).strip() # refer_source = ref_code refer_tokens = tokenize_code(refer_source) cur_example_correct = False decode_cands = decode_results[eid] if len(decode_cands) == 0: continue decode_cand = decode_cands[0] cid, cand, ast_tree, code = decode_cand code = astor.to_source(ast_tree).strip() # simple_url_2_re = re.compile('_STR:0_', re.)) try: predict_tokens = tokenize_code(code) except: logging.error('error in tokenizing [%s]', code) continue if refer_tokens == predict_tokens: cum_acc += 1 cur_example_correct = True if verbose: exact_match_ids.append(example.raw_id) f.write('-' * 60 + '\n') f.write('example_id: %d\n' % example.raw_id) f.write(code + '\n') f.write('-' * 60 + '\n') if config.data_type == 'django': ref_code_for_bleu = example.meta_data['raw_code'] pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code']) # ref_code_for_bleu = de_canonicalize_code(ref_code_for_bleu, example.meta_data['raw_code']) # convert canonicalized code to raw code for literal, place_holder in example.meta_data['str_map'].iteritems(): pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal) # ref_code_for_bleu = ref_code_for_bleu.replace('\'' + place_holder + '\'', literal) elif config.data_type == 'hs': ref_code_for_bleu = ref_code pred_code_for_bleu = code # we apply Ling Wang's trick when evaluating BLEU scores refer_tokens_for_bleu = tokenize_for_bleu_eval(ref_code_for_bleu) pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu) # The if-chunk below is for debugging purpose, sometimes the reference cannot match with the prediction # because of inconsistent quotes (e.g., single quotes in reference, double quotes in prediction). # However most of these cases are solved by cannonicalizing the reference code using astor (parse the reference # into AST, and regenerate the code. Use this regenerated one as the reference) weired = False if refer_tokens_for_bleu == pred_tokens_for_bleu and refer_tokens != predict_tokens: # cum_acc += 1 weired = True elif refer_tokens == predict_tokens: # weired! # weired = True pass shorter = len(pred_tokens_for_bleu) < len(refer_tokens_for_bleu) all_references.append([refer_tokens_for_bleu]) all_predictions.append(pred_tokens_for_bleu) # try: ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu)) bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3) cum_bleu += bleu_score # except: # pass if verbose: print 'raw_id: %d, bleu_score: %f' % (example.raw_id, bleu_score) f_decode.write('-' * 60 + '\n') f_decode.write('example_id: %d\n' % example.raw_id) f_decode.write('intent: \n') if config.data_type == 'django': f_decode.write(eid_to_annot[example.raw_id] + '\n') elif config.data_type == 'hs': f_decode.write(' '.join(example.query) + '\n') f_bleu_eval_ref.write(' '.join(refer_tokens_for_bleu) + '\n') f_bleu_eval_hyp.write(' '.join(pred_tokens_for_bleu) + '\n') f_decode.write('canonicalized reference: \n') f_decode.write(refer_source + '\n') f_decode.write('canonicalized prediction: \n') f_decode.write(code + '\n') f_decode.write('reference code for bleu calculation: \n') f_decode.write(ref_code_for_bleu + '\n') f_decode.write('predicted code for bleu calculation: \n') f_decode.write(pred_code_for_bleu + '\n') f_decode.write('pred_shorter_than_ref: %s\n' % shorter) f_decode.write('weired: %s\n' % weired) f_decode.write('-' * 60 + '\n') # for Hiro's evaluation f_generated_code.write(pred_code_for_bleu.replace('\n', '#NEWLINE#') + '\n') # compute oracle best_score = 0. cur_oracle_acc = 0. for decode_cand in decode_cands[:config.beam_size]: cid, cand, ast_tree, code = decode_cand try: code = astor.to_source(ast_tree).strip() predict_tokens = tokenize_code(code) if predict_tokens == refer_tokens: cur_oracle_acc = 1 if config.data_type == 'django': pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code']) # convert canonicalized code to raw code for literal, place_holder in example.meta_data['str_map'].iteritems(): pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal) elif config.data_type == 'hs': pred_code_for_bleu = code # we apply Ling Wang's trick when evaluating BLEU scores pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu) ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu)) bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3) if bleu_score > best_score: best_score = bleu_score except: continue cum_oracle_bleu += best_score cum_oracle_acc += cur_oracle_acc cum_bleu /= dataset.count cum_acc /= dataset.count cum_oracle_bleu /= dataset.count cum_oracle_acc /= dataset.count logging.info('corpus level bleu: %f', corpus_bleu(all_references, all_predictions, smoothing_function=sm.method3)) logging.info('sentence level bleu: %f', cum_bleu) logging.info('accuracy: %f', cum_acc) logging.info('oracle bleu: %f', cum_oracle_bleu) logging.info('oracle accuracy: %f', cum_oracle_acc) if verbose: f.write(', '.join(str(i) for i in exact_match_ids)) f.close() f_decode.close() f_bleu_eval_ref.close() f_bleu_eval_hyp.close() f_generated_code.close() return cum_bleu, cum_acc
def test_bleu_bug(): ref = [[[1, 3], [3], [4]]] gen = [[1]] with pytest.raises(ZeroDivisionError): corpus_bleu(ref, gen, smoothing_function=SmoothingFunction().method3)
def eval_wordpiece_bleu(models, dataloader, recog_params, epoch, recog_dir=None, streaming=False, progressbar=False, fine_grained=False, oracle=False, teacher_force=False): """Evaluate a wordpiece-level model by corpus-level BLEU. Args: models (List): models to evaluate dataloader (torch.utils.data.DataLoader): evaluation dataloader recog_params (omegaconf.dictconfig.DictConfig): decoding hyperparameters epoch (int): current epoch recog_dir (str): directory path to save hypotheses streaming (bool): streaming decoding for session-level evaluation progressbar (bool): visualize progressbar fine_grained (bool): calculate fine-grained corpus-level BLEU distributions based on input lengths oracle (bool): calculate oracle corpsu-level BLEU teacher_force (bool): conduct decoding in teacher-forcing mode Returns: c_bleu (float): corpus-level 4-gram BLEU """ if recog_dir is None: recog_dir = 'decode_' + dataloader.set + '_ep' + \ str(epoch) + '_beam' + str(recog_params.get('recog_beam_width')) recog_dir += '_lp' + str(recog_params.get('recog_length_penalty')) recog_dir += '_cp' + str(recog_params.get('recog_coverage_penalty')) recog_dir += '_' + str(recog_params.get('recog_min_len_ratio')) + '_' + \ str(recog_params.get('recog_max_len_ratio')) recog_dir += '_lm' + str(recog_params.get('recog_lm_weight')) ref_trn_path = mkdir_join(models[0].save_path, recog_dir, 'ref.trn') hyp_trn_path = mkdir_join(models[0].save_path, recog_dir, 'hyp.trn') else: ref_trn_path = mkdir_join(recog_dir, 'ref.trn') hyp_trn_path = mkdir_join(recog_dir, 'hyp.trn') list_of_references_dist = { } # calculate corpus-level BLEU distribution bucketed by input lengths hypotheses_dist = {} hypotheses_oracle = [] n_oracle_hit = 0 n_utt = 0 # Reset data counter dataloader.reset(recog_params.get('recog_batch_size')) if progressbar: pbar = tqdm(total=len(dataloader)) list_of_references = [] hypotheses = [] with codecs.open(hyp_trn_path, 'w', encoding='utf-8') as f_hyp, \ codecs.open(ref_trn_path, 'w', encoding='utf-8') as f_ref: while True: batch, is_new_epoch = dataloader.next( recog_params.get('recog_batch_size')) if streaming or recog_params.get('recog_block_sync'): nbest_hyps_id = models[0].decode_streaming( batch['xs'], recog_params, dataloader.idx2token[0], exclude_eos=True)[0] else: nbest_hyps_id = models[0].decode( batch['xs'], recog_params, idx2token=dataloader.idx2token[0], exclude_eos=True, refs_id=batch['ys'], utt_ids=batch['utt_ids'], speakers=batch['sessions' if dataloader.corpus == 'swbd' else 'speakers'], ensemble_models=models[1:] if len(models) > 1 else [])[0] for b in range(len(batch['xs'])): ref = batch['text'][b] if ref[0] == '<': ref = ref.split('>')[1] nbest_hyps = [ dataloader.idx2token[0](hyp_id) for hyp_id in nbest_hyps_id[b] ] # Write to trn # speaker = str(batch['speakers'][b]).replace('-', '_') if streaming: utt_id = str(batch['utt_ids'][b]) + '_0000000_0000001' else: utt_id = str(batch['utt_ids'][b]) f_ref.write(ref + '\n') f_hyp.write(nbest_hyps[0] + '\n') logger.debug('utt-id (%d/%d): %s' % (n_utt + 1, len(dataloader), utt_id)) logger.debug('Ref: %s' % ref) logger.debug('Hyp: %s' % nbest_hyps[0]) logger.debug('-' * 150) if not streaming: list_of_references += [[ref.split(' ')]] hypotheses += [nbest_hyps[0].split(' ')] if fine_grained: xlen_bin = (batch['xlens'][b] // 200 + 1) * 200 if xlen_bin in hypotheses_dist.keys(): list_of_references_dist[xlen_bin] += [[ ref.split(' ') ]] hypotheses_dist[xlen_bin] += [hypotheses[-1]] else: list_of_references_dist[xlen_bin] = [[ ref.split(' ') ]] hypotheses_dist[xlen_bin] = [hypotheses[-1]] # Compute oracle corpus-level BLEU (selected by sentence-level BLEU) if oracle and len(nbest_hyps) > 1: s_blues_b = [ sentence_bleu(ref.split(' '), hyp_n.split(' ')) for hyp_n in nbest_hyps ] oracle_idx = np.argmax(np.array(s_blues_b)) if oracle_idx == 0: n_oracle_hit += len(batch['utt_ids']) hypotheses_oracle += [ nbest_hyps[oracle_idx].split(' ') ] n_utt += len(batch['utt_ids']) if progressbar: pbar.update(len(batch['utt_ids'])) if is_new_epoch: break if progressbar: pbar.close() # Reset data counters dataloader.reset() c_bleu = corpus_bleu(list_of_references, hypotheses) * 100 if not streaming: if oracle: c_bleu_oracle = corpus_bleu(list_of_references, hypotheses_oracle) * 100 oracle_hit_rate = n_oracle_hit * 100 / n_utt logger.info('Oracle corpus-level BLEU (%s): %.2f %%' % (dataloader.set, c_bleu_oracle)) logger.info('Oracle hit rate (%s): %.2f %%' % (dataloader.set, oracle_hit_rate)) if fine_grained: for len_bin, hypotheses_bin in sorted(hypotheses_dist.items(), key=lambda x: x[0]): c_bleu_bin = corpus_bleu(list_of_references_dist[len_bin], hypotheses_bin) * 100 logger.info(' corpus-level BLEU (%s): %.2f %% (%d)' % (dataloader.set, c_bleu_bin, len_bin)) logger.debug('Corpus-level BLEU (%s): %.2f %%' % (dataloader.set, c_bleu)) return c_bleu
def validate(val_loader, decoder, criterion_ce, i2w, device, print_freq, word_map, current_epoch, break_flag, top_x, smoothing_method, print_flag): """ Performs one epoch's validation. :param val_loader: DataLoader for validation data. :param decoder: decoder model :param criterion_ce: cross entropy loss layer :param criterion_dis : discriminative loss layer :return: BLEU-4 score """ decoder.eval() # eval mode (no dropout or batchnorm) batch_time = AverageMeter() losses = AverageMeter() top5accs = AverageMeter() start = time.time() references = list( ) # references (true captions) for calculating BLEU-4 score hypotheses = list() # hypotheses (predictions) # Batches with torch.no_grad(): for i, data in enumerate(val_loader): if break_flag and i == 5: break # only 5 batches print('val i', i) imgs, caps, caplens, allcaps = data # Move to device, if available imgs = imgs.to(device) caps = caps.to(device) caplens = caplens.to(device) scores, caps_sorted, decode_lengths, sort_ind = decoder( imgs, caps, caplens) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] if print_flag: print_predictions(scores, targets, i2w) # Remove timesteps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this scores_copy = scores.clone() scores, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True) targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True) # Calculate loss loss = criterion_ce(scores, targets) # Keep track of metrics losses.update(loss.item(), sum(decode_lengths)) top5 = accuracy(scores, targets, top_x) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() if i % print_freq == 0: print( 'Validation: [{0}/{1}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-{topx} Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'. format(i, len(val_loader), batch_time=batch_time, loss=losses, topx=top_x, top5=top5accs)) # Store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] # References allcaps = allcaps[ sort_ind] # because images were sorted in the decoder # DIDEC caps of other participants come here # print(allcaps.shape) for j in range(allcaps.shape[0]): img_caps = allcaps[j].tolist() img_captions = list( map( lambda c: [ w for w in c if w not in {word_map['<start>'], word_map['<pad>']} ], img_caps)) # remove <start> and pads refs_per_img = [] for ic in img_captions: if len(ic) > 0: refs_per_img.append(ic) references.append(refs_per_img) # Hypotheses _, preds = torch.max(scores_copy, dim=2) # print('scr', scores_copy) # print('preds', preds) preds = preds.tolist() temp_preds = list() for j, p in enumerate(preds): temp_preds.append( preds[j][:decode_lengths[j]] ) # remove pads #SUPERFLUOUS ENDS stay for teacher forcing preds = temp_preds hypotheses.extend(preds) assert len(references) == len(hypotheses) # Calculate BLEU-4 scores #print('refshyps') #print(references) #print(hypotheses) bleu4 = corpus_bleu(references, hypotheses, smoothing_function=smoothing_method) bleu4 = round(bleu4, 4) print( '\n * LOSS - {loss.avg:.3f}, TOP-{topx} ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n' .format(loss=losses, topx=top_x, top5=top5accs, bleu=bleu4)) return bleu4, losses
def evaluate_with_beam(beam_width, data_name, model, encoder, decoder, word_map, word_map_start, word_map_end, rev_word_map): """ Evaluation :param beam_size: beam size at which to generate captions for evaluation :return: BLEU-4 score """ # DataLoader loader = torch.utils.data.DataLoader(CaptionDataset( data_folder, data_name, 'TEST', transform=transforms.Compose([normalize])), batch_size=1, shuffle=True, num_workers=1, pin_memory=True) # TODO: Batched Beam Search # Therefore, do not use a batch_size greater than 1 - IMPORTANT! # Lists to store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] references = list() hypotheses = list() references_zh = list() hypotheses_zh = list() # For each image for i, (image, caps, caplens, allcaps) in enumerate( tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_width))): #if i % 1000 != 0: continue # Move to GPU device, if available image = image.to(device) # (1, 3, 256, 256) # Encode encoder_out = encoder( image) # (1, enc_image_size, enc_image_size, encoder_dim) enc_image_size = encoder_out.size(1) encoder_dim = encoder_out.size(3) # Flatten encoding encoder_out = encoder_out.view( 1, -1, encoder_dim) # (1, num_pixels, encoder_dim) num_pixels = encoder_out.size(1) # Decode seq, _ = decode_one(decoder, encoder_out, encoder_dim, enc_image_size, word_map_start, word_map_end, beam_width) # References img_caps = allcaps[0].tolist() img_captions = list( map( lambda c: [ w for w in c if w not in { word_map['<start>'], word_map['<end>'], word_map[ '<pad>'] } ], img_caps)) # remove <start> and pads #img_captions_zh = [" ".join(rev_word_map[word] for word in sentence) for sentence in img_captions] img_captions_zh = [ " ".join(str(word) for word in sentence) for sentence in img_captions ] references.append(img_captions) references_zh.append(img_captions_zh) # Hypotheses hypothese = [ w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']} ] hypotheses.append(hypothese) #hypothese_zh = [rev_word_map[item] for item in hypothese] hypothese_zh = [str(item) for item in hypothese] hypothese_zh = " ".join(hypothese_zh) hypotheses_zh.append(hypothese_zh) assert len(references) == len(hypotheses) assert len(references_zh) == len(hypotheses_zh) # Calculate BLEU-4 scores bleu1nltk = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0)) bleu2nltk = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0)) bleu3nltk = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0)) bleu4nltk = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)) nlgeval = NLGEval() # loads the models metrics_dict = nlgeval.compute_metrics(references_zh, hypotheses_zh) metrics_dict["bleu1nltk"] = bleu1nltk metrics_dict["bleu2nltk"] = bleu2nltk metrics_dict["bleu3nltk"] = bleu3nltk metrics_dict["bleu4nltk"] = bleu4nltk # write result to json file #print(metrics_dict) output = { "model": model[15:-8], "beam_width": beam_width, "scores": metrics_dict } output_file_name = "../evaluation/" + model[15:-8] + ".txt" f = open(output_file_name, "a+") f.writelines(json.dumps(str(output))) f.writelines("\n") f.close() return metrics_dict
def self_bleu(sents): return bleu.corpus_bleu([[s for (j, s) in enumerate(sents) if j != i] for i in range(len(sents))], sents)
def validate(val_loader, encoder, decoder, criterion): """ Performs one epoch's validation. :param val_loader: DataLoader for validation data. :param encoder: encoder model :param decoder: decoder model :param criterion: loss layer :return: BLEU-4 score """ decoder.eval() # eval mode (no dropout or batchnorm) if encoder is not None: encoder.eval() batch_time = AverageMeter() losses = AverageMeter() top5accs = AverageMeter() start = time.time() references = list( ) # references (true captions) for calculating BLEU-4 score hypotheses = list() # hypotheses (predictions) # Batches for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader): # Move to device, if available imgs = imgs.to(device) caps = caps.to(device) caplens = caplens.to(device) # Forward prop. if encoder is not None: imgs = encoder(imgs) scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder( imgs, caps, caplens) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] # Remove timesteps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this scores_copy = scores.clone() scores, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True) targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True) # Calculate loss loss = criterion(scores, targets) # Add doubly stochastic attention regularization loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean() # Keep track of metrics losses.update(loss.item(), sum(decode_lengths)) top5 = accuracy(scores, targets, 5) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() if i % print_freq == 0: print('Validation: [{0}/{1}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format( i, len(val_loader), batch_time=batch_time, loss=losses, top5=top5accs)) # Store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] # References allcaps = allcaps[ sort_ind] # because images were sorted in the decoder for j in range(allcaps.shape[0]): img_caps = allcaps[j].tolist() img_captions = list( map( lambda c: [ w for w in c if w not in {word_map['<start>'], word_map['<pad>']} ], img_caps)) # remove <start> and pads references.append(img_captions) # Hypotheses _, preds = torch.max(scores_copy, dim=2) preds = preds.tolist() temp_preds = list() for j, p in enumerate(preds): temp_preds.append(preds[j][:decode_lengths[j]]) # remove pads preds = temp_preds hypotheses.extend(preds) assert len(references) == len(hypotheses) # Calculate BLEU-4 scores bleu4 = corpus_bleu(references, hypotheses) print( '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n' .format(loss=losses, top5=top5accs, bleu=bleu4)) return bleu4
def validate(val_loader, encoder, decoder, criterion): decoder.eval() if encoder is not None: encoder.eval() batch_time = AverageMeter() losses = AverageMeter() top5accs = AverageMeter() start = time.time() references = list() # true captions for calculating the bleu scores hypotheses = list() # hypotheses (predictions) with torch.no_grad(): # batches for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader): # move to device, if available imgs = imgs.to(device) caps = caps.to(device) caplens = caplens.unsqueeze(1).to(device) # forward prop if encoder is not None: imgs = encoder(imgs) scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder( imgs, caps, caplens) targets = caps_sorted[:, 1:] scores_copy = scores.clone() scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).data # calculate loss loss = criterion(scores, targets) # doubly stochastic attention regularization loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean() # keep track of metrics losses.update(loss.item(), sum(decode_lengths)) top5 = accuracy(scores, targets, 5) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() if i % print_freq == 0: print( 'Validation: [{0}/{1}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format( i, len(val_loader), batch_time=batch_time, loss=losses, top5=top5accs)) allcaps = allcaps[sort_ind] for j in range(allcaps.shape[0]): img_caps = allcaps[j].tolist() # check if it has <sos> token and remove it references.append(img_caps) # hypotheses _, preds = torch.max(scores_copy, dim=2) preds = preds.tolist() temp_preds = list() for j, p in enumerate(preds): temp_preds.append(preds[j][:decode_lengths[j]]) # remove pads preds = temp_preds hypotheses.extend(preds) assert len(references) == len(hypotheses) # Calculate BLEU-4 scores bleu4 = corpus_bleu(references, hypotheses) print( '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n' .format(loss=losses, top5=top5accs, bleu=bleu4)) return bleu4
def evaluate(beam_size): """ Evaluation :param beam_size: beam size at which to generate captions for evaluation :return: BLEU-4 score """ # DataLoader loader = torch.utils.data.DataLoader( CaptionDataset(data_folder, data_name, 'TEST', transform=transforms.Compose([normalize])), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) # TODO: Batched Beam Search # Lists to store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] references = list() hypotheses = list() #bleu1_list=list() #bleu2_list=list() #bleu3_list=list() #bleu4_list=list() #result=[] # For each image for i, (image, caps, caplens, allcaps) in enumerate( tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))): k = beam_size #print(i) # Move to GPU device, if available image = image.to(device) # (1, 3, 256, 256) # Encode encoder_out = encoder(image) # (1, enc_image_size, enc_image_size, encoder_dim) enc_image_size = encoder_out.size(1) encoder_dim = encoder_out.size(3) # Flatten encoding encoder_out = encoder_out.view(1, -1, encoder_dim) # (1, num_pixels, encoder_dim) num_pixels = encoder_out.size(1) # We'll treat the problem as having a batch size of k encoder_out = encoder_out.expand(k, num_pixels, encoder_dim) # (k, num_pixels, encoder_dim) # Tensor to store top k previous words at each step; now they're just <start> k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(device) # (k, 1) # Tensor to store top k sequences; now they're just <start> seqs = k_prev_words # (k, 1) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(k, 1).to(device) # (k, 1) # Lists to store completed sequences and scores complete_seqs = list() complete_seqs_scores = list() # Start decoding step = 1 h, c = decoder.init_hidden_state(encoder_out) # s is a number less than or equal to k, because sequences are removed from this process once they hit <end> while True: embeddings = decoder.embedding(k_prev_words).squeeze(1) # (s, embed_dim) awe, _ = decoder.attention(encoder_out, h) # (s, encoder_dim), (s, num_pixels) gate = decoder.sigmoid(decoder.f_beta(h)) # gating scalar, (s, encoder_dim) awe = gate * awe h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c)) # (s, decoder_dim) scores = decoder.fc(h) # (s, vocab_size) scores = F.log_softmax(scores, dim=1) # Add scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size) # For the first step, all k points will have the same scores (since same k previous words, h, c) if step == 1: top_k_scores, top_k_words = scores[0].topk(k, 0, True, True) # (s) else: # Unroll and find top scores, and their unrolled indices top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True) # (s) # Convert unrolled indices to actual indices of scores prev_word_inds = top_k_words / vocab_size # (s) next_word_inds = top_k_words % vocab_size # (s) # Add new words to sequences seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1) # Which sequences are incomplete (didn't reach <end>)? incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>']] complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds)) # Set aside complete sequences if len(complete_inds) > 0: complete_seqs.extend(seqs[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) k -= len(complete_inds) # reduce beam length accordingly # Proceed with incomplete sequences if k == 0: break seqs = seqs[incomplete_inds] h = h[prev_word_inds[incomplete_inds]] c = c[prev_word_inds[incomplete_inds]] encoder_out = encoder_out[prev_word_inds[incomplete_inds]] top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1) k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1) # Break if things have been going on too long if step > 50: break step += 1 i = complete_seqs_scores.index(max(complete_seqs_scores)) seq = complete_seqs[i] # References img_caps = allcaps[0].tolist() img_captions = list( map(lambda c: [rev_word_map[w] for w in c if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}], img_caps)) # remove <start> and pads references.append(img_captions) #print(allcaps[0]) #break # Hypotheses hypotheses.append([rev_word_map[w] for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}]) #result_dict={} #result_dict['num']=i #result_dict['reference']=img_captions #result_dict['hypotheses']=[rev_word_map[w] for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}] #result.append(result_dict) assert len(references) == len(hypotheses) #bleu1 = sentence_bleu(img_captions,[w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],weights=(1, 0, 0, 0)) #bleu2 = sentence_bleu(img_captions,[w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],weights=(0.5, 0.5, 0, 0)) #bleu3 = sentence_bleu(img_captions,[w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],weights=(0.33, 0.33, 0.33, 0)) #bleu4 = sentence_bleu(img_captions,[w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],weights=(0.25, 0.25, 0.25, 0.25)) #print(bleu1) #bleu1_list.append(bleu1) #bleu2_list.append(bleu2) #bleu3_list.append(bleu3) #bleu4_list.append(bleu4) #print(len(bleu_list)) #print(img_captions) #print([rev_word_map[w] for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}]) #bleu_dict={} #bleu_dict['bleu1']=bleu1_list #bleu_dict['bleu2']=bleu2_list #bleu_dict['bleu3']=bleu3_list #bleu_dict['bleu4']=bleu4_list #print(len(references)) #print(len(hypotheses)) #with open('./result_hypothese.json','w') as f: # json.dump(result,f) #=========matplotlib============ #print(len(bleu_list)) #print(bleu_list) #plt.hist(bleu_list,bins=20,normed=0,facecolor='blue',edgecolor='black') #plt.bar(range(len(bleu_list)),bleu_list,fc='b') #plt.plot(range(len(bleu_list)),bleu_list) #plt.xlim((0,5000)) #plt.ylim((0,1)) #plt.xticks(np.arange(0,5000,1000)) #plt.yticks(np.arange(0,1,0.1)) #plt.xlabel('image') #plt.ylabel('bleu 1 scores') #plt.title('bleu 1') #plt.show() #sns.distplot(bleu1_list) #plt.show() # Calculate BLEU-4 scores bleu4 = corpus_bleu(references, hypotheses,weights=(0.25,0.25,0.25,0.25),emulate_multibleu=True) bleu3 = corpus_bleu(references, hypotheses,weights=(0.33,0.33,0.33,0),emulate_multibleu=True) bleu2 = corpus_bleu(references, hypotheses,weights=(0.5, 0.5, 0, 0),emulate_multibleu=True) bleu1 = corpus_bleu(references, hypotheses,weights=(1, 0, 0, 0),emulate_multibleu=True) return bleu4,bleu3,bleu2,bleu1
sources = Variable(sources.cuda(), volatile=True) M1.zero_grad() M2.zero_grad() logits = M1(sources, None) logits = torch.max(logits.data.cpu(), 2)[1] logits = [list(x) for x in logits] hyp = [x[:x.index(1)] if 1 in x else x for x in logits] hyp = [[DS1.vocab[x] for x in y] for y in hyp] inters.extend(hyp) sources2 = (hyp, targets) s2, _ = DS2.pad_batch(sources2, targ=False) s2 = Variable(s2.cuda(), volatile=True) logits2 = M2(s2, None) logits2 = torch.max(logits2.data.cpu(), 2)[1] logits2 = [list(x) for x in logits2] hyp2 = [x[:x.index(1)] if 1 in x else x for x in logits2] hyp2 = [[DS2.vocab[x] for x in y] for y in hyp2] hyps.extend(hyp2) refs.extend(targets) bleu = corpus_bleu(refs, hyps, emulate_multibleu=True, smoothing_function=cc.method3) print(bleu) with open(args.savestr + "hyps", 'w') as f: hyps = [' '.join(x) for x in hyps] f.write('\n'.join(hyps)) with open(args.savestr + "refs", 'w') as f: refs = ['\t'.join([' '.join(x) for x in y]) for y in refs] f.write('\n'.join(refs))
def eval(): # Load graph g = Graph(is_training=False) print("Graph loaded") # Load data X, Sources, Targets = load_test_data() cn2idx, idx2cn = load_cn_vocab() en2idx, idx2en = load_en_vocab() # X, Sources, Targets = X[:33], Sources[:33], Targets[:33] # Start session with g.graph.as_default(): # A training helper that checkpoints models and computes summaries. sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: ## Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print("Restored!") ## Get model name mname = open(hp.logdir + '\checkpoint', 'r').read().split('"')[1] # model name mname = re.findall(r'results(.*)', mname)[0] ## Inference result_trans = hp.logdir + '\\translation' if not os.path.exists(result_trans): os.mkdir(result_trans) with codecs.open(result_trans + mname, "w", "utf-8") as fout: list_of_refs, hypotheses = [], [] if len(X) // hp.batch_size == 0: iteration = 1 else: iteration = len(X) // hp.batch_size for i in range(iteration): if iteration == 1: x = X sources = Sources targets = Targets preds = np.zeros((len(Sources), hp.maxlen), np.int32) else: ### Get mini-batches x = X[i * hp.batch_size:(i + 1) * hp.batch_size] sources = Sources[i * hp.batch_size:(i + 1) * hp.batch_size] targets = Targets[i * hp.batch_size:(i + 1) * hp.batch_size] ### Autoregressive inference preds = np.zeros((hp.batch_size, hp.maxlen), np.int32) for j in range(hp.maxlen): _preds = sess.run(g.preds, {g.x: x, g.y: preds}) preds[:, j] = _preds[:, j] ### Write to file for source, target, pred in zip(sources, targets, preds): # sentence-wise got = " ".join( idx2en[idx] for idx in pred).split("</S>")[0].strip() fout.write("- source: " + source + "\n") fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") fout.flush() # bleu score ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis) ## Calculate bleu score score = corpus_bleu(list_of_refs, hypotheses) fout.write("Bleu Score = " + str(100 * score))
def per_item_dialog_bleu(y_true, y_predicted): y_true = (y['text'] for dialog in y_true for y in dialog) return corpus_bleu([[y_t.lower().split()] for y_t in y_true], [y_p.lower().split() for y_p in y_predicted])
if di != 0: answer += output_lang.index2word[topi.item()] decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) topv, topi = decoder_output.topk(1) decoder_input = topi.squeeze().detach() # detach from history as input loss += criterion(decoder_output, target_tensor[di]) if decoder_input.item() == EOS_token: break target = [output_lang.index2word[i.item()] for i in target_tensor] loss.backward() bleu_score = corpus_bleu([target], [list(answer)], weights=(1, 0, 0, 0)) target = ''.join(target) # if '!' == target[-1]: # target = target[:-1] ind_acc = 0 # if '!' == target[-1]: # target = target[:-1] # if '?' in answer: # target = target[:target.find('?')] # if '!' == answer[-1]: # answer = answer[:-1] acc = 1 if target == answer else 0
def bleu(y_true, y_predicted): return corpus_bleu([[y_t.lower().split()] for y_t in y_true], [y_p.lower().split() for y_p in y_predicted])
def analyze_decode_results(dataset, decode_results, verbose=True): from lang.py.parse import tokenize_code, de_canonicalize_code # tokenize_code = tokenize_for_bleu_eval import ast assert dataset.count == len(decode_results) f = f_decode = None if verbose: f = open(dataset.name + '.exact_match', 'w') exact_match_ids = [] f_decode = open(dataset.name + '.decode_results.txt', 'w') eid_to_annot = dict() if config.data_type == 'django': for raw_id, line in enumerate(open(DJANGO_ANNOT_FILE)): eid_to_annot[raw_id] = line.strip() f_bleu_eval_ref = open(dataset.name + '.ref', 'w') f_bleu_eval_hyp = open(dataset.name + '.hyp', 'w') logging.info('evaluating [%s] set, [%d] examples', dataset.name, dataset.count) cum_oracle_bleu = 0.0 cum_oracle_acc = 0.0 cum_bleu = 0.0 cum_acc = 0.0 sm = SmoothingFunction() all_references = [] all_predictions = [] if all(len(cand) == 0 for cand in decode_results): logging.ERROR('Empty decoding results for the current dataset!') return -1, -1 binned_results_dict = defaultdict(list) def get_binned_key(ast_size): cutoff = 50 if config.data_type == 'django' else 250 k = 10 if config.data_type == 'django' else 25 # for hs if ast_size >= cutoff: return '%d - inf' % cutoff lower = int(ast_size / k) * k upper = lower + k key = '%d - %d' % (lower, upper) return key for eid in range(dataset.count): example = dataset.examples[eid] ref_code = example.code ref_ast_tree = ast.parse(ref_code).body[0] refer_source = astor.to_source(ref_ast_tree).strip() # refer_source = ref_code refer_tokens = tokenize_code(refer_source) cur_example_acc = 0.0 decode_cands = decode_results[eid] if len(decode_cands) == 0: continue decode_cand = decode_cands[0] cid, cand, ast_tree, code = decode_cand code = astor.to_source(ast_tree).strip() # simple_url_2_re = re.compile('_STR:0_', re.)) try: predict_tokens = tokenize_code(code) except: logging.error('error in tokenizing [%s]', code) continue if refer_tokens == predict_tokens: cum_acc += 1 cur_example_acc = 1.0 if verbose: exact_match_ids.append(example.raw_id) f.write('-' * 60 + '\n') f.write('example_id: %d\n' % example.raw_id) f.write(code + '\n') f.write('-' * 60 + '\n') if config.data_type == 'django': ref_code_for_bleu = example.meta_data['raw_code'] pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code']) # ref_code_for_bleu = de_canonicalize_code(ref_code_for_bleu, example.meta_data['raw_code']) # convert canonicalized code to raw code for literal, place_holder in example.meta_data['str_map'].iteritems(): pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal) # ref_code_for_bleu = ref_code_for_bleu.replace('\'' + place_holder + '\'', literal) elif config.data_type == 'hs': ref_code_for_bleu = ref_code pred_code_for_bleu = code # we apply Ling Wang's trick when evaluating BLEU scores refer_tokens_for_bleu = tokenize_for_bleu_eval(ref_code_for_bleu) pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu) shorter = len(pred_tokens_for_bleu) < len(refer_tokens_for_bleu) all_references.append([refer_tokens_for_bleu]) all_predictions.append(pred_tokens_for_bleu) # try: ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu)) bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3) cum_bleu += bleu_score # except: # pass if verbose: print 'raw_id: %d, bleu_score: %f' % (example.raw_id, bleu_score) f_decode.write('-' * 60 + '\n') f_decode.write('example_id: %d\n' % example.raw_id) f_decode.write('intent: \n') if config.data_type == 'django': f_decode.write(eid_to_annot[example.raw_id] + '\n') elif config.data_type == 'hs': f_decode.write(' '.join(example.query) + '\n') f_bleu_eval_ref.write(' '.join(refer_tokens_for_bleu) + '\n') f_bleu_eval_hyp.write(' '.join(pred_tokens_for_bleu) + '\n') f_decode.write('canonicalized reference: \n') f_decode.write(refer_source + '\n') f_decode.write('canonicalized prediction: \n') f_decode.write(code + '\n') f_decode.write('reference code for bleu calculation: \n') f_decode.write(ref_code_for_bleu + '\n') f_decode.write('predicted code for bleu calculation: \n') f_decode.write(pred_code_for_bleu + '\n') f_decode.write('pred_shorter_than_ref: %s\n' % shorter) # f_decode.write('weired: %s\n' % weired) f_decode.write('-' * 60 + '\n') # compute oracle best_bleu_score = 0. cur_oracle_acc = 0. for decode_cand in decode_cands[:config.beam_size]: cid, cand, ast_tree, code = decode_cand try: code = astor.to_source(ast_tree).strip() predict_tokens = tokenize_code(code) if predict_tokens == refer_tokens: cur_oracle_acc = 1. if config.data_type == 'django': pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code']) # convert canonicalized code to raw code for literal, place_holder in example.meta_data['str_map'].iteritems(): pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal) elif config.data_type == 'hs': pred_code_for_bleu = code # we apply Ling Wang's trick when evaluating BLEU scores pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu) ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu)) cand_bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3) if cand_bleu_score > best_bleu_score: best_bleu_score = cand_bleu_score except: continue cum_oracle_bleu += best_bleu_score cum_oracle_acc += cur_oracle_acc ref_ast_size = example.parse_tree.size binned_key = get_binned_key(ref_ast_size) binned_results_dict[binned_key].append((bleu_score, cur_example_acc, best_bleu_score, cur_oracle_acc)) cum_bleu /= dataset.count cum_acc /= dataset.count cum_oracle_bleu /= dataset.count cum_oracle_acc /= dataset.count logging.info('corpus level bleu: %f', corpus_bleu(all_references, all_predictions, smoothing_function=sm.method3)) logging.info('sentence level bleu: %f', cum_bleu) logging.info('accuracy: %f', cum_acc) logging.info('oracle bleu: %f', cum_oracle_bleu) logging.info('oracle accuracy: %f', cum_oracle_acc) keys = sorted(binned_results_dict, key=lambda x: int(x.split(' - ')[0])) Y = [[], [], [], []] X = [] for binned_key in keys: entry = binned_results_dict[binned_key] avg_bleu = np.average([t[0] for t in entry]) avg_acc = np.average([t[1] for t in entry]) avg_oracle_bleu = np.average([t[2] for t in entry]) avg_oracle_acc = np.average([t[3] for t in entry]) print binned_key, avg_bleu, avg_acc, avg_oracle_bleu, avg_oracle_acc, len(entry) Y[0].append(avg_bleu) Y[1].append(avg_acc) Y[2].append(avg_oracle_bleu) Y[3].append(avg_oracle_acc) X.append(int(binned_key.split(' - ')[0])) import matplotlib.pyplot as plt from pylab import rcParams rcParams['figure.figsize'] = 6, 2.5 if config.data_type == 'django': fig, ax = plt.subplots() ax.plot(X, Y[0], 'bs--', label='BLEU', lw=1.2) # ax.plot(X, Y[2], 'r^--', label='oracle BLEU', lw=1.2) ax.plot(X, Y[1], 'r^--', label='acc', lw=1.2) # ax.plot(X, Y[3], 'r^--', label='oracle acc', lw=1.2) ax.set_ylabel('Performance') ax.set_xlabel('Reference AST Size (# nodes)') plt.legend(loc='upper right', ncol=6) plt.tight_layout() # plt.savefig('django_acc_ast_size.pdf', dpi=300) # os.system('pcrop.sh django_acc_ast_size.pdf') plt.savefig('django_perf_ast_size.pdf', dpi=300) os.system('pcrop.sh django_perf_ast_size.pdf') else: fig, ax = plt.subplots() ax.plot(X, Y[0], 'bs--', label='BLEU', lw=1.2) # ax.plot(X, Y[2], 'r^--', label='oracle BLEU', lw=1.2) ax.plot(X, Y[1], 'r^--', label='acc', lw=1.2) # ax.plot(X, Y[3], 'r^--', label='oracle acc', lw=1.2) ax.set_ylabel('Performance') ax.set_xlabel('Reference AST Size (# nodes)') plt.legend(loc='upper right', ncol=6) plt.tight_layout() # plt.savefig('hs_bleu_ast_size.pdf', dpi=300) # os.system('pcrop.sh hs_bleu_ast_size.pdf') plt.savefig('hs_perf_ast_size.pdf', dpi=300) os.system('pcrop.sh hs_perf_ast_size.pdf') if verbose: f.write(', '.join(str(i) for i in exact_match_ids)) f.close() f_decode.close() f_bleu_eval_ref.close() f_bleu_eval_hyp.close() return cum_bleu, cum_acc
def validation(beam_size): """ Evaluation Process :param beam_size: beam size at which to generate captions for evaluation :return: BLEU-4 score """ references = list( ) # references (true captions) for calculating BLEU-4 score hypotheses = list() # hypotheses (predictions) start_time = datetime.datetime.now() print("Start training at: ", start_time) for j, (images, captions, caplens, all_caps) in enumerate(val_loader): k = beam_size start = datetime.datetime.now() images = images.to(device) # Forward encoder_out = encoder(images) enc_image_size = encoder_out.size(1) encoder_dim = encoder_out.size(-1) ## Flatten encoding`` encoder_out = encoder_out.view( batch_size, -1, encoder_dim) # (1, num_pixels, encoder_dim) num_pixels = encoder_out.size(1) ## We'll treat the problem as having a batch size of k` encoder_out = encoder_out.expand(k, num_pixels, encoder_dim) ## Tensor to store top k previous words at each step; now they're just <start>` prev_words = torch.LongTensor([[word_map["<start>"]]] * k).to( device) # (k, 1) ## Tensor to store top k sequences; now they're just <start>` seqs = prev_words # (k, 1) ## Tensor to store top k sequences' scores; now they're just 0` seqs_scores = torch.zeros([k, 1]).to(device) # (k, 1) # Initialize lists complete_seqs = [] complete_scores = [] print("start decode") # Decode step = 1 h, c = decoder.init_hidden_state(encoder_out) # (k, decoder_dim) ## Iterate until all k sequences are completed while True: # Compute scores of current k previous words embeddings = decoder.embedding(prev_words).squeeze( 1) # (k, 1, embed_dim) to (k, embed_dim) h, c = decoder.decode_step( embeddings, # (1, embed_dim) (h, c)) # (1, decoder_dim) scores = decoder.fc(decoder.dropout(h)) # (k, vocab_size) scores = F.log_softmax(scores, dim=2) # Add (i.e. multiply because of 'log' above) to current scores scores = seqs_scores.expand_as(scores) + scores # Take the maximum k elements in (k * vocab_size) combinations if step == 1: ## Initialize top_scores, top_k_locations = scores[0].topk(k, 0, True, True) else: top_scores, top_k_locations = scores.view(-1).topk( k, 0, True, True) # Row and Column indices of k largest elements top_k_prev_ind = top_k_locations // vocab_size # (k, 1) top_k_next_ind = top_k_locations % vocab_size # (k, 1) # Update sequences seqs = torch.cat( [seqs[top_k_prev_ind], top_k_next_ind.unsqueeze(1)], dim=1) # (k, step+1) # Check whether a sequence is completed comp_seqs_ind = [ j for j, next_word in enumerate(top_k_next_ind) if next_word == word_map["<end>"] ] incomp_seqs_ind = list( set(range(seqs.size(0))) - set(comp_seqs_ind)) # Deal with completed sequences if len(comp_seqs_ind) > 0: complete_seqs.extend(seqs[comp_seqs_ind].tolist()) complete_scores.extend(seqs_scores[comp_seqs_ind]) k -= len(comp_seqs_ind) # reduce beam length # Deal with incomplete sequences if k == 0: break seqs = seqs[incomp_seqs_ind] h = h[top_k_prev_ind[incomp_seqs_ind]] c = c[top_k_prev_ind[incomp_seqs_ind]] encoder_out = encoder_out[top_k_prev_ind[incomp_seqs_ind]] #seqs_scores = seqs_scores[incomp_seqs_ind].unsqueeze(1) seqs_scores = seqs_scores[incomp_seqs_ind] #prev_words = top_k_next_ind[incomp_seqs_ind].unsqueeze(1) prev_words = top_k_next_ind[incomp_seqs_ind] # Break if things have been going on too long if step > 50: break step += 1 max_i = np.argmax(complete_scores) #max_i = complete_scores.index(max(complete_scores)) max_seq = complete_seqs[max_i] ## Store references (true captions), and hypothesis (prediction) for each image ## If for n images, we have n hypotheses, and references a, b, c... for each image, we need - ## references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] # references # all_caps = all_caps[sort_ind] print("reference") for k in range(all_caps.shape[0]): img_caps = all_caps[k].tolist() img_captions = list( map( lambda c: [ w for w in c if w not in {word_map["<start>"], word_map["<pad>"]} ], img_caps)) references.append(img_captions) # hypotheses hypotheses.append([ w for w in max_seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']} ]) assert len(references) == len(hypotheses) ## Compute BLEU-4 Scores bleu4 = corpus_bleu(references, hypothesese) return bleu4
def per_item_bleu(y_true, y_predicted): y_predicted = itertools.chain(*y_predicted) return corpus_bleu([[y_t.lower().split()] for y_t in y_true], [y_p.lower().split() for y_p in y_predicted])
def analyze_decode_results(dataset, decode_results, verbose=True): from lang.py.parse import tokenize_code, de_canonicalize_code # tokenize_code = tokenize_for_bleu_eval import ast assert dataset.count == len(decode_results) f = f_decode = None if verbose: f = open(dataset.name + '.exact_match', 'w') exact_match_ids = [] f_decode = open(dataset.name + '.decode_results.txt', 'w') eid_to_annot = dict() if data_type == 'django': for raw_id, line in enumerate(open(DJANGO_ANNOT_FILE)): eid_to_annot[raw_id] = line.strip() f_bleu_eval_ref = open(dataset.name + '.ref', 'w') f_bleu_eval_hyp = open(dataset.name + '.hyp', 'w') logging.info('evaluating [%s] set, [%d] examples', dataset.name, dataset.count) cum_oracle_bleu = 0.0 cum_oracle_acc = 0.0 cum_bleu = 0.0 cum_acc = 0.0 sm = SmoothingFunction() all_references = [] all_predictions = [] if all(len(cand) == 0 for cand in decode_results): logging.ERROR('Empty decoding results for the current dataset!') return -1, -1 binned_results_dict = defaultdict(list) def get_binned_key(ast_size): cutoff = 50 if data_type == 'django' else 250 k = 10 if data_type == 'django' else 25 # for hs if ast_size >= cutoff: return '%d - inf' % cutoff lower = int(ast_size / k) * k upper = lower + k key = '%d - %d' % (lower, upper) return key for eid in range(dataset.count): example = dataset.examples[eid] ref_code = example.code ref_ast_tree = ast.parse(ref_code).body[0] refer_source = astor.to_source(ref_ast_tree).strip() # refer_source = ref_code refer_tokens = tokenize_code(refer_source) cur_example_acc = 0.0 decode_cands = decode_results[eid] if len(decode_cands) == 0: continue decode_cand = decode_cands[0] cid, cand, ast_tree, code = decode_cand code = astor.to_source(ast_tree).strip() # simple_url_2_re = re.compile('_STR:0_', re.)) try: predict_tokens = tokenize_code(code) except: logging.error('error in tokenizing [%s]', code) continue if refer_tokens == predict_tokens: cum_acc += 1 cur_example_acc = 1.0 if verbose: exact_match_ids.append(example.raw_id) f.write('-' * 60 + '\n') f.write('example_id: %d\n' % example.raw_id) f.write(code + '\n') f.write('-' * 60 + '\n') if data_type == 'django': ref_code_for_bleu = example.meta_data['raw_code'] pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code']) # ref_code_for_bleu = de_canonicalize_code(ref_code_for_bleu, example.meta_data['raw_code']) # convert canonicalized code to raw code for literal, place_holder in example.meta_data['str_map'].iteritems(): pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal) # ref_code_for_bleu = ref_code_for_bleu.replace('\'' + place_holder + '\'', literal) elif data_type == 'hs': ref_code_for_bleu = ref_code pred_code_for_bleu = code # we apply Ling Wang's trick when evaluating BLEU scores refer_tokens_for_bleu = tokenize_for_bleu_eval(ref_code_for_bleu) pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu) shorter = len(pred_tokens_for_bleu) < len(refer_tokens_for_bleu) all_references.append([refer_tokens_for_bleu]) all_predictions.append(pred_tokens_for_bleu) # try: ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu)) bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3) cum_bleu += bleu_score # except: # pass if verbose: print 'raw_id: %d, bleu_score: %f' % (example.raw_id, bleu_score) f_decode.write('-' * 60 + '\n') f_decode.write('example_id: %d\n' % example.raw_id) f_decode.write('intent: \n') if data_type == 'django': f_decode.write(eid_to_annot[example.raw_id] + '\n') elif data_type == 'hs': f_decode.write(' '.join(example.query) + '\n') f_bleu_eval_ref.write(' '.join(refer_tokens_for_bleu) + '\n') f_bleu_eval_hyp.write(' '.join(pred_tokens_for_bleu) + '\n') f_decode.write('canonicalized reference: \n') f_decode.write(refer_source + '\n') f_decode.write('canonicalized prediction: \n') f_decode.write(code + '\n') f_decode.write('reference code for bleu calculation: \n') f_decode.write(ref_code_for_bleu + '\n') f_decode.write('predicted code for bleu calculation: \n') f_decode.write(pred_code_for_bleu + '\n') f_decode.write('pred_shorter_than_ref: %s\n' % shorter) # f_decode.write('weired: %s\n' % weired) f_decode.write('-' * 60 + '\n') # compute oracle best_bleu_score = 0. cur_oracle_acc = 0. for ast_tree in decode_results: try: code = astor.to_source(ast_tree).strip() predict_tokens = tokenize_code(code) if predict_tokens == refer_tokens: cur_oracle_acc = 1. if data_type == 'django': pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code']) # convert canonicalized code to raw code for literal, place_holder in example.meta_data['str_map'].iteritems(): pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal) elif data_type == 'hs': pred_code_for_bleu = code # we apply Ling Wang's trick when evaluating BLEU scores pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu) ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu)) cand_bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3) if cand_bleu_score > best_bleu_score: best_bleu_score = cand_bleu_score except: continue cum_oracle_bleu += best_bleu_score cum_oracle_acc += cur_oracle_acc ref_ast_size = example.parse_tree.size binned_key = get_binned_key(ref_ast_size) binned_results_dict[binned_key].append((bleu_score, cur_example_acc, best_bleu_score, cur_oracle_acc)) cum_bleu /= dataset.count cum_acc /= dataset.count cum_oracle_bleu /= dataset.count cum_oracle_acc /= dataset.count logging.info('corpus level bleu: %f', corpus_bleu(all_references, all_predictions, smoothing_function=sm.method3)) logging.info('sentence level bleu: %f', cum_bleu) logging.info('accuracy: %f', cum_acc) logging.info('oracle bleu: %f', cum_oracle_bleu) logging.info('oracle accuracy: %f', cum_oracle_acc) keys = sorted(binned_results_dict, key=lambda x: int(x.split(' - ')[0])) Y = [[], [], [], []] X = [] for binned_key in keys: entry = binned_results_dict[binned_key] avg_bleu = np.average([t[0] for t in entry]) avg_acc = np.average([t[1] for t in entry]) avg_oracle_bleu = np.average([t[2] for t in entry]) avg_oracle_acc = np.average([t[3] for t in entry]) print binned_key, avg_bleu, avg_acc, avg_oracle_bleu, avg_oracle_acc, len(entry) Y[0].append(avg_bleu) Y[1].append(avg_acc) Y[2].append(avg_oracle_bleu) Y[3].append(avg_oracle_acc) X.append(int(binned_key.split(' - ')[0])) import matplotlib.pyplot as plt from pylab import rcParams rcParams['figure.figsize'] = 6, 2.5 if data_type == 'django': fig, ax = plt.subplots() ax.plot(X, Y[0], 'bs--', label='BLEU', lw=1.2) # ax.plot(X, Y[2], 'r^--', label='oracle BLEU', lw=1.2) ax.plot(X, Y[1], 'r^--', label='acc', lw=1.2) # ax.plot(X, Y[3], 'r^--', label='oracle acc', lw=1.2) ax.set_ylabel('Performance') ax.set_xlabel('Reference AST Size (# nodes)') plt.legend(loc='upper right', ncol=6) plt.tight_layout() # plt.savefig('django_acc_ast_size.pdf', dpi=300) # os.system('pcrop.sh django_acc_ast_size.pdf') plt.savefig('django_perf_ast_size.pdf', dpi=300) os.system('pcrop.sh django_perf_ast_size.pdf') else: fig, ax = plt.subplots() ax.plot(X, Y[0], 'bs--', label='BLEU', lw=1.2) # ax.plot(X, Y[2], 'r^--', label='oracle BLEU', lw=1.2) ax.plot(X, Y[1], 'r^--', label='acc', lw=1.2) # ax.plot(X, Y[3], 'r^--', label='oracle acc', lw=1.2) ax.set_ylabel('Performance') ax.set_xlabel('Reference AST Size (# nodes)') plt.legend(loc='upper right', ncol=6) plt.tight_layout() # plt.savefig('hs_bleu_ast_size.pdf', dpi=300) # os.system('pcrop.sh hs_bleu_ast_size.pdf') plt.savefig('hs_perf_ast_size.pdf', dpi=300) os.system('pcrop.sh hs_perf_ast_size.pdf') if verbose: f.write(', '.join(str(i) for i in exact_match_ids)) f.close() f_decode.close() f_bleu_eval_ref.close() f_bleu_eval_hyp.close() return cum_bleu, cum_acc
from nltk.translate import bleu_score # use sentence_bleu to evaluate single-sentence paraphrases. # reference = known good translation into the destination language reference = 'The king is staying up all night drinking and dancing'.split(' ') # hypothesis = system's translation into the destination language hypothesis1 = 'The king doth wake tonight and takes his rouse'.split(' ') hypothesis2 = 'The king stays up tonight and takes his rouse'.split(' ') hypothesis3 = 'The king stays up tonight drinking and dancing'.split(' ') hypothesis4 = 'The king stays up all night drinking and dancing'.split(' ') for hyp in [hypothesis1, hypothesis2, hypothesis3, hypothesis4, reference]: print bleu_score.sentence_bleu([reference], hyp) # use corpus_bleu to evaluate multi-sentence paraphrases. reference1 = 'the musicians make a ruckus to celebrate his draining another cup.'.split(' ') hypothesis1_1 = 'The kettle-drum and trumpet thus bray out The triumph of his pledge.'.split(' ') print bleu_score.corpus_bleu( [[reference], [reference1]], # list of references for each sentence in the corpus [hypothesis1, hypothesis1_1]) # 1 hypothesis for each sentence in the corpus
def evaluate_decode_results(data_type, dataset, decode_results, verbose=True): from lang.py.parse import tokenize_code, de_canonicalize_code # tokenize_code = tokenize_for_bleu_eval import ast assert dataset.count == len(decode_results) f = f_decode = None if verbose: f = open(dataset.name + '.exact_match', 'w') exact_match_ids = [] f_decode = open(dataset.name + '.decode_results.txt', 'w') eid_to_annot = dict() if data_type == 'django': for raw_id, line in enumerate(open(DJANGO_ANNOT_FILE)): eid_to_annot[raw_id] = line.strip() f_bleu_eval_ref = open(dataset.name + '.ref', 'w') f_bleu_eval_hyp = open(dataset.name + '.hyp', 'w') f_generated_code = open(dataset.name + '.geneated_code', 'w') logging.info('evaluating [%s] set, [%d] examples', dataset.name, dataset.count) cum_oracle_bleu = 0.0 cum_oracle_acc = 0.0 cum_bleu = 0.0 cum_acc = 0.0 sm = SmoothingFunction() all_references = [] all_predictions = [] for eid in range(dataset.count): example = dataset.examples[eid] ref_code = example.code ref_ast_tree = ast.parse(ref_code).body[0] refer_source = astor.to_source(ref_ast_tree).strip() # refer_source = ref_code refer_tokens = tokenize_code(refer_source) cur_example_correct = False ast_tree = decode_results[eid] code = astor.to_source(ast_tree).strip() # simple_url_2_re = re.compile('_STR:0_', re.)) try: predict_tokens = tokenize_code(code) except: logging.error('error in tokenizing [%s]', code) continue if refer_tokens == predict_tokens: cum_acc += 1 cur_example_correct = True if verbose: exact_match_ids.append(example.raw_id) f.write('-' * 60 + '\n') f.write('example_id: %d\n' % example.raw_id) f.write(code + '\n') f.write('-' * 60 + '\n') if data_type == 'django': ref_code_for_bleu = example.meta_data['raw_code'] pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code']) # ref_code_for_bleu = de_canonicalize_code(ref_code_for_bleu, example.meta_data['raw_code']) # convert canonicalized code to raw code for literal, place_holder in example.meta_data['str_map'].iteritems(): pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal) # ref_code_for_bleu = ref_code_for_bleu.replace('\'' + place_holder + '\'', literal) elif data_type == 'hs': ref_code_for_bleu = ref_code pred_code_for_bleu = code # we apply Ling Wang's trick when evaluating BLEU scores refer_tokens_for_bleu = tokenize_for_bleu_eval(ref_code_for_bleu) pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu) # The if-chunk below is for debugging purpose, sometimes the reference cannot match with the prediction # because of inconsistent quotes (e.g., single quotes in reference, double quotes in prediction). # However most of these cases are solved by cannonicalizing the reference code using astor (parse the reference # into AST, and regenerate the code. Use this regenerated one as the reference) weired = False if refer_tokens_for_bleu == pred_tokens_for_bleu and refer_tokens != predict_tokens: # cum_acc += 1 weired = True elif refer_tokens == predict_tokens: # weired! # weired = True pass shorter = len(pred_tokens_for_bleu) < len(refer_tokens_for_bleu) all_references.append([refer_tokens_for_bleu]) all_predictions.append(pred_tokens_for_bleu) # try: ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu)) bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3) cum_bleu += bleu_score # except: # pass if verbose: print 'raw_id: %d, bleu_score: %f' % (example.raw_id, bleu_score) f_decode.write('-' * 60 + '\n') f_decode.write('example_id: %d\n' % example.raw_id) f_decode.write('intent: \n') if data_type == 'django': f_decode.write(eid_to_annot[example.raw_id] + '\n') elif data_type == 'hs': f_decode.write(' '.join(example.query) + '\n') f_bleu_eval_ref.write(' '.join(refer_tokens_for_bleu) + '\n') f_bleu_eval_hyp.write(' '.join(pred_tokens_for_bleu) + '\n') f_decode.write('canonicalized reference: \n') f_decode.write(refer_source + '\n') f_decode.write('canonicalized prediction: \n') f_decode.write(code + '\n') f_decode.write('reference code for bleu calculation: \n') f_decode.write(ref_code_for_bleu + '\n') f_decode.write('predicted code for bleu calculation: \n') f_decode.write(pred_code_for_bleu + '\n') f_decode.write('pred_shorter_than_ref: %s\n' % shorter) f_decode.write('weired: %s\n' % weired) f_decode.write('-' * 60 + '\n') # for Hiro's evaluation f_generated_code.write(pred_code_for_bleu.replace('\n', '#NEWLINE#') + '\n') # compute oracle best_score = 0. cur_oracle_acc = 0. for ast_tree in decode_results: try: code = astor.to_source(ast_tree).strip() predict_tokens = tokenize_code(code) if predict_tokens == refer_tokens: cur_oracle_acc = 1 if data_type == 'django': pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code']) # convert canonicalized code to raw code for literal, place_holder in example.meta_data['str_map'].iteritems(): pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal) elif data_type == 'hs': pred_code_for_bleu = code # we apply Ling Wang's trick when evaluating BLEU scores pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu) ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu)) bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3) if bleu_score > best_score: best_score = bleu_score except: continue cum_oracle_bleu += best_score cum_oracle_acc += cur_oracle_acc cum_bleu /= dataset.count cum_acc /= dataset.count cum_oracle_bleu /= dataset.count cum_oracle_acc /= dataset.count logging.info('corpus level bleu: %f', corpus_bleu(all_references, all_predictions, smoothing_function=sm.method3)) logging.info('sentence level bleu: %f', cum_bleu) logging.info('accuracy: %f', cum_acc) logging.info('oracle bleu: %f', cum_oracle_bleu) logging.info('oracle accuracy: %f', cum_oracle_acc) if verbose: f.write(', '.join(str(i) for i in exact_match_ids)) f.close() f_decode.close() f_bleu_eval_ref.close() f_bleu_eval_hyp.close() f_generated_code.close() print cum_bleu, cum_acc return cum_bleu, cum_acc
for line in f: vocab.append(line.rstrip('\n')) f.close() stringSentences = [] for sentence in sentences: stringSentence = [] for wordIndex in sentence: stringSentence.append(vocab[wordIndex-1]) stringSentences.append(stringSentence) #stringSentences = stringSentences[:1790] + stringSentences[2100:] #hypotheses = hypotheses[:1790] + hypotheses[2100:] stringSentencesDev = stringSentences[2000:] hypothesesDev = hypotheses[2000:] stringSentencesTest = stringSentences[2000:] hypothesesTest = hypotheses[2000:] """ scores = [] for i,j in zip(stringSentences, hypotheses): references = [i] scores.append(bleu_score.sentence_bleu(references, j)) average = np.average(np.array(scores)) #print bleu_score.corpus_bleu(stringSentences, hypotheses) print average """ print bleu_score.corpus_bleu(stringSentencesDev, hypothesesDev)
args = parser.parse_args() wer_score_file = os.path.dirname(args.input_1) + "/" + os.path.splitext( os.path.basename(args.input_1))[0] + "_evaluated_WER_and_BLEU.txt" hypotheses = args.input_1 target = args.input_2 import nltk from nltk.translate.bleu_score import corpus_bleu corpus_tokenized = [s.split() for s in reference_input] references = [[['this', 'is', 'a', 'test'], ['this', 'is' 'test']]] candidates = [['this', 'is', 'a', 'test']] score = corpus_bleu(references, candidates) print(score) def levenshtein(src, trg, sub_cost=1.0, del_cost=1.0, ins_cost=1.0, randomize=True): DEL, INS, KEEP, SUB = range(4) op_names = 'delete', 'insert', 'keep', 'sub' costs = np.zeros((len(trg) + 1, len(src) + 1)) ops = np.zeros((len(trg) + 1, len(src) + 1), dtype=np.int32)
def get_bleu(references, hypotheses): # compute BLEU bleu_score = corpus_bleu([[ref[1:-1]] for ref in references], [hyp[1:-1] for hyp in hypotheses]) return bleu_score
def evaluate(sess, dataloader, model, ksave_dir, mode='valid'): if mode == 'valid': # texts_path = "original_data/valid.summary" texts_path = "processed_data/valid/valid.box.val" gold_path = gold_path_valid evalset = dataloader.dev_set else: # texts_path = "original_data/test.summary" texts_path = "processed_data/test/test.box.val" gold_path = gold_path_test evalset = dataloader.test_set # for copy words from the infoboxes texts = open(texts_path, 'rt', encoding="UTF8").read().strip().split('\n') texts = [list(t.strip().split()) for t in texts] v = Vocab() # with copy pred_list, pred_list_copy, gold_list = [], [], [] pred_unk, pred_mask = [], [] k = 0 for x in dataloader.batch_iter(evalset, FLAGS.batch_size, False): predictions, atts = model.generate(x, sess) atts = np.squeeze(atts) idx = 0 for summary in np.array(predictions): with open(pred_path + str(k), 'w', -1, "utf-8") as sw: summary = list(summary) if 2 in summary: summary = summary[:summary. index(2)] if summary[0] != 2 else [2] real_sum, unk_sum, mask_sum = [], [], [] for tk, tid in enumerate(summary): if tid == 3: sub = texts[k][np.argmax(atts[tk, :len(texts[k]), idx])] real_sum.append(sub) mask_sum.append("**" + str(sub) + "**") else: real_sum.append(v.id2word(tid)) mask_sum.append(v.id2word(tid)) unk_sum.append(v.id2word(tid)) sw.write(" ".join([str(x) for x in real_sum]) + '\n') pred_list.append([str(x) for x in real_sum]) pred_unk.append([str(x) for x in unk_sum]) pred_mask.append([str(x) for x in mask_sum]) k += 1 idx += 1 write_word(pred_mask, ksave_dir, mode + "_summary_copy.txt") write_word(pred_unk, ksave_dir, mode + "_summary_unk.txt") for tk in range(k): with open(gold_path + str(tk), 'r', -1, "utf-8") as g: gold_list.append([g.read().strip().split()]) gold_set = [[gold_path + str(i)] for i in range(k)] pred_set = [pred_path + str(i) for i in range(k)] # recall_tmp, precision_tmp, F_measure_tmp = [],[],[] # scorer = rouge_scorer.RougeScorer(['rouge1']) # for i in range(len(pred_set)) : # pred = open(pred_set[i], "rt", encoding="UTF8") # pred_lines = pred.readlines() # gold = open(gold_set[i][0], "rt", encoding="UTF8") # gold_lines = gold.readlines() # scores = scorer.score(pred_lines[0], gold_lines[0]) # result = list(scores.values()) # recall_tmp.append(result[0][1]) # precision_tmp.append(result[0][0]) # F_measure_tmp.append(result[0][2]) # recall = np.mean(recall_tmp) # precision = np.mean(precision_tmp) # F_measure = np.mean(F_measure_tmp) F_measure1_tmp, F_measure2_tmp, F_measure3_tmp = [], [], [] scorer1 = rouge_scorer.RougeScorer(['rouge1']) scorer2 = rouge_scorer.RougeScorer(['rouge2']) scorer3 = rouge_scorer.RougeScorer(['rouge3']) for i in range(len(pred_set)): pred = open(pred_set[i], "rt", encoding="UTF8") pred_lines = pred.readlines() gold = open(gold_set[i][0], "rt", encoding="UTF8") gold_lines = gold.readlines() scores1 = scorer1.score(pred_lines[0], gold_lines[0]) scores2 = scorer2.score(pred_lines[0], gold_lines[0]) scores3 = scorer3.score(pred_lines[0], gold_lines[0]) result1 = list(scores1.values()) result2 = list(scores2.values()) result3 = list(scores3.values()) F_measure1_tmp.append(result1[0][2]) F_measure2_tmp.append(result2[0][2]) F_measure3_tmp.append(result3[0][2]) F_measure1 = np.mean(F_measure1_tmp) F_measure2 = np.mean(F_measure2_tmp) F_measure3 = np.mean(F_measure3_tmp) bleu = corpus_bleu(gold_list, pred_list) # copy_result = "with copy F_measure: %s Recall: %s Precision: %s BLEU: %s\n" % \ # (str(F_measure), str(recall), str(precision), str(bleu)) copy_result = "with copy F_measure of ROUGE1: %s ROUGE2: %s ROUGE3: %s BLEU: %s\n" % \ (str(F_measure1), str(F_measure2), str(F_measure3), str(bleu)) # print copy_result # for tk in range(k): # with open(pred_path + str(tk), 'w', -1 ,"utf-8") as sw: # sw.write(" ".join(pred_unk[tk]) + '\n') # bleu = corpus_bleu(gold_list, pred_unk) # # nocopy_result = "without copy F_measure: %s Recall: %s Precision: %s BLEU: %s\n" % \ # # (str(F_measure), str(recall), str(precision), str(bleu)) # nocopy_result = "without copy F_measure of ROUGE1: %s ROUGE2: %s ROUGE3: %s BLEU: %s\n" % \ # (str(F_measure1), str(F_measure2), str(F_measure3), str(bleu)) # print nocopy_result result = copy_result #+ nocopy_result # print result if mode == 'valid': print(result) # wandb.log({'F_measure1' : F_measure1, 'F_measure2' : F_measure2, 'F_measure3' : F_measure3, 'BLEU' : bleu}) return result
def generator_test_max_example(self, positive_dir, negative_dir, num_batch): self.temp_positive_dir = positive_dir self.temp_negative_dir = negative_dir if not os.path.exists(self.temp_positive_dir): os.mkdir(self.temp_positive_dir) if not os.path.exists(self.temp_negative_dir): os.mkdir(self.temp_negative_dir) shutil.rmtree(self.temp_negative_dir) shutil.rmtree(self.temp_positive_dir) if not os.path.exists(self.temp_positive_dir): os.mkdir(self.temp_positive_dir) if not os.path.exists(self.temp_negative_dir): os.mkdir(self.temp_negative_dir) counter = 0 batches = self.test_batches step = 0 list_hop = [] list_ref = [] while step < num_batch: batch = batches[step] step += 1 decode_result = self._model.max_generator(self._sess, batch) #decode_result = self._model.run_eval_given_step(self._sess, self.batches[self.current_batch]) for i in range(FLAGS.batch_size): decoded_words_all = [] original_review = batch.original_review_output[i] for j in range(FLAGS.max_dec_sen_num): output_ids = [int(t) for t in decode_result['generated'][i][j]][1:] decoded_words = data.outputids2words(output_ids, self._vocab, None) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(data.STOP_DECODING) # index of the (first) [STOP] symbol decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words if len(decoded_words)<2: continue if len(decoded_words_all)>0: new_set1 =set(decoded_words_all[len(decoded_words_all)-1].split()) new_set2= set(decoded_words) if len(new_set1 & new_set2) > 0.5 * len(new_set2): continue decoded_output = ' '.join(decoded_words).strip() # single string decoded_words_all.append(decoded_output) decoded_words_all = ' '.join(decoded_words_all).strip() try: fst_stop_idx = decoded_words_all.index( data.STOP_DECODING_DOCUMENT) # index of the (first) [STOP] symbol decoded_words_all = decoded_words_all[:fst_stop_idx] except ValueError: decoded_words_all = decoded_words_all decoded_words_all = decoded_words_all.replace("[UNK] ", "") decoded_words_all = decoded_words_all.replace("[UNK]", "") decoded_words_all, _ = re.subn(r"(! ){2,}", "! ", decoded_words_all) decoded_words_all, _ = re.subn(r"(\. ){2,}", ". ", decoded_words_all) self.write_negtive_temp_to_json(original_review, decoded_words_all, counter) list_ref.append([nltk.word_tokenize(original_review)]) list_hop.append(nltk.word_tokenize(decoded_words_all)) counter += 1 # this is how many examples we've decoded '''self.current_batch +=1 if self.current_batch >= len(self.batches): self.current_batch = 0''' bleu_score = corpus_bleu(list_ref, list_hop) tf.logging.info('bleu: ' + str(bleu_score)) eva = Evaluate() eva.diversity_evaluate(negative_dir + "/*")
scores_ = scores.view(-1, max(decode_lens), scores.size(-1)) recon_scores = torch.argmax(scores_, -1) # Convert to text for bleu score run_preprocess = \ lambda x: remove_tokens(x, [SOS_TOKEN, EOS_TOKEN, PAD_TOKEN]) all_recon_captions.extend( [run_preprocess(sent) \ for sent in tensor2text(recon_scores, train_vocab)]) # Note (BP): Wrap in all_gold_captions.extend( [[run_preprocess(sent)] \ for sent in tensor2text(captions_sorted, test_vocab)]) pbar.update() pbar.close() # Bleu score bleu_score = corpus_bleu(all_gold_captions, all_recon_captions) logging.info("Corpus bleu:\t{}".format(round(bleu_score, 4))) # Attention plot # Loss plot if args.create_losses_plot: losses_fp = os.path.join(args.model_dir, 'losses.csv') df_losses = pd.read_csv(losses_fp) sns.lineplot(x="epochs", y="val", hue="typ", data=df_losses) losses_out_fp = os.path.join(args.model_dir, "loss.png") logging.info("Saving losses plot to {}".format(losses_out_fp)) plt.savefig(losses_out_fp)