def language_eval_excoco(predictions, predictions_bleu, sents_label_eval, loader): Scorer = CiderD() Bleu_scorer = Bleu(4) METEOR_scorer = Meteor() ROUGE_scorer = Rouge() c_score, _ = Scorer.compute_score(sents_label_eval, predictions) b_score, _ = Bleu_scorer.compute_score(sents_label_eval, predictions_bleu) m_score, _ = METEOR_scorer.compute_score(sents_label_eval, predictions_bleu) r_score, _ = ROUGE_scorer.compute_score(sents_label_eval, predictions_bleu) print('Evaluating {} samples'.format(len(predictions))) print('Bleu_1 : ' + str(b_score[0])) print('Bleu_2 : ' + str(b_score[1])) print('Bleu_3 : ' + str(b_score[2])) print('Bleu_4 : ' + str(b_score[3])) print('METEOR : ' + str(m_score)) print('ROUGE_L : ' + str(r_score)) print('CIDEr : ' + str(c_score)) lang_stat = {} lang_stat['BLEU_1'] = b_score[0] lang_stat['BLEU_2'] = b_score[1] lang_stat['BLEU_3'] = b_score[2] lang_stat['BLEU_4'] = b_score[3] lang_stat['METEOR'] = m_score lang_stat['ROUGE_L'] = r_score lang_stat['CIDEr'] = c_score return lang_stat
def test(model, dataloader, args): scorer = Bleu(4) m_scorer = Meteor() r_scorer = Rouge() hyp = [] ref = [] model.eval() gold_file = open('tmp_gold.txt', 'w') pred_file = open('tmp_pred.txt', 'w') with tqdm(dataloader, desc='Test ', mininterval=1) as tq: for batch in tq: with torch.no_grad(): seq = model(batch, beam_size=args.beam_size) r = write_txt(batch, batch['tgt_text'], gold_file, args) h = write_txt(batch, seq, pred_file, args) hyp.extend(h) ref.extend(r) hyp = dict(zip(range(len(hyp)), hyp)) ref = dict(zip(range(len(ref)), ref)) print(hyp[0], ref[0]) print('BLEU INP', len(hyp), len(ref)) print('BLEU', scorer.compute_score(ref, hyp)[0]) print('METEOR', m_scorer.compute_score(ref, hyp)[0]) print('ROUGE_L', r_scorer.compute_score(ref, hyp)[0]) gold_file.close() pred_file.close()
def test(model_path='models/model-61', video_feat_path=video_feat_path): train_data, test_data = get_video_data(video_data_path, video_feat_path, train_ratio=0.7) test_videos = test_data['video_path'].values test_captions = test_data['Description'].values ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist()) test_videos_unique = list() test_captions_list = list() for (video, caption) in zip(test_videos, test_captions): if len(test_videos_unique) == 0 or test_videos_unique[-1] != video: test_videos_unique.append(video) test_captions_list.append([caption]) else: test_captions_list[-1].append(caption) model = Video_Caption_Generator( dim_image=dim_image, n_words=len(ixtoword), dim_embed=dim_embed, dim_hidden=dim_hidden, batch_size=batch_size, encoder_max_sequence_length=encoder_step, decoder_max_sentence_length=decoder_step, bias_init_vector=None) video_tf, video_mask_tf, caption_tf, probs_tf, last_embed_tf = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, model_path) scorer = Meteor() scorer_bleu = Bleu(4) GTS = defaultdict(list) RES = defaultdict(list) counter = 0 for (video_feat_path, caption) in zip(test_videos_unique, test_captions_list): generated_sentence = gen_sentence( sess, video_tf, video_mask_tf, caption_tf, video_feat_path, ixtoword) print video_feat_path, generated_sentence #print caption GTS[str(counter)] = [{'image_id':str(counter),'cap_id':i,'caption':s} for i, s in enumerate(caption)] RES[str(counter)] = [{'image_id':str(counter),'caption':generated_sentence[:-2]+'.'}] #GTS[video_feat_path] = caption #RES[video_feat_path] = [generated_sentence[:-2] + '.'] counter += 1 #ipdb.set_trace() tokenizer = PTBTokenizer() GTS = tokenizer.tokenize(GTS) RES = tokenizer.tokenize(RES) score, scores = scorer.compute_score(GTS, RES) print "METEOR", score score, scores = scorer_bleu.compute_score(GTS, RES) print "BLEU", score
def _define_metrics(gts, res): bleu_scorer = Bleu(n=4) bleu, _ = bleu_scorer.compute_score(gts=gts, res=res) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts, res=res) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts, res=res) meteor_scorer = Meteor() meteor, _ = meteor_scorer.compute_score(gts=gts, res=res) for i in range(4): bleu[i] = round(bleu[i], 4) return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
def eval(result_gts_path, result_res_path): with open(result_gts_path, 'r') as file: gts_dict = json.load(file) with open(result_res_path, 'r') as file: res_dict = json.load(file) bleu_score = Bleu(n=4) bleu, _ = bleu_score.compute_score(gts=gts_dict, res=res_dict) meteor_score = Meteor() meteor, _ = meteor_score.compute_score(gts=gts_dict, res=res_dict) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts_dict, res=res_dict) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts_dict, res=res_dict) return bleu, meteor, rouge, cider
def calculate_metric(rnn, meteor=None): gts = {} res = {} lp_avg = 0.0 lp_c = 0 for idx in range(rnn.V_valid.shape[0]): iid = rnn.Id_valid[idx] if iid not in gts: gts[iid] = [] #gts[iid].append(' '.join([rnn.dp.i2w[w] for w in rnn.X_valid[idx] if w != 0][::-1])) gts[iid] = [ ' '.join(rnn.dp.tokens[i][::-1]) for i in rnn.dp.img_id_to_tokens[iid] ] if iid in res: continue res[iid] = [] #pos_sen, pos_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([1.0], dtype=theano.config.floatX)) (lp, pos_sen) = decoder_beamsearch(rnn, rnn.V_valid[idx], senti=1.0, beam_size=1) pos_sen = pos_sen[:-1] print(' '.join(pos_sen[::-1])) res[iid].append(' '.join(pos_sen[::-1])) lp_avg += np.exp(lp) lp_c += 1 lp_avg /= float(lp_c) return lp_avg bleu = Bleu() print("Bleu:") print("Positive:", bleu.compute_score(gts, res)[0]) rouge = Rouge() print("Rouge:") print("Positive:", rouge.compute_score(gts, res)[0]) if meteor is None: meteor = Meteor() print("Meteor:") mscore = meteor.compute_score(gts, res)[0] print("Positive:", mscore) return mscore
def check_meteor_works(): try: met = Meteor() except (AttributeError, FileNotFoundError) as e: print(f"Meteor couldn't start due to {e}") met = None gts = { "datapoint1": ["hello my name is", "meteor test program"], "datapoint2": ["another test sentence", "this the end of the test."] } refs = { "datapoint1": ["is my name really meteor"], "datapoint2": ["probably another test sentence"] } try: output = met.compute_score(gts, refs) except (ValueError, FileNotFoundError, AttributeError) as e: print(f"{e.__class__.__name__}: {e}") met.lock.release() return False print(output) return True
def coco_caption_metrics(predictions_list, image_id_list, vocabulary_path='data/vocabulary.json', max_caption_length=25, batch_size=32, is_training=True): with open(vocabulary_path, 'r') as file: vocabulary_list = json.load(file) word2id = {} for i in range(vocabulary_list.__len__()): word2id[vocabulary_list[i]] = i id2word = {v: k for k, v in word2id.items()} with open('data/captions_gt.json', 'r') as file: captions_gt_dict = json.load(file) gts = {} res = {} for i in range(0, predictions_list.__len__()): for j in range(0, batch_size): sen_input, sen_ground_truth = [], [] for k in range(max_caption_length): id_input = int(predictions_list[i][k][j]) sen_input.append(id2word[id_input]) sen_pre = [] for n in range(max_caption_length): word = sen_input[n] if word != '</S>': sen_pre.append(word) else: break str_input = ' '.join(sen_pre) image_id = image_id_list[i][j][0] # print(image_id) res[image_id] = [str_input] gts[image_id] = captions_gt_dict[str(image_id)] if not is_training: # for key in gts.keys(): # str_input = res[key] # str_grundtruth = gts[key] # print(key) # print(str_input) # print(str_grundtruth) # print('*' * 100) with open('data/result/result_res.json', 'w') as file: json.dump(res, file) with open('data/result/result_gts.json', 'w') as file: json.dump(gts, file) # print('result.json get success') bleu_scorer = Bleu(n=4) bleu, _ = bleu_scorer.compute_score(gts=gts, res=res) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts, res=res) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts, res=res) meteor_scorer = Meteor() meteor, _ = meteor_scorer.compute_score(gts=gts, res=res) for i in range(4): bleu[i] = round(bleu[i], 4) return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
class RealTransformer(nn.Module): def __init__(self, d_model, encoder, vocab_trg, d_hidden=2048, n_layers=6, n_heads=8, drop_ratio=0.1): super().__init__() # self.encoder = Encoder(d_model, d_hidden, n_vocab_src, n_layers, # n_heads, drop_ratio) self.encoder = encoder self.decoder = Decoder(d_model, d_hidden, vocab_trg, n_layers, n_heads, drop_ratio) self.n_layers = n_layers self.tokenizer = PTBTokenizer() # 将索引转换为对应的单词并替换其中的特殊标记 def denum(self, data): return ' '.join(self.decoder.vocab.itos[i] for i in data).replace( ' <eos>', '').replace(' <pad>', '').replace(' .', '').replace(' ', '') # x:(5,480,1024) s:(5,20) x_mask:(5,480,1) def forward(self, x, s, x_mask=None, sample_prob=0): encoding = self.encoder(x, x_mask) # [(5,480,1024),(5,480,1024)] max_sent_len = 20 if not self.training: if isinstance(s, list): hiddens, _ = self.decoder.greedy(encoding, max_sent_len) h = hiddens[-1] targets = None else: h = self.decoder(s[:, :-1].contiguous(), encoding) targets, h = mask(s[:, 1:].contiguous(), h) logits = self.decoder.out(h) else: if sample_prob == 0: h = self.decoder( s[:, :-1].contiguous(), encoding ) # (5,19),[(5,480,1024),(5,480,1024)]-->(5,19,1024) # 使用mask屏蔽语句中pad,获取对应的特征 targets, h = mask(s[:, 1:].contiguous(), h) # targets:(63) h:(63,1024) logits = self.decoder.out(h) else: model_pred = self.decoder.sampling(encoding, s, s.size(1) - 2, sample_prob, is_argmax=True) model_pred.detach_() new_y = torch.cat((Variable( model_pred.data.new(s.size(0), 1).long().fill_( self.decoder.vocab.stoi['<init>'])), model_pred), 1) h = self.decoder(new_y, encoding) targets, h = mask(s[:, 1:].contiguous(), h) logits = self.decoder.out(h) return logits, targets # (63,24) / (63) #x: (91,480,1024) #x_mask: (91,480,1) #T:20 def greedy(self, x, x_mask, T): encoding = self.encoder(x, x_mask) # [(91,480,1024),(91,480,1024)] _, pred = self.decoder.greedy(encoding, T) # (91,20) sent_lst = [] for i in range(pred.data.size(0)): sent_lst.append(self.denum(pred.data[i])) return sent_lst # (91,20) # --------------------------------------------------scst_loss-----------------------------------------------------# """ scst_loss indicates self-critical sequence training (as in https://arxiv.org/abs/1612.00563). We didn't report results w/ this training loss and hence it's deprecated. Still, we keep this option out there in case people need (might need to upgrade some of the code to pytorch 0.4 """ def scst(self, x, x_mask, s): self.scorer = Meteor() encoding = self.encoder(x, x_mask) # greedy part _, pred = self.decoder.greedy(encoding, s.size(1) - 1) pred_greedy = [] for i in range(pred.data.size(0)): pred_greedy.append(self.denum(pred.data[i])) del pred # sampling part model_pred = self.decoder.sampling(encoding, s, s.size(1) - 2, sample_prob=1, is_argmax=False) model_pred.detach_() new_y = torch.cat((Variable( model_pred.data.new(s.size(0), 1).long().fill_( self.decoder.vocab.stoi['<init>'])), model_pred), 1) h = self.decoder(new_y, encoding) B, T, H = h.size() logits = self.decoder.out(h.view(-1, H)) #.view(B, T, -1) mask = (s[:, 1:] != 1).float() _, pred_sample = torch.max(logits, -1) p_model = F.log_softmax(logits, dim=-1) logp = p_model[torch.arange(0, B * T).type(logits.data.type()).long(), pred_sample.data].view(B, T) pred_sample = pred_sample.view(B, T) assert pred_sample.size(0) == len(pred_greedy), ( 'pred_sample should have the same number of sentences as in ' 'pred_greedy, got {} and {} instead'.format(B, len(pred_greedy))) assert pred_sample.size() == (B, T), ('pred_sample size should error') pred_sample.detach_() # rewards sentence_greedy, sentence_sample, sentence_gt = {}, {}, {} for i in range(len(pred_greedy)): sentence_greedy[i] = [{'caption': pred_greedy[i]}] sentence_sample[i] = [{'caption': self.denum(pred_sample.data[i])}] sentence_gt[i] = [{'caption': self.denum(s.data[i, 1:])}] tok_greedy = self.tokenizer.tokenize(sentence_greedy) tok_sample = self.tokenizer.tokenize(sentence_sample) tok_gt = self.tokenizer.tokenize(sentence_gt) _, r_greedy = self.scorer.compute_score(tok_gt, tok_greedy) _, r_sample = self.scorer.compute_score(tok_gt, tok_sample) r_diff = [r_s - r_g for (r_s, r_g) in zip(r_greedy, r_sample)] r_diff = Variable(torch.Tensor(r_diff).type(logp.data.type())) loss = -torch.mean(torch.sum(r_diff.view(-1, 1) * logp * mask, 1)) return loss
def evaluate(beam_size): """ Evaluation :param beam_size: beam size at which to generate captions for evaluation :return: BLEU-4 score """ # DataLoader loader = torch.utils.data.DataLoader(CaptionDataset( data_folder, data_name, 'TEST', transform=transforms.Compose([normalize])), batch_size=1, shuffle=True, num_workers=0, pin_memory=False) # TODO: Batched Beam Search # Therefore, do not use a batch_size greater than 1 - IMPORTANT! # Lists to store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] references = dict() hypotheses = dict() # For each image for j, (image, caps, caplens, allcaps) in enumerate( tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))): k = beam_size # Move to GPU device, if available image = image.to(device) # (1, 3, 256, 256) attrs, encoder_out = encoder(image) attrs = attrs.expand(3, attrs_dim) enc_image_size = encoder_out.size(1) encoder_dim = encoder_out.size(3) encoder_out = encoder_out.view(1, -1, encoder_dim) num_pixels = encoder_out.size(1) encoder_out = encoder_out.expand(k, num_pixels, encoder_dim) x0 = decoder.init_x0(attrs) # Tensor to store top k previous words at each step; now they're just <start> k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to( device) # (k, 1) # Tensor to store top k sequences; now they're just <start> seqs = k_prev_words # (k, 1) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(k, 1).to(device) # (k, 1) # Lists to store completed sequences and scores complete_seqs = list() complete_seqs_scores = list() # Start decoding step = 1 h1, c1, h2, c2 = decoder.init_hidden_state(attrs, encoder_out, zero=True) h1, c1 = decoder.decode_step1(x0, (h1, c1)) # s is a number less than or equal to k, because sequences are removed from this process once they hit <end> while True: embeddings = decoder.embedding(k_prev_words).squeeze( 1) # (s, embed_dim) h1, c1 = decoder.decode_step1(embeddings, (h1, c1)) awe, _ = decoder.attention(encoder_out, h1, h2) # gate = decoder.sigmoid(decoder.f_beta(h2)) # awe = gate * awe h2, c2 = decoder.decode_step2(torch.cat([embeddings, awe], dim=1), (h2, c2)) scores = decoder.fc2(decoder.dropout2(h2)) scores = F.log_softmax(scores, dim=1) # Add scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size) # For the first step, all k points will have the same scores (since same k previous words, h, c) if step == 1: top_k_scores, top_k_words = scores[0].topk(k, 0, True, True) # (s) else: # Unroll and find top scores, and their unrolled indices # (s) 所有分数中最大的k个 top_k_scores, top_k_words = scores.view(-1).topk( k, 0, True, True) # Convert unrolled indices to actual indices of scores # 上面展开了,prev_word_inds得到哪些句子是概率最大的 prev_word_inds = top_k_words / vocab_size # (s) next_word_inds = top_k_words % vocab_size # (s) # Add new words to sequences seqs = torch.cat( [seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1) # Which sequences are incomplete (didn't reach <end>)? incomplete_inds = [ ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>'] ] complete_inds = list( set(range(len(next_word_inds))) - set(incomplete_inds)) # Set aside complete sequences if len(complete_inds) > 0: complete_seqs.extend(seqs[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) k -= len(complete_inds) # reduce beam length accordingly # Proceed with incomplete sequences if k == 0: break seqs = seqs[incomplete_inds] h1 = h1[prev_word_inds[incomplete_inds]] c1 = c1[prev_word_inds[incomplete_inds]] h2 = h2[prev_word_inds[incomplete_inds]] c2 = c2[prev_word_inds[incomplete_inds]] encoder_out = encoder_out[prev_word_inds[incomplete_inds]] top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1) k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1) # Break if things have been going on too long if step > 50: break step += 1 i = complete_seqs_scores.index(max(complete_seqs_scores)) seq = complete_seqs[i] # References img_caps = allcaps[0].tolist() img_captions = list( map( lambda c: [ rev_word_map[w] for w in c if w not in { word_map['<start>'], word_map['<end>'], word_map[ '<pad>'] } ], img_caps)) # remove <start> and pads img_caps = [' '.join(c) for c in img_captions] # print(img_caps) references[str(j)] = img_caps # Hypotheses hypothesis = ([ rev_word_map[w] for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']} ]) hypothesis = [' '.join(hypothesis)] # print(hypothesis) hypotheses[str(j)] = hypothesis assert len(references) == len(hypotheses) # Calculate BLEU-1~BLEU4 scores m1 = Bleu() m2 = Meteor() m3 = Cider() m4 = Rouge() m5 = Spice() (score1, scores1) = m1.compute_score(references, hypotheses) (score2, scores2) = m2.compute_score(references, hypotheses) (score3, scores3) = m3.compute_score(references, hypotheses) (score4, scores4) = m4.compute_score(references, hypotheses) (score5, scores5) = m5.compute_score(references, hypotheses) return score1, score2, score3, score4, score5
rouge, _ = rouge_obj.compute_score(wtd, wrd) rouges.append(rouge) print(np.mean(rouges)) with open("%s-rouges.txt" % system, 'w') as outf: for r in rouges: outf.write(str(r) + '\n') for i in range(len(ref1_strs)): word_target_dict[i] = [ref1_strs[i], ref2_strs[i]] word_response_dict[i] = [sys_strs[i]] bleu_score, bleu_scores = bleu_obj.compute_score(word_target_dict, word_response_dict) bleu1_score, _, _, bleu4_score = bleu_score bleu1_scores, _, _, bleu4_scores = bleu_scores meteor_score, meteor_scores = meteor_obj.compute_score(word_target_dict, word_response_dict) rouge_score, rouge_scores = rouge_obj.compute_score(word_target_dict, word_response_dict) cider_score, cider_scores = cider_obj.compute_score(word_target_dict, word_response_dict) print("ROUGE-L: ", rouge_score) print("BLEU-1: ", bleu1_score) print("BLEU-4: ", bleu4_score) print("METEOR: ", meteor_score) print("CiDER: ", cider_score)
class RealTransformer(nn.Module): # to caption the proposal object # for each proposal, encoder will forward again (with mask) def __init__(self, d_model, encoder, vocab_trg, d_hidden=2048, n_layers=6, n_heads=8, drop_ratio=0.1): super().__init__() # self.encoder = Encoder(d_model, d_hidden, n_vocab_src, n_layers, # n_heads, drop_ratio) self.encoder = encoder self.decoder = Decoder(d_model, d_hidden, vocab_trg, n_layers, n_heads, drop_ratio) self.n_layers = n_layers self.tokenizer = PTBTokenizer() def denum(self, data): return ' '.join(self.decoder.vocab.itos[i] for i in data).replace( ' <eos>', '').replace(' <pad>', '').replace(' .', '').replace(' ', '') def forward(self, x, s, x_mask=None, sample_prob=0): # s : sentence hidden state # x是 feature sequence # x_mask是proposal mask encoding = self.encoder(x, x_mask) #encode max_sent_len = 20 #sentence length if not self.training: # Infer Mode if isinstance(s, list): hiddens, _ = self.decoder.greedy(encoding, max_sent_len) h = hiddens[-1] #取最后一个层的所有状态 targets = None else: h = self.decoder(s[:, :-1].contiguous(), encoding) # 用已有的Hidden State前推一步? targets, h = mask(s[:, 1:].contiguous(), h) logits = self.decoder.out(h) else: # Training Mode if sample_prob == 0: #全部从GT里面抽 h = self.decoder(s[:, :-1].contiguous(), encoding) #一步就够了 targets, h = mask(s[:, 1:].contiguous(), h) """ def mask(targets, out): mask = (targets != 1) out_mask = mask.unsqueeze(-1).expand_as(out) return targets[mask], out[out_mask].view(-1, out.size(-1)) """ #省略掉开头符 logits = self.decoder.out(h) else: model_pred = self.decoder.sampling(encoding, s, s.size(1) - 2, sample_prob, is_argmax=True) model_pred.detach_() new_y = torch.cat((Variable( model_pred.data.new(s.size(0), 1).long().fill_( self.decoder.vocab.stoi['<init>'])), model_pred), 1) h = self.decoder(new_y, encoding) targets, h = mask(s[:, 1:].contiguous(), h) logits = self.decoder.out(h) return logits, targets def greedy(self, x, x_mask, T): encoding = self.encoder(x, x_mask) # Encode Visual Content _, pred = self.decoder.greedy(encoding, T) # Get The Prediction Sentence By "Greedy Strategy" sent_lst = [] for i in range(pred.data.size(0)): sent_lst.append(self.denum(pred.data[i])) return sent_lst def scst(self, x, x_mask, s): self.scorer = Meteor() encoding = self.encoder(x, x_mask) # greedy part _, pred = self.decoder.greedy(encoding, s.size(1) - 1) pred_greedy = [] for i in range(pred.data.size(0)): pred_greedy.append(self.denum(pred.data[i])) del pred # sampling part model_pred = self.decoder.sampling(encoding, s, s.size(1) - 2, sample_prob=1, is_argmax=False) model_pred.detach_() new_y = torch.cat((Variable( model_pred.data.new(s.size(0), 1).long().fill_( self.decoder.vocab.stoi['<init>'])), model_pred), 1) h = self.decoder(new_y, encoding) B, T, H = h.size() logits = self.decoder.out(h.view(-1, H)) #.view(B, T, -1) mask = (s[:, 1:] != 1).float() _, pred_sample = torch.max(logits, -1) p_model = F.log_softmax(logits, dim=-1) logp = p_model[torch.arange(0, B * T).type(logits.data.type()).long(), pred_sample.data].view(B, T) pred_sample = pred_sample.view(B, T) assert pred_sample.size(0) == len(pred_greedy), ( 'pred_sample should have the same number of sentences as in ' 'pred_greedy, got {} and {} instead'.format(B, len(pred_greedy))) assert pred_sample.size() == (B, T), ('pred_sample size should error') pred_sample.detach_() # rewards sentence_greedy, sentence_sample, sentence_gt = {}, {}, {} for i in range(len(pred_greedy)): sentence_greedy[i] = [{'caption': pred_greedy[i]}] sentence_sample[i] = [{'caption': self.denum(pred_sample.data[i])}] sentence_gt[i] = [{'caption': self.denum(s.data[i, 1:])}] tok_greedy = self.tokenizer.tokenize(sentence_greedy) tok_sample = self.tokenizer.tokenize(sentence_sample) tok_gt = self.tokenizer.tokenize(sentence_gt) _, r_greedy = self.scorer.compute_score(tok_gt, tok_greedy) _, r_sample = self.scorer.compute_score(tok_gt, tok_sample) r_diff = [r_s - r_g for (r_s, r_g) in zip(r_greedy, r_sample)] r_diff = Variable(torch.Tensor(r_diff).type(logp.data.type())) loss = -torch.mean(torch.sum(r_diff.view(-1, 1) * logp * mask, 1)) return loss
def meteor(gts, res): scorer = Meteor() score, scores = scorer.compute_score(gts, res) out_file.write('METEOR = %s' % score + '\n')
class CaptionEvaluator(object): def __init__(self, rtranslator): self.tokenizer = PTBTokenizer() self.scorer = Meteor() self.rtranslator = rtranslator def evaluate(self, gts, res): _, scores = self.scorer.compute_score(gts, res) return scores def build_loss(self, sl_conf, video_feat, video_len, video_mask, sent_gd, model_cg): """ param input_caption """ return self.build_loss_v1(sl_conf, video_feat, video_len, video_mask, sent_gd, model_cg) def build_loss_v1(self, sl_conf, video_feat, video_len, video_mask, sent_gd, model_cg): """ :param sl_conf: (batch, n_anchor) :param sl_gather_idx: (batch, ) :param video_feat: (batch, ~, ~) :param video_len: (batch, 2) :param video_mask: (batch, ~, 1) :param model_cg: :return: """ initial_anchors = params['anchor_list'] n_anchors = len(initial_anchors) batch_size = video_feat.size(0) ts_seq = Variable(FloatTensor(initial_anchors).repeat(batch_size, 1)) ts_gather_idx = Variable( LongTensor(range(batch_size)).unsqueeze(1).repeat( 1, n_anchors).view(-1)) _, sent_pred, sent_len, sent_mask = model_cg.forward( video_feat, video_len, video_mask, ts_seq, ts_gather_idx) sent_pred = sent_pred.view(batch_size, n_anchors, -1) cur_res = {} cur_gts = {} for idxi, gts_caption in enumerate(sent_gd): cur_gts[idxi] = [{ 'caption': remove_nonascii( self.rtranslator.rtranslate( gts_caption.cpu().data.numpy())) }] for idxj in range(n_anchors): cur_res[idxi * n_anchors + idxj] = [{ 'caption': remove_nonascii( self.rtranslator.rtranslate( sent_pred[idxi, idxj].cpu().data.numpy())) }] tokenize_res = self.tokenizer.tokenize(cur_res) tokenize_gts = self.tokenizer.tokenize(cur_gts) res = { i: {j: tokenize_res[i * n_anchors + j] for j in range(n_anchors)} for i in range(sent_gd.size(0)) } gts = { i: {j: tokenize_gts[i] for j in range(n_anchors)} for i in range(sent_gd.size(0)) } scores = [] for i in range(sent_gd.size(0)): score = self.evaluate(gts[i], res[i]) scores.append(score) approx_ground_truth = Variable( torch.from_numpy(np.array(scores).argmax(1)).cuda()) return F.cross_entropy(sl_conf, approx_ground_truth)
class RefineTransformer(nn.Module): def __init__(self, d_model, encoder, vocab_trg, d_hidden=2048, n_layers=6, n_heads=8, drop_ratio=0.1): super().__init__() # self.encoder = Encoder(d_model, d_hidden, n_vocab_src, n_layers, # n_heads, drop_ratio) self.encoder = encoder self.decoder = Decoder(d_model, d_hidden, vocab_trg, n_layers, n_heads, drop_ratio) self.n_layers = n_layers self.tokenizer = PTBTokenizer() def denum(self, data): return ' '.join(self.decoder.vocab.itos[i] for i in data).replace( ' <eos>', '').replace(' <pad>', '').replace(' .', '').replace(' ', '') def forward(self, x, s, x_mask=None, sample_prob=0): encoding = self.encoder(x, x_mask) max_sent_len = 20 if not self.training: if isinstance(s, list): hiddens, _ = self.decoder.greedy(encoding, max_sent_len) h = hiddens[-1] targets = None else: h = self.decoder(s[:, :-1].contiguous(), encoding) targets, h = mask(s[:, 1:].contiguous(), h) logits = self.decoder.out(h) else: if sample_prob == 0: h = self.decoder(s[:, :-1].contiguous(), encoding) targets, h = mask(s[:, 1:].contiguous(), h) logits = self.decoder.out(h) else: model_pred = self.decoder.sampling(encoding, s, s.size(1) - 2, sample_prob, is_argmax=True) model_pred.detach_() new_y = torch.cat(( Variable(model_pred.data.new(s.size(0), 1).long().fill_( self.decoder.vocab.stoi['<init>'])), model_pred), 1) h = self.decoder(new_y, encoding) targets, h = mask(s[:, 1:].contiguous(), h) logits = self.decoder.out(h) return logits, targets def greedy(self, x, x_mask, T): encoding = self.encoder(x, x_mask) _, pred = self.decoder.greedy(encoding, T) sent_lst = [] for i in range(pred.data.size(0)): sent_lst.append(self.denum(pred.data[i])) return sent_lst def scst(self, x, x_mask, s): self.scorer = Meteor() encoding = self.encoder(x, x_mask) # greedy part _, pred = self.decoder.greedy(encoding, s.size(1)-1) pred_greedy = [] for i in range(pred.data.size(0)): pred_greedy.append(self.denum(pred.data[i])) del pred # sampling part model_pred = self.decoder.sampling(encoding, s, s.size(1) - 2, sample_prob=1, is_argmax=False) model_pred.detach_() new_y = torch.cat(( Variable(model_pred.data.new(s.size(0), 1).long().fill_( self.decoder.vocab.stoi['<init>'])), model_pred), 1) h = self.decoder(new_y, encoding) B, T, H = h.size() logits = self.decoder.out(h.view(-1, H)) #.view(B, T, -1) mask = (s[:,1:] != 1).float() _, pred_sample = torch.max(logits, -1) p_model = F.log_softmax(logits, dim=-1) logp = p_model[torch.arange(0,B*T).type(logits.data.type()).long(), pred_sample.data].view(B, T) pred_sample = pred_sample.view(B, T) assert pred_sample.size(0) == len(pred_greedy), ( 'pred_sample should have the same number of sentences as in ' 'pred_greedy, got {} and {} instead'.format(B, len(pred_greedy)) ) assert pred_sample.size() == (B, T), ( 'pred_sample size should error' ) pred_sample.detach_() # rewards sentence_greedy, sentence_sample, sentence_gt = {}, {}, {} for i in range(len(pred_greedy)): sentence_greedy[i] = [{'caption':pred_greedy[i]}] sentence_sample[i] = [{'caption':self.denum(pred_sample.data[i])}] sentence_gt[i] = [{'caption':self.denum(s.data[i,1:])}] tok_greedy = self.tokenizer.tokenize(sentence_greedy) tok_sample = self.tokenizer.tokenize(sentence_sample) tok_gt = self.tokenizer.tokenize(sentence_gt) _, r_greedy = self.scorer.compute_score(tok_gt, tok_greedy) _, r_sample = self.scorer.compute_score(tok_gt, tok_sample) r_diff = [r_s-r_g for (r_s, r_g) in zip(r_greedy, r_sample)] r_diff = Variable(torch.Tensor(r_diff).type(logp.data.type())) loss = - torch.mean(torch.sum(r_diff.view(-1,1) * logp * mask, 1)) return loss
def end_epoch(self, ): path = Path(Options()["exp.dir"]) dirname = path.joinpath("generated_sentences") # Create directory if it does not exist if not os.path.exists(dirname): try: os.makedirs(dirname) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise # Dump sentences to the directory for field in ["action", "justification"]: for key in ["ground_truth", "predicted"]: filepath = dirname.joinpath("%s_%s.txt" % (key, field)) with open(filepath, "w") as f: f.write("\n".join(self.sentences[key][field])) # Compute NLP quality scores (bleu, meteor, cider...) for field in ["action", "justification"]: cider = Cider() bleu = Bleu() meteor = Meteor() # Check if this is not empty if len(self.sentences["ground_truth"][field]) > 0: ground_truth = { i: [sentence] for i, sentence in enumerate(self.sentences["ground_truth"] [field]) } predicted = { i: [sentence] for i, sentence in enumerate(self.sentences["predicted"] [field]) } cider_score, _ = cider.compute_score(ground_truth, predicted) cider_score = cider_score * 100 # Convert to percentage bleus_score, _ = bleu.compute_score(ground_truth, predicted) bleu_score = bleus_score[ 3] * 100 # Take bleu-4 and convert to percentage meteor_score, _ = meteor.compute_score(ground_truth, predicted) meteor_score = meteor_score * 100 # Convert to percentage else: # Otherwise all scores are 0 cider_score, bleu_score, meteor_score = 0, 0, 0 Logger().log_value('%s_epoch.cider_%s' % (self.mode, field), cider_score, should_print=True) Logger().log_value('%s_epoch.bleucoco_%s' % (self.mode, field), bleu_score, should_print=True) Logger().log_value('%s_epoch.meteorcoco_%s' % (self.mode, field), meteor_score, should_print=True) # Reset sentences self.sentences = { "ground_truth": { "action": [], "justification": [] }, "predicted": { "action": [], "justification": [] } } return
def run_load_gap_filler(pretrained_filename, do_bleu=False, must_have_anp=False, copy_if_no_anp=False, replace_adj=False, get_human=False, semi_human=False): rnn = RNNModel() rnn.load_model(pretrained_filename) rnn.conf['VAL_SPLIT'] = RNNDataProvider.TEST if get_human: id_to_caps = pickle.load(open("coco_mturk/id_to_caps.pik", "rb")) rnn.build_model_core() rnn.load_val_dataset() rnn.build_sentence_generator() rnn.build_perplexity_calculator() #print rnn.sample_sentence(rnn.V_valid[0]) #print decoder_beamsearch2(rnn, rnn.V_valid[0]) #print decoder_beamsearch(rnn, rnn.V_valid[0]) #calculate_metric(rnn) #sys.exit(0) pos_sentence_res = [] pos_att_res = [] des_sentence_res = [] des_att_res = [] img_files = [] img_ids = [] id_to_sentences = {} seen_ids = set() if 'added_words' in rnn.conf: new_words = set([w[0] for w in rnn.conf['added_words']]) else: new_words = set() num_ignore = 0 num_not_ignore = 0 for idx in range(rnn.V_valid.shape[0]): img_file = rnn.dp.img_id_to_filename[rnn.Id_valid[idx]] img_id = rnn.Id_valid[idx] if img_id not in id_to_sentences: id_to_sentences[img_id] = [] #id_to_sentences[img_id].append(' '.join([rnn.dp.i2w[w] for w in rnn.X_valid[idx] if w != 0][::-1])) if replace_adj: id_to_sentences[img_id] = [ ' '.join(do_replace_adj(rnn.dp.tokens[i])[::-1]) for i in rnn.dp.img_id_to_tokens[img_id] ] elif get_human: id_to_sentences[img_id] = [ ' '.join(rnn.dp.tokens[i][::-1]) for i in rnn.dp.img_id_to_tokens[img_id] ] np.random.shuffle(id_to_sentences[img_id]) print(len(id_to_sentences[img_id])) human_sen_pos = id_to_sentences[img_id].pop() print(len(id_to_sentences[img_id])) if not id_to_sentences[img_id]: continue else: id_to_sentences[img_id] = [ ' '.join(rnn.dp.tokens[i][::-1]) for i in rnn.dp.img_id_to_tokens[img_id] ] #print id_to_sentences[img_id] if img_id in seen_ids: continue seen_ids.add(img_id) if get_human and not semi_human: pos_sen = human_sen_pos.split()[::-1] np.random.shuffle(id_to_caps[img_id]) des_sen = id_to_caps[img_id][0][::-1] else: lp, pos_sen, pos_att = decoder_beamsearch_with_attention( rnn, rnn.V_valid[idx], senti=1.0, beam_size=5) lp, des_sen, des_att = decoder_beamsearch_with_attention( rnn, rnn.V_valid[idx], senti=-1.0, beam_size=5) pos_sen = pos_sen[:-1] des_sen = des_sen[:-1] #des_att = des_att[:-1] pos_att = pos_att[:-1] #pos_sen, pos_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([1.0], dtype=theano.config.floatX)) pos_att = np.array(pos_att) pos_att = pos_att.flatten() #des_att = np.array(des_att) #des_att = des_att.flatten() des_att = np.zeros((len(des_sen), )) #pos_att = np.zeros((len(pos_sen),)) if must_have_anp: if not sentence_has_anp(pos_sen[::-1]): num_ignore += 1 continue num_not_ignore += 1 if copy_if_no_anp: if not sentence_has_anp(pos_sen[::-1]): pos_sen = des_sen if replace_adj: pos_sen = do_replace_adj(pos_sen[::-1])[::-1] des_sen = do_replace_adj(des_sen[::-1])[::-1] #des_sen, des_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([-1.0], dtype=theano.config.floatX)) new_pos_sen = [] for vv, a in zip(pos_sen, pos_att): out = vv col = "" if a > 0.75: col = "#FF3300" elif a > 0.5: col = "#FF5C33" elif a > 0.25: col = "#FF8566" #if a > 0.75: # col = "#33CC33"# "#3366FF" #elif a > 0.5: # col = "#70DB70" #"#5C85FF" #elif a > 0.25: # col = "#ADEBAD" #"#85A3FF" if col: out = "<font style='background-color: %s'>%s</font>" % (col, vv) new_pos_sen.append(out) pos_sen = new_pos_sen print(pos_sen) print(pos_att) print(des_sen) print_it = False for v in pos_sen: if v in new_words: print_it = True if print_it: for x in zip(pos_sen, pos_att)[::-1]: print(x[0], end=' ') print("") #for x in zip(pos_sen, pos_att)[::-1]: # print x[0], #print "" #for x in zip(des_sen, des_att)[::-1]: # print x[0], #print "\n" pos_att = pos_att[:len(pos_sen)] des_att = des_att[:len(des_sen)] pos_sentence_res.append(pos_sen[::-1]) pos_att_res.append(np.exp(pos_att[::-1])) des_sentence_res.append(des_sen[::-1]) des_att_res.append(np.exp(des_att[::-1])) img_files.append(img_file) img_ids.append(img_id) output = { 'pos_sen': pos_sentence_res, 'pos_att': pos_att_res, 'des_sen': des_sentence_res, 'des_att': des_att_res, 'img_files': img_files, 'img_ids': img_ids } pickle.dump(output, open("output_data/sen_att_pos_01.pik", "wb"), protocol=2) if must_have_anp: print("Must have ANP % removed:", num_ignore / float(num_not_ignore) * 100.0) print("getting Positive perplexity") print(rnn.get_val_perplexity()) print("got perplexity") print("getting Descriptive perplexity") print(rnn.get_val_perplexity(base=True)) print("got perplexity") gts = {} res = {} fout = open("eval/output_pos", "w") for line, iid in zip(pos_sentence_res, img_ids): fout.write(' '.join(line) + '\n') if iid not in res: res[iid] = [] res[iid].append(' '.join(line)) fout.close() res_des = {} fout = open("eval/output_des", "w") for line, iid in zip(des_sentence_res, img_ids): fout.write(' '.join(line) + '\n') if iid not in res_des: res_des[iid] = [] res_des[iid].append(' '.join(line)) fout.close() for i in range(3): fout = open("eval/reference%d" % i, "w") for cid in img_ids: if cid not in gts: gts[cid] = [] if len(id_to_sentences[cid]) > i: gts[cid].append(id_to_sentences[cid][i]) fout.write(id_to_sentences[cid][i] + "\n") else: fout.write("\n") fout.close() bleu = Bleu() #for i in gts.keys()[:10]: # print gts[i] # print res_des[i] # print res[i] # print "" total_ref_sentences = 0 for i in list(gts.keys()): total_ref_sentences += len(gts[i]) print("Total ref sentences:", total_ref_sentences) print("Bleu:") print("Positive:", bleu.compute_score(gts, res)[0]) print("Descriptive:", bleu.compute_score(gts, res_des)[0]) rouge = Rouge() print("Rouge:") print("Positive:", rouge.compute_score(gts, res)[0]) print("Descriptive:", rouge.compute_score(gts, res_des)[0]) cider = Cider() print("Cider:") print("Positive:", cider.compute_score(gts, res)[0]) print("Descriptive:", cider.compute_score(gts, res_des)[0]) meteor = Meteor() print("Meteor:") print("Positive:", meteor.compute_score(gts, res)[0]) print("Descriptive:", meteor.compute_score(gts, res_des)[0])
def test(model_path='models/model-61'): captions = get_video_data(video_data_path_test, video_feat_path_test, is_test=True) ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist()) model = VideoCaptionGenerator( dim_image=dim_image, n_words=len(ixtoword), dim_embed=dim_embed, dim_hidden=dim_hidden, batch_size=1, dim_obj_feats=dim_obj_feats, n_obj_feats=n_obj_feats, #encoder_max_sequence_length=encoder_step, decoder_max_sentence_length=decoder_step, bias_init_vector=None) _, tf_obj_feats, tf_video_mask, _, _, tf_generated_words, tf_generated_att = \ model.build_model(is_test=True) sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, model_path) scorer = Meteor() scorer_bleu = Bleu(4) GTS = defaultdict(list) RES = defaultdict(list) counter = 0 for vid, caption in captions.items(): print counter if False: # Collect frames. cap = cv2.VideoCapture(os.path.join(video_path_test, vid + '.mp4')) frames = list() while True: ret, im = cap.read() if ret is False: break frames.append(im) # Load meta data. #with open(os.path.join(meta_data_path_test, vid+'.mp4.txt'), 'r') as f: # meta_data = json.load(f) # all_feats = meta_data['features'] generated_sentence, generated_att, _ = gen_sentence( sess, tf_video_mask, tf_obj_feats, tf_generated_words, tf_generated_att, vid, ixtoword) #generated_sentence_test, weights = gen_sentence( # sess, video_tf, video_mask_tf, caption_tf, vid, ixtoword, weights_tf, 0.3) generated_att = [att[:, 0, 0] for att in generated_att] #print generated_att print vid, generated_sentence[:-2] #plt.plot(generated_att) #plt.show() #print generated_sentence_test #print caption if False: words = generated_sentence.split(' ') feats = list() for i, w in enumerate(words): i_best_feat_list = np.argsort(generated_att[i])[::-1] imgs = list() for i_best_feat in i_best_feat_list: weight = generated_att[i][i_best_feat] if weight < 0.1: break print w, i_best_feat if all_feats is None or len(all_feats) == 0: im = cv2.resize( frames[:len(frames):len(frames) / 4][i_best_feat], (300, 300)) else: feat = all_feats[i_best_feat] i_frame = feat[0] bbox = feat[2] im = np.copy(frames[i_frame][bbox[2]:bbox[3], bbox[0]:bbox[1]]) im = cv2.resize(im, (300, 300)) constant = cv2.copyMakeBorder(im, 10, 10, 10, 10, cv2.BORDER_CONSTANT, value=[0, 0, 0]) violet = np.zeros((30, constant.shape[1], 3), np.uint8) violet[:] = (255, 255, 255) vcat = cv2.vconcat((violet, constant)) cv2.putText(vcat, str(weight), (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, 0) imgs.append(vcat) if imgs: final_img = cv2.hconcat(imgs) cv2.imshow('test', final_img) cv2.waitKey(10000) GTS[str(counter)] = [{ 'image_id': str(counter), 'cap_id': i, 'caption': ' '.join(s) } for i, s in enumerate(caption)] RES[str(counter)] = [{ 'image_id': str(counter), 'caption': generated_sentence[:-2] }] #GTS[vid] = caption #RES[vid] = [generated_sentence[:-2] + '.'] counter += 1 #words = generated_sentence.split(' ') #fig = plt.figure() #for i in range(len(words)): # w = weights[i] # ax = fig.add_subplot(len(words), 1, i+1) # ax.set_title(words[i]) # ax.plot(range(len(w)), [ww[0] for ww in w], 'b') #plt.show() #ipdb.set_trace() tokenizer = PTBTokenizer() GTS = tokenizer.tokenize(GTS) RES = tokenizer.tokenize(RES) score, scores = scorer.compute_score(GTS, RES) print "METEOR", score score, scores = scorer_bleu.compute_score(GTS, RES) print "BLEU", score
import sys sys.path.append('../third_party/densevid_eval/coco-caption') from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer from pycocoevalcap.meteor.meteor import Meteor src = {0: [{'caption': "this could be a good time, but not then."}]} tgt = {0: [{'caption': "this is not good at all, time will say."}]} src_1 = {0: ["this could be a good time, but not then."]} tgt_1 = {0: ["this is not good at all, time will say."]} tokenizer = PTBTokenizer() meteor = Meteor() src_t = tokenizer.tokenize(src) tgt_t = tokenizer.tokenize(tgt) score = meteor.compute_score(src_t, tgt_t) score_1 = meteor.compute_score(src_1, tgt_1) import pdb pdb.set_trace()
def test(model_path='models/model-37', video_feat_path=video_feat_path): train_data, test_data = get_video_data(video_data_path, video_feat_path, train_ratio=0.9) test_videos = test_data['video_path'].values test_captions = test_data['Description'].values ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist()) test_videos_unique = list() test_captions_list = list() for (video, caption) in zip(test_videos, test_captions): if len(test_videos_unique) == 0 or test_videos_unique[-1] != video: test_videos_unique.append(video) test_captions_list.append([caption]) else: test_captions_list[-1].append(caption) model = Video_Caption_Generator(dim_image=dim_image, n_words=len(ixtoword), dim_hidden=dim_hidden, batch_size=batch_size, encoder_max_sequence_length=n_frame_step, decoder_max_sentence_length=n_frame_step, bias_init_vector=None) video_tf, video_mask_tf, caption_tf, probs_tf, last_embed_tf = model.build_generator( ) sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, model_path) scorer = Meteor() GTS = dict() RES = dict() for (video_feat_path, caption) in zip(test_videos_unique, test_captions_list): print video_feat_path print caption video_feat = np.load(video_feat_path)[None, ...] interval_frame = video_feat.shape[1] / n_frame_step video_feat = video_feat[:, range(0, n_frame_step * interval_frame, interval_frame), :] video_mask = np.ones((video_feat.shape[0], video_feat.shape[1])) #video_feat = sampling(video_feat, 0.3) generated_word_index = sess.run(caption_tf, feed_dict={ video_tf: video_feat, video_mask_tf: video_mask }) #probs_val = sess.run(probs_tf, feed_dict={video_tf:video_feat}) #embed_val = sess.run(last_embed_tf, feed_dict={video_tf:video_feat}) generated_words = ixtoword[generated_word_index] punctuation = np.argmax(np.array(generated_words) == '.') + 1 generated_words = generated_words[:punctuation] generated_sentence = ' '.join(generated_words) print generated_sentence GTS[video_feat_path] = caption RES[video_feat_path] = [generated_sentence[:-2] + '.'] score, scores = scorer.compute_score(GTS, RES) print score ipdb.set_trace() print score ipdb.set_trace()
def meteor(): scorer = Meteor() score, scores = scorer.compute_score(gts, res) print('meter = %s' % score)
def test(model_path='models/model-61', video_feat_path=video_feat_path): train_data, test_data = get_video_data(video_data_path, video_feat_path, train_ratio=0.7) test_videos = test_data['video_path'].values test_captions = test_data['Description'].values ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist()) test_videos_unique = list() test_captions_list = list() for (video, caption) in zip(test_videos, test_captions): if len(test_videos_unique) == 0 or test_videos_unique[-1] != video: test_videos_unique.append(video) test_captions_list.append([caption]) else: test_captions_list[-1].append(caption) model = Video_Caption_Generator(dim_image=dim_image, n_words=len(ixtoword), dim_embed=dim_embed, dim_hidden=dim_hidden, batch_size=batch_size, encoder_max_sequence_length=encoder_step, decoder_max_sentence_length=decoder_step, bias_init_vector=None) tf_loss, tf_video, tf_video_mask, tf_obj_feats, tf_caption, tf_caption_mask, tf_probs = model.build_model( is_test=True) sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, model_path) scorer = Meteor() scorer_bleu = Bleu(4) GTS = defaultdict(list) RES = defaultdict(list) counter = 0 for (vid, caption) in zip(test_videos_unique, test_captions_list): generated_sentence = gen_sentence(sess, tf_video, tf_video_mask, tf_obj_feats, tf_caption, vid, ixtoword, 1) #generated_sentence_test, weights = gen_sentence( # sess, video_tf, video_mask_tf, caption_tf, vid, ixtoword, weights_tf, 0.3) print vid, generated_sentence #print generated_sentence_test #print caption GTS[str(counter)] = [{ 'image_id': str(counter), 'cap_id': i, 'caption': s } for i, s in enumerate(caption)] RES[str(counter)] = [{ 'image_id': str(counter), 'caption': generated_sentence[:-2] + '.' }] #GTS[vid] = caption #RES[vid] = [generated_sentence[:-2] + '.'] counter += 1 #words = generated_sentence.split(' ') #fig = plt.figure() #for i in range(len(words)): # w = weights[i] # ax = fig.add_subplot(len(words), 1, i+1) # ax.set_title(words[i]) # ax.plot(range(len(w)), [ww[0] for ww in w], 'b') #plt.show() ipdb.set_trace() tokenizer = PTBTokenizer() GTS = tokenizer.tokenize(GTS) RES = tokenizer.tokenize(RES) score, scores = scorer.compute_score(GTS, RES) print "METEOR", score score, scores = scorer_bleu.compute_score(GTS, RES) print "BLEU", score