def get_corpus_bleu(model, data_loader, vocabs, device, beam_size): import torch from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge """Defining Scorers""" scorer_bleu = Bleu(4) scorer_rouge = Rouge() scorer_cider = Cider() sequences_ref = {} sequences_gen = {} bad_words = ['<SOS>', '<EOS>', '<UNK>'] bad_toks = [vocabs['word_vocab'](i) for i in bad_words] """Generation Loop""" for i, data in enumerate(data_loader): with torch.no_grad(): captions = data['captions'] length = captions.size(1) - 1 targets = captions.narrow(1, 1, length) images = data['images'].to(device) topics = data['topics'].to(device) predictions = model.sample_v2(images, topics, beam_size=beam_size) sequences_ref[i] = [ " ".join([ vocabs['word_vocab'](j.item()) for j in targets[0] if j.item() not in bad_toks ]) ] sequences_gen[i] = [ " ".join([ vocabs['word_vocab'](j.item()) for j in predictions[0][1] if j.item() not in bad_toks ]) ] # sequences_gen[i] = [" ".join([vocabs['word_vocab'](j) for j in predictions[0] if j not in bad_toks])] """Getting Scores""" bleu_score, bleu_scores = scorer_bleu.compute_score( sequences_ref, sequences_gen) rouge_score, rouge_scores = scorer_rouge.compute_score( sequences_ref, sequences_gen) cider_score, cider_scores = scorer_cider.compute_score( sequences_ref, sequences_gen) scores = { 'bleu_score': bleu_score, 'rouge_score': rouge_score, 'cider_score': cider_score } print(scores) return scores
def compute_batch_score(decode_res, key2refs, keys, start_idx, end_idx, vocabulary, scorer): """ Args: decode_res: decoding results of model, [N, max_length] key2refs: references of all samples, dict(<key> -> [ref_1, ref_2, ..., ref_n] keys: keys of this batch, used to match decode results and refs Return: scores of this batch, [N,] """ if scorer is None: from pycocoevalcap.cider.cider import Cider scorer = Cider() hypothesis = {} references = {} for i in range(len(keys)): if keys[i] in hypothesis.keys(): continue # prepare candidate sentence candidate = [] for w_t in decode_res[i]: if w_t == start_idx: continue elif w_t == end_idx: break candidate.append(vocabulary.idx2word[w_t]) hypothesis[keys[i]] = [" ".join(candidate), ] # prepare reference sentences references[keys[i]] = key2refs[keys[i]] score, scores = scorer.compute_score(references, hypothesis) key2score = {key: scores[i] for i, key in enumerate(references.keys())} results = np.zeros(decode_res.shape[0]) for i in range(decode_res.shape[0]): results[i] = key2score[keys[i]] return results
def _define_metrics(gts, res): bleu_scorer = Bleu(n=4) bleu, _ = bleu_scorer.compute_score(gts=gts, res=res) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts, res=res) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts, res=res) meteor_scorer = Meteor() meteor, _ = meteor_scorer.compute_score(gts=gts, res=res) for i in range(4): bleu[i] = round(bleu[i], 4) return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
def compute_cider_score(decode_res, keys, gts, start_idx, end_idx, vocabulary): """ Args: decode_res: decoding results of model, [B, max_length] keys: keys of this batch, tuple [B,] gts: ground truth sentences of all audios, dict(<key> -> [ref_1, ref_2, ..., ref_n]) Return: score: scores of this batch, [B,] """ from pycocoevalcap.cider.cider import Cider scorer = Cider() hypothesis = {} references = {} for i in range(decode_res.shape[0]): if keys[i] in hypothesis: continue # prepare candidate candidate = [] for t, w_t in enumerate(decode_res[i]): if w_t == start_idx: continue elif w_t == end_idx: break else: candidate.append(vocabulary.idx2word[w_t]) hypothesis[keys[i]] = [ " ".join(candidate), ] # prepare reference references[keys[i]] = gts[keys[i]] (score, scores) = scorer.compute_score(references, hypothesis) key2score = {key: scores[i] for i, key in enumerate(hypothesis.keys())} results = np.zeros(decode_res.shape[0]) for i in range(decode_res.shape[0]): results[i] = key2score[keys[i]] return results
def eval(result_gts_path, result_res_path): with open(result_gts_path, 'r') as file: gts_dict = json.load(file) with open(result_res_path, 'r') as file: res_dict = json.load(file) bleu_score = Bleu(n=4) bleu, _ = bleu_score.compute_score(gts=gts_dict, res=res_dict) meteor_score = Meteor() meteor, _ = meteor_score.compute_score(gts=gts_dict, res=res_dict) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts_dict, res=res_dict) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts_dict, res=res_dict) return bleu, meteor, rouge, cider
def compute_batch_score(decode_res, refs, keys, start_idx, end_idx, vocabulary, scorer): """ Args: decode_res: decoding results of model, [B, max_length] refs: references of all samples, dict(<key> -> [ref_1, ref_2, ..., ref_n] keys: keys of this batch, used to match decode results and refs Return: scores of this batch, [B,] """ if scorer is None: from pycocoevalcap.cider.cider import Cider scorer = Cider() key2pred = {} key2refs = {} for i in range(len(keys)): # prepare candidate sentence candidate = [] for w_t in decode_res[i]: if w_t == start_idx: continue elif w_t == end_idx: break candidate.append(vocabulary.idx2word[w_t]) key2pred[i] = [ " ".join(candidate), ] # prepare reference sentences key2refs[i] = refs[keys[i]] score, scores = scorer.compute_score(key2refs, key2pred) return scores
def cider(): scorer = Cider() # scorer += (hypo[0], ref1) (score, scores) = scorer.compute_score(gts, res) print('cider = %s' % score)
def cider(gts, res): scorer = Cider() (score, scores) = scorer.compute_score(gts, res) out_file.write('CIDEr = %s' % score + '\n')
def coco_caption_metrics(predictions_list, image_id_list, vocabulary_path='data/vocabulary.json', max_caption_length=25, batch_size=32, is_training=True): with open(vocabulary_path, 'r') as file: vocabulary_list = json.load(file) word2id = {} for i in range(vocabulary_list.__len__()): word2id[vocabulary_list[i]] = i id2word = {v: k for k, v in word2id.items()} with open('data/captions_gt.json', 'r') as file: captions_gt_dict = json.load(file) gts = {} res = {} for i in range(0, predictions_list.__len__()): for j in range(0, batch_size): sen_input, sen_ground_truth = [], [] for k in range(max_caption_length): id_input = int(predictions_list[i][k][j]) sen_input.append(id2word[id_input]) sen_pre = [] for n in range(max_caption_length): word = sen_input[n] if word != '</S>': sen_pre.append(word) else: break str_input = ' '.join(sen_pre) image_id = image_id_list[i][j][0] # print(image_id) res[image_id] = [str_input] gts[image_id] = captions_gt_dict[str(image_id)] if not is_training: # for key in gts.keys(): # str_input = res[key] # str_grundtruth = gts[key] # print(key) # print(str_input) # print(str_grundtruth) # print('*' * 100) with open('data/result/result_res.json', 'w') as file: json.dump(res, file) with open('data/result/result_gts.json', 'w') as file: json.dump(gts, file) # print('result.json get success') bleu_scorer = Bleu(n=4) bleu, _ = bleu_scorer.compute_score(gts=gts, res=res) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts, res=res) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts, res=res) meteor_scorer = Meteor() meteor, _ = meteor_scorer.compute_score(gts=gts, res=res) for i in range(4): bleu[i] = round(bleu[i], 4) return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
# if(_count%1000==0): # print(_count) # # _count=_count+1 print(' wmd_done\n time: {}'.format(datetime.now()-time_now)) print(' wmd_middle\n time: {}'.format(time_now2-time_now1)) ####################################################################################################### # '''''''''''''''''''''''''''''''18''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' print('--starting Cider--') df_mode='coco-val-df' time_now = datetime.now() CIDer=Cider(df=df_mode) # using coco-val-df as tf idf' cider_all, cider_scores=CIDer.compute_score(ref,hypo,ImgId) print('--Cider done--\n time: {}'.format(datetime.now()-time_now)) ####################################################################################################### '''''''''''''''''''''''''''''''19''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' # Spice scores spice_scores=[] with open(spice_score_path) as file: sp_file=json.load(file) for ids in ImgId: spice_scores.append(sp_file[str(ids)]) # meteor scores json was saved with str as keys print(' Spice_done\n time: {}'.format(datetime.now()-time_now))
def end_epoch(self, ): path = Path(Options()["exp.dir"]) dirname = path.joinpath("generated_sentences") # Create directory if it does not exist if not os.path.exists(dirname): try: os.makedirs(dirname) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise # Dump sentences to the directory for field in ["action", "justification"]: for key in ["ground_truth", "predicted"]: filepath = dirname.joinpath("%s_%s.txt" % (key, field)) with open(filepath, "w") as f: f.write("\n".join(self.sentences[key][field])) # Compute NLP quality scores (bleu, meteor, cider...) for field in ["action", "justification"]: cider = Cider() bleu = Bleu() meteor = Meteor() # Check if this is not empty if len(self.sentences["ground_truth"][field]) > 0: ground_truth = { i: [sentence] for i, sentence in enumerate(self.sentences["ground_truth"] [field]) } predicted = { i: [sentence] for i, sentence in enumerate(self.sentences["predicted"] [field]) } cider_score, _ = cider.compute_score(ground_truth, predicted) cider_score = cider_score * 100 # Convert to percentage bleus_score, _ = bleu.compute_score(ground_truth, predicted) bleu_score = bleus_score[ 3] * 100 # Take bleu-4 and convert to percentage meteor_score, _ = meteor.compute_score(ground_truth, predicted) meteor_score = meteor_score * 100 # Convert to percentage else: # Otherwise all scores are 0 cider_score, bleu_score, meteor_score = 0, 0, 0 Logger().log_value('%s_epoch.cider_%s' % (self.mode, field), cider_score, should_print=True) Logger().log_value('%s_epoch.bleucoco_%s' % (self.mode, field), bleu_score, should_print=True) Logger().log_value('%s_epoch.meteorcoco_%s' % (self.mode, field), meteor_score, should_print=True) # Reset sentences self.sentences = { "ground_truth": { "action": [], "justification": [] }, "predicted": { "action": [], "justification": [] } } return
class SelfCriticalSequenceTrainingCriterion(FairseqCriterion): def __init__(self, args, task): super().__init__(args, task) self.task = task self.generator = SimpleSequenceGenerator( beam=args.scst_beam, penalty=args.scst_penalty, max_pos=args.max_target_positions, eos_index=task.target_dictionary.eos_index) # Needed for decoding model output to string self.conf_tokenizer = encoders.build_tokenizer(args) self.conf_decoder = encoders.build_bpe(args) self.captions_dict = task.target_dictionary # Tokenizer needed for computing CIDEr scores self.tokenizer = PTBTokenizer() self.scorer = Cider() @staticmethod def add_args(parser): parser.add_argument('--scst-beam', type=int, default=5, help='beam size') parser.add_argument('--scst-penalty', type=float, default=1.0, help='beam search length penalty') parser.add_argument('--scst-validation-set-size', type=int, default=0, metavar='N', help='limited size of validation set') @property def image_ids(self): return self.task.dataset('train').img_ds.image_ids def decode(self, x): """Decode model output. """ x = self.captions_dict.string(x) x = self.conf_decoder.decode(x) return self.conf_tokenizer.decode(x) def generate(self, model, sample): """Generate captions using (simple) beam search. """ tgt_captions = dict() gen_captions = dict() scores, _, tokens, _ = self.generator.generate(model, sample) counter = 0 for i, tb in enumerate(tokens): image_id = self.image_ids[i] image_captions = sample['target'][i] for t in tb: counter += 1 decoded = self.decode(t) tgt_captions[counter] = image_captions gen_captions[counter] = [{ 'image_id': image_id, 'caption': decoded, 'id': 1 }] gen_captions = self.tokenizer.tokenize(gen_captions) return tgt_captions, gen_captions, scores def forward(self, model, sample, reduce=True): sample_indices = sample['id'] sample_device = sample_indices.device tgt_captions, gen_captions, scores = self.generate(model, sample) _, reward = self.scorer.compute_score(tgt_captions, gen_captions) reward = torch.from_numpy(reward).to(device=sample_device).view( scores.shape) # Mean of rewards is used as baseline rather than greedy # decoding (see also https://arxiv.org/abs/1912.08226). reward_baseline = torch.mean(reward, dim=1, keepdim=True) loss = -scores * (reward - reward_baseline) loss = loss.mean() sample_nsentences = sample['nsentences'] sample_ntokens = sample['ntokens'] logging_output = { 'loss': loss.data, 'ntokens': sample_ntokens, 'nsentences': sample_nsentences, 'sample_size': sample_nsentences, } return loss, sample_nsentences, logging_output @staticmethod def aggregate_logging_outputs(logging_outputs): return { 'loss': sum(log.get('loss', 0) for log in logging_outputs), 'ntokens': sum(log.get('ntokens', 0) for log in logging_outputs), 'nsentences': sum(log.get('nsentences', 0) for log in logging_outputs), 'sample_size': sum(log.get('sample_size', 0) for log in logging_outputs) }
rouge, _ = rouge_obj.compute_score(wtd, wrd) rouges.append(rouge) print(np.mean(rouges)) with open("%s-rouges.txt" % system, 'w') as outf: for r in rouges: outf.write(str(r) + '\n') for i in range(len(ref1_strs)): word_target_dict[i] = [ref1_strs[i], ref2_strs[i]] word_response_dict[i] = [sys_strs[i]] bleu_score, bleu_scores = bleu_obj.compute_score(word_target_dict, word_response_dict) bleu1_score, _, _, bleu4_score = bleu_score bleu1_scores, _, _, bleu4_scores = bleu_scores meteor_score, meteor_scores = meteor_obj.compute_score(word_target_dict, word_response_dict) rouge_score, rouge_scores = rouge_obj.compute_score(word_target_dict, word_response_dict) cider_score, cider_scores = cider_obj.compute_score(word_target_dict, word_response_dict) print("ROUGE-L: ", rouge_score) print("BLEU-1: ", bleu1_score) print("BLEU-4: ", bleu4_score) print("METEOR: ", meteor_score) print("CiDER: ", cider_score)
def coco_caption_metrics_hier(predicts_list, sentences_list, image_id_list, config, batch_size=26, is_training=True): with open(config.vocabulary_path, 'r') as file: vocabulary_list = json.load(file) word2id = {} for i in range(vocabulary_list.__len__()): word2id[vocabulary_list[i]] = i id2word = {v: k for k, v in word2id.items()} gts = {} res = {} for i in range(0, predicts_list.__len__()): for j in range(0, batch_size): sent_pre, sent_gt = [], [] for k in range(config.max_sentence_num * config.max_sentence_length): id_input = int(predicts_list[i][k][j]) sent_pre.append(id2word[id_input]) id_gt = sentences_list[i][j][k] if (not id2word[id_gt].__eq__('</S>')) and ( not id2word[id_gt].__eq__('<EOS>')): sent_gt.append(id2word[id_gt]) # sent_pre2 = sent_pre sent_pre2 = [] for n in range(config.max_sentence_num): for m in range(config.max_sentence_length): word = sent_pre[n * config.max_sentence_length + m] if word != '</S>': sent_pre2.append(word) else: break str_pre, str_gt = ' '.join(sent_pre2), ' '.join(sent_gt) image_id = image_id_list[i][j][0] gts[str(image_id)] = [str_gt] res[str(image_id)] = [str_pre] if not is_training: with open(config.result_gts_path, 'w') as file: json.dump(gts, file) with open(config.result_res_path, 'w') as file: json.dump(res, file) bleu_scorer = Bleu(n=4) bleu, _ = bleu_scorer.compute_score(gts=gts, res=res) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts, res=res) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts, res=res) # # # meteor_scorer = Meteor() # meteor, _ = meteor_scorer.compute_score(gts=gts, res=res) for i in range(4): bleu[i] = round(bleu[i], 4) # return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4) return bleu, round(rouge, 4), round(cider, 4)
def evaluate(beam_size): """ Evaluation :param beam_size: beam size at which to generate captions for evaluation :return: BLEU-4 score """ # DataLoader loader = torch.utils.data.DataLoader(CaptionDataset( data_folder, data_name, 'TEST', transform=transforms.Compose([normalize])), batch_size=1, shuffle=True, num_workers=0, pin_memory=False) # TODO: Batched Beam Search # Therefore, do not use a batch_size greater than 1 - IMPORTANT! # Lists to store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] references = dict() hypotheses = dict() # For each image for j, (image, caps, caplens, allcaps) in enumerate( tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))): k = beam_size # Move to GPU device, if available image = image.to(device) # (1, 3, 256, 256) attrs, encoder_out = encoder(image) attrs = attrs.expand(3, attrs_dim) enc_image_size = encoder_out.size(1) encoder_dim = encoder_out.size(3) encoder_out = encoder_out.view(1, -1, encoder_dim) num_pixels = encoder_out.size(1) encoder_out = encoder_out.expand(k, num_pixels, encoder_dim) x0 = decoder.init_x0(attrs) # Tensor to store top k previous words at each step; now they're just <start> k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to( device) # (k, 1) # Tensor to store top k sequences; now they're just <start> seqs = k_prev_words # (k, 1) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(k, 1).to(device) # (k, 1) # Lists to store completed sequences and scores complete_seqs = list() complete_seqs_scores = list() # Start decoding step = 1 h1, c1, h2, c2 = decoder.init_hidden_state(attrs, encoder_out, zero=True) h1, c1 = decoder.decode_step1(x0, (h1, c1)) # s is a number less than or equal to k, because sequences are removed from this process once they hit <end> while True: embeddings = decoder.embedding(k_prev_words).squeeze( 1) # (s, embed_dim) h1, c1 = decoder.decode_step1(embeddings, (h1, c1)) awe, _ = decoder.attention(encoder_out, h1, h2) # gate = decoder.sigmoid(decoder.f_beta(h2)) # awe = gate * awe h2, c2 = decoder.decode_step2(torch.cat([embeddings, awe], dim=1), (h2, c2)) scores = decoder.fc2(decoder.dropout2(h2)) scores = F.log_softmax(scores, dim=1) # Add scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size) # For the first step, all k points will have the same scores (since same k previous words, h, c) if step == 1: top_k_scores, top_k_words = scores[0].topk(k, 0, True, True) # (s) else: # Unroll and find top scores, and their unrolled indices # (s) 所有分数中最大的k个 top_k_scores, top_k_words = scores.view(-1).topk( k, 0, True, True) # Convert unrolled indices to actual indices of scores # 上面展开了,prev_word_inds得到哪些句子是概率最大的 prev_word_inds = top_k_words / vocab_size # (s) next_word_inds = top_k_words % vocab_size # (s) # Add new words to sequences seqs = torch.cat( [seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1) # Which sequences are incomplete (didn't reach <end>)? incomplete_inds = [ ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>'] ] complete_inds = list( set(range(len(next_word_inds))) - set(incomplete_inds)) # Set aside complete sequences if len(complete_inds) > 0: complete_seqs.extend(seqs[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) k -= len(complete_inds) # reduce beam length accordingly # Proceed with incomplete sequences if k == 0: break seqs = seqs[incomplete_inds] h1 = h1[prev_word_inds[incomplete_inds]] c1 = c1[prev_word_inds[incomplete_inds]] h2 = h2[prev_word_inds[incomplete_inds]] c2 = c2[prev_word_inds[incomplete_inds]] encoder_out = encoder_out[prev_word_inds[incomplete_inds]] top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1) k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1) # Break if things have been going on too long if step > 50: break step += 1 i = complete_seqs_scores.index(max(complete_seqs_scores)) seq = complete_seqs[i] # References img_caps = allcaps[0].tolist() img_captions = list( map( lambda c: [ rev_word_map[w] for w in c if w not in { word_map['<start>'], word_map['<end>'], word_map[ '<pad>'] } ], img_caps)) # remove <start> and pads img_caps = [' '.join(c) for c in img_captions] # print(img_caps) references[str(j)] = img_caps # Hypotheses hypothesis = ([ rev_word_map[w] for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']} ]) hypothesis = [' '.join(hypothesis)] # print(hypothesis) hypotheses[str(j)] = hypothesis assert len(references) == len(hypotheses) # Calculate BLEU-1~BLEU4 scores m1 = Bleu() m2 = Meteor() m3 = Cider() m4 = Rouge() m5 = Spice() (score1, scores1) = m1.compute_score(references, hypotheses) (score2, scores2) = m2.compute_score(references, hypotheses) (score3, scores3) = m3.compute_score(references, hypotheses) (score4, scores4) = m4.compute_score(references, hypotheses) (score5, scores5) = m5.compute_score(references, hypotheses) return score1, score2, score3, score4, score5
def eval_cv(engine, key2pred, key2refs): scorer = Cider(zh=zh) score, scores = scorer.compute_score(key2refs, key2pred) engine.state.metrics["score"] = score key2pred.clear()
def run_load_gap_filler(pretrained_filename, do_bleu=False, must_have_anp=False, copy_if_no_anp=False, replace_adj=False, get_human=False, semi_human=False): rnn = RNNModel() rnn.load_model(pretrained_filename) rnn.conf['VAL_SPLIT'] = RNNDataProvider.TEST if get_human: id_to_caps = pickle.load(open("coco_mturk/id_to_caps.pik", "rb")) rnn.build_model_core() rnn.load_val_dataset() rnn.build_sentence_generator() rnn.build_perplexity_calculator() #print rnn.sample_sentence(rnn.V_valid[0]) #print decoder_beamsearch2(rnn, rnn.V_valid[0]) #print decoder_beamsearch(rnn, rnn.V_valid[0]) #calculate_metric(rnn) #sys.exit(0) pos_sentence_res = [] pos_att_res = [] des_sentence_res = [] des_att_res = [] img_files = [] img_ids = [] id_to_sentences = {} seen_ids = set() if 'added_words' in rnn.conf: new_words = set([w[0] for w in rnn.conf['added_words']]) else: new_words = set() num_ignore = 0 num_not_ignore = 0 for idx in range(rnn.V_valid.shape[0]): img_file = rnn.dp.img_id_to_filename[rnn.Id_valid[idx]] img_id = rnn.Id_valid[idx] if img_id not in id_to_sentences: id_to_sentences[img_id] = [] #id_to_sentences[img_id].append(' '.join([rnn.dp.i2w[w] for w in rnn.X_valid[idx] if w != 0][::-1])) if replace_adj: id_to_sentences[img_id] = [ ' '.join(do_replace_adj(rnn.dp.tokens[i])[::-1]) for i in rnn.dp.img_id_to_tokens[img_id] ] elif get_human: id_to_sentences[img_id] = [ ' '.join(rnn.dp.tokens[i][::-1]) for i in rnn.dp.img_id_to_tokens[img_id] ] np.random.shuffle(id_to_sentences[img_id]) print(len(id_to_sentences[img_id])) human_sen_pos = id_to_sentences[img_id].pop() print(len(id_to_sentences[img_id])) if not id_to_sentences[img_id]: continue else: id_to_sentences[img_id] = [ ' '.join(rnn.dp.tokens[i][::-1]) for i in rnn.dp.img_id_to_tokens[img_id] ] #print id_to_sentences[img_id] if img_id in seen_ids: continue seen_ids.add(img_id) if get_human and not semi_human: pos_sen = human_sen_pos.split()[::-1] np.random.shuffle(id_to_caps[img_id]) des_sen = id_to_caps[img_id][0][::-1] else: lp, pos_sen, pos_att = decoder_beamsearch_with_attention( rnn, rnn.V_valid[idx], senti=1.0, beam_size=5) lp, des_sen, des_att = decoder_beamsearch_with_attention( rnn, rnn.V_valid[idx], senti=-1.0, beam_size=5) pos_sen = pos_sen[:-1] des_sen = des_sen[:-1] #des_att = des_att[:-1] pos_att = pos_att[:-1] #pos_sen, pos_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([1.0], dtype=theano.config.floatX)) pos_att = np.array(pos_att) pos_att = pos_att.flatten() #des_att = np.array(des_att) #des_att = des_att.flatten() des_att = np.zeros((len(des_sen), )) #pos_att = np.zeros((len(pos_sen),)) if must_have_anp: if not sentence_has_anp(pos_sen[::-1]): num_ignore += 1 continue num_not_ignore += 1 if copy_if_no_anp: if not sentence_has_anp(pos_sen[::-1]): pos_sen = des_sen if replace_adj: pos_sen = do_replace_adj(pos_sen[::-1])[::-1] des_sen = do_replace_adj(des_sen[::-1])[::-1] #des_sen, des_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([-1.0], dtype=theano.config.floatX)) new_pos_sen = [] for vv, a in zip(pos_sen, pos_att): out = vv col = "" if a > 0.75: col = "#FF3300" elif a > 0.5: col = "#FF5C33" elif a > 0.25: col = "#FF8566" #if a > 0.75: # col = "#33CC33"# "#3366FF" #elif a > 0.5: # col = "#70DB70" #"#5C85FF" #elif a > 0.25: # col = "#ADEBAD" #"#85A3FF" if col: out = "<font style='background-color: %s'>%s</font>" % (col, vv) new_pos_sen.append(out) pos_sen = new_pos_sen print(pos_sen) print(pos_att) print(des_sen) print_it = False for v in pos_sen: if v in new_words: print_it = True if print_it: for x in zip(pos_sen, pos_att)[::-1]: print(x[0], end=' ') print("") #for x in zip(pos_sen, pos_att)[::-1]: # print x[0], #print "" #for x in zip(des_sen, des_att)[::-1]: # print x[0], #print "\n" pos_att = pos_att[:len(pos_sen)] des_att = des_att[:len(des_sen)] pos_sentence_res.append(pos_sen[::-1]) pos_att_res.append(np.exp(pos_att[::-1])) des_sentence_res.append(des_sen[::-1]) des_att_res.append(np.exp(des_att[::-1])) img_files.append(img_file) img_ids.append(img_id) output = { 'pos_sen': pos_sentence_res, 'pos_att': pos_att_res, 'des_sen': des_sentence_res, 'des_att': des_att_res, 'img_files': img_files, 'img_ids': img_ids } pickle.dump(output, open("output_data/sen_att_pos_01.pik", "wb"), protocol=2) if must_have_anp: print("Must have ANP % removed:", num_ignore / float(num_not_ignore) * 100.0) print("getting Positive perplexity") print(rnn.get_val_perplexity()) print("got perplexity") print("getting Descriptive perplexity") print(rnn.get_val_perplexity(base=True)) print("got perplexity") gts = {} res = {} fout = open("eval/output_pos", "w") for line, iid in zip(pos_sentence_res, img_ids): fout.write(' '.join(line) + '\n') if iid not in res: res[iid] = [] res[iid].append(' '.join(line)) fout.close() res_des = {} fout = open("eval/output_des", "w") for line, iid in zip(des_sentence_res, img_ids): fout.write(' '.join(line) + '\n') if iid not in res_des: res_des[iid] = [] res_des[iid].append(' '.join(line)) fout.close() for i in range(3): fout = open("eval/reference%d" % i, "w") for cid in img_ids: if cid not in gts: gts[cid] = [] if len(id_to_sentences[cid]) > i: gts[cid].append(id_to_sentences[cid][i]) fout.write(id_to_sentences[cid][i] + "\n") else: fout.write("\n") fout.close() bleu = Bleu() #for i in gts.keys()[:10]: # print gts[i] # print res_des[i] # print res[i] # print "" total_ref_sentences = 0 for i in list(gts.keys()): total_ref_sentences += len(gts[i]) print("Total ref sentences:", total_ref_sentences) print("Bleu:") print("Positive:", bleu.compute_score(gts, res)[0]) print("Descriptive:", bleu.compute_score(gts, res_des)[0]) rouge = Rouge() print("Rouge:") print("Positive:", rouge.compute_score(gts, res)[0]) print("Descriptive:", rouge.compute_score(gts, res_des)[0]) cider = Cider() print("Cider:") print("Positive:", cider.compute_score(gts, res)[0]) print("Descriptive:", cider.compute_score(gts, res_des)[0]) meteor = Meteor() print("Meteor:") print("Positive:", meteor.compute_score(gts, res)[0]) print("Descriptive:", meteor.compute_score(gts, res_des)[0])