def compute_score(self, gts, res): """ :param gts: :param res: :return: """ assert(gts.keys() == res.keys()) imgIds = gts.keys() bleu_scorer = BleuScorer(n=self._n) for id in imgIds: hypo = res[id] ref = gts[id] # Sanity check. assert(type(hypo) is list) assert(len(hypo) == 1) assert(type(ref) is list) assert(len(ref) > 0) bleu_scorer += (hypo[0], ref) score, scores = bleu_scorer.compute_score(option='closest', verbose=1) return score, scores
def compute_score(self, gts, res): assert (gts.keys() == res.keys()) imgIds = gts.keys() bleu_scorer = BleuScorer(n=self._n) for id in imgIds: hypo = res[id] ref = gts[id] # Sanity check. assert (type(hypo) is list) #print(len(hypo)) assert (len(hypo) == 1) assert (type(ref) is list) assert (len(ref) >= 1) bleu_scorer += (hypo[0], ref) #score, scores = bleu_scorer.compute_score(option='shortest') score, scores = bleu_scorer.compute_score(option='closest', verbose=1) #score, scores = bleu_scorer.compute_score(option='average', verbose=1) # return (bleu, bleu_info) return score, scores
def compute_score(self, gts, res): """ :param gts: :param res: :return: """ assert (gts.keys() == res.keys()) imgIds = gts.keys() bleu_scorer = BleuScorer(n=self._n) for id in imgIds: hypo = res[id] ref = gts[id] # Sanity check. assert (type(hypo) is list) assert (len(hypo) == 1) assert (type(ref) is list) assert (len(ref) > 0) bleu_scorer += (hypo[0], ref) score, scores = bleu_scorer.compute_score(option='closest', verbose=1) return score, scores
class Bleu: def __init__(self, n=4): # default compute Blue score up to 4 self._n = n self._hypo_for_image = {} self.ref_for_image = {} self.bleu_scorer = BleuScorer(n=self._n) def compute_score(self, gts, res): self.bleu_scorer.clear() for res_id in res: hypo = res_id['caption'] ref = gts[res_id['image_id']] # Sanity check. assert(type(hypo) is list) assert(len(hypo) == 1) assert(type(ref) is list) assert(len(ref) >= 1) self.bleu_scorer += (hypo[0], ref) #score, scores = bleu_scorer.compute_score(option='shortest') score, scores = self.bleu_scorer.compute_score(option='closest', verbose=0) #score, scores = bleu_scorer.compute_score(option='average', verbose=1) # return (bleu, bleu_info) return score, scores def method(self): return "Bleu"
def __init__(self, coco, useBleu=False, useCider=False): self.coco = coco self.useBleu = useBleu self.useCider = useCider self.params = {'image_id': coco.getImgIds()} imgIds = self.params['image_id'] gts = {} for imgId in imgIds: gts[imgId] = self.coco.imgToAnns[imgId] if self.useBleu: self.b_scorer = BleuScorer() if self.useCider: self.c_scorer = CiderScorer() print('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) for imgId in imgIds: ref = gts[imgId] assert (type(ref) is list) assert (len(ref) > 0) if self.useCider: self.c_scorer += (None, ref) if self.useCider: self.c_scorer.compute_doc_freq() assert (len(self.c_scorer.ctest) >= max( self.c_scorer.document_frequency.values()))
def compute_score_for_consensus2(self, gts, subGts): assert(len(gts.keys()) == 1) assert(gts.keys() == subGts.keys()) imgIds = gts.keys() bleu_scorer = BleuScorer(n=self._n) for id in imgIds: ref = gts[id] subRef = subGts[id] sent_len = len(ref) # Sanity check. #assert(type(hypo) is list) #assert(len(hypo) == 1) assert(type(subRef) is list) assert(type(ref) is list) assert(len(ref) > 1) for r1 in ref: for r2 in subRef: bleu_scorer += (r1, [r2]) #score, scores = bleu_scorer.compute_score(option='shortest') score, scores = bleu_scorer.compute_score(option='closest', verbose=0) #score, scores = bleu_scorer.compute_score(option='average', verbose=1) # return (bleu, bleu_info) scores = np.sum(scores, axis = 0) scores = scores.reshape((len(ref), len(subRef))) scores = np.sum(scores, axis = 1) return score, scores
def compute_score(self, gts, res): assert(gts.keys() == res.keys()) imgIds = gts.keys() bleu_scorer = BleuScorer(n=self._n) for id in imgIds: hypo = res[id] ref = gts[id] # Sanity check. assert(type(hypo) is list) # assert(len(hypo) == 1) assert(type(ref) is list) assert(len(ref) >= 1) # Revise to incorporate paragraph-level situation for hypo_element in hypo: bleu_scorer += (hypo_element, ref) # bleu_scorer += (hypo[0], ref) #score, scores = bleu_scorer.compute_score(option='shortest') score, scores = bleu_scorer.compute_score(option='closest', verbose=1) #score, scores = bleu_scorer.compute_score(option='average', verbose=1) # return (bleu, bleu_info) return score, scores
def compute_score(self, gts, res): assert(gts.keys() == res.keys()) imgIds = gts.keys() bleu_scorer = BleuScorer(n=self._n) for id in imgIds: hypo = res[id] ref = gts[id] # Sanity check. assert(type(hypo) is list) assert(len(hypo) == 1) assert(type(ref) is list) assert(len(ref) >= 1) bleu_scorer += (hypo[0], ref) #score, scores = bleu_scorer.compute_score(option='shortest') score, scores = bleu_scorer.compute_score(option='closest', verbose=1) #score, scores = bleu_scorer.compute_score(option='average', verbose=1) # return (bleu, bleu_info) return score, scores
def compute_score_for_consensus2(self, gts, subGts): assert (len(gts.keys()) == 1) assert (gts.keys() == subGts.keys()) imgIds = gts.keys() bleu_scorer = BleuScorer(n=self._n) for id in imgIds: ref = gts[id] subRef = subGts[id] sent_len = len(ref) # Sanity check. #assert(type(hypo) is list) #assert(len(hypo) == 1) assert (type(subRef) is list) assert (type(ref) is list) assert (len(ref) > 1) for r1 in ref: for r2 in subRef: bleu_scorer += (r1, [r2]) #score, scores = bleu_scorer.compute_score(option='shortest') score, scores = bleu_scorer.compute_score(option='closest', verbose=0) #score, scores = bleu_scorer.compute_score(option='average', verbose=1) # return (bleu, bleu_info) scores = np.sum(scores, axis=0) scores = scores.reshape((len(ref), len(subRef))) scores = np.sum(scores, axis=1) return score, scores
class evalSentence: def __init__(self, coco, useBleu=False, useCider=False): self.coco = coco self.useBleu = useBleu self.useCider = useCider self.params = {'image_id': coco.getImgIds()} imgIds = self.params['image_id'] gts = {} for imgId in imgIds: gts[imgId] = self.coco.imgToAnns[imgId] if self.useBleu: self.b_scorer = BleuScorer() if self.useCider: self.c_scorer = CiderScorer() print('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) for imgId in imgIds: ref = gts[imgId] assert (type(ref) is list) assert (len(ref) > 0) if self.useCider: self.c_scorer += (None, ref) if self.useCider: self.c_scorer.compute_doc_freq() assert (len(self.c_scorer.ctest) >= max( self.c_scorer.document_frequency.values())) def eval_cider(self, test, ref): assert (self.useCider) c_score = self.c_scorer.compute_cider(test, ref) return np.array(c_score) def eval_bleu(self, test, ref): assert (self.useBleu) self.b_scorer.reset_list() for ts, rs in zip(test, ref): self.b_scorer += (ts, rs) b_score, b_scores = self.b_scorer.compute_score() return b_scores[3] # return bleu_4
model.load_state_dict(torch.load("latest_model_49.pt")) model.to(device) scaler = transforms.Scale((224, 224)) totensor = transforms.ToTensor() normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) orders = torch.cat([ torch.arange(MAX_SEQ_LEN, dtype=torch.long, device=device).unsqueeze(0) for _ in range(BATCHSIZE) ], dim=0) bleu_scorer = BleuScorer(n=4) result_json = [] model.eval() for ids in val_loader: ids = ids.squeeze(1) captions = [val[val.id == a.item()].iloc[0].captions for a in ids] filenames = [val[val.id == a.item()].iloc[0].filename for a in ids] filenames = [val_folder_path + filename for filename in filenames] # filenames = [test_folder_path + filename for filename in filenames] image_features = torch.cat([ get_image(filename, scaler, totensor, normalize) for filename in filenames ], axis=0).to(device) if len(filenames) == BATCHSIZE:
def evaluate_stylize(G12, G21, loader, opts, split='test'): """ Evaluates sentence generation from both the generators using BLEU, CIDER, METEOR, ROUGE-L, SPICE """ depleted = False sents_s1_all = [] # GT s1 sents_s2_hat_all = [] # GT s1 -> s2 hat sents_s2_all = [] # GT s2 sents_s1_hat_all = [] # GT s2 -> s1 hat while not depleted: # Sents: batch_size x max_length [w1, w2, ..., <eos>, <pad>, <pad>, ...] # Masks: batch_size x max_length [ 1, 1, ..., 1, 0, 0, ...] if split == 'train': sents_s1, masks_s1, sents_s2, masks_s2, depleted = loader.next_batch_train( ) elif split == 'val': sents_s1, masks_s1, sents_s2, masks_s2, depleted = loader.next_batch_val( ) elif split == 'test': sents_s1, masks_s1, sents_s2, masks_s2, depleted = loader.next_batch_test( ) batch_size = sents_s1.shape[0] # Assuming both styles have same max_length max_length = sents_s1.shape[1] # source input must not contain a start token input_sents_s1 = torch.LongTensor(sents_s1) input_sents_s2 = torch.LongTensor(sents_s2) if use_cuda: input_sents_s1 = input_sents_s1.cuda() input_sents_s2 = input_sents_s2.cuda() input_sents_s1 = Variable(input_sents_s1) input_sents_s2 = Variable(input_sents_s2) # encode the input source sentence input_sents_s1_encoded, hidden_12 = G12.encode(input_sents_s1) input_sents_s2_encoded, hidden_21 = G21.encode(input_sents_s2) # generate the predicted target # initial input must be the start token decoder_input_s2_hat = Variable( torch.LongTensor(np.ones((batch_size, 1)) * opts.start_idx_s2)) decoder_input_s1_hat = Variable( torch.LongTensor(np.ones((batch_size, 1)) * opts.start_idx_s1)) if use_cuda: decoder_input_s2_hat = decoder_input_s2_hat.cuda() decoder_input_s1_hat = decoder_input_s1_hat.cuda() rollouts_s2_hat, _ = G12.decoder_rollout(max_length, decoder_input_s2_hat, hidden_12, input_sents_s1_encoded, opts.alpha) rollouts_s1_hat, _ = G21.decoder_rollout(max_length, decoder_input_s1_hat, hidden_21, input_sents_s2_encoded, opts.alpha) sents_s2_hat = rollouts_s2_hat.data.cpu().numpy().astype(int) sents_s1_hat = rollouts_s1_hat.data.cpu().numpy().astype(int) # computing the string sentences sents_s1_all.extend(get_sentence_from_np(sents_s1, loader, src=True)) sents_s1_hat_all.extend( get_sentence_from_np(sents_s1_hat, loader, src=True)) sents_s2_all.extend(get_sentence_from_np(sents_s2, loader, src=False)) sents_s2_hat_all.extend( get_sentence_from_np(sents_s2_hat, loader, src=False)) # Compute BLEU scores bleu_scorer_G21 = BleuScorer(n=4) bleu_scorer_G12 = BleuScorer(n=4) for i in range(len(sents_s1_all)): bleu_scorer_G21 += (sents_s1_hat_all[i], [sents_s1_all[i]]) bleu_scorer_G12 += (sents_s2_hat_all[i], [sents_s2_all[i]]) bleu_G21, _ = bleu_scorer_G21.compute_score(option='closest') bleu_G12, _ = bleu_scorer_G12.compute_score(option='closest') print( 'BLEU scores for Style 1 to 2 ===> B1: %.3f B2: %.3f B3: %.3f B4: %.3f' % (bleu_G12[0], bleu_G12[1], bleu_G12[2], bleu_G12[3])) print( 'BLEU scores for Style 2 to 1 ===> B1: %.3f B2: %.3f B3: %.3f B4: %.3f' % (bleu_G21[0], bleu_G21[1], bleu_G21[2], bleu_G21[3]))
# from https://github.com/mtanti/coco-caption/blob/master/pycocoevalcap/bleu/bleu_scorer.py from bleu_scorer import BleuScorer true_sentences = [] pred_sentences = [] f = open("eval/preds.txt", "r") for pred in f: pred = pred.split("[SEP]", 1)[0] pred_sentences.append(pred) #print(pred) f.close() g = open("eval/golds.txt", "r") for true in g: true = true.split("[SEP]", 1)[0] true_sentences.append([true]) #print(true) g.close() bleu_scorer = BleuScorer(n=4) # up to 4 gram for true, pred in zip(true_sentences, pred_sentences): bleu_scorer += (pred, true) scores, instance_scores = bleu_scorer.compute_score(option='closest', verbose=0) print("BLEU1 score: ", scores[0]) print("BLEU4 score: ", scores[3])
def __init__(self, n=4): # default compute Blue score up to 4 self._n = n self._hypo_for_image = {} self.ref_for_image = {} self.bleu_scorer = BleuScorer(n=self._n)
def bleu(output, ref): scorer = BleuScorer(n=4) scorer += (output.lower(), [ref.lower()]) score, _ = scorer.compute_score() return score