def setup(self, metrics, noatt_gt_path, regions_gt_path, example_ids): # Load scorers self.scorers = dict() for metric in metrics: if metric == 'BLEU-1': self.scorers[metric] = Bleu(n=1) if metric == 'BLEU-2': self.scorers[metric] = Bleu(n=2) if metric == 'BLEU-3': self.scorers[metric] = Bleu(n=3) if metric == 'BLEU-4': self.scorers[metric] = Bleu(n=4) if metric == 'ROUGE-L': self.scorers[metric] = Rouge() if metric == 'METEOR': self.scorers[metric] = Meteor() if metric == 'CIDEr': self.scorers[metric] = Cider() if metric == 'SPICE': self.scorers[metric] = Spice() # Load all ground truth captions for text metrics evaluation with open(noatt_gt_path) as gt_file: gts_all_base = PTBTokenizer.tokenize(json.load(gt_file)['gts']) # Match our example_ids to the base captions (without the _X) self.gts_all = dict() for example_id in example_ids: self.gts_all[example_id] = gts_all_base[example_id.split('_')[0]] if regions_gt_path is not None: with open(regions_gt_path) as gt_file: gts_regions_base = PTBTokenizer.tokenize( json.load(gt_file)['gts']) self.gts_regions = dict() for example_id in example_ids: self.gts_regions[example_id] = gts_regions_base[example_id]
score_nw = 0. for c in gt_captions[i]: score = nw_aligner.score(c, pred_cap) score_nw += score scores_nw.append(score_nw / len(gt_captions[i])) gts[i] = gt_captions[i] gen[i] = [pred_cap] gts_t = PTBTokenizer.tokenize(gts) gen_t = PTBTokenizer.tokenize(gen) val_bleu, _ = Bleu(n=4).compute_score(gts_t, gen_t) method = ['Blue_1', 'Bleu_2', 'Bleu_3', 'Bleu_4'] for metric, score in zip(method, val_bleu): print(metric, score) val_meteor, _ = Meteor().compute_score(gts_t, gen_t) print('METEOR', val_meteor) val_rouge, _ = Rouge().compute_score(gts_t, gen_t) print('ROUGE_L', val_rouge) val_cider, _ = Cider().compute_score(gts_t, gen_t) print('CIDEr', val_cider) val_spice, _ = Spice().compute_score(gts_t, gen_t) print('SPICE', val_spice) print('NW Alignment Score', np.mean(scores_nw))