def language_eval(dataset, preds, preds_n, eval_kwargs, split): model_id = eval_kwargs['id'] eval_oracle = eval_kwargs.get('eval_oracle', 0) # create output dictionary out = {} if len(preds_n) > 0: # vocab size and novel sentences if 'coco' in dataset: dataset_file = 'data/dataset_coco.json' elif 'flickr30k' in dataset or 'f30k' in dataset: dataset_file = 'data/dataset_flickr30k.json' training_sentences = set([' '.join(__['tokens']) for _ in json.load(open(dataset_file))['images'] if not _['split'] in ['val', 'test'] for __ in _['sentences']]) generated_sentences = set([_['caption'] for _ in preds_n]) novels = generated_sentences - training_sentences out['novel_sentences'] = float(len(novels)) / len(preds_n) tmp = [_.split() for _ in generated_sentences] words = [] for _ in tmp: words += _ out['vocab_size'] = len(set(words)) # encoder.FLOAT_REPR = lambda o: format(o, '.3f') cache_path = os.path.join('eval_results/', '.cache_'+ model_id + '_' + split + '.json') coco = getCOCO(dataset) valids = coco.getImgIds() # filter results to only those in MSCOCO validation set preds_filt = [p for p in preds if p['image_id'] in valids] mean_perplexity = sum([_['perplexity'] for _ in preds_filt]) / len(preds_filt) mean_entropy = sum([_['entropy'] for _ in preds_filt]) / len(preds_filt) print('using %d/%d predictions' % (len(preds_filt), len(preds))) json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API... cocoRes = coco.loadRes(cache_path) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate() for metric, score in cocoEval.eval.items(): out[metric] = score # Add mean perplexity out['perplexity'] = mean_perplexity out['entropy'] = mean_entropy imgToEval = cocoEval.imgToEval for k in list(imgToEval.values())[0]['SPICE'].keys(): if k != 'All': out['SPICE_'+k] = np.array([v['SPICE'][k]['f'] for v in imgToEval.values()]) out['SPICE_'+k] = (out['SPICE_'+k][out['SPICE_'+k]==out['SPICE_'+k]]).mean() for p in preds_filt: image_id, caption = p['image_id'], p['caption'] imgToEval[image_id]['caption'] = caption if len(preds_n) > 0: import eval_multi cache_path_n = os.path.join('eval_results/', '.cache_'+ model_id + '_' + split + '_n.json') spice_n = eval_multi.eval_spice_n(dataset, preds_n, model_id, split) out.update(spice_n['overall']) div_stats = eval_multi.eval_div_stats(dataset, preds_n, model_id, split) out.update(div_stats['overall']) if eval_oracle: oracle = eval_multi.eval_oracle(dataset, preds_n, model_id, split) out.update(oracle['overall']) else: oracle = None self_cider = eval_multi.eval_self_cider(dataset, preds_n, model_id, split) out.update(self_cider['overall']) with open(cache_path_n, 'w') as outfile: json.dump({'spice_n': spice_n, 'div_stats': div_stats, 'oracle': oracle, 'self_cider': self_cider}, outfile) out['bad_count_rate'] = sum([count_bad(_['caption']) for _ in preds_filt]) / float(len(preds_filt)) outfile_path = os.path.join('eval_results/', model_id + '_' + split + '.json') with open(outfile_path, 'w') as outfile: json.dump({'overall': out, 'imgToEval': imgToEval}, outfile) return out
def language_eval(dataset, preds, preds_n, eval_kwargs, split): model_id = eval_kwargs['id'] eval_oracle = eval_kwargs.get('eval_oracle', 0) import sys sys.path.append("coco-caption") annFile = 'coco-caption/annotations/captions_val2014.json' from pycocotools.coco import COCO from pycocoevalcap.eval import COCOEvalCap # encoder.FLOAT_REPR = lambda o: format(o, '.3f') if not os.path.isdir('eval_results'): os.mkdir('eval_results') cache_path = os.path.join('eval_results/', '.cache_' + model_id + '_' + split + '.json') coco = COCO(annFile) valids = coco.getImgIds() # filter results to only those in MSCOCO validation set (will be about a third) preds_filt = [p for p in preds if p['image_id'] in valids] mean_perplexity = sum([_['perplexity'] for _ in preds_filt]) / len(preds_filt) mean_entropy = sum([_['entropy'] for _ in preds_filt]) / len(preds_filt) print('using %d/%d predictions' % (len(preds_filt), len(preds))) json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API... cocoRes = coco.loadRes(cache_path) cocoEval = COCOEvalCap(coco, cocoRes) cocoEval.params['image_id'] = cocoRes.getImgIds() cocoEval.evaluate() # create output dictionary out = {} for metric, score in cocoEval.eval.items(): out[metric] = score # Add mean perplexity out['perplexity'] = mean_perplexity out['entropy'] = mean_entropy imgToEval = cocoEval.imgToEval for k in list(imgToEval.values())[0]['SPICE'].keys(): if k != 'All': out['SPICE_' + k] = np.array( [v['SPICE'][k]['f'] for v in imgToEval.values()]) out['SPICE_' + k] = (out['SPICE_' + k][out['SPICE_' + k] == out['SPICE_' + k]]).mean() for p in preds_filt: image_id, caption = p['image_id'], p['caption'] imgToEval[image_id]['caption'] = caption if len(preds_n) > 0: cache_path_n = os.path.join( 'eval_results/', '.cache_' + model_id + '_' + split + '_n.json') spice_n = eval_multi.eval_spice_n(preds_n, model_id, split) out.update(spice_n['overall']) div_stats = eval_multi.eval_div_stats(preds_n, model_id, split) out.update(div_stats['overall']) if eval_oracle: oracle = eval_multi.eval_oracle(preds_n, model_id, split) out.update(oracle['overall']) with open(cache_path_n, 'w') as outfile: json.dump( { 'spice_n': spice_n, 'div_stats': div_stats, 'oracle': oracle }, outfile) out['bad_count_rate'] = sum([count_bad(_['caption']) for _ in preds_filt]) / float(len(preds_filt)) outfile_path = os.path.join('eval_results/', model_id + '_' + split + '.json') with open(outfile_path, 'w') as outfile: json.dump({'overall': out, 'imgToEval': imgToEval}, outfile) return out