Ejemplo n.º 1
0
def coco_eval(candidates_file, references_file):
  """
    Given the candidates and references, the coco-caption module is 
    used to calculate various metrics. Returns a list of dictionaries containing:
    -BLEU
    -ROUGE
    -METEOR
    -CIDEr
  """

  # This is used to suppress the output of coco-eval:
  old_stdout = sys.stdout
  sys.stdout = open(os.devnull, "w")
  try:
    # Derived from example code in coco-captions repo
    coco    = COCO( references_file )
    cocoRes = coco.loadRes( candidates_file )
  
    cocoEval = COCOEvalCap(coco, cocoRes)

    cocoEval.evaluate()
  finally:
    # Change back to standard output
    sys.stdout.close()
    sys.stdout = old_stdout
  
  return cocoEval.evalImgs
def language_eval(dataset, preds):
    import sys
    if 'coco' in dataset:
        sys.path.append("coco-caption")
        annFile = 'coco-caption/annotations/captions_val2014.json'
    else:
        sys.path.append("f30k-caption")
        annFile = 'f30k-caption/annotations/dataset_flickr30k.json'
    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap

    encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    coco = COCO(annFile)
    valids = coco.getImgIds()

    # filter results to only those in MSCOCO validation set (will be about a third)
    preds_filt = [p for p in preds if p['image_id'] in valids]
    print 'using %d/%d predictions' % (len(preds_filt), len(preds))
    json.dump(preds_filt, open('tmp.json', 'w')) # serialize to temporary json file. Sigh, COCO API...

    resFile = 'tmp.json'
    cocoRes = coco.loadRes(resFile)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    cocoEval.evaluate()

    # create output dictionary
    out = {}
    for metric, score in cocoEval.eval.items():
        out[metric] = score

    return out
Ejemplo n.º 3
0
def main():
  HASH_IMG_NAME = True
  pylab.rcParams['figure.figsize'] = (10.0, 8.0)
  json.encoder.FLOAT_REPR = lambda o: format(o, '.3f')

  parser = argparse.ArgumentParser()
  parser.add_argument("-i", "--inputfile", type=str, required=True,
      help='File containing model-generated/hypothesis sentences.')
  parser.add_argument("-r", "--references", type=str, required=True,
      help='JSON File containing references/groundtruth sentences.')
  args = parser.parse_args()
  prediction_file = args.inputfile
  reference_file = args.references
  json_predictions_file = '{0}.json'.format(prediction_file)
  
  crf = CocoResFormat()
  crf.read_file(prediction_file, HASH_IMG_NAME)
  crf.dump_json(json_predictions_file)
   
  # create coco object and cocoRes object.
  coco = COCO(reference_file)
  cocoRes = coco.loadRes(json_predictions_file)
  
  # create cocoEval object.
  cocoEval = COCOEvalCap(coco, cocoRes)
  
  # evaluate results
  cocoEval.evaluate()
  
  # print output evaluation scores
  for metric, score in cocoEval.eval.items():
    print '%s: %.3f'%(metric, score)
def coco_eval(ann_fn, json_fn, save_fn):
    coco = COCO(ann_fn)
    coco_res = coco.loadRes(json_fn)
    coco_evaluator = COCOEvalCap(coco, coco_res)
    # comment below line to evaluate the full validation or testing set. 
    coco_evaluator.params['image_id'] = coco_res.getImgIds()
    coco_evaluator.evaluate(save_fn)
Ejemplo n.º 5
0
def main(argv):
    input_json = 'results/' + sys.argv[1]

    annFile = 'annotations/captions_val2014.json'
    coco = COCO(annFile)
    valids = coco.getImgIds()

    checkpoint = json.load(open(input_json, 'r'))
    preds = checkpoint['val_predictions']

    # filter results to only those in MSCOCO validation set (will be about a third)
    preds_filt = [p for p in preds if p['image_id'] in valids]
    print 'using %d/%d predictions' % (len(preds_filt), len(preds))
    json.dump(preds_filt, open('tmp.json', 'w')) # serialize to temporary json file. Sigh, COCO API...

    resFile = 'tmp.json'
    cocoRes = coco.loadRes(resFile)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    cocoEval.evaluate()

    # create output dictionary
    out = {}
    for metric, score in cocoEval.eval.items():
        out[metric] = score
    # serialize to file, to be read from Lua
    json.dump(out, open(input_json + '_out.json', 'w'))
Ejemplo n.º 6
0
def language_eval(input_data, savedir, split):
  if type(input_data) == str: # Filename given.
    checkpoint = json.load(open(input_data, 'r'))
    preds = checkpoint
  elif type(input_data) == list: # Direct predictions give.
    preds = input_data

  annFile = 'third_party/coco-caption/annotations/captions_val2014.json'
  coco = COCO(annFile)
  valids = coco.getImgIds()

  # Filter results to only those in MSCOCO validation set (will be about a third)
  preds_filt = [p for p in preds if p['image_id'] in valids]
  print 'Using %d/%d predictions' % (len(preds_filt), len(preds))
  resFile = osp.join(savedir, 'result_%s.json' % (split))
  json.dump(preds_filt, open(resFile, 'w')) # Serialize to temporary json file. Sigh, COCO API...

  cocoRes = coco.loadRes(resFile)
  cocoEval = COCOEvalCap(coco, cocoRes)
  cocoEval.params['image_id'] = cocoRes.getImgIds()
  cocoEval.evaluate()

  # Create output dictionary.
  out = {}
  for metric, score in cocoEval.eval.items():
    out[metric] = score

  # Return aggregate and per image score.
  return out, cocoEval.evalImgs
def evaluateModel(model_json):
    cocoRes = coco.loadRes(model_json)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()  
    cocoEval.evaluate()
    results = {}
    for metric, score in cocoEval.eval.items():
        results[metric] = score
    return results
Ejemplo n.º 8
0
def score_generation(gt_filename=None, generation_result=None):

  coco_dict = read_json(generation_result)
  coco = COCO(gt_filename)
  generation_coco = coco.loadRes(generation_result)
  coco_evaluator = COCOEvalCap(coco, generation_coco)
  #coco_image_ids = [self.sg.image_path_to_id[image_path]
  #                  for image_path in self.images]
  coco_image_ids = [j['image_id'] for j in coco_dict]
  coco_evaluator.params['image_id'] = coco_image_ids
  results = coco_evaluator.evaluate(return_results=True)
  return results
Ejemplo n.º 9
0
def measure(prediction_txt_path, reference):
    # 把txt格式的预测结果转换成检验程序所要求的格式
    crf = CocoResFormat()
    crf.read_file(prediction_txt_path, True)

    # crf.res就是格式转换之后的预测结果
    cocoRes = reference.loadRes(crf.res)
    cocoEval = COCOEvalCap(reference, cocoRes)

    cocoEval.evaluate()

    for metric, score in cocoEval.eval.items():
        print('%s: %.3f' % (metric, score))
    return cocoEval.eval
def language_eval(dataset, preds, model_id, split):
    import sys
    if 'coco' in dataset:
        sys.path.append("coco-caption")
        annFile = 'coco-caption/annotations/captions_val2014.json'
    elif 'msvd' in dataset:
        sys.path.append('coco-caption')
        annFile = 'coco-caption/annotations/coco_ref_msvd.json'
    elif 'kuaishou' in dataset:
        sys.path.append('coco-caption')
        annFile = 'coco-caption/annotations/coco_ref_kuaishou.json'
    else:
        sys.path.append("f30k-caption")
        annFile = 'f30k-caption/annotations/dataset_flickr30k.json'
    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap

    encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    if not os.path.isdir('eval_results'):
        os.mkdir('eval_results')
    cache_path = os.path.join('eval_results/', model_id + '_' + split + '.json')

    coco = COCO(annFile)
    valids = coco.getImgIds()

    # filter results to only those in MSCOCO validation set (will be about a third)
    preds_filt = [p for p in preds if p['image_id'] in valids]
    print('using %d/%d predictions' % (len(preds_filt), len(preds)))
    json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API...

    cocoRes = coco.loadRes(cache_path)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    cocoEval.evaluate()

    # create output dictionary
    out = {}
    for metric, score in cocoEval.eval.items():
        out[metric] = score

    imgToEval = cocoEval.imgToEval
    for p in preds_filt:
        image_id, caption = p['image_id'], p['caption']
        imgToEval[image_id]['caption'] = caption
    with open(cache_path, 'w') as outfile:
        json.dump({'overall': out, 'imgToEval': imgToEval}, outfile)

    return out
Ejemplo n.º 11
0
 def coco_val_eval(self, pred_path, result_path):
   """Evaluate the predicted sentences on MS COCO validation."""
   sys.path.append('./external/coco-caption')
   from pycocotools.coco import COCO
   from pycocoevalcap.eval import COCOEvalCap
   
   coco = COCO('./external/coco-caption/annotations/captions_val2014.json')
   cocoRes = coco.loadRes(pred_path)
   
   cocoEval = COCOEvalCap(coco, cocoRes)
   cocoEval.params['image_id'] = cocoRes.getImgIds()
   cocoEval.evaluate()
   
   with open(result_path, 'w') as fout:
     for metric, score in cocoEval.eval.items():
       print('%s: %.3f' % (metric, score), file=fout)
Ejemplo n.º 12
0
def run(dataset,algName,outDir):

    pylab.rcParams['figure.figsize'] = (10.0, 8.0)

    import json
    from json import encoder
    encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    # set up file names and pathes
    # dataDir='./data/'+dataset
    # dataDir= '/media/SSD/projects/NeuralTalkAnimator'
    dataType='val'

    # annFile='%s/annotations/captions_%s.json'%(dataDir,dataType)
    # annFile='/media/SSD/projects/NeuralTalkAnimator/data/youtube2text/captions_val2014.json'
    dataDir = 'data/'+dataset
    annFile='%s/captions_%s.json'%(dataDir,dataType)
    subtypes=['results', 'evalImgs', 'eval']
    [resFile, evalImgsFile, evalFile]= \
    ['%s/captions_%s_%s_%s.json'%(outDir,dataType,algName,subtype) for subtype in subtypes]

    coco = COCO(annFile)
    cocoRes = coco.loadRes(resFile)

    # create cocoEval object by taking coco and cocoRes
    cocoEval = COCOEvalCap(coco, cocoRes)

    # evaluate on a subset of images by setting
    # cocoEval.params['image_id'] = cocoRes.getImgIds()
    # please remove this line when evaluating the full validation set
    cocoEval.params['image_id'] = cocoRes.getImgIds()

    # evaluate results
    cocoEval.evaluate()

    # print output evaluation scores
    scores = list()
    for metric, score in cocoEval.eval.items():
        print '%s: %.3f'%(metric, score)
        scores.append(score)

    print 'inside metrics'
    return scores
Ejemplo n.º 13
0
import sys

input_json = sys.argv[1]

annFile = 'annotations/captions_val2014.json'
coco = COCO(annFile)
valids = coco.getImgIds()

checkpoint = json.load(open(input_json, 'r'))
preds = checkpoint['val_predictions']

# filter results to only those in MSCOCO validation set (will be about a third)
preds_filt = [p for p in preds if p['image_id'] in valids]
# preds_filt = preds
print 'using %d/%d predictions' % (len(preds_filt), len(preds))
json.dump(preds_filt, open('tmp.json', 'w')) # serialize to temporary json file. Sigh, COCO API...

resFile = 'tmp.json'
cocoRes = coco.loadRes(resFile)
cocoEval = COCOEvalCap(coco, cocoRes)
cocoEval.params['image_id'] = cocoRes.getImgIds()
cocoEval.evaluate()

# create output dictionary
out = {}
for metric, score in cocoEval.eval.items():
    out[metric] = score
# serialize to file, to be read from Lua
json.dump(out, open(input_json + '_out.json', 'w'))

Ejemplo n.º 14
0
def language_eval(dataset, preds, preds_n, eval_kwargs, split):
    model_id = eval_kwargs['id']
    eval_oracle = eval_kwargs.get('eval_oracle', 0)

    import sys
    sys.path.append("coco-caption")
    annFile = 'coco-caption/annotations/captions_val2014.json'
    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap

    # encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    if not os.path.isdir('eval_results'):
        os.mkdir('eval_results')
    cache_path = os.path.join('eval_results/',
                              '.cache_' + model_id + '_' + split + '.json')

    coco = COCO(annFile)
    valids = coco.getImgIds()

    # filter results to only those in MSCOCO validation set (will be about a third)
    preds_filt = [p for p in preds if p['image_id'] in valids]
    mean_perplexity = sum([_['perplexity']
                           for _ in preds_filt]) / len(preds_filt)
    mean_entropy = sum([_['entropy'] for _ in preds_filt]) / len(preds_filt)
    print('using %d/%d predictions' % (len(preds_filt), len(preds)))
    json.dump(preds_filt,
              open(cache_path,
                   'w'))  # serialize to temporary json file. Sigh, COCO API...

    cocoRes = coco.loadRes(cache_path)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    cocoEval.evaluate()

    # create output dictionary
    out = {}
    for metric, score in cocoEval.eval.items():
        out[metric] = score
    # Add mean perplexity
    out['perplexity'] = mean_perplexity
    out['entropy'] = mean_entropy

    imgToEval = cocoEval.imgToEval
    for k in list(imgToEval.values())[0]['SPICE'].keys():
        if k != 'All':
            out['SPICE_' + k] = np.array(
                [v['SPICE'][k]['f'] for v in imgToEval.values()])
            out['SPICE_' + k] = (out['SPICE_' + k][out['SPICE_' +
                                                       k] == out['SPICE_' +
                                                                 k]]).mean()
    for p in preds_filt:
        image_id, caption = p['image_id'], p['caption']
        imgToEval[image_id]['caption'] = caption

    if len(preds_n) > 0:
        cache_path_n = os.path.join(
            'eval_results/', '.cache_' + model_id + '_' + split + '_n.json')
        spice_n = eval_multi.eval_spice_n(preds_n, model_id, split)
        out.update(spice_n['overall'])
        div_stats = eval_multi.eval_div_stats(preds_n, model_id, split)
        out.update(div_stats['overall'])
        if eval_oracle:
            oracle = eval_multi.eval_oracle(preds_n, model_id, split)
        out.update(oracle['overall'])
        with open(cache_path_n, 'w') as outfile:
            json.dump(
                {
                    'spice_n': spice_n,
                    'div_stats': div_stats,
                    'oracle': oracle
                }, outfile)

    out['bad_count_rate'] = sum([count_bad(_['caption'])
                                 for _ in preds_filt]) / float(len(preds_filt))
    outfile_path = os.path.join('eval_results/',
                                model_id + '_' + split + '.json')
    with open(outfile_path, 'w') as outfile:
        json.dump({'overall': out, 'imgToEval': imgToEval}, outfile)

    return out
annFile='./annotations/captions_val2014.json'

# create coco object and cocoRes object
coco = COCO(annFile)

all_results_json=[]

for i in xrange(50):
    resFile=model_dir+'/caption_model%d.json'%i
    print resFile


    cocoRes = coco.loadRes(resFile)
    # create cocoEval object by taking coco and cocoRes
    cocoEval = COCOEvalCap(coco, cocoRes)

    # evaluate on a subset of images by setting
    # cocoEval.params['image_id'] = cocoRes.getImgIds()
    # please remove this line when evaluating the full validation set
    #cocoEval.params['image_id'] = cocoRes.getImgIds()

    #evaluate results
    cocoEval.evaluate()

    # print output evaluation scores
    results={}
    for metric, score in cocoEval.eval.items():
        results[metric]=score
    all_results_json.append(results)
Ejemplo n.º 16
0
def language_eval(dataset, preds, model_id, split):
    import sys
    sys.path.append("coco-caption")
    if 'coco' in dataset:
        annFile = 'coco-caption/annotations/captions_val2014.json'
    elif 'flickr30k' in dataset or 'f30k' in dataset:
        annFile = 'coco-caption/f30k_captions4eval.json'
    elif 'person' in dataset:
        annFile='coco-caption/person_captions4eval.json'
    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap
    # encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    if not os.path.isdir('eval_results'):
        os.mkdir('eval_results')
    cache_path = os.path.join('eval_results/', '.cache_'+ model_id + '_' + split + '.json')
    best_cider=0
    #gdindex=[0,1,2,3,4]
    gdindex=[-1]
    cider_list =[]
    for i in gdindex:
        annFile='coco-caption/person_captions4eval_'+str(i)+'.json'
        print(annFile)
        coco = COCO(annFile)    
        valids = coco.getImgIds()

        # filter results to only those in MSCOCO validation set (will be about a third)
        preds_filt = [p for p in preds if p['image_id'] in valids]
        print('using %d/%d predictions' % (len(preds_filt), len(preds)))
        json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API...
        cocoRes = coco.loadRes(cache_path)
        cocoEval = COCOEvalCap(coco, cocoRes)
        cocoEval.params['image_id'] = cocoRes.getImgIds()
        cocoEval.evaluate()
        cider_list.append(cocoEval.eval['CIDEr'])
        # create output dictionary
        if cocoEval.eval['CIDEr']>=best_cider:
            best_cider = cocoEval.eval['CIDEr']
            out = {}
            for metric, score in cocoEval.eval.items():
                out[metric] = score

            imgToEval = cocoEval.imgToEval
                # collect SPICE_sub_score
            #for k in imgToEval.values()[0]['SPICE'].keys():
            #    if k != 'All':
            #        out['SPICE_'+k] = np.array([v['SPICE'][k]['f'] for v in  imgToEval.values()])
            #        out['SPICE_'+k] = (out['SPICE_'+k][out['SPICE_'+k]==out['SPICE_'+k]]).mean()
            
            for p in preds_filt:
                image_id, caption = p['image_id'], p['caption']
                imgToEval[image_id]['caption'] = caption
            #update predictions
            for i in range(len(preds)):
                if preds[i]['image_id'] in imgToEval:
                    preds[i]['eval'] = imgToEval[preds[i]['image_id']]

            out['bad_count_rate'] = sum([count_bad(_['caption']) for _ in preds_filt]) / float(len(preds_filt))
        else:
            continue
    outfile_path = os.path.join('eval_results/', model_id + '_' + split + '.json')
    with open(outfile_path, 'w') as outfile:
        json.dump({'overall': out, 'imgToEval': imgToEval}, outfile)
    cider_list=np.array(cider_list)
    print("min:",np.min(cider_list)," max:",np.max(cider_list)," mean:",np.mean(cider_list)," std:",np.std(cider_list))
    return out
Ejemplo n.º 17
0
import os
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

coco = COCO(
    os.path.expanduser(
        "~/Projects/datasets/COCO/annotations/captions_val2017.json"))
cocoRes = coco.loadRes(
    os.path.expanduser("~/Projects/datasets/COCO/results_3.json"))

# create cocoEval object by taking coco and cocoRes
cocoEval = COCOEvalCap(coco, cocoRes)

# evaluate on a subset of images by setting
# cocoEval.params['image_id'] = cocoRes.getImgIds()
# please remove this line when evaluating the full validation set
cocoEval.params['image_id'] = cocoRes.getImgIds()

# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
cocoEval.evaluate()
  def generation_experiment(self, strategy, max_batch_size=1000):
    # Compute image descriptors.
    print 'Computing image descriptors'
    self.compute_descriptors()

    do_batches = (strategy['type'] == 'beam' and strategy['beam_size'] == 1) or \
        (strategy['type'] == 'sample' and
         ('temp' not in strategy or strategy['temp'] in (1, float('inf'))) and
         ('num' not in strategy or strategy['num'] == 1))

    num_images = len(self.images)
    batch_size = min(max_batch_size, num_images) if do_batches else 1

    # Generate captions for all images.
    all_captions = [None] * num_images
    for image_index in xrange(0, num_images, batch_size):
      batch_end_index = min(image_index + batch_size, num_images)
      sys.stdout.write("\rGenerating captions for image %d/%d" %
                       (image_index, num_images))
      sys.stdout.flush()
      if do_batches:
        if strategy['type'] == 'beam' or \
            ('temp' in strategy and strategy['temp'] == float('inf')):
          temp = float('inf')
        else:
          temp = strategy['temp'] if 'temp' in strategy else 1
        output_captions, output_probs = self.captioner.sample_captions(
            self.descriptors[image_index:batch_end_index], temp=temp)
        for batch_index, output in zip(range(image_index, batch_end_index),
                                       output_captions):
          all_captions[batch_index] = output
      else:
        for batch_image_index in xrange(image_index, batch_end_index):
          captions, caption_probs = self.captioner.predict_caption(
              self.descriptors[batch_image_index], strategy=strategy)
          best_caption, max_log_prob = None, None
          for caption, probs in zip(captions, caption_probs):
            log_prob = gen_stats(probs)['log_p']
            if best_caption is None or \
                (best_caption is not None and log_prob > max_log_prob):
              best_caption, max_log_prob = caption, log_prob
          all_captions[batch_image_index] = best_caption
    sys.stdout.write('\n')

    # Compute the number of reference files as the maximum number of ground
    # truth captions of any image in the dataset.
    num_reference_files = 0
    for captions in self.dataset.values():
      if len(captions) > num_reference_files:
        num_reference_files = len(captions)
    if num_reference_files <= 0:
      raise Exception('No reference captions.')

    # Collect model/reference captions, formatting the model's captions and
    # each set of reference captions as a list of len(self.images) strings.
    exp_dir = '%s/generation' % self.cache_dir
    if not os.path.exists(exp_dir):
      os.makedirs(exp_dir)
    # For each image, write out the highest probability caption.
    model_captions = [''] * len(self.images)
    reference_captions = [([''] * len(self.images)) for _ in xrange(num_reference_files)]
    for image_index, image in enumerate(self.images):
      caption = self.captioner.sentence(all_captions[image_index])
      model_captions[image_index] = caption
      for reference_index, (_, caption) in enumerate(self.dataset[image]):
        caption = ' '.join(caption)
        reference_captions[reference_index][image_index] = caption

    coco_image_ids = [self.sg.image_path_to_id[image_path]
                      for image_path in self.images]
    generation_result = [{
      'image_id': self.sg.image_path_to_id[image_path],
      'caption': model_captions[image_index]
    } for (image_index, image_path) in enumerate(self.images)]
    json_filename = '%s/generation_result.json' % self.cache_dir
    print 'Dumping result to file: %s' % json_filename
    with open(json_filename, 'w') as json_file:
      json.dump(generation_result, json_file)
    generation_result = self.sg.coco.loadRes(json_filename)
    coco_evaluator = COCOEvalCap(self.sg.coco, generation_result)
    coco_evaluator.params['image_id'] = coco_image_ids
    coco_evaluator.evaluate()
Ejemplo n.º 19
0
def language_eval(type, preds, model_id, split):
    import sys
    if 'coco' in type:
        annFile = 'coco-caption/annotations/captions_val2014.json'
        sys.path.append("coco-caption")
        print("Load reference file from: {}".format(annFile))
        from pycocotools.coco import COCO
        from pycocoevalcap.eval import COCOEvalCap
    elif '30k' in type:
        annFile = 'coco-caption/annotations/flickr30k_val.json'
        sys.path.append("coco-caption")
        print("Load reference file from: {}".format(annFile))
        from pycocotools.coco import COCO
        from pycocoevalcap.eval import COCOEvalCap
    elif 'zh' in type:
        annFile = 'data/aic_i2t/eval_reference.json'
        sys.path.append("AI_Challenger/Evaluation/caption_eval")
        print("Load reference file from: {}".format(annFile))
        from coco_caption.pycxtools.coco import COCO
        from coco_caption.pycxevalcap.eval import COCOEvalCap
    else:
        raise Exception('Current eval type is not recognizable.')

    encoder.FLOAT_REPR = lambda o: format(o, '.3f')

    if not os.path.isdir('eval_results'):
        os.mkdir('eval_results')
    cache_path = os.path.join('eval_results/',
                              type + '_' + model_id + '_' + split + '.json')
    print("Load cache path is:" + cache_path)
    coco = COCO(annFile)
    valids = coco.getImgIds()
    # filter results to only those in MSCOCO validation set (will be about a third)
    if 'coco' in type:
        preds_filt = [p for p in preds if p['image_id'] in valids]
        print('using %d/%d predictions' % (len(preds_filt), len(preds)))
        json.dump(preds_filt, open(
            cache_path,
            'w'))  # serialize to temporary json file. Sigh, COCO API...
    elif '30k' in type:
        preds_filt = [{
            'caption': p['caption'],
            'image_id': str(p['image_id'])
        } for p in preds if p['image_id'] in valids]
    else:
        json.dump(preds, open(
            cache_path,
            'w'))  # serialize to temporary json file. Sigh, COCO API...

    cocoRes = coco.loadRes(cache_path)
    cocoEval = COCOEvalCap(coco, cocoRes)
    cocoEval.params['image_id'] = cocoRes.getImgIds()
    print(len(set(cocoRes.getImgIds()) & set(coco.getImgIds())))
    cocoEval.evaluate()

    # create output dictionary
    out = {}
    for metric, score in cocoEval.eval.items():
        out[metric] = score

    imgToEval = cocoEval.imgToEval
    # for p in preds:
    #     image_id, caption = p['image_id'], p['caption']
    #     imgToEval[image_id]['caption'] = caption
    with open(cache_path, 'w') as outfile:
        json.dump({'overall': out, 'imgToEval': imgToEval}, outfile)

    return out