Python PTBTokenizer.tokenizeの例、eval.mseval.pycocoevalcap.tokenizer.ptbtokenizer.PTBTokenizer.tokenize Pythonの例

コード例 #1

0

ファイルを表示

def getTfIdfWeights(params):

    if params.get('tfIdf_file', '') == '':
        dataset = json.load(
            open(
                '/triton/ics/project/imagedb/picsom/databases/COCO/download/annotations/captions_train2014.json',
                'r'))
        tokenizer = PTBTokenizer()
        origRefs = {}
        curr_keys = set()
        n = params.get('max_ngram', 4)
        for anns in dataset['annotations']:
            if anns['image_id'] not in curr_keys:
                origRefs[anns['image_id']] = []
                curr_keys.add(anns['image_id'])
            origRefs[anns['image_id']].append(anns)

        origRefs = tokenizer.tokenize(origRefs)

        doc_freq = Counter()
        for refs in origRefs.iteritems():
            rCounter = Counter()
            for s in refs[1]:
                rCounter += precook(s, n)
            for ngrams in rCounter.keys():
                doc_freq[ngrams] += 1

        tfidf = {'doc_freq': doc_freq, 'N': len(origRefs)}
        pickle.dump(tfidf, open('tf_idf_ngrams_4_allTrnRefs.p', 'w'))
    else:
        tfidf = pickle.load(open(params.get('tfIdf_file'), 'r'))

    return tfidf

コード例 #2

0

ファイルを表示

ファイル: combineCider.py プロジェクト: KentChun33333/neuraltalkTheano

def getTfIdfWeights(params):

  if params.get('tfIdf_file','') == '':
     dataset = json.load(open('/triton/ics/project/imagedb/picsom/databases/COCO/download/annotations/captions_train2014.json','r'))
     tokenizer = PTBTokenizer()
     origRefs = {}
     curr_keys = set()
     n = params.get('max_ngram',4)
     for anns in dataset['annotations']:
         if anns['image_id'] not in curr_keys:
             origRefs[anns['image_id']] = []
             curr_keys.add(anns['image_id'])
         origRefs[anns['image_id']].append(anns)
      
     origRefs  = tokenizer.tokenize(origRefs)

     doc_freq = Counter()
     for refs in origRefs.iteritems():
         rCounter = Counter()
         for s in refs[1]:
             rCounter += precook(s,n)
         for ngrams in rCounter.keys():
             doc_freq[ngrams] += 1 
     
     tfidf = {'doc_freq':doc_freq,'N':len(origRefs)}
     pickle.dump(tfidf,open('tf_idf_ngrams_4_allTrnRefs.p','w'))
  else:
      tfidf = pickle.load(open(params.get('tfIdf_file'),'r'))

  return tfidf

コード例 #3

0

ファイルを表示

def eval_prep_refs(split, dp, eval_metric):
    refsById = defaultdict(list)
    for s in dp.iterSentences(split=split):
        refsById[s['cocoid']].append({
            'image_id': s['cocoid'],
            'id': s['sentid'],
            'caption': s['raw']
        })

    tokenizer = PTBTokenizer()
    refsById = tokenizer.tokenize(refsById)
    if type(eval_metric) != list:
        eval_metric = [eval_metric]

    scorer = []
    scorer_name = []
    for evm in eval_metric:
        if evm == 'meteor':
            scorer.append(Meteor())
            scorer_name.append("METEOR")
        elif evm == 'spice':
            scorer.append(Spice())
            scorer_name.append("Spice")
        elif evm == 'cider':
            scorer.append(Cider())
            scorer_name.append("CIDEr")
        elif evm == 'rouge':
            scorer.append(Rouge())
            scorer_name.append("ROUGE_L")
        elif evm[:4] == 'bleu':
            bn = int(evm.split('_')[1])
            scorer.append(Bleu(bn))
            scorer_name_temp = ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]
            scorer_name.append(scorer_name_temp[:bn])
        elif evm == 'len':
            scorer.append(lenComputer())
            scorer_name.append("Mean_Len")
        elif evm[:3] == 'div':
            dn = int(evm.split('_')[1])
            scorer.append(divComputer(dn))
            scorer_name.append("Global_Div_" + str(dn))
        elif evm[:6] == 'lcldiv':
            dn = int(evm.split('_')[1])
            scorer.append(lcldivComputer(dn))
            scorer_name.append("Local_Div_" + str(dn))
        else:
            raise ValueError('ERROR: %s --> Unsupported eval metric' % (evm))

    return refsById, {
        'scr_fn': scorer,
        'scr_name': scorer_name,
        'tokenizer': tokenizer
    }

コード例 #4

0

ファイルを表示

ファイル: computeDivStats.py プロジェクト: sususushi/captionGAN

def main(params):

    tokenizer = PTBTokenizer()
    for resF in params['resFileList']:
        caps = json.load(open(resF, 'r'))
        capsById = {}
        idTocaps = {}
        n_cands = params['keepN'] - 1 if params['keepN'] != None else None
        n = 0
        for i, img in enumerate(caps['imgblobs']):
            imgid = int(img['img_path'].split('_')[-1].split('.')[0])
            capsById[imgid] = [{
                'image_id': imgid,
                'caption': img['candidate']['text'],
                'id': n
            }]
            idTocaps[imgid] = i
            n += 1
            capsById[imgid].extend([{
                'image_id': imgid,
                'caption': cd['text'],
                'id': n + j
            } for j, cd in enumerate(img['candidatelist'][:n_cands])])
            if len(capsById[imgid]) < (n_cands + 1):
                capsById[imgid].extend([
                    capsById[imgid][-1]
                    for _ in xrange(n_cands + 1 - len(capsById[imgid]))
                ])
            n += len(capsById[imgid]) - 1

        n_caps_perimg = len(capsById[capsById.keys()[0]])
        print n_caps_perimg
        capsById = tokenizer.tokenize(capsById)

        div_1, adiv_1 = compute_div_n(capsById, 1)
        div_2, adiv_2 = compute_div_n(capsById, 2)

        globdiv_1, _ = compute_global_div_n(capsById, 1)

        print 'Diversity Statistics are as follows: \n Div1: %.2f, Div2: %.2f, gDiv1: %d\n' % (
            div_1, div_2, globdiv_1)

        if params['compute_mbleu']:
            scorer = Bleu(4)

            # Run 1 vs rest bleu metrics
            all_scrs = []
            scrperimg = np.zeros((n_caps_perimg, len(capsById)))
            for i in xrange(n_caps_perimg):
                tempRefsById = {}
                candsById = {}
                for k in capsById:
                    tempRefsById[k] = capsById[k][:i] + capsById[k][i + 1:]
                    candsById[k] = [capsById[k][i]]

                score, scores = scorer.compute_score(tempRefsById, candsById)
                all_scrs.append(score)
                scrperimg[i, :] = scores[1]

            all_scrs = np.array(all_scrs)
            if params['writeback']:
                for i, imgid in enumerate(capsById.keys()):
                    caps['imgblobs'][
                        idTocaps[imgid]]['mBleu_2'] = scrperimg[:, i].mean()
                    caps['imgblobs'][
                        idTocaps[imgid]]['candidate']['mBleu_2'] = scrperimg[0,
                                                                             i]
                    for j, st in enumerate(caps['imgblobs'][idTocaps[imgid]]
                                           ['candidatelist'][:n_cands]):
                        caps['imgblobs'][idTocaps[imgid]]['candidatelist'][j][
                            'mBleu_2'] = scrperimg[1 + j, i]
                json.dump(caps, open(resF, 'w'))

            print 'Mean mutual Bleu scores on this set is:\nmBLeu_1, mBLeu_2, mBLeu_3, mBLeu_4\n'
            print all_scrs.mean(axis=0)

コード例 #5

0

ファイルを表示

ファイル: compute_meteor.py プロジェクト: mireshghallah/A4NT

def main(params):
    resInp = json.load(open(params['inputCands'],'r'))
    resGtImgid = defaultdict(list)
    resCandsImgid = defaultdict(list)
    icnt = 0
    for i,doc in enumerate(resInp['docs']):
        imgid = str(i)
        for j,st in enumerate(doc['sents']):
            if type(st)==list:
                for sent in st:
                    resCandsImgid[imgid+'+'+str(j)].append({'image_id':imgid,'caption':' '.join(sent['trans'].split()[:-1]),'id':icnt})
                    resGtImgid[imgid+'+'+str(j)].append({'image_id':imgid,'caption':' '.join(sent['sent'].split()[:-1]),'id':icnt})
                    icnt+=1
            else:
                resCandsImgid[imgid+'+'+str(j)].append({'image_id':imgid,'caption':' '.join(st['trans'].split()[:-1]),'id':icnt})
                resGtImgid[imgid+'+'+str(j)].append({'image_id':imgid,'caption':' '.join(st['sent'].split()[:-1]),'id':icnt})
                icnt+=1
    tokenizer = PTBTokenizer()
    resCandsImgid = tokenizer.tokenize(resCandsImgid)
    resGtImgid = tokenizer.tokenize(resGtImgid)

    eval_metric = params['eval_metric']
    if eval_metric == 'meteor':
      scorer = Meteor()
      scorer_name = "METEOR"
    elif eval_metric == 'cider':
      scorer = Cider()
      scorer_name = "CIDEr"
    elif eval_metric == 'rouge':
      scorer = Rouge()
      scorer_name = "ROUGE_L"
    elif eval_metric[:4] == 'bleu':
      bn = int(eval_metric.split('_')[1])
      scorer = Bleu(bn)
      scorer_name = ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]
      scorer_name = scorer_name[:bn]
    else:
      raise ValueError('ERROR: %s --> Unsupported eval metric'%(eval_metric))

    lenDict = defaultdict(list)
    for k in resCandsImgid:
       lenDict[len(resCandsImgid[k])].append(k)

    maxlen = max(lenDict.keys())
    print 'Max length: %d'%maxlen
    for i in xrange(maxlen):
        res ={}
        gts = {}
        for k in resGtImgid.keys():
            if i < len(resCandsImgid[k]):
                res[k] = [resCandsImgid[k][i]]
                gts[k] = resGtImgid[k]
        print 'Now in %d, Lengths %d'%(i, len(gts))
        t0 = time.time()
        score, scores = scorer.compute_score(gts, res)
        dt = time.time() - t0
        print 'Done %d in %.3fs, score = %.3f' %(i, dt, score)
        icnt = 0
        for si,k in enumerate(gts.keys()):
            idx,sidx = map(int,k.split('+'))
            if type(st)==list:
                resInp['docs'][idx]['sents'][sidx][i][scorer_name] = scores[si]
            else:
                resInp['docs'][idx]['sents'][sidx][scorer_name] = scores[si]

        assert(len(scores) == si+1)
    #pickle.dump(candScoresImgid,open('candScrMeteor_4AuxCmmePgoogSwapPposJJ_fullVal.json','w'))
    json.dump(resInp,open(params['inputCands'],'w'))

コード例 #6

0

ファイルを表示

ファイル: miscCode.py プロジェクト: Santara/neuraltalkTheano

dataset = json.load(open('/triton/ics/project/imagedb/picsom/databases/COCO/download/annotations/captions_val2014.json','r'))
resMulti = json.load(open('example_images/result_struct_4AuxCmmePgoogSwapPposJJ_fullVal.json','r'))
resAllImgid = defaultdict(list)
for img in dataset['annotations']:
    resAllImgid[img['image_id']].append(img)
resCandsImgid = defaultdict(list)
icnt = 0
for img in resMulti['imgblobs']:
    imgid = int(img['img_path'].split('_')[-1].split('.')[0])
    for s in img['candidatelist']:
        resCandsImgid[imgid].append({'image_id':imgid,'caption':s['text'],'id':icnt})
        icnt+=1
from eval.mseval.pycocoevalcap.meteor.meteor import Meteor
from eval.mseval.pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
tokenizer = PTBTokenizer()
resCandsImgid = tokenizer.tokenize(resCandsImgid)
resAllImgid = tokenizer.tokenize(resAllImgid)

scorer = Meteor()

lenDict = defaultdict(list) 
for k in resCandsImgid:
   lenDict[len(resCandsImgid[k])].append(k) 

maxlen = max(lenDict.keys())
print maxlen
candScoresImgid = defaultdict(list)
for i in xrange(maxlen):
    res ={}
    gts = {}
    for k in resAllImgid.keys():

コード例 #7

0

ファイルを表示

def main(params):
    tokenizer = PTBTokenizer()
    scorer = Spice(multihyp=1)
    refsJs = json.load(open(params['refdata'], 'r'))
    refsById = defaultdict(list)
    for i, ann in enumerate(refsJs['annotations']):
        refsById[ann['image_id']].append({
            'image_id': ann['image_id'],
            'id': i,
            'caption': ann['caption']
        })

    refsById = tokenizer.tokenize(refsById)
    n_cands = params['keepN'] - 1 if params['keepN'] != None else None

    for resF in params['resFileList']:
        caps = json.load(open(resF, 'r'))
        capsById = {}
        n = 0
        n_cands_per_img = np.zeros((len(caps['imgblobs'])), dtype=np.int32)
        for i, img in enumerate(caps['imgblobs']):
            imgid = int(img['img_path'].split('_')[-1].split('.')[0])
            capsById[imgid] = [{
                'image_id': imgid,
                'caption': img['candidate']['text'],
                'id': n
            }]
            n += 1
            capsById[imgid].extend([{
                'image_id': imgid,
                'caption': cd['text'],
                'id': n + j
            } for j, cd in enumerate(img['candidatelist'][:n_cands])])
            n += len(capsById[imgid]) - 1
            n_cands_per_img[i] = len(capsById[imgid])

        capsById = tokenizer.tokenize(capsById)

        print 'Maximum number of candidates is %d, mean is %.2f' % (
            n_cands_per_img.max(), n_cands_per_img.mean())

        refToks = {
            imgid: refsById[imgid]
            for imgid in capsById if imgid in refsById
        }
        if len(refToks) < len(capsById):
            capsById = {imgid: capsById[imgid] for imgid in refToks}

        n_refs_perimg = len(refToks[refToks.keys()[0]])

        all_scrs = []
        #met = [[] for i in xrange(len(eval_metric)) if eval_metric[i][:6] != 'lcldiv']
        if params['rev_eval'] == 1:
            tempCont = capsById
            capsById = refToks
            refToks = tempCont

        if params['iterativeEval']:
            npfilename = osp.join(
                'scorelogs',
                osp.basename(resF).split('.')[0] + '_iterativeSpice_%d' %
                (params['keepN']))
            if params[
                    'refdata'] != '/BS/databases/coco/annotations/captions_val2014.json':
                npfilename += '_visgenome'
            if params['singleHyp']:
                npfilename += '_singlehyp'
            iterIdces = np.arange(n_cands_per_img.max(), dtype=np.int32)
        else:
            iterIdces = [n_cands_per_img.max() - 1]

        mean_scr = np.zeros((len(iterIdces)))
        prec = np.zeros((len(iterIdces), len(capsById), 7))
        rec = np.zeros((len(iterIdces), len(capsById), 7))
        f_scr = np.zeros((len(iterIdces), len(capsById), 7))
        for ii, idx in enumerate(iterIdces):
            candsInp = {
                imgid: [capsById[imgid][min(idx,
                                            len(capsById[imgid]) - 1)]]
                for imgid in capsById
            } if params['singleHyp'] else {
                imgid: capsById[imgid][:idx + 1]
                for imgid in capsById
            }
            mean_scr[ii], all_scores = scorer.compute_score(refToks, candsInp)
            #Compute mean precision and recalls
            categories = all_scores[0].keys()

            for i, scr in enumerate(all_scores):
                for j, cat in enumerate(categories):
                    f_scr[ii, i, j] = scr[cat]['f']
                    prec[ii, i, j] = scr[cat]['pr']
                    rec[ii, i, j] = scr[cat]['re']
            print 'At idx %d, prec = %.3f, rec= %.3f' % (
                idx, prec[ii, :, 0].mean(), rec[ii, :, 0].mean())

        if params['iterativeEval']:
            np.savez(npfilename + '.npz',
                     mean_scr=mean_scr,
                     prec=prec,
                     rec=rec,
                     keys=refToks.keys())
        prec = np.nan_to_num(prec)
        rec = np.nan_to_num(rec)

        print '---------------------\nmean spice is %.3f\n---------------------\n Per category scores are' % (
            mean_scr[-1])
        for j, cat in enumerate(categories):
            print '%12s: f = %.3f, prec = %.3f, rec= %.3f' % (
                cat, np.nan_to_num(f_scr[-1, :, j]).mean(), prec[-1, :,
                                                                 j].mean(),
                rec[-1, :, j].mean())  #~np.isnan(f_scr[-1,:,j])