예제 #1
0
def process(options):

    overwrite = options.overwrite
    inputeFile = options.inputeFile
    weightFile = options.weightFile


    weightFile = os.path.join('result', weightFile)
    if checkToSkip(weightFile, overwrite):
        sys.exit(0)
    makedirsforfile(weightFile)

    test()
    print '-'*70
    best_perf = -10
    best_alpha = None
    
    sigma = 0.001
    data = load_data(os.path.join('result', inputeFile))
    for i in range(1):
        perf, alpha = coordinate_ascent(data, sigma)
        if perf > best_perf:
            best_perf = perf
            best_alpha = alpha
        print '*'*70
    print 'optimized wights:', ' '.join(['%g'%x for x in best_alpha])
    print 'best tuned performance:', best_perf

    open(weightFile, 'w').write(' '.join(map(str,best_alpha)))
    print 'optimized wight parameters have written into %s' % weightFile
예제 #2
0
def process(options, source_dir, feat_dim, imsetfile, result_dir):

    resultfile = os.path.join(result_dir, 'feature.bin')
    if checkToSkip(resultfile, options.overwrite):
        sys.exit(0)

    imset = map(str.strip, open(imsetfile).readlines())
    print "requested", len(imset)

    featurefile = BigFile(source_dir, feat_dim)
    
    makedirsforfile(resultfile)
    fw = open(resultfile, 'wb')

    done = []
    start = 0
  
    while start < len(imset):
        end = min(len(imset), start + options.blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))
        renamed, vectors = featurefile.read(imset[start:end])
        for vec in vectors:
            vec = np.array(vec, dtype=np.float32)
            vec.tofile(fw)
        done += renamed
        start = end
    fw.close()

    assert(len(done) == len(set(done)))
    resultfile = os.path.join(result_dir, 'id.txt')
    fw = open(resultfile, 'w')
    fw.write(' '.join(done))
    fw.close()

    print '%d requested, %d obtained' % (len(imset), len(done))
예제 #3
0
def process(options, tagfile, tpp):
    if "stem" == tpp:
        worker = nltk.PorterStemmer()
        func = stemming
    else:
        worker = nltk.WordNetLemmatizer()
        func = lemmatize

    resultfile = os.path.join(os.path.split(tagfile)[0], 'id.userid.%stags.txt' % tpp)
    if checkToSkip(resultfile, options.overwrite):
        return 0

    makedirsforfile(resultfile)

    fw = codecs.open(resultfile, "w", encoding='utf8')
    parsed = 0
    obtained = 0
    for line in open(tagfile):
        elems = line.strip().split()
        parsed += 1
        if len(elems) > 2:
            newtags = []
            for tag in elems[2:]:
                try:
                    newtag = func(worker,tag.lower())
                except:
                    newtag = tag
                
                newtags.append(newtag.decode('utf-8'))

            newline = "\t".join([elems[0], elems[1], " ".join(newtags)])
            fw.write('%s\n' % newline)
            obtained += 1
    fw.close()
    print ('%d lines parsed, %d records obtained' % (parsed, obtained) )
예제 #4
0
def process(options, conceptfile, tagvotesfile, resultfile):
    if checkToSkip(resultfile, options.overwrite):
        return 0
    
    concepts = map(str.strip, open(conceptfile).readlines())
    concept2index = dict(zip(concepts,range(len(concepts))))
    
    data = open(tagvotesfile).readlines()
    print ('%d instances to dump' % len(data))
    
    concept_num = len(concepts)
    image_num = len(data)
    scores = np.zeros((image_num, concept_num)) - 1e4
    id_images = [None] * image_num
    
    for i in xrange(image_num):
        elems = str.split(data[i])
        id_images[i] = int(elems[0])
        del elems[0]
        for k in range(0, len(elems), 2):
            tag = elems[k]
            score = float(elems[k+1])
            j = concept2index.get(tag, -1)
            if j >= 0:
                scores[i,j] = score
    
    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump({'concepts':concepts, 'id_images':id_images, 'scores':scores}, output, -1)
    output.close()
예제 #5
0
def process(options, collection, annotationName, simdir, resultfile):
    rootpath = options.rootpath
    
    if checkToSkip(resultfile, options.overwrite):
        return 0
    
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    concept_num = len(concepts)

    id_images = readImageSet(collection, collection, rootpath)
    image_num = len(id_images)
    im2index = dict(zip(id_images, range(image_num)))
    print ('%d instances, %d concepts to dump -> %s' % (image_num, concept_num, resultfile))
    
    scores = np.zeros((image_num, concept_num)) - 1e4
    
    for c_id,concept in enumerate(concepts):
        simfile = os.path.join(simdir, '%s.txt' % concept)
        ranklist = readRankingResults(simfile)
        for im,score in ranklist:
            idx = im2index[im]
            scores[idx,c_id] = score

    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump({'concepts':concepts, 'id_images':map(int,id_images), 'scores':scores}, output, -1)
    output.close()
예제 #6
0
def process(options, collection, annotationName, pos_num):
    assert(annotationName.endswith('.txt'))
    rootpath = options.rootpath
    pos_bag_num = options.pos_bag_num
    neg_bag_num = options.neg_bag_num
    neg_pos_ratio = options.neg_pos_ratio

    annotationNameStr = annotationName[:-4] + ('.random%d' % pos_num) + '.%d' + ('.npr%d' % neg_pos_ratio) + '.%d.txt'

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    
    skip = 0
    newAnnotationNames = [None] * (pos_bag_num * neg_bag_num)

    for idxp in range(pos_bag_num):
        for idxn in range(neg_bag_num):
            anno_idx = idxp * neg_bag_num + idxn
            newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn)
            resultfile = os.path.join(rootpath,collection,'Annotations',newAnnotationNames[anno_idx])
            if checkToSkip(resultfile, options.overwrite):
                skip += 1
                continue
            writeConcepts(concepts,resultfile)

    first,second,last = annotationNameStr.split('%d')
    scriptfile = os.path.join(rootpath,collection,'annotationfiles',first + '0-%d'%(pos_bag_num-1) + second + '0-%d'%(neg_bag_num-1) + last)

    makedirsforfile(scriptfile)
    fout = open(scriptfile,'w')
    fout.write('\n'.join(newAnnotationNames) + '\n')
    fout.close()

    if len(newAnnotationNames) == skip:
        return 0
        
    for concept in concepts:
        names,labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath)
        positivePool = [x[0] for x in zip(names,labels) if x[1]>0]
        negativePool = [x[0] for x in zip(names,labels) if x[1]<0]
        
        for idxp in range(pos_bag_num):
            if len(positivePool) > pos_num:
                positiveBag = random.sample(positivePool, pos_num)
            else:
                positiveBag = positivePool
            for idxn in range(neg_bag_num):
                anno_idx = idxp * neg_bag_num + idxn
                newAnnotationName = newAnnotationNames[anno_idx]
                resultfile = os.path.join(rootpath,collection,'Annotations','Image',newAnnotationName,'%s.txt'%concept)
                if checkToSkip(resultfile, options.overwrite):
                    continue
                real_neg_num = max(len(positiveBag) * neg_pos_ratio, 1000)
                real_neg_num = min(len(negativePool), real_neg_num)
                negativeBag = random.sample(negativePool, real_neg_num)

                assert(len(set(positiveBag).intersection(set(negativeBag))) == 0)
                printStatus(INFO, "anno(%s,%d) %d pos %d neg -> %s" % (concept,anno_idx,len(positiveBag),len(negativeBag),resultfile))
                writeAnnotations(positiveBag + negativeBag, [1]*len(positiveBag) + [-1]*len(negativeBag), resultfile)
예제 #7
0
def process(options, testCollection, trainCollection, tagsimMethod):
    rootpath = options.rootpath
    overwrite = options.overwrite
    testsetName = options.testset if options.testset else testCollection 
    tpp = options.tpp
    numjobs = options.numjobs
    job = options.job
    useWnVob = 1

    outputName = tagsimMethod + '-wn' if useWnVob else tagsimMethod

    if tagsimMethod == 'wns':
        resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, outputName,'id.tagvotes.txt')
    else:    
        resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, trainCollection, outputName,'id.tagvotes.txt')
    if numjobs>1:
        resultfile = resultfile.replace("id.tagvotes.txt", "id.tagvotes.%d.%d.txt" % (numjobs,job))

    if checkToSkip(resultfile, overwrite):
        sys.exit(0)

    makedirsforfile(resultfile)

    try:
        doneset = set([str.split(x)[0] for x in open(options.donefile).readlines()[:-1]])
    except:
        doneset = set()
        
    printStatus(INFO, "done set: %d" % len(doneset))

 
    testImageSet = readImageSet(testCollection, testCollection, rootpath)
    testImageSet = [x for x in testImageSet if x not in doneset]
    testImageSet = [testImageSet[i] for i in range(len(testImageSet)) if (i%numjobs+1) == job]
    printStatus(INFO, 'working on %d-%d, %d test images -> %s' % (numjobs,job,len(testImageSet),resultfile) )
    
    testreader = TagReader(testCollection, rootpath=rootpath)    

    if tagsimMethod == "wns":
        tagrel = SIM_TO_TAGREL["wns"](trainCollection, useWnVob, "wup", rootpath)
    else:
        tagrel = SIM_TO_TAGREL[tagsimMethod](trainCollection, useWnVob, rootpath)

 
    done = 0
    fw = open(resultfile, "w")
    
    for qry_id in testImageSet:
        qry_tags = testreader.get(qry_id)    
        tagvotes = tagrel.estimate(qry_tags)
        newline = qry_id + " " + " ".join(["%s %s" % (tag, niceNumber(vote,8)) for (tag,vote) in tagvotes])
        fw.write(newline+"\n")
        done += 1
        if done%1000 == 0:
            printStatus(INFO, "%d done" % done)
    # done    
    fw.close()
    printStatus(INFO, "%d done" % done)
예제 #8
0
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName):
    assert(modelName.startswith('fastlinear'))
    
    rootpath = options.rootpath
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job
    topk = options.topk
    
    outputName = '%s,%s' % (feature,modelName)
    
    resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt')
    if numjobs>1:
        resultfile += '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    test_imset = readImageSet(testCollection, testCollection, rootpath)
    test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job]
    test_imset = set(test_imset)
    nr_of_test_images = len(test_imset)
    printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile))

    ma = ModelArray(trainCollection, trainAnnotationName, feature, modelName, rootpath=rootpath)
        
    feat_file = StreamFile(os.path.join(rootpath, testCollection, "FeatureData", feature))
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    done = 0

    feat_file.open()
    for _id, _vec in feat_file:
        if _id not in test_imset:
            continue
       
        res = ma.predict([_vec],prob=0)
        tagvotes = res[0]
        if topk>0:
            tagvotes = tagvotes[:topk]
        newline = '%s %s\n' % (_id, " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes]))
        fw.write(newline)
        done += 1
        if done % 1e4  == 0:
            printStatus(INFO, "%d done" % done)

    feat_file.close()
    fw.close()
    printStatus(INFO, "%d done" % (done))
    return done
예제 #9
0
def process(options, testCollection):
    overwrite = options.overwrite
    rootpath = options.rootpath
    corpus = options.corpus
    word2vec_model = options.word2vec
    embedding_model = options.embedding
    Y0 = options.Y0
    Y1 = options.Y1
    pY0 = options.pY0
    r = options.r
    blocksize = 2000

    embedding_name = '%s,%s,%s' % (corpus, word2vec_model, embedding_model)
    for synset_name in [Y0, Y1]:
        assert(os.path.exists(os.path.join(rootpath, 'synset2vec', synset_name, embedding_name)))

    resfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, embedding_name, pY0, 'id.tagvotes.txt')
    if checkToSkip(resfile, overwrite):
        return 0

    label_file = 'data/ilsvrc12/synsets.txt'
    label2vec_dir = os.path.join(rootpath, 'synset2vec', Y0, embedding_name)
    i2v = Image2Vec(label_file, label2vec_dir)

    tagger = ZeroshotTagger(synset_name=Y1, embedding_name=embedding_name, rootpath=rootpath)

    imset = readImageSet(testCollection, testCollection, rootpath)
    feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', pY0)
    feat_file = BigFile(feat_dir)
    

    printStatus(INFO, 'tagging %d images' % len(imset))
    makedirsforfile(resfile)
    fw = open(resfile, 'w')

    start = 0
    while start < len(imset):
        end = min(len(imset), start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end))
        todo = imset[start:end]
        if not todo:
            break

        renamed, vectors = feat_file.read(todo)
        output = []
        for _id,_vec in zip(renamed, vectors):
            im_vec = i2v.embedding(_vec)
            pred = tagger.predict(im_vec, topk=options.r)
            output.append('%s %s\n' % (_id, ' '.join(['%s %s'%(x[0],x[1]) for x in pred])))
        start = end
        fw.write(''.join(output))

    fw.close()
예제 #10
0
파일: negbp.py 프로젝트: Peratham/jingwei
def process(options, trainCollection, baseAnnotationName, startAnnotationName, feature, modelName):
    global train_model, compress_model, save_model
    assert(modelName in ['fik', 'fastlinear'])
    if 'fik' == modelName:
        from model_based.svms.fiksvm.svmutil import svm_train as train_model
        from model_based.svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from model_based.svms.fiksvm.fiksvm import fiksvm_save_model as save_model
    else:
        from model_based.svms.fastlinear.liblinear193.python.liblinearutil import train as train_model
        from model_based.svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        from model_based.svms.fastlinear.fastlinear import fastlinear_save_model as save_model


    rootpath = options.rootpath
    overwrite = options.overwrite
    params = {'rootpath': rootpath, 'trainCollection': trainCollection, 'baseAnnotationName': baseAnnotationName,
              'startAnnotationName': startAnnotationName, 'feature': feature, 'model': modelName, 'strategy': options.strategy,
              'iterations': options.iterations, 'npr': options.npr, 'nr_bins': options.nr_bins}

    concepts = readConcepts(trainCollection, startAnnotationName, rootpath)
    newAnnotationName = get_new_annotation_name(params)
    newModelName = get_model_name(params)
    modeldir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, newModelName)
    todo = [concept for concept in concepts if overwrite or os.path.exists(os.path.join(modeldir,'%s.txt'%concept)) is False]
    activeConcepts = [todo[i] for i in range(len(todo)) if (i%options.numjobs+1) == options.job]

    params['feat_file'] = BigFile(os.path.join(rootpath, trainCollection, 'FeatureData', feature))

    if 'fik' == modelName:
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))    

        
    s_time = time.time()

    for concept in activeConcepts:
        printStatus(INFO, 'processing %s' % concept)
        modelfile = os.path.join(modeldir, '%s.model'%concept)
        if checkToSkip(modelfile, overwrite):
            continue
        new_model = NegativeBootstrap.learn(concept, params)
        makedirsforfile(modelfile)
        printStatus(INFO, 'save model to %s' % modelfile)
        save_model(modelfile, new_model)
        printStatus(INFO, '%s done' % concept)
        
    timecost = time.time() - s_time
    writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath)
    printStatus(INFO, 'done for %g concepts: %s' % (len(activeConcepts), ' '.join(activeConcepts)))
    printStatus(INFO, 'models stored at %s' % modeldir)
    printStatus(INFO, '%g seconds in total' % timecost)
예제 #11
0
def process(options, trainCollection, annotationName, testCollection):
    rootpath = options.rootpath
    m = options.m
    k_r = options.kr
    k_d = options.kd
    k_s = options.ks
    k_c = options.kc
    feature = options.feature
    add_bonus = options.bonus
    overwrite = options.overwrite
    
    #outputName = 'cotag,m%d,kr%d,kd%d,ks%d,kc%d,bonus%d'%(m,k_r,k_d,k_s,k_c,add_bonus)
    outputName = 'cotag' # simplify the outputName to reduce the length of the result filename
    outputName = os.path.join(outputName, feature) if (k_c>1e-6) else outputName
    resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, annotationName, outputName, 'id.tagvotes.txt')

    if checkToSkip(resultfile, overwrite):
        sys.exit(0)
     
    testImageSet = readImageSet(testCollection, testCollection, rootpath=rootpath)
    test_tag_reader = TagReader(testCollection, rootpath=rootpath)
    
    if k_c < 1e-6:
        tagger = TagCooccurTagger(testCollection, trainCollection, annotationName, rootpath=rootpath)
    else:
        tagger = TagCooccurPlusTagger(testCollection, trainCollection, annotationName, feature=feature, rootpath=rootpath)
    tagger.m = m
    tagger.k_r = k_r
    tagger.k_d = k_d
    tagger.k_s = k_s
    tagger.k_c = k_c
    tagger.add_bonus = add_bonus
    
    makedirsforfile(resultfile)
    
    fw = open(resultfile, 'w')
    
    output = []
    done = 0
    for im in testImageSet:
        user_tags = test_tag_reader.get(im)
        tagvotes = tagger.predict(content=im, context=user_tags)
        newline = '%s %s' % (im, ' '.join(['%s %s'%(x[0], niceNumber(x[1],6)) for x in tagvotes]))
        output.append(newline)
        done += 1
        if len(output) % 1e4 == 0:
            fw.write('\n'.join(output) + '\n')
            output=[]
            printStatus(INFO, '%d done' % done)
    if output:
        fw.write('\n'.join(output) + '\n')
    fw.close()
    printStatus(INFO, '%d done' % done)
예제 #12
0
def process(options, collection, conceptfile):
    rootpath = options.rootpath
    tpp = options.tpp
    overwrite = options.overwrite

    concepts = [x.strip() for x in open(conceptfile).readlines() if x.strip() and not x.strip().startswith('#')]
    resultdir = os.path.join(rootpath, collection, 'tagged,%s'%tpp)

    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, '%s.txt'%concept)
        if checkToSkip(resultfile, overwrite):
            continue
        todo.append(concept)

    if not todo:
        printStatus(INFO, 'nothing to do')
        return 0

    try:
        holdoutfile = os.path.join(rootpath,collection,'ImageSets','holdout.txt')
        holdoutSet = set(map(str.strip,open(holdoutfile).readlines()))
    except:
        holdoutSet = set()

    hitlists = buildHitlists(collection, todo, tpp, rootpath)
    min_hit = 1e6
    max_hit = 0

    for concept in todo:
        resultfile = os.path.join(resultdir, '%s.txt' % concept)
        if checkToSkip(resultfile,overwrite):
            continue
        subconcepts = concept.split('-')
        labeledSet = set(hitlists[subconcepts[0]])
        for i in range(1,len(subconcepts)):
            labeledSet = labeledSet.intersection(hitlists[subconcepts[i]])
        labeledSet = labeledSet.difference(holdoutSet)
        if len(labeledSet) == 0:
            printStatus(INFO, '%s has ZERO hit' % concept)
        else:
            printStatus(INFO, '%s, %d hits -> %s' %(concept, len(labeledSet), resultfile))
            makedirsforfile(resultfile)
            fw = open(resultfile, 'w')
            fw.write('\n'.join(labeledSet) + '\n')
            fw.close()
        if len(labeledSet) > max_hit:
            max_hit = len(labeledSet)
        if len(labeledSet) < min_hit:
            min_hit = len(labeledSet)
            
    printStatus(INFO, 'max hits: %d, min hits: %d' % (max_hit, min_hit))
예제 #13
0
파일: im2vec.py 프로젝트: silasxue/hierse
def process(options, label_file, label2vec_dir, testCollection, feature, new_feature):
    rootpath = options.rootpath
    overwrite = options.overwrite
    k = options.k
    blocksize = options.blocksize
    subset = options.subset if options.subset else testCollection

    resfile = os.path.join(rootpath, testCollection, 'FeatureData', new_feature, 'id.feature.txt')
    if checkToSkip(resfile, overwrite):
        return 0

    imsetfile = os.path.join(rootpath, testCollection, 'ImageSets', '%s.txt' % subset)
    imset = map(str.strip, open(imsetfile).readlines())
    printStatus(INFO, '%d images to do' % len(imset))

    feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature))

    im2vec = Image2Vec(label_file, label2vec_dir)


    makedirsforfile(resfile)
    fw = open(resfile, 'w')

    read_time = 0
    run_time = 0
    start = 0
    done = 0

    while start < len(imset):
        end = min(len(imset), start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))

        s_time = time.time()
        renamed, test_X = feat_file.read(imset[start:end])
        read_time += time.time() - s_time
        
        s_time = time.time()
        output = [None] * len(renamed)
        for i in xrange(len(renamed)):
            vec = im2vec.embedding(test_X[i], k)
            output[i] = '%s %s\n' % (renamed[i], " ".join([niceNumber(x,6) for x in vec]))
        run_time += time.time() - s_time
        start = end
        fw.write(''.join(output))
        done += len(output)

    # done    
    printStatus(INFO, "%d done. read time %g seconds, run_time %g seconds" % (done, read_time, run_time))
    fw.close()
    return done
예제 #14
0
파일: tagpos.py 프로젝트: Peratham/jingwei
def process(options, collection):
    rootpath = options.rootpath
    tpp = options.tpp
    overwrite = options.overwrite

    
    resultfile = os.path.join(rootpath, collection, "tagrel", collection, 'tagpos,%s'%tpp, 'id.tagvotes.txt')
    if checkToSkip(resultfile, overwrite):
        sys.exit(0)    

    imset = readImageSet(collection, collection, rootpath)
    printStatus(INFO, 'working on %d test images -> %s' % (len(imset),resultfile))
    
    reader = TagReader(collection,tpp=tpp,rootpath=rootpath)   
    
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")
    output = []
    done = 0
    
    for im in imset:
        tags = reader.get(im)
        tagSet = set()
        tagSeq = []
        for tag in str.split(tags):
            if tag not in tagSet:
                tagSeq.append(tag)
                tagSet.add(tag)
        assert(len(tagSeq) == len(tagSet))
        
        nr_tags = len(tagSeq)
        tagvotes = [(tagSeq[i], 1.0-float(i)/nr_tags) for i in range(nr_tags)]
        newline = "%s %s" % (im, " ".join(["%s %g" % (x[0],x[1]) for x in tagvotes]))
        output.append(newline + "\n")
        done += 1
        
        if len(output)%1e4 == 0:
            printStatus(INFO, '%d %s %s' % (done,im,' '.join(['%s:%g' % (x[0],x[1]) for x in tagvotes[:3]] )))
            fw.write("".join(output))
            fw.flush()
            output = []
        
    if output:
        fw.write("".join(output))
    fw.close()
    printStatus(INFO, 'done')
예제 #15
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.random.rand(len(id_images), len(concepts))

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)    
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
예제 #16
0
def process(options, feat_dir):
    newname = ''
    if options.ssr:
        newname = 'ssr'
    newname += 'l%d' % options.p
    resfile = os.path.join(feat_dir.rstrip('/\\') + newname, 'feature.bin')
    if checkToSkip(resfile, options.overwrite):
        return 0

    with open(os.path.join(feat_dir, 'shape.txt')) as fr:
        nr_of_images, feat_dim = map(int, fr.readline().strip().split())
        fr.close()
        
    offset = np.float32(1).nbytes * feat_dim
    res = array.array('f')
    
    fr = open(os.path.join(feat_dir,'feature.bin'), 'rb')
    makedirsforfile(resfile)
    fw = open(resfile, 'wb')
    print ('>>> writing results to %s' % resfile)
    

    for i in xrange(nr_of_images):
        res.fromfile(fr, feat_dim)
        vec = res
        if options.ssr:
            vec = [np.sign(x) * np.sqrt(abs(x)) for x in vec]
        if options.p == 1:
            Z = sum(abs(x) for x in vec) + 1e-9
        else:
            Z = np.sqrt(sum([x**2 for x in vec])) + 1e-9
        if i % 1e4 == 0:
            print ('image_%d, norm_%d=%g' % (i, options.p, Z))
        vec = [x/Z for x in vec]
        del res[:]
        vec = np.array(vec, dtype=np.float32)
        vec.tofile(fw)
    fr.close()
    fw.close()
    print ('>>> %d lines parsed' % nr_of_images)
    shutil.copyfile(os.path.join(feat_dir,'id.txt'), os.path.join(os.path.split(resfile)[0], 'id.txt'))
    
    shapefile = os.path.join(os.path.split(resfile)[0], 'shape.txt')
    with open(shapefile, 'w') as fw:
        fw.write('%d %d' % (nr_of_images, feat_dim))
        fw.close()
예제 #17
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    inputfile = options.inputfile
    resultname = options.resultname

    result_file = os.path.join('result', resultname)
    if checkToSkip(result_file, overwrite):
       sys.exit(0)
    makedirsforfile(result_file)

    # inpute of query
    qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)

    num2file = {}
    num2file[0] = os.path.join(rootpath, collection, 'Annotations', 'Image', 'concepts%s.txt' % collection)
    method_count = 1
    for line in open(os.path.join('result',inputfile)).readlines():
        num2file[method_count] = line.strip()
        method_count +=1

    fout = open(result_file, "w")
    
    for qid in qid_list:
        name2feature = {}
        for fnum in xrange(method_count):
            data_file = os.path.join( num2file[fnum], '%s.txt' % qid)
            data = readAnnotations(data_file)
            data.sort(key=lambda v:v[0], reverse=True)
            names = [x[0] for x in data]
            labels = [x[1] for x in data]
            # print 'fnum %d' % fnum
            if fnum == 0:
                key_names = names
                for i in xrange(len(names)):
                    name2feature[names[i]] = [labels[i]]
            else:
                assert(checkSameList(key_names, names))
                for i in xrange(len(names)):
                    name2feature[names[i]].append(labels[i])
        for img in key_names:
            fout.write('%s ' % qid + img + ' ' + ' '.join(name2feature[img]) + '\n')  
    fout.close()

    print 'Combined result of different written into %s' % result_file
def process(options, trainCollection, annotationName):
    rootpath = options.rootpath
    overwrite = options.overwrite
    
    resultfile = os.path.join(rootpath, trainCollection, 'TextData', 'tag.concept-rank.%s.pkl' % annotationName)
    if checkToSkip(resultfile, overwrite):
        return 0
        
    concepts = readConcepts(trainCollection, annotationName, rootpath)
    concept_num = len(concepts)
    concept2index = dict(zip(concepts, range(concept_num)))
    tcb = TagCooccurBase(trainCollection, rootpath=rootpath)
    tag_num = tcb.tag_num()
    DEFAULT_RANK = tag_num
    rank_matrix = np.zeros((tag_num, concept_num), dtype=np.int) + DEFAULT_RANK
    tag_list = []
    
    for i,u in enumerate(tcb.vob):
        ranklist = tcb.top_cooccur(u,-1)
        concept2rank = {}
        rank = [DEFAULT_RANK] * concept_num
        
        hit = 0
        for j,x in enumerate(ranklist):
            idx = concept2index.get(x[0], -1)
            if idx>=0:
                rank_matrix[i,idx] = j+1
                hit += 1
                if hit == concept_num:
                    break
        tag_list.append(u)
        
        if (i+1) % 1e4 == 0:
            printStatus(INFO, '%d done' % (i+1) )
    
    assert(len(tag_list) == tag_num)
    
    import cPickle as pickle
    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump({'tags':tag_list, 'concepts':concepts, 'rank_matrix':rank_matrix}, output, -1)
    output.close()
    printStatus(INFO, '%dx%d dumped to %s' % (tag_num, concept_num, resultfile))
예제 #19
0
def process(options, collection, feature):
    rootpath = options.rootpath
    tpp = options.tpp
    k = 1000  # options.k
    numjobs = options.numjobs
    job = options.job
    overwrite = options.overwrite

    feat_dir = os.path.join(rootpath, collection, "FeatureData", feature)
    feat_file = BigFile(feat_dir)
    hitlists = buildHitLists(collection, tpp, rootpath)
    printStatus(INFO, "nr of tags: %d" % len(hitlists))

    vob = sorted(hitlists.keys())
    vob = [vob[i] for i in range(len(vob)) if i % numjobs == job - 1]
    printStatus(INFO, "working on %d-%d: %d tags" % (numjobs, job, len(vob)))

    for tag_idx, tag in enumerate(vob):
        resultdir = os.path.join(rootpath, collection, "FeatureIndex", feature, tag[:2], tag)
        binfile = os.path.join(resultdir, "feature.bin")
        if checkToSkip(binfile, overwrite):
            continue

        hitlist = hitlists[tag]
        hitlist = hitlist[:k]  # keep at most 1000 images per tag
        renamed, vecs = feat_file.read(hitlist)

        makedirsforfile(binfile)
        np.array(vecs).astype(np.float32).tofile(binfile)
        idfile = os.path.join(resultdir, "id.txt")
        fw = open(idfile, "w")
        fw.write(" ".join(renamed))
        fw.close()

        shapefile = os.path.join(resultdir, "shape.txt")
        fw = open(shapefile, "w")
        fw.write("%d %d" % (len(renamed), len(vecs[0])))
        fw.close()

        if tag_idx % 1e3 == 0:
            printStatus(INFO, "%d - %s, %d images" % (tag_idx, tag, len(hitlist)))
예제 #20
0
def process(options, model_name, concept_file, weight_dir, result_dir):
    rootpath = options.rootpath
    overwrite = options.overwrite

    if 'fastlinear' == model_name:
        from fastlinear.fastlinear import fastlinear_load_model as load_model
        from fastlinear.fastlinear import fastlinear_save_model as save_model
    else:
        from fiksvm.fiksvm import fiksvm_load_model as load_model
        from fiksvm.fiksvm import fiksvm_save_model as save_model


    concepts = [x.strip() for x in open(concept_file).readlines() if x.strip() and not x.strip().startswith('#')]
    todo = [x for x in concepts if overwrite or not os.path.exists(os.path.join(result_dir, '%s.model'%x))]
    printStatus(INFO, '%d concepts to do' % len(todo))

    for concept in todo:
        weight_file = os.path.join(weight_dir, '%s.txt' % concept)
        weight_data = map(str.strip, open(weight_file).readlines())
        nr_of_models = len(weight_data)
        assert(nr_of_models >= 2)
        weights = [0] * nr_of_models
        models = [None] * nr_of_models

        for i,line in enumerate(weight_data):
            w, model_dir = line.split()
            weights[i] = float(w)
            model_dir =  model_dir if model_dir.startswith(rootpath) else os.path.join(rootpath, model_dir)
            assert (model_dir.find(model_name)>0)
            model_file_name = os.path.join(model_dir, '%s.model' % concept)
            models[i] = load_model(model_file_name)

        new_model = models[0]
        new_model.add_fastsvm(models[1], weights[0], weights[1])
        for i in range(2, len(models)):
            new_model.add_fastsvm(models[i], 1, weights[i])    

        new_model_file = os.path.join(result_dir, '%s.model'%concept)
        makedirsforfile(new_model_file)
        save_model(new_model_file, new_model)
예제 #21
0
파일: im2fea.py 프로젝트: danieljf24/cmrf
def process(options, feat_dir, imsetfile, result_dir):

    resultfile = os.path.join(result_dir, "feature.bin")
    if checkToSkip(resultfile, options.overwrite):
        sys.exit(0)

    imset = map(str.strip, open(imsetfile).readlines())
    print "requested", len(imset)

    feat_file = BigFile(feat_dir)

    makedirsforfile(resultfile)
    fw = open(resultfile, "wb")

    done = []
    start = 0

    while start < len(imset):
        end = min(len(imset), start + options.blocksize)
        printStatus(INFO, "processing images from %d to %d" % (start, end - 1))
        toread = imset[start:end]
        if len(toread) == 0:
            break
        renamed, vectors = feat_file.read(toread)
        for vec in vectors:
            vec = np.array(vec, dtype=np.float32)
            vec.tofile(fw)
        done += renamed
        start = end
    fw.close()

    assert len(done) == len(set(done))
    with open(os.path.join(result_dir, "id.txt"), "w") as fw:
        fw.write(" ".join(done))
        fw.close()

    with open(os.path.join(result_dir, "shape.txt"), "w") as fw:
        fw.write("%d %d" % (len(done), feat_file.ndims))
        fw.close()
    print "%d requested, %d obtained" % (len(imset), len(done))
예제 #22
0
def process(options, synset_file, synset_name):
    overwrite = options.overwrite
    rootpath = options.rootpath
    corpus = options.corpus
    word2vec_model = options.word2vec
    embedding = options.embedding

    resdir = os.path.join(rootpath, 'synset2vec', synset_name, '%s,%s,%s' % (corpus, word2vec_model, embedding))
    resfile = os.path.join(resdir, 'feature.bin')
    if checkToSkip(resfile, overwrite):
        return 0

    synsets = map(str.strip, open(synset_file).readlines())
    s2v = get_synset_encoder(embedding)(corpus, word2vec_model, rootpath=rootpath)
    makedirsforfile(resfile)

    good = []
    with open(resfile, 'wb') as fw:
        for i,wnid in enumerate(synsets):
            #if i % 1e3 == 0:
            #    printStatus(INFO, '%d done' % i)
            vec = s2v.embedding(wnid)

            if vec is not None:
                vec = np.array(vec, dtype=np.float32)
                vec.tofile(fw)
                good.append(wnid)

        fw.close()
        printStatus(INFO, '%d done, %d okay' % ((i+1), len(good)))
 

    with open(os.path.join(resdir, 'id.txt'), 'w') as fw:
        fw.write(' '.join(good))
        fw.close()

    with open(os.path.join(resdir, 'shape.txt'), 'w') as fw:
        fw.write('%d %d' % (len(good), s2v.get_feat_dim()))
        fw.close() 
예제 #23
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite
    random = options.random

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.zeros((len(id_images), len(concepts)))

    id_images = []
    tag2idx = dict(zip(concepts, xrange(len(concepts))))
    with open(os.path.join(rootpath, workingCollection, 'TextData', 'id.userid.lemmtags.txt')) as f:
        cnt = 0
        for line in f:
            id_img, _, tags = line.split('\t')
            tags = tags.split()
            if len(tags) > 0:
                tags = [(tag2idx.get(x,-1), y) for x,y in zip(tags, xrange(len(tags)))]
                idx = np.array([x[0] for x in tags])
                vals = 1. / (1. + np.array([x[1] for x in tags]))
                tagmatrix[cnt, idx] = vals

            id_images.append(id_img)
            cnt += 1

    # random rank for untagged images
    if random:
        tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand(tagmatrix.shape[0], tagmatrix.shape[1])

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)    
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
예제 #24
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    feature = options.feature
    method = options.method
    sigma =options.sigma

    # result path
    ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'MetaData', method, feature)
    DCG_result_path = os.path.join(rootpath, collection, 'DCG', method, feature)
    if checkToSkip(ranking_result_path, overwrite):
        sys.exit(0)
    if checkToSkip(DCG_result_path, overwrite):
        sys.exit(0)
    
    # inpute of query
    qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)
    qid2query =  dict(zip(qid_list, query_list))
    
    # inpute of image
    img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature)
    img_feats = BigFile(img_feat_path)

    # the model to calculate DCG@25
    scorer = getScorer("DCG@25")


    done = 0
    qid2dcg = collections.OrderedDict()
    qid2iid_label_score = {}

    for qid in qid_list:
        iid_list, label_list = readAnnotationsFrom(collection, 'concepts%s.txt' % collection, qid, False, rootpath)

        renamed, test_X = img_feats.read(iid_list)

        parzen_list = []
        for imidx in iid_list:
            parzen_list.append(calParzen(img_feats.read_one(imidx), test_X , sigma))

        sorted_tuple = sorted(zip(iid_list, label_list, parzen_list), key=lambda v:v[2], reverse=True)
        qid2iid_label_score[qid] = sorted_tuple

        # calculate DCG@25
        sorted_label = [x[1] for x in sorted_tuple]
        qid2dcg[qid] = scorer.score(sorted_label)
        printMessage("Done", qid, qid2query[qid])

        done += 1
        if done % 20 == 0:
             writeRankingResult(ranking_result_path, qid2iid_label_score)
             qid2iid_label_score = {}


    writeDCGResult(DCG_result_path, qid2dcg)
    writeRankingResult(ranking_result_path, qid2iid_label_score)
    print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values()))

    result_path_file = "result/individual_result_pathes.txt"
    if os.path.exists(result_path_file):
        fout = open(result_path_file,'a')
    else:
        makedirsforfile(result_path_file)
        fout = open(result_path_file, 'w')
    fout.write(ranking_result_path + '\n')
    fout.close()
예제 #25
0
def process(options, workingCollection, feature):
    rootpath = options.rootpath
    k_ratio = options.kratio
    distance = options.distance
    overwrite = options.overwrite

    nnName = distance + "knn"
    resultfile = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,k_ratio), 'laplacianI.mat')

    if checkToSkip(resultfile, overwrite):
        return 0

    workingSet = readImageSet(workingCollection, workingCollection, rootpath)
    workingSet.sort()

    tot_images = len(workingSet)
    printStatus(INFO, '%d images' % (tot_images))

    K_neighs = int(math.floor(len(workingSet) * k_ratio))
    printStatus(INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio))

    printStatus(INFO, 'Allocating I,J,V arrays')
    I = np.zeros((K_neighs * tot_images * 2))
    J = np.zeros((K_neighs * tot_images * 2))
    V = np.zeros((K_neighs * tot_images * 2))
    n_entries = 0

    # distances
    printStatus(INFO, 'Starting to fill I,J,V arrays')
    for i in xrange(tot_images):
        try:
            neighbors = _get_neighbors('%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs*2, feature, distance)
            # remove images with features but not in the working set
            NNrow = []
            NNDrow = []
            new_neighs = []
            for x in neighbors:
                try:
                    NNrow.append(bisect_index(workingSet, x[0]))
                    NNDrow.append(x[1])
                    new_neighs.append(x)
                except ValueError:
                    pass
            #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors])
            #NNDrow = np.array([x[1] for x in neighbors])
            NNrow = np.array(NNrow)
            NNDrow = np.array(NNDrow)
            neighbors = new_neighs[0:K_neighs]
        except ValueError:
            printStatus(INFO, 'ERROR: id_img %s has non-standard format!' % (workingSet[i]))
            sys.exit(1)

        if len(neighbors) < K_neighs:
            printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (workingSet[i], len(neighbors), K_neighs))
            sys.exit(1)

        if (i+1) % 1000 == 0:
            printStatus(INFO, '%d / %d done' % (i+1, tot_images))
        for k in xrange(K_neighs):
            if i != int(NNrow[k]): # -1 zero on the diagonal for a later step
                I[n_entries] = i
                J[n_entries] = int(NNrow[k]) # -1
                V[n_entries] = NNDrow[k]
                n_entries += 1
                I[n_entries] = int(NNrow[k]) # -1
                J[n_entries] = i
                V[n_entries] = NNDrow[k]
                n_entries += 1

    I = I[0:n_entries]
    J = J[0:n_entries]
    V = V[0:n_entries]

    printStatus(INFO, 'Removing duplicates')
    ind = np.lexsort((V,J,I))
    I = I[ind]
    J = J[ind]
    V = V[ind]
    a = np.concatenate([I.reshape(1, len(I)), J.reshape(1, len(J))], axis=0).T
    b = np.ascontiguousarray(a).view(np.dtype((np.void, a.dtype.itemsize * a.shape[1])))
    del a
    _, idx = np.unique(b, return_index=True)
    del b

    I = I[idx]
    J = J[idx]
    V = V[idx]

    printStatus(INFO, 'Computing the final laplacian matrix')
    sigma = np.median(V) ** 2.;
    printStatus(INFO, 'Estimated sigma^2 = %f' % sigma)
    V = np.exp(-V / sigma)

    matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tocsr()
    new_diag = matrix.sum(axis=0).T
    V = -V

    I_add = np.zeros((tot_images))
    J_add = np.zeros((tot_images))
    V_add = np.zeros((tot_images))
    for i,v in enumerate(new_diag):
        I_add[i] = i
        J_add[i] = i
        V_add[i] = v

    I = np.append(I, I_add)
    J = np.append(J, J_add)
    V = np.append(V, V_add)

    matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tolil()

    printStatus(INFO, 'Saving laplacian matrix to %s' % resultfile)
    makedirsforfile(resultfile)
    scipy.io.savemat(resultfile, {'im_similarity' : matrix, 'sigma' : sigma})
예제 #26
0
def process(options, trainCollection, trainAnnotationName, feature):
    import re
    p = re.compile(r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)')

    rootpath = options.rootpath
    best_param_dir = options.best_param_dir
    overwrite = options.overwrite
    #autoweight = options.autoweight
    numjobs = options.numjobs
    job = options.job
    beta = 0.5
    
    modelName = 'fastlinear'
    if best_param_dir:
        modelName += '-tuned'
    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    if not todo:
        return 0

    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    
    feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature))
    
    for concept in todo:
        if best_param_dir:
            param_file = os.path.join(best_param_dir, '%s.txt' % concept)
            m = p.search(open(param_file).readline().strip())
            C = float(m.group('C'))
            A = float(m.group('a'))
            B = float(m.group('b'))
        else:
            C = 1
            A = 0
            B = 0
        printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B))
        
        model_file_name = os.path.join(resultdir, concept + '.model')
        
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names,labels))
        renamed,vectors = feat_file.read(names)
        y = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if  1 == lab])
        nn = len([1 for lab in labels if  -1== lab])
        wp = float(beta) * (np+nn) / np
        wn = (1.0-beta) * (np+nn) /nn
    
        # no bias term added by setting "-B -1"
        svm_params = '-w1 %g -w-1 %g -s 2 -B -1 -q' % (wp*C, wn*C) 
        model = liblinear_train(y, vectors, svm_params)
        newmodel = liblinear_to_fastlinear([model], [1.0], feat_file.ndims)
        newmodel.set_probAB(A, B)
        makedirsforfile(model_file_name)
        printStatus(INFO, '-> %s'%model_file_name)
        fastlinear_save_model(model_file_name, newmodel)

        # reload the model file to do a simple check
        fastlinear_load_model(model_file_name)
        assert(abs(newmodel.get_probAB()[0]-A)<1e-6)
        assert(abs(newmodel.get_probAB()[1]-B)<1e-6)

    return len(todo)
예제 #27
0
def process(options, workingCollection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    chunk = options.chunk - 1
    n_chunks = options.nchunks
    ratio_cs = options.ratiocs
    assert chunk < n_chunks and chunk >= 0 and n_chunks > 0

    printStatus(INFO, 'RatioCS = %f' % ratio_cs)

    printStatus(INFO, 'Using Brown Corpus for the ic')
    brown_ic = wordnet_ic.ic('ic-brown.dat')

    tags_file = os.path.join(rootpath, workingCollection, 'TextData', 'lemm_wordnet_freq_tags.h5')
    if not os.path.exists(tags_file):
        printStatus(INFO, 'Tags file not found at %s Did you run wordnet_frequency_tags.py ?' % tags_file)
        sys.exit(1)

    if n_chunks > 1:
        resultfile = os.path.join(rootpath, workingCollection, 'LaplacianT', '%f'%(ratio_cs), 'laplacianT_%d.mat' % chunk)
    else:
        resultfile = os.path.join(rootpath, workingCollection, 'LaplacianT', '%f'%(ratio_cs), 'laplacianT.mat')
    if checkToSkip(resultfile, overwrite):
        return 0

    tags_data = h5py.File(tags_file, 'r')

    vocab = list(tags_data['vocab'][:])
    tagmatrix = tags_data['tagmatrix'][:]
    N_tags = len(vocab)

    # single tag frequency
    frequency = tagmatrix.sum(axis=0)
    assert len(frequency) == len(vocab), "%s " % len(frequency) == len(vocab)

    final_matrix = np.zeros((N_tags, N_tags))

    # similarity matrix
    printStatus(INFO, 'Building the similarity matrix')
    start_chunk = chunk * int(math.floor(N_tags / n_chunks))
    if chunk == (n_chunks - 1):
        end_chunk = N_tags
    else:
        end_chunk = (chunk + 1) * int(math.floor(N_tags / n_chunks))

    for i in xrange(start_chunk, end_chunk):
        if i % 100 == 0:
            printStatus(INFO, '%d / %d done' % (i+1, end_chunk))
        for k in xrange(i+1, N_tags):
            context = ratio_cs * np.sum(tagmatrix[:, [i, k]].sum(axis=1) > 1.5) / (frequency[i] + frequency[k])
            semantic = max(0, (1. - ratio_cs) * tag_semantic_similarity(vocab[i], vocab[k], brown_ic))
            final_matrix[i, k] = context + semantic
            final_matrix[k, i] = final_matrix[i, k]

    # laplacian
    if n_chunks < 2:
        printStatus(INFO, 'Computing the laplacian matrix')
        new_diag = final_matrix.sum(axis=0).T
        final_matrix = - final_matrix
        for i in xrange(N_tags):
            final_matrix[i, i] = new_diag[i]

    if n_chunks < 2:
        printStatus(INFO, 'Saving laplacian matrix to %s' % resultfile)
    else:
        printStatus(INFO, 'Saving partial matrix to %s' % resultfile)
    makedirsforfile(resultfile)
    scipy.io.savemat(resultfile, {'tag_similarity' : final_matrix})
예제 #28
0
def process(options, testCollection, trainCollection, annotationName, feature):
    rootpath = options.rootpath
    k = options.k
    distance = options.distance
    overwrite = options.overwrite
    testset = testCollection
    onlytest = options.onlytest
    
    nnName = distance + "knn"
    resultfile_train = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d'%(feature,nnName,k), 'nn_train.h5')
    resultfile_test = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d'%(feature,nnName,k), 'nn_test.h5')
    
    if (not onlytest and checkToSkip(resultfile_train, overwrite)) or checkToSkip(resultfile_test, overwrite):
        return 0

    testSet = readImageSet(testCollection, testset, rootpath)
    trainSet = readImageSet(trainCollection, trainCollection, rootpath)
    testSet.sort()
    trainSet.sort()

    #train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
    #train_feat_file = BigFile(train_feat_dir)

    tagger = NAME_TO_TAGGER["preknn"](trainCollection, annotationName, feature, distance, rootpath=rootpath, k=1001)

    printStatus(INFO, '%d test images, %d train images' % (len(testSet), len(trainSet)))

    # allocate train -> train nearest neighbors
    if not onlytest:
        printStatus(INFO, 'Allocating NN, NND matrices')    
        NN = np.zeros((len(trainSet), k+1), dtype=np.int32)
        NND = np.zeros((len(trainSet), k+1))

        printStatus(INFO, 'Filling NN, NND matrices')    
        for i,id_img in enumerate(trainSet):
            neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (trainCollection, id_img))
            if len(neighbors) < k+1:
                printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k+1))    
                sys.exit(1)

            NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors])
            NNDrow = np.array([x[1] for x in neighbors])

            NN[i,:] = NNrow[0:k+1]
            NND[i,:] = NNDrow[0:k+1]

            if i % 1000 == 0:
                printStatus(INFO, '%d / %d images' % (i, len(trainSet)))    

        printStatus(INFO, 'Saving train matrices to file %s' % (resultfile_train))
        makedirsforfile(resultfile_train)
        fout = h5py.File(resultfile_train, 'w')
        fout['NN'] = NN
        fout['NND'] = NND
        fout['trainSet'] = trainSet
        fout['concepts'] = tagger.concepts
        fout.close()

        del NN
        del NND
   
    # allocate test -> train nearest neighbors
    printStatus(INFO, 'Allocating NNT, NNDT matrices')        
    NNT = np.zeros((len(testSet), k), dtype=np.int32)
    NNDT = np.zeros((len(testSet), k))

    printStatus(INFO, 'Filling NNT, NNDT matrices')    
    for i,id_img in enumerate(testSet):
        neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (testCollection, id_img))
        if len(neighbors) < k:
            printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k))    
            sys.exit(1)

        NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors])
        NNDrow = np.array([x[1] for x in neighbors])

        NNT[i,:] = NNrow[0:k]
        NNDT[i,:] = NNDrow[0:k]

        if i % 1000 == 0:
            printStatus(INFO, '%d / %d images' % (i, len(testSet)))    
   
    printStatus(INFO, 'Saving test matrices to file %s' % (resultfile_test))
    makedirsforfile(resultfile_test)
    fout = h5py.File(resultfile_test, 'w')
    fout['NNT'] = NNT
    fout['NNDT'] = NNDT
    fout['trainSet'] = trainSet
    fout['testSet'] = testSet
    fout['concepts'] = tagger.concepts   
    fout.close()
예제 #29
0
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName):
    if modelName.startswith('fik'):
        from fiksvm.fiksvm import fiksvm_load_model as load_model
    else:
        from fastlinear.fastlinear import fastlinear_load_model as load_model

    rootpath = options.rootpath
    overwrite = options.overwrite
    prob_output = options.prob_output
    numjobs = options.numjobs
    job = options.job
    blocksize = options.blocksize
    
    outputName = '%s,%s' % (feature,modelName)
    if prob_output:
        outputName += ',prob'

    resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt')
    if numjobs>1:
        resultfile += '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    test_imset = readImageSet(testCollection, testCollection, rootpath)
    test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job]
    nr_of_test_images = len(test_imset)
    printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile))

    models = [None] * nr_of_concepts
    for c in range(nr_of_concepts):
        model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concepts[c])
        models[c] = load_model(model_file_name)
        if models[c] is None:
            return 0
        #(pA,pB) = model.get_probAB()
        

    feat_file = BigFile(os.path.join(rootpath, testCollection, "FeatureData", feature))
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    read_time = 0
    test_time = 0
    start = 0
    done = 0

    while start < nr_of_test_images:
        end = min(nr_of_test_images, start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))

        s_time = time.time()
        renamed, test_X = feat_file.read(test_imset[start:end])
        read_time += time.time() - s_time
        
        s_time = time.time()
        output = [None] * len(renamed)
        for i in xrange(len(renamed)):
            if prob_output:
                scores = [models[c].predict_probability(test_X[i]) for c in range(nr_of_concepts)]
            else:
                scores = [models[c].predict(test_X[i]) for c in range(nr_of_concepts)]
            #dec_value = sigmoid_predict(dec_value, A=pA, B=pB)
            tagvotes = sorted(zip(concepts, scores), key=lambda v:v[1], reverse=True)
            output[i] = '%s %s\n' % (renamed[i], " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes]))
        test_time += time.time() - s_time
        start = end
        fw.write(''.join(output))
        fw.flush()
        done += len(output)

    # done    
    printStatus(INFO, "%d done. read time %g seconds, test_time %g seconds" % (done, read_time, test_time))
    fw.close()
    return done
예제 #30
0
def process(options, trainCollection, valCollection, testCollection):
    lang = which_language(trainCollection)
    assert(which_language(trainCollection) == which_language(valCollection))
    assert(which_language(trainCollection) == which_language(testCollection))

    rootpath = options.rootpath
    overwrite =  options.overwrite
    checkpoint = options.checkpoint
    init_model_from = options.init_model_from
    unroll = options.unroll
    corpus = options.corpus
    word2vec = options.word2vec
    batch_size = options.batch_size
    
    w2vv_config = options.model_config
    config = load_config('w2vv_configs/%s.py' % w2vv_config)

    img_feature = config.img_feature
    set_style = config.set_style
    # text embedding style (word2vec, bag-of-words, word hashing)
    text_style = config.text_style
    L1_normalize = config.L1_normalize
    L2_normalize = config.L2_normalize
    
    bow_vocab = config.bow_vocab+'.txt'

    l2_p = config.l2_p
    dropout = config.dropout
    
    max_epochs= config.max_epochs
    optimizer = config.optimizer
    loss_fun = config.loss_fun
    lr = config.lr
    clipnorm = config.clipnorm
    activation = config.activation
    sequences = config.sequences

    # lstm
    sent_maxlen = config.sent_maxlen
    embed_size = config.embed_size
    we_trainable = config.we_trainable
    lstm_size = config.lstm_size

    n_layers = map(int, config.n_layers.strip().split('-'))

    if init_model_from != '':
        init_model_name = init_model_from.strip().split("/")[-1]
        train_style = INFO + "_" + init_model_name
    else:
        train_style = INFO

    rnn_style, bow_style, w2v_style = text_style.strip().split('@')
    
    # text embedding style
    model_info = w2vv_config

    if 'lstm' in text_style or 'gru' in text_style:
        if lang == 'zh':
            w2v_data_path = os.path.join(rootpath, 'zh_w2v', 'model', 'zh_jieba.model')
        else:
            w2v_data_path = os.path.join(rootpath, "word2vec", corpus, word2vec)

        # bag-of-words vocabulary file path
        text_data_path = os.path.join(rootpath, trainCollection, "TextData", "vocabulary", "bow", bow_vocab)
        bow_data_path = os.path.join(rootpath, trainCollection, "TextData", "vocabulary", bow_style, bow_vocab)
        
        # text embedding (text representation)
        text2vec = get_text_encoder(rnn_style)(text_data_path, ndims=0, language=lang, L1_normalize=L1_normalize, L2_normalize=L2_normalize, maxlen=sent_maxlen)
        bow2vec = get_text_encoder(bow_style)(bow_data_path, ndims=0, language=lang, L1_normalize=L1_normalize, L2_normalize=L2_normalize)
        w2v2vec = get_text_encoder(w2v_style)(w2v_data_path, ndims=0, language=lang, L1_normalize=L1_normalize, L2_normalize=L2_normalize)
        if n_layers[0] == 0:
            n_layers[0] = bow2vec.ndims + w2v2vec.ndims
        else:
            assert n_layers[0] == bow2vec.ndims + w2v2vec.ndims

        # log file
        checkpoint_dir = os.path.join(rootpath, trainCollection, checkpoint, valCollection, train_style, model_info)

    else:
        logger.info("%s is not supported, please check the 'text_style' parameter", text_style)
        sys.exit(0)

    train_loss_hist_file = os.path.join(checkpoint_dir, 'train_loss_hist.txt')
    val_per_hist_file = os.path.join(checkpoint_dir, 'val_per_hist.txt')
    model_file_name = os.path.join(checkpoint_dir, 'model.json')
    model_img_name = os.path.join(checkpoint_dir, 'model.png')

    logger.info(model_file_name)
    if checkToSkip(model_file_name, overwrite):
        sys.exit(0)

    makedirsforfile(val_per_hist_file)

    # img2vec
    img_feat_path = os.path.join(rootpath, FULL_COLLECTION, 'FeatureData', img_feature)
    img_feats = BigFile(img_feat_path)

    val_img_feat_path = os.path.join(rootpath, FULL_COLLECTION, 'FeatureData', img_feature)
    val_img_feats = BigFile(val_img_feat_path)

    # dataset 
    train_file = os.path.join(rootpath, trainCollection, 'TextData', '%s.caption.txt' % trainCollection)

    # training set
    # print "loss function: ", loss_fun
    dataset_style = 'sent_' + loss_fun
    DataSet = get_dataset(dataset_style)

    # represent text on the fly
    trainData = DataSet(train_file, batch_size, text2vec, bow2vec, w2v2vec, img_feats, flag_maxlen=True, maxlen=sent_maxlen)

    # get pre-trained word embedding
    we_weights = get_we_parameter(text2vec.vocab, w2v_data_path, lang)
    # define word2visualvec model
    w2vv = W2VV_MS( text2vec.nvocab, sent_maxlen, embed_size, we_weights, we_trainable, lstm_size, n_layers, dropout, l2_p, activation=activation, lstm_style=rnn_style, sequences=sequences, unroll=unroll)

    w2vv.save_json_model(model_file_name)
    w2vv.plot(model_img_name)
    w2vv.compile_model(optimizer, loss_fun, learning_rate = lr, clipnorm=clipnorm)
   

    if options.init_model_from != '':
        logger.info('initialize the model from %s', options.init_model_from)
        w2vv.init_model(options.init_model_from)

    # preparation for validation
    val_sent_file = os.path.join(rootpath, valCollection, 'TextData', '%s.caption.txt' % valCollection)
    val_sents_id, val_sents, val_id2sents = readSentsInfo(val_sent_file)
    val_img_list = map(str.strip, open(os.path.join(rootpath, valCollection,  set_style, '%s.txt' % valCollection)).readlines())

    sent_feats_1 = []
    sent_feats_2 = []
    new_val_sents_id = []
    for index, sent in enumerate(val_sents):
        sent_vec = text2vec.mapping(sent)
        bow_vec = bow2vec.mapping(sent)
        w2v_vec = w2v2vec.mapping(sent)
        if sent_vec is not None and bow_vec is not None and w2v_vec is not None:
            sent_feats_1.append(sent_vec)
            sent_feats_2.append(list(bow_vec) + list(w2v_vec))
            new_val_sents_id.append(val_sents_id[index])
    sent_feats_1 = pad_sequences(sent_feats_1, maxlen=sent_maxlen,  truncating='post')

    simer = get_simer('cosine_batch')()
    scorer = getScorer(options.val_metric)

    count = 0
    lr_count = 0
    best_validation_perf = 0
    best_epoch = -1
    train_loss_hist = []
    val_per_hist = []
    n_train_batches = int(np.ceil( 1.0 * trainData.datasize / batch_size ))
    if loss_fun == 'ctl':
        datasize = 2*trainData.datasize
    else:
        datasize = trainData.datasize
    for epoch in range(max_epochs):
        logger.info('Epoch %d', epoch)
        logger.info("Training..., learning rate: %g", w2vv.get_lr())
        
        train_loss_epoch = []
        train_progbar = generic_utils.Progbar(datasize)
        trainBatchIter = trainData.getBatchData()
        for minibatch_index in xrange(n_train_batches):
            train_X_batch, train_Y_batch = trainBatchIter.next()
            loss = w2vv.model.train_on_batch(train_X_batch, train_Y_batch)
            train_progbar.add(train_X_batch[0].shape[0], values=[("train loss", loss)])

            train_loss_epoch.append(loss)

        train_loss_hist.append(np.mean(train_loss_epoch))

        this_validation_perf = do_validation(val_img_list, val_img_feats, new_val_sents_id, sent_feats_1, sent_feats_2, simer, scorer, w2vv)
        val_per_hist.append(this_validation_perf)

        logger.info('previous_best_performance: %g', best_validation_perf)
        logger.info('current_performance: %g', this_validation_perf)

        fout_file = os.path.join(checkpoint_dir, 'epoch_%d.h5' % ( epoch))

        lr_count += 1
        if this_validation_perf > best_validation_perf:
            best_validation_perf = this_validation_perf          
            count = 0

            # save best model
            w2vv.model.save_weights(fout_file)
            if best_epoch != -1:
                os.system('rm '+ os.path.join(checkpoint_dir, 'epoch_%d.h5' % (best_epoch)))
            best_epoch = epoch
        else:
            # when the validation performance has decreased after an epoch,
            # we divide the learning rate by 2 and continue training;
            # but we use each learning rate for at least 3 epochs.
            if lr_count > 2:
                w2vv.decay_lr(0.5)
                lr_count = 0
            count += 1
            if count > 10:
                print ("Early stopping happend")
                break


    sorted_epoch_loss = zip(range(len(train_loss_hist)), train_loss_hist)
    with open(train_loss_hist_file, 'w') as fout:
        for i, loss in sorted_epoch_loss:
            fout.write("epoch_" + str(i) + " " + str(loss) + "\n")


    sorted_epoch_perf = sorted(zip(range(len(val_per_hist)), val_per_hist), key = lambda x: x[1], reverse=True)
    with open(val_per_hist_file, 'w') as fout:
        for i, perf in sorted_epoch_perf:
            fout.write("epoch_" + str(i) + " " + str(perf) + "\n")


    # generate the shell script for test
    templete = ''.join(open( 'TEMPLATE_do_test.sh').readlines())
    striptStr = templete.replace('@@@rootpath@@@', rootpath)
    striptStr = striptStr.replace('@@@overwrite@@@', str(overwrite))
    striptStr = striptStr.replace('@@@trainCollection@@@', trainCollection)
    striptStr = striptStr.replace('@@@testCollection@@@', '%s %s'%(valCollection, testCollection))
    striptStr = striptStr.replace('@@@model_config@@@', w2vv_config)
    striptStr = striptStr.replace('@@@set_style@@@', set_style)
    striptStr = striptStr.replace('@@@model_path@@@', checkpoint_dir)
    striptStr = striptStr.replace('@@@model_name@@@', 'model.json')
    striptStr = striptStr.replace('@@@weight_name@@@', 'epoch_%d.h5' % sorted_epoch_perf[0][0])
    runfile = 'do_test_%s_%s.sh' % (w2vv_config, testCollection)
    open( runfile, 'w' ).write(striptStr+'\n')
    os.system('chmod +x %s' % runfile)
    os.system('./%s' % runfile)
예제 #31
0
def process(opt, trainCollection, valCollection, testCollection):

    rootpath = opt.rootpath
    overwrite = opt.overwrite

    opt.n_text_layers = map(int, opt.n_text_layers.strip().split('-'))

    if opt.init_model_from != '':
        init_model_name = opt.init_model_from.strip().split("/")[-1]
        train_style = opt.model_name + "_" + INFO + "_ft_" + init_model_name
    else:
        train_style = opt.model_name + "_" + INFO

    # text embedding style
    if '@' in opt.text_style and opt.model_name.endswith('_ms'):
        rnn_style, bow_style, w2v_style = opt.text_style.strip().split('@')
        opt.rnn_style = rnn_style
        text_data_path = os.path.join(rootpath, trainCollection, "TextData",
                                      "vocabulary", "bow", opt.rnn_vocab)
        bow_data_path = os.path.join(rootpath, trainCollection, "TextData",
                                     "vocabulary", bow_style, opt.bow_vocab)
        w2v_data_path = os.path.join(rootpath, "word2vec", opt.corpus,
                                     opt.word2vec)
        text_name = opt.bow_vocab + "_rnn_%d_%s_sent_%d" % (
            opt.rnn_size, opt.rnn_vocab, opt.sent_maxlen)
    else:
        print opt.text_style + " is not supported, please check the 'text_style' parameter"
        sys.exit(0)

    optm_style = opt.optimizer + '_clipnorm_%.1f_lr_%.5f_dp_%.2f_l2_%.5f_%s_bs_%d' % \
        (opt.clipnorm, opt.lr, opt.dropout, opt.l2_p, opt.loss_fun, opt.batch_size)
    model_style = "-".join(map(
        str, opt.n_text_layers)) + '_' + opt.hidden_act + '_' + opt.simi_fun

    model_id = "".join([
        opt.checkpoint, 'w2vv', valCollection, train_style,
        opt.text_style + '_' + text_name, opt.img_feature, optm_style,
        model_style, opt.postfix
    ])
    checkpoint_dir = os.path.join(rootpath, trainCollection, "train_results",
                                  hashlib.sha1(model_id).hexdigest())

    # output visualization script
    runfile_vis = 'do_visual.sh'
    open(runfile_vis, 'w').write(
        'port=$1\ntensorboard --logdir %s --port $port' % checkpoint_dir)
    os.system('chmod +x %s' % runfile_vis)

    val_per_hist_file = os.path.join(checkpoint_dir, 'val_per_hist.txt')
    if checkToSkip(val_per_hist_file, overwrite):
        sys.exit(0)
    # else:
    #     if os.path.exists(checkpoint_dir):
    #         os.system("rm -r " + checkpoint_dir)
    makedirsforfile(val_per_hist_file)
    model_file_name = os.path.join(checkpoint_dir, 'model.json')
    model_img_name = os.path.join(checkpoint_dir, 'model.png')
    tb_logger.configure(checkpoint_dir, flush_secs=5)

    # text embedding (text representation)
    if '@' in opt.text_style and opt.model_name.endswith('_ms'):
        text2vec = get_text_encoder(rnn_style)(text_data_path)
        bow2vec = get_text_encoder(bow_style)(bow_data_path)
        w2v2vec = get_text_encoder(w2v_style)(w2v_data_path)
        if opt.n_text_layers[0] == 0:
            opt.n_text_layers[0] = bow2vec.ndims + w2v2vec.ndims
        else:
            assert opt.n_text_layers[0] == bow2vec.ndims + w2v2vec.ndims
        opt.vocab_size = text2vec.n_vocab
        opt.embed_size = w2v2vec.ndims
    else:
        text2vec = get_text_encoder(opt.text_style)(text_data_path,
                                                    ndims=opt.n_text_layers[0])
        if opt.n_text_layers[0] == 0:
            opt.n_text_layers[0] = text2vec.ndims

    # img2vec
    img_feat_path = os.path.join(rootpath, trainCollection, 'FeatureData',
                                 opt.img_feature)
    img_feats = BigFile(img_feat_path)

    val_img_feat_path = os.path.join(rootpath, valCollection, 'FeatureData',
                                     opt.img_feature)
    val_img_feats = BigFile(val_img_feat_path)

    # write out options for evaluation
    pkl_file = os.path.join(checkpoint_dir, 'option.pkl')
    writePkl(opt, pkl_file)

    # define word2visualvec model
    if opt.model_name.endswith('_ms'):
        we_weights = get_we_parameter(text2vec.vocab, w2v_data_path)
        print we_weights.shape
        model = get_model(opt.model_name)(opt, we_weights=we_weights)
    else:
        model = get_model(opt.model_name)(opt)
    model.save_json_model(model_file_name)
    model.plot(model_img_name)
    model.compile_model(opt.loss_fun, opt=opt)
    if opt.init_model_from != '':
        print '*' * 20
        print 'initialize the model form ' + opt.init_model_from
        print '*' * 20
        model.init_model(opt.init_model_from)

    # training set
    caption_file = os.path.join(rootpath, trainCollection, 'TextData',
                                '%s.caption.txt' % trainCollection)
    trainData = PairDataSet_MS(caption_file,
                               opt.batch_size,
                               text2vec,
                               bow2vec,
                               w2v2vec,
                               img_feats,
                               flag_maxlen=True,
                               maxlen=opt.sent_maxlen)

    val_sent_file = os.path.join(rootpath, valCollection, 'TextData',
                                 '%s.caption.txt' % valCollection)
    val_img_list, val_sents_id, val_sents = readImgSents(val_sent_file)

    losser = get_losser(opt.simi_fun)()

    best_validation_perf = 0
    n_step = 0
    count = 0
    lr_count = 0
    best_epoch = -1
    val_per_hist = []
    for epoch in range(opt.max_epochs):
        print '\nEpoch', epoch
        print "Training..."
        print "learning rate: ", model.get_lr()
        tb_logger.log_value('lr', model.get_lr(), step=n_step)

        train_progbar = generic_utils.Progbar(trainData.datasize)
        trainBatchIter = trainData.getBatchData()
        for minibatch_index in xrange(trainData.max_batch_size):
            n_step += 1
            img_X_batch, text_X_batch = trainBatchIter.next()
            loss_batch = model.model.train_on_batch(text_X_batch, img_X_batch)
            train_progbar.add(img_X_batch.shape[0],
                              values=[("loss", loss_batch)])

            tb_logger.log_value('loss', loss_batch, step=n_step)
            tb_logger.log_value('n_step', n_step, step=n_step)

        print "\nValidating..."
        all_errors = pred_mutual_error_ms(val_img_list,
                                          val_sents,
                                          model,
                                          text2vec,
                                          bow2vec,
                                          w2v2vec,
                                          val_img_feats,
                                          losser,
                                          opt=opt)

        this_validation_perf = cal_val_perf(all_errors, opt=opt)
        tb_logger.log_value('val_accuracy', this_validation_perf, step=n_step)

        val_per_hist.append(this_validation_perf)

        print 'previous_best_performance: %.3f' % best_validation_perf
        print 'current_performance: %.3f' % this_validation_perf

        fout_file = os.path.join(checkpoint_dir, 'epoch_%d.h5' % (epoch))

        lr_count += 1
        if this_validation_perf > best_validation_perf:
            best_validation_perf = this_validation_perf
            count = 0

            # save model
            model.model.save_weights(fout_file)
            if best_epoch != -1:
                os.system('rm ' + os.path.join(checkpoint_dir, 'epoch_%d.h5' %
                                               (best_epoch)))
            best_epoch = epoch

        else:
            # when the validation performance has decreased after an epoch,
            # we divide the learning rate by 2 and continue training;
            # but we use each learning rate for at least 3 epochs.
            if lr_count > 2:
                model.decay_lr(0.5)
                lr_count = 0
            count += 1
            if count > 10:
                print("Early stopping happened")
                break

    sorted_epoch_perf = sorted(zip(range(len(val_per_hist)), val_per_hist),
                               key=lambda x: x[1],
                               reverse=True)
    with open(val_per_hist_file, 'w') as fout:
        for i, perf in sorted_epoch_perf:
            fout.write("epoch_" + str(i) + " " + str(perf) + "\n")

    # generate the shell script for test
    templete = ''.join(open('TEMPLATE_do_test.sh').readlines())
    striptStr = templete.replace('@@@rootpath@@@', rootpath)
    striptStr = striptStr.replace('@@@trainCollection@@@', trainCollection)
    striptStr = striptStr.replace('@@@valCollection@@@', valCollection)
    striptStr = striptStr.replace('@@@testCollection@@@', testCollection)
    striptStr = striptStr.replace('@@@model_path@@@', checkpoint_dir)
    striptStr = striptStr.replace('@@@weight_name@@@',
                                  'epoch_%d.h5' % sorted_epoch_perf[0][0])
    striptStr = striptStr.replace('@@@n_caption@@@', str(opt.n_caption))

    print os.path.join(checkpoint_dir, 'epoch_%d.h5' % sorted_epoch_perf[0][0])
    runfile = 'do_test_%s.sh' % (testCollection)
    open(runfile, 'w').write(striptStr + '\n')
    os.system('chmod +x %s' % runfile)
    # os.system('./'+runfile)
    os.system('cp %s/epoch_%d.h5 %s/best_model.h5' %
              (checkpoint_dir, sorted_epoch_perf[0][0], checkpoint_dir))
예제 #32
0
if len(sys.argv) < 4:
    print "Usage: merge_datasets.py trainCollection testCollection feature"
    sys.exit(1)

coll1 = sys.argv[1]
coll2 = sys.argv[2]
feature = sys.argv[3]

coll1_features_file = "%s/%s/FeatureData/%s/feature.bin" % (datapath, coll1,
                                                            feature)
coll2_features_file = "%s/%s/FeatureData/%s/feature.bin" % (datapath, coll2,
                                                            feature)
new_features_file = "%s/%s+%s/FeatureData/%s/feature.bin" % (datapath, coll1,
                                                             coll2, feature)
makedirsforfile(new_features_file)

coll1_shape_file = "%s/%s/FeatureData/%s/shape.txt" % (datapath, coll1,
                                                       feature)
coll2_shape_file = "%s/%s/FeatureData/%s/shape.txt" % (datapath, coll2,
                                                       feature)
new_shape_file = "%s/%s+%s/FeatureData/%s/shape.txt" % (datapath, coll1, coll2,
                                                        feature)
makedirsforfile(new_shape_file)

# shape file
with open(new_shape_file, 'w') as fout:
    imA, featA = open(coll1_shape_file).read().strip().split(" ")
    imB, featB = open(coll2_shape_file).read().strip().split(" ")
    assert featA == featB
def process(options, collection, annotationName, pos_num):
    assert (annotationName.endswith('.txt'))
    rootpath = options.rootpath
    pos_bag_num = options.pos_bag_num
    neg_bag_num = options.neg_bag_num
    neg_pos_ratio = options.neg_pos_ratio

    annotationNameStr = annotationName[:-4] + (
        '.random%d' % pos_num) + '.%d' + ('.npr%d' % neg_pos_ratio) + '.%d.txt'

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)

    skip = 0
    newAnnotationNames = [None] * (pos_bag_num * neg_bag_num)

    for idxp in range(pos_bag_num):
        for idxn in range(neg_bag_num):
            anno_idx = idxp * neg_bag_num + idxn
            newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn)
            resultfile = os.path.join(rootpath, collection, 'Annotations',
                                      newAnnotationNames[anno_idx])
            if checkToSkip(resultfile, options.overwrite):
                skip += 1
                continue
            writeConcepts(concepts, resultfile)

    first, second, last = annotationNameStr.split('%d')
    scriptfile = os.path.join(
        rootpath, collection, 'annotationfiles', first + '0-%d' %
        (pos_bag_num - 1) + second + '0-%d' % (neg_bag_num - 1) + last)

    makedirsforfile(scriptfile)
    fout = open(scriptfile, 'w')
    fout.write('\n'.join(newAnnotationNames) + '\n')
    fout.close()

    if len(newAnnotationNames) == skip:
        return 0

    for concept in concepts:
        names, labels = readAnnotationsFrom(collection,
                                            annotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        positivePool = [x[0] for x in zip(names, labels) if x[1] > 0]
        negativePool = [x[0] for x in zip(names, labels) if x[1] < 0]

        for idxp in range(pos_bag_num):
            if len(positivePool) > pos_num:
                positiveBag = random.sample(positivePool, pos_num)
            else:
                positiveBag = positivePool
            for idxn in range(neg_bag_num):
                anno_idx = idxp * neg_bag_num + idxn
                newAnnotationName = newAnnotationNames[anno_idx]
                resultfile = os.path.join(rootpath, collection, 'Annotations',
                                          'Image', newAnnotationName,
                                          '%s.txt' % concept)
                if checkToSkip(resultfile, options.overwrite):
                    continue
                real_neg_num = max(len(positiveBag) * neg_pos_ratio, 1000)
                real_neg_num = min(len(negativePool), real_neg_num)
                negativeBag = random.sample(negativePool, real_neg_num)

                assert (len(set(positiveBag).intersection(
                    set(negativeBag))) == 0)
                printStatus(
                    INFO, "anno(%s,%d) %d pos %d neg -> %s" %
                    (concept, anno_idx, len(positiveBag), len(negativeBag),
                     resultfile))
                writeAnnotations(positiveBag + negativeBag,
                                 [1] * len(positiveBag) +
                                 [-1] * len(negativeBag), resultfile)
예제 #34
0
def process(options, trainCollection, annotationfile, feature, modelName):
    assert(modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1 #options.autoweight
    beta = 0.5
    C = 1
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {'rootpath': rootpath, 'model': modelName}
    
    if 'fik' == modelName:
        from svms.fiksvm.svmutil import svm_train as train_model
        from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from svms.fiksvm.fiksvm import fiksvm_save_model as save_model
        from svms.fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))    
    else:
        from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model
        from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        from svms.fastlinear.fastlinear import fastlinear_save_model as save_model
 
    newAnnotationName = os.path.split(annotationfile)[-1]
    trainAnnotationNames = [x.strip() for x in open(annotationfile).readlines() if x.strip() and not x.strip().startswith('#')]
    for annotationName in trainAnnotationNames:
        conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', annotationName)
        if not os.path.exists(conceptfile):
            print '%s does not exist' % conceptfile
            return 0

    concepts = readConcepts(trainCollection, trainAnnotationNames[0], rootpath=rootpath)

    resultdir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature))
    feat_dim = train_feat_file.ndims

    s_time = time.time()

    for concept in todo:
        assemble_model = None
        for t in range(1, len(trainAnnotationNames)+1):
            names,labels = readAnnotationsFrom(trainCollection, trainAnnotationNames[t-1], concept, skip_0=True, rootpath=rootpath)
            name2label = dict(zip(names,labels))
            renamed,vectors = train_feat_file.read(names)
            Ys = [name2label[x] for x in renamed]
            np = len([1 for lab in labels if  1 == lab])
            nn = len([1 for lab in labels if  -1== lab])
            wp = float(beta) * (np+nn) / np
            wn = (1.0-beta) * (np+nn) /nn
    
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) 
            else:
                svm_params = '-c %g' % C
            
            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
           
            g_t = train_model(Ys, vectors, svm_params + ' -q')
            if t == 1:
                assemble_model = compress_model([g_t], [1.0], feat_dim, params)
            else:
                new_model = compress_model([g_t], [1.0], feat_dim, params)
                assemble_model.add_fastsvm(new_model, 1-1.0/t, 1.0/t)

        new_model_file = os.path.join(resultdir, '%s.model' % concept)            
        makedirsforfile(new_model_file)
        printStatus(INFO, 'save model to %s' % new_model_file)
        save_model(new_model_file, assemble_model)
        printStatus(INFO, '%s done' % concept)

        
    timecost = time.time() - s_time
    writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath)
    printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo)))
    printStatus(INFO, 'models stored at %s' % resultdir)
    printStatus(INFO, '%g seconds in total' % timecost)
예제 #35
0
            int,
            open(shapefile).readline().strip().split())
        nr_of_images_list.append(nr_of_images)
        feat_dim_list.append(feat_dim)
        feat_files.append(
            BigFile(os.path.join(rootpath, collection, 'FeatureData',
                                 feature)))

    #assert(nr_of_images_list[0] == nr_of_images_list[1])
    new_feat_dim = sum(feat_dim_list)

    imset = readImageSet(collection, collection, rootpath)
    nr_of_images = len(imset)
    blocksize = 1000

    makedirsforfile(binary_file)
    fw = open(binary_file, 'wb')
    new_imset = []
    start = 0

    while start < nr_of_images:
        end = min(nr_of_images, start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end - 1))

        renamed_0, vecs_0 = feat_files[0].read(imset[start:end])
        renamed_1, vecs_1 = feat_files[1].read(imset[start:end])

        sorted_idx_0 = np.argsort(renamed_0)
        sorted_idx_1 = np.argsort(renamed_1)

        for x, y in zip(sorted_idx_0, sorted_idx_1):
예제 #36
0
def process(options, feature, srcCollections, newCollection):
    assert (type(srcCollections) == list)

    temp = []
    [x for x in srcCollections
     if x not in temp and temp.append(x)]  # unique source collections
    srcCollections = temp

    rootpath = options.rootpath

    resfile = os.path.join(rootpath, newCollection, 'FeatureData', feature,
                           'feature.bin')
    if checkToSkip(resfile, options.overwrite):
        return 0

    querysetfile = os.path.join(rootpath, newCollection, 'ImageSets',
                                '%s.txt' % newCollection)
    try:
        query_set = set(map(str.strip, open(querysetfile).readlines()))
        printStatus(INFO, '%d images wanted' % len(query_set))
    except IOError:
        printStatus(
            INFO, 'failed to load %s, will merge all features in %s' %
            (querysetfile, ';'.join(srcCollections)))
        query_set = None

    makedirsforfile(resfile)
    fw = open(resfile, 'wb')
    printStatus(INFO, 'writing results to %s' % resfile)
    seen = set()
    newimset = []

    for collection in srcCollections:
        feat_dir = os.path.join(rootpath, collection, 'FeatureData', feature)
        with open(os.path.join(feat_dir, 'shape.txt')) as fr:
            nr_of_images, feat_dim = map(int, fr.readline().strip().split())
            fr.close()

        srcimset = open(os.path.join(feat_dir,
                                     'id.txt')).readline().strip().split()
        res = array.array('f')
        fr = open(os.path.join(feat_dir, 'feature.bin'), 'rb')

        for i, im in enumerate(srcimset):
            res.fromfile(fr, feat_dim)
            if im not in seen:
                seen.add(im)
                if not query_set or im in query_set:
                    vec = res
                    vec = np.array(vec, dtype=np.float32)
                    vec.tofile(fw)
                    newimset.append(im)
            del res[:]
            if i % 1e5 == 0:
                printStatus(
                    INFO,
                    '%d parsed, %d obtained' % (len(seen), len(newimset)))
        fr.close()
        printStatus(INFO,
                    '%d parsed, %d obtained' % (len(seen), len(newimset)))

    fw.close()
    printStatus(INFO, '%d parsed, %d obtained' % (len(seen), len(newimset)))

    idfile = os.path.join(os.path.split(resfile)[0], 'id.txt')
    with open(idfile, 'w') as fw:
        fw.write(' '.join(newimset))
        fw.close()

    shapefile = os.path.join(os.path.split(resfile)[0], 'shape.txt')
    with open(shapefile, 'w') as fw:
        fw.write('%d %d' % (len(newimset), feat_dim))
        fw.close()
예제 #37
0
def process(options, workingCollection, feature):
    rootpath = options.rootpath
    k_ratio = options.kratio
    distance = options.distance
    overwrite = options.overwrite

    nnName = distance + "knn"
    resultfile = os.path.join(rootpath, workingCollection, 'LaplacianI',
                              workingCollection,
                              '%s,%s,%f' % (feature, nnName, k_ratio),
                              'laplacianI.mat')

    if checkToSkip(resultfile, overwrite):
        return 0

    workingSet = readImageSet(workingCollection, workingCollection, rootpath)
    workingSet.sort()

    tot_images = len(workingSet)
    printStatus(INFO, '%d images' % (tot_images))

    K_neighs = int(math.floor(len(workingSet) * k_ratio))
    printStatus(
        INFO,
        '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio))

    printStatus(INFO, 'Allocating I,J,V arrays')
    I = np.zeros((K_neighs * tot_images * 2))
    J = np.zeros((K_neighs * tot_images * 2))
    V = np.zeros((K_neighs * tot_images * 2))
    n_entries = 0

    # distances
    printStatus(INFO, 'Starting to fill I,J,V arrays')
    for i in xrange(tot_images):
        try:
            neighbors = _get_neighbors(
                '%s,%s' % (workingCollection, workingSet[i]), rootpath,
                K_neighs * 2, feature, distance)
            # remove images with features but not in the working set
            NNrow = []
            NNDrow = []
            new_neighs = []
            for x in neighbors:
                try:
                    NNrow.append(bisect_index(workingSet, x[0]))
                    NNDrow.append(x[1])
                    new_neighs.append(x)
                except ValueError:
                    pass
            #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors])
            #NNDrow = np.array([x[1] for x in neighbors])
            NNrow = np.array(NNrow)
            NNDrow = np.array(NNDrow)
            neighbors = new_neighs[0:K_neighs]
        except ValueError:
            printStatus(
                INFO,
                'ERROR: id_img %s has non-standard format!' % (workingSet[i]))
            sys.exit(1)

        if len(neighbors) < K_neighs:
            printStatus(
                INFO, 'ERROR: id_img %s has %d < %d neighbors!' %
                (workingSet[i], len(neighbors), K_neighs))
            sys.exit(1)

        if (i + 1) % 1000 == 0:
            printStatus(INFO, '%d / %d done' % (i + 1, tot_images))
        for k in xrange(K_neighs):
            if i != int(NNrow[k]):  # -1 zero on the diagonal for a later step
                I[n_entries] = i
                J[n_entries] = int(NNrow[k])  # -1
                V[n_entries] = NNDrow[k]
                n_entries += 1
                I[n_entries] = int(NNrow[k])  # -1
                J[n_entries] = i
                V[n_entries] = NNDrow[k]
                n_entries += 1

    I = I[0:n_entries]
    J = J[0:n_entries]
    V = V[0:n_entries]

    printStatus(INFO, 'Removing duplicates')
    ind = np.lexsort((V, J, I))
    I = I[ind]
    J = J[ind]
    V = V[ind]
    a = np.concatenate([I.reshape(1, len(I)), J.reshape(1, len(J))], axis=0).T
    b = np.ascontiguousarray(a).view(
        np.dtype((np.void, a.dtype.itemsize * a.shape[1])))
    del a
    _, idx = np.unique(b, return_index=True)
    del b

    I = I[idx]
    J = J[idx]
    V = V[idx]

    printStatus(INFO, 'Computing the final laplacian matrix')
    sigma = np.median(V)**2.
    printStatus(INFO, 'Estimated sigma^2 = %f' % sigma)
    V = np.exp(-V / sigma)

    matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tocsr()
    new_diag = matrix.sum(axis=0).T
    V = -V

    I_add = np.zeros((tot_images))
    J_add = np.zeros((tot_images))
    V_add = np.zeros((tot_images))
    for i, v in enumerate(new_diag):
        I_add[i] = i
        J_add[i] = i
        V_add[i] = v

    I = np.append(I, I_add)
    J = np.append(J, J_add)
    V = np.append(V, V_add)

    matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tolil()

    printStatus(INFO, 'Saving laplacian matrix to %s' % resultfile)
    makedirsforfile(resultfile)
    scipy.io.savemat(resultfile, {'im_similarity': matrix, 'sigma': sigma})
예제 #38
0
codepath = "/home/urix/shared/tagrelcodebase"
datapath = ROOT_PATH

if len(sys.argv) < 4:
	print "Usage: merge_datasets.py trainCollection testCollection feature"
	sys.exit(1)

coll1 = sys.argv[1]
coll2 = sys.argv[2]
feature = sys.argv[3]

coll1_features_file = "%s/%s/FeatureData/%s/feature.bin" % (datapath, coll1, feature)
coll2_features_file = "%s/%s/FeatureData/%s/feature.bin" % (datapath, coll2, feature)
new_features_file = "%s/%s+%s/FeatureData/%s/feature.bin" % (datapath, coll1, coll2, feature)
makedirsforfile(new_features_file)

coll1_shape_file = "%s/%s/FeatureData/%s/shape.txt" % (datapath, coll1, feature)
coll2_shape_file = "%s/%s/FeatureData/%s/shape.txt" % (datapath, coll2, feature)
new_shape_file = "%s/%s+%s/FeatureData/%s/shape.txt" % (datapath, coll1, coll2, feature)
makedirsforfile(new_shape_file)

# shape file
with open(new_shape_file, 'w') as fout:
	imA, featA = open(coll1_shape_file).read().strip().split(" ")
	imB, featB = open(coll2_shape_file).read().strip().split(" ")
	assert featA == featB

	fout.write('%d %d' % (int(imA) + int(imB), int(featA)))

# copy and concatenate features
예제 #39
0
def process(options, testCollection, trainCollection, annotationName, feature,
            outputpkl):
    rootpath = options.rootpath
    k = options.k
    distance = options.distance
    variant = options.variant
    overwrite = options.overwrite
    testset = testCollection
    forcetrainmodel = options.trainmodel
    modelName = "tagprop"
    nnName = distance + "knn"

    printStatus(
        INFO, "Starting TagProp %s,%s,%s,%s,%s" %
        (variant, trainCollection, testCollection, annotationName, feature))

    resultfile = os.path.join(outputpkl)
    resultfile_tagprop = os.path.join(
        rootpath, testCollection, 'TagProp-Prediction', testset,
        trainCollection, annotationName, modelName,
        '%s,%s,%s,%d' % (feature, nnName, variant, k), 'prediction.mat')
    if checkToSkip(resultfile, overwrite) or checkToSkip(
            resultfile_tagprop, overwrite):
        return 0

    tagmatrix_file = os.path.join(rootpath, trainCollection, 'TextData',
                                  'lemm_wordnet_freq_tags.h5')
    if not os.path.exists(tagmatrix_file):
        printStatus(
            INFO,
            "Tag matrix file not found at %s Did you run wordnet_frequency_tags.py?"
            % (tagmatrix_file))
        sys.exit(1)

    train_neighs_file = os.path.join(rootpath, trainCollection, 'TagProp-data',
                                     trainCollection,
                                     '%s,%s,%d' % (feature, nnName, k),
                                     'nn_train.h5')
    if not os.path.exists(train_neighs_file):
        printStatus(
            INFO,
            "Matlab train neighbors file not found at %s Did you run prepare_tagprop_data.py?"
            % (train_neighs_file))
        sys.exit(1)

    # do we need to perform learning?
    train_model_file = os.path.join(
        rootpath, trainCollection, 'TagProp-models',
        '%s,%s,%s,%d' % (feature, nnName, variant, k), 'model.mat')
    if os.path.exists(train_model_file) and not forcetrainmodel:
        printStatus(
            INFO, "model for %s available at %s" %
            (trainCollection, train_model_file))
    else:
        printStatus(INFO, "starting learning model for %s" % (trainCollection))
        makedirsforfile(train_model_file)

        script = """
                tagprop_path = 'model_based/tagprop/TagProp/';
                addpath(tagprop_path);
                tagmatrix = h5read('%s', '/tagmatrix') > 0.5;
                tagmatrix = sparse(tagmatrix);
                NN = h5read('%s', '/NN');
                NN = NN(2:end, :);
                NN = double(NN);
        """ % (tagmatrix_file, train_neighs_file)

        if variant == 'dist' or variant == 'distsigmoids':
            script += """
                NND = h5read('%s', '/NND');
                NND = NND(2:end, :);
                NND = reshape(NND, 1, size(NND,1), size(NND,2));
                NND = double(NND);
            """ % train_neighs_file

        if variant == 'rank':
            script += """
                m = tagprop_learn(NN,[],tagmatrix);
            """
        elif variant == 'ranksigmoids':
            script += """
                m = tagprop_learn(NN,[],tagmatrix,'sigmoids',true);
            """
        elif variant == 'dist':
            script += """
                m = tagprop_learn(NN,NND,tagmatrix,'type','dist');
            """
        elif variant == 'distsigmoids':
            script += """
                m = tagprop_learn(NN,NND,tagmatrix,'type','dist','sigmoids',true);
            """

        script += """
                save('%s', 'm', '-v7.3');
                exit;
        """ % train_model_file

        call_matlab(script)

    # we perform prediction
    printStatus(INFO, "starting prediction")
    test_neighs_file = os.path.join(rootpath, testCollection, 'TagProp-data',
                                    testset, trainCollection, annotationName,
                                    '%s,%s,%d' % (feature, nnName, k),
                                    'nn_test.h5')
    if not os.path.exists(test_neighs_file):
        printStatus(
            INFO,
            "Matlab test neighbors file not found at %s Did you run prepare_tagprop_data.py?"
            % (test_neighs_file))
        sys.exit(1)

    script = """
            tagprop_path = 'model_based/tagprop/TagProp/';
            addpath(tagprop_path);
            load('%s');
            tagmatrix = h5read('%s', '/tagmatrix') > 0.5;
            tagmatrix = sparse(tagmatrix);
            NNT = h5read('%s', '/NNT');
            NNT = double(NNT);

    """ % (train_model_file, tagmatrix_file, test_neighs_file)

    if variant == 'dist' or variant == 'distsigmoids':
        script += """
            NNDT = h5read('%s', '/NNDT');
            NNDT = reshape(NNDT, 1, size(NNDT,1), size(NNDT,2));
            NNDT = double(NNDT);
        """ % test_neighs_file

    script += """
            P = tagprop_predict(NNT,[],m)';
            save('%s', '-v7.3');
            exit;
    """ % resultfile_tagprop

    makedirsforfile(resultfile_tagprop)
    call_matlab(script)

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)

    concepts = readConcepts(testCollection, annotationName, rootpath)
    id_images = readImageSet(testCollection, testset, rootpath)
    id_images.sort()
    # id_images = map(int, id_images)

    # concepts mapping
    tagprop_output = h5py.File(resultfile_tagprop, 'r')
    tagprop_input = h5py.File(tagmatrix_file, 'r')
    mapping = getVocabMap(list(tagprop_input['vocab'][:]), concepts)

    final_tagmatrix = tagprop_output['P'][:][:, mapping]

    with open(resultfile, 'w') as f:
        pickle.dump(
            {
                'concepts': concepts,
                'id_images': id_images,
                'scores': final_tagmatrix
            }, f, pickle.HIGHEST_PROTOCOL)
예제 #40
0
def process(options, feature, srcCollections, newCollection):
    assert(type(srcCollections) == list)
    
    temp = []
    [x for x in srcCollections if x not in temp and temp.append(x)] # unique source collections
    srcCollections = temp
    
    rootpath = options.rootpath
    
    resfile = os.path.join(rootpath, newCollection, 'FeatureData', feature, 'feature.bin')
    if checkToSkip(resfile, options.overwrite):
        return 0
    
    querysetfile = os.path.join(rootpath, newCollection, 'ImageSets', '%s.txt' % newCollection)
    try:
        query_set = set(map(str.strip, open(querysetfile).readlines()))
        printStatus(INFO, '%d images wanted' % len(query_set))
    except IOError:
        printStatus(INFO, 'failed to load %s, will merge all features in %s' % (querysetfile, ';'.join(srcCollections)))
        query_set = None
    
    makedirsforfile(resfile)
    fw = open(resfile, 'wb')
    printStatus(INFO, 'writing results to %s' % resfile)
    seen = set()
    newimset = []
    
    for collection in srcCollections:
        feat_dir = os.path.join(rootpath, collection, 'FeatureData', feature)
        with open(os.path.join(feat_dir, 'shape.txt')) as fr:
            nr_of_images, feat_dim = map(int, fr.readline().strip().split())
            fr.close()
        
        srcimset = open(os.path.join(feat_dir,'id.txt')).readline().strip().split()
        res = array.array('f')
        fr = open(os.path.join(feat_dir,'feature.bin'), 'rb')
        
        for i,im in enumerate(srcimset):
            res.fromfile(fr, feat_dim)
            if im not in seen:
                seen.add(im)
                if not query_set or im in query_set:
                    vec = res
                    vec = np.array(vec, dtype=np.float32)
                    vec.tofile(fw)
                    newimset.append(im)
            del res[:]
            if i%1e5 == 0:
                printStatus(INFO, '%d parsed, %d obtained' % (len(seen), len(newimset)))
        fr.close()       
        printStatus(INFO, '%d parsed, %d obtained' % (len(seen), len(newimset)))
                        
    fw.close()
    printStatus(INFO, '%d parsed, %d obtained' % (len(seen), len(newimset)))
    
    idfile = os.path.join(os.path.split(resfile)[0], 'id.txt')
    with open(idfile, 'w') as fw:
        fw.write(' '.join(newimset))
        fw.close()
        
    shapefile = os.path.join(os.path.split(resfile)[0], 'shape.txt')
    with open(shapefile, 'w') as fw:
        fw.write('%d %d' % (len(newimset), feat_dim))
        fw.close()
예제 #41
0
def main():
    opt = parse_args()
    print(json.dumps(vars(opt), indent=2))

    rootpath = opt.rootpath
    evalpath = opt.evalpath
    testCollection = opt.testCollection
    batchsize = opt.batch_size

    # n_caption = opt.n_caption
    resume = os.path.join(opt.logger_name, opt.checkpoint_name)

    if not os.path.exists(resume):
        logging.info(resume + ' not exists.')
        sys.exit(0)

    saveFile_AVS16 = (opt.logger_name + '/AVS16_' + testCollection +
                      '_Dense_Dual_model_bin.txt')
    saveFile_AVS17 = (opt.logger_name + '/AVS17_' + testCollection +
                      '_Dense_Dual_model_bin.txt')
    saveFile_AVS18 = (opt.logger_name + '/AVS18_' + testCollection +
                      '_Dense_Dual_model_bin.txt')

    if os.path.exists(saveFile_AVS17):
        sys.exit(0)

    queriesFile = 'AVS/tv16_17_18.avs.topics_parsed.txt'
    lineList = [line.rstrip('\n') for line in open(queriesFile)]

    checkpoint = torch.load(resume)
    start_epoch = checkpoint['epoch']
    best_rsum = checkpoint['best_rsum']
    print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
        resume, start_epoch, best_rsum))
    options = checkpoint['opt']

    if not hasattr(options, 'do_visual_feas_norm'):
        setattr(options, "do_visual_feas_norm", 0)

    if not hasattr(options, 'concate'):
        setattr(options, "concate", "full")

    trainCollection = options.trainCollection
    output_dir = resume.replace(trainCollection, testCollection)
    output_dir = output_dir.replace('/%s/' % options.cv_name,
                                    '/results/%s/' % trainCollection)
    result_pred_sents = os.path.join(output_dir, 'id.sent.score.txt')
    pred_error_matrix_file = os.path.join(output_dir,
                                          'pred_errors_matrix.pth.tar')
    if checkToSkip(pred_error_matrix_file, opt.overwrite):
        sys.exit(0)
    makedirsforfile(pred_error_matrix_file)

    # data loader prepare
    caption_files = {
        'test':
        os.path.join(evalpath, testCollection, 'TextData',
                     '%s.caption.txt' % testCollection)
    }
    img_feat_path = os.path.join(evalpath, testCollection, 'FeatureData',
                                 options.visual_feature)
    visual_feats = {'test': BigFile(img_feat_path)}
    assert options.visual_feat_dim == visual_feats['test'].ndims
    video2frames = {
        'test':
        read_dict(
            os.path.join(evalpath, testCollection, 'FeatureData',
                         options.visual_feature, 'video2frames.txt'))
    }
    # video2frames = None

    # set bow vocabulary and encoding
    bow_vocab_file = os.path.join(rootpath, options.trainCollection,
                                  'TextData', 'vocabulary', 'bow',
                                  options.vocab + '.pkl')
    bow_vocab = pickle.load(open(bow_vocab_file, 'rb'))
    bow2vec = get_text_encoder('bow')(bow_vocab)
    options.bow_vocab_size = len(bow_vocab)

    # set rnn vocabulary
    rnn_vocab_file = os.path.join(rootpath, options.trainCollection,
                                  'TextData', 'vocabulary', 'rnn',
                                  options.vocab + '.pkl')
    rnn_vocab = pickle.load(open(rnn_vocab_file, 'rb'))
    options.vocab_size = len(rnn_vocab)

    # initialize word embedding
    options.we_parameter = None
    if options.word_dim == 500:
        w2v_data_path = os.path.join(rootpath, "word2vec", 'flickr',
                                     'vec500flickr30m')
        options.we_parameter = get_we_parameter(rnn_vocab, w2v_data_path)

    # Construct the model
    model = get_model(options.model)(options)
    model.load_state_dict(checkpoint['model'])
    model.Eiters = checkpoint['Eiters']
    # switch to evaluate mode
    model.val_start()

    video2frames = video2frames['test']
    videoIDs = [key for key in video2frames.keys()]

    # Queries embeddings
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
    queryEmbeddingsTMP = []
    for quer in lineList:
        videBatch = videoIDs[0]  # a dummy video
        data = dataLoadedVideoText_one(video2frames, videBatch,
                                       visual_feats['test'], quer, bow2vec,
                                       rnn_vocab, tokenizer, options)
        videos, captions = collate_frame_gru_fn(data)
        # compute the embeddings
        vid_emb, cap_emb = model.forward_emb(videos, captions, True)
        # preserve the embeddings by copying from gpu and converting to numpy
        cap_embs = cap_emb.data.cpu().numpy().copy()
        queryEmbeddingsTMP.append(cap_embs[0])

    queryEmbeddings = np.stack(queryEmbeddingsTMP)
    # print(queryEmbeddings.shape)

    start = time.time()
    VideoIDS = []
    errorlistList = []

    for i in xrange(0, len(videoIDs), batchsize):
        videBatch = videoIDs[i:i + batchsize]
        VideoIDS.extend(videBatch)

        data = []
        for bb in videBatch:
            data.extend(
                dataLoadedVideoText_one(video2frames, bb, visual_feats['test'],
                                        lineList[0], bow2vec, rnn_vocab,
                                        tokenizer, options))
        videos, captions = collate_frame_gru_fn(data)

        # compute the embeddings
        vid_emb, cap_emb = model.forward_emb(videos, captions, True)
        # preserve the embeddings by copying from gpu and converting to numpy
        video_embs = vid_emb.data.cpu().numpy().copy()

        # calculate cosine distance
        errorlistList.extend(cosine_calculate(video_embs, queryEmbeddings))

        if i % 100000 == 0:
            # print (i)
            end = time.time()
            print(str(i) + ' in: ' + str(end - start))
            start = time.time()

    errorlist = np.asarray(errorlistList)
    f = open(saveFile_AVS16, "w")
    for num, name in enumerate(lineList[:30], start=1):
        queryError = errorlist[:, num - 1]
        scoresIndex = np.argsort(queryError)

        f = open(saveFile_AVS16, "a")
        c = 0
        for ind in scoresIndex:
            imgID = VideoIDS[ind]
            c = c + 1
            f.write('15%02d' % num)
            f.write(' 0 ' + imgID + ' ' + str(c) + ' ' + str(1000 - c) +
                    ' ITI-CERTH' + '\n')
            if c == 1000:
                break
    f.close()

    # AVS17
    f = open(saveFile_AVS17, "w")
    for num, name in enumerate(lineList[30:60], start=31):
        queryError = errorlist[:, num - 1]
        scoresIndex = np.argsort(queryError)

        f = open(saveFile_AVS17, "a")
        c = 0
        for ind in scoresIndex:
            imgID = VideoIDS[ind]
            c = c + 1
            f.write('15%02d' % num)
            f.write(' 0 ' + imgID + ' ' + str(c) + ' ' + str(1000 - c) +
                    ' ITI-CERTH' + '\n')
            if c == 1000:
                break
    f.close()

    # AVS18
    f = open(saveFile_AVS18, "w")
    for num, name in enumerate(lineList[60:90], start=61):
        queryError = errorlist[:, num - 1]
        scoresIndex = np.argsort(queryError)

        f = open(saveFile_AVS18, "a")
        c = 0
        for ind in scoresIndex:
            imgID = VideoIDS[ind]
            c = c + 1
            f.write('15%02d' % num)
            f.write(' 0 ' + imgID + ' ' + str(c) + ' ' + str(1000 - c) +
                    ' ITI-CERTH' + '\n')
            if c == 1000:
                break
    f.close()

    resultAVSFile16 = saveFile_AVS16[:-4] + '_results.txt'
    command = "perl AVS/sample_eval.pl -q AVS/avs.qrels.tv16 {} > {}".format(
        saveFile_AVS16, resultAVSFile16)
    os.system(command)
    resultAVSFile17 = saveFile_AVS17[:-4] + '_results.txt'
    command = "perl AVS/sample_eval.pl -q AVS/avs.qrels.tv17 {} > {}".format(
        saveFile_AVS17, resultAVSFile17)
    os.system(command)
    resultAVSFile18 = saveFile_AVS18[:-4] + '_results.txt'
    command = "perl AVS/sample_eval.pl -q AVS/avs.qrels.tv18 {} > {}".format(
        saveFile_AVS18, resultAVSFile18)
    os.system(command)