Beispiel #1
0
def process(options, collection, annotationName, simdir, resultfile):
    rootpath = options.rootpath
    
    if checkToSkip(resultfile, options.overwrite):
        return 0
    
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    concept_num = len(concepts)

    id_images = readImageSet(collection, collection, rootpath)
    image_num = len(id_images)
    im2index = dict(zip(id_images, range(image_num)))
    print ('%d instances, %d concepts to dump -> %s' % (image_num, concept_num, resultfile))
    
    scores = np.zeros((image_num, concept_num)) - 1e4
    
    for c_id,concept in enumerate(concepts):
        simfile = os.path.join(simdir, '%s.txt' % concept)
        ranklist = readRankingResults(simfile)
        for im,score in ranklist:
            idx = im2index[im]
            scores[idx,c_id] = score

    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump({'concepts':concepts, 'id_images':map(int,id_images), 'scores':scores}, output, -1)
    output.close()
Beispiel #2
0
    def __init__(self,
                 collection,
                 annotationName,
                 feature,
                 distance,
                 tpp=DEFAULT_TPP,
                 rootpath=ROOT_PATH,
                 k=DEFAULT_K):
        self.rootpath = rootpath
        self.concepts = readConcepts(collection, annotationName, rootpath)
        self.nr_of_concepts = len(self.concepts)
        self.concept2index = dict(
            zip(self.concepts, range(self.nr_of_concepts)))

        self.imset = readImageSet(collection, collection, rootpath)
        self.nr_of_images = len(self.imset)
        self.knndir = os.path.join(collection,
                                   '%s,%sknn,1500' % (feature, distance))

        self.k = k
        self.noise = 0

        self._load_tag_data(collection, tpp, rootpath)

        printStatus(
            INFO,
            "%s, %d images, %d unique tags, %s %d neighbours for voting" %
            (self.__class__.__name__, self.nr_of_images, len(
                self.tag2freq), distance, self.k))
Beispiel #3
0
def process(options, collection, annotationName, simdir, resultfile):
    rootpath = options.rootpath

    if checkToSkip(resultfile, options.overwrite):
        return 0

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    concept_num = len(concepts)

    id_images = readImageSet(collection, collection, rootpath)
    image_num = len(id_images)
    im2index = dict(zip(id_images, range(image_num)))
    print('%d instances, %d concepts to dump -> %s' %
          (image_num, concept_num, resultfile))

    scores = np.zeros((image_num, concept_num)) - 1e4

    for c_id, concept in enumerate(concepts):
        simfile = os.path.join(simdir, '%s.txt' % concept)
        ranklist = readRankingResults(simfile)
        for im, score in ranklist:
            idx = im2index[im]
            scores[idx, c_id] = score

    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump(
        {
            'concepts': concepts,
            'id_images': map(int, id_images),
            'scores': scores
        }, output, -1)
    output.close()
Beispiel #4
0
def process(options, testCollection, trainCollection, tagsimMethod):
    rootpath = options.rootpath
    overwrite = options.overwrite
    testsetName = options.testset if options.testset else testCollection 
    tpp = options.tpp
    numjobs = options.numjobs
    job = options.job
    useWnVob = 1

    outputName = tagsimMethod + '-wn' if useWnVob else tagsimMethod

    if tagsimMethod == 'wns':
        resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, outputName,'id.tagvotes.txt')
    else:    
        resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, trainCollection, outputName,'id.tagvotes.txt')
    if numjobs>1:
        resultfile = resultfile.replace("id.tagvotes.txt", "id.tagvotes.%d.%d.txt" % (numjobs,job))

    if checkToSkip(resultfile, overwrite):
        sys.exit(0)

    makedirsforfile(resultfile)

    try:
        doneset = set([str.split(x)[0] for x in open(options.donefile).readlines()[:-1]])
    except:
        doneset = set()
        
    printStatus(INFO, "done set: %d" % len(doneset))

 
    testImageSet = readImageSet(testCollection, testCollection, rootpath)
    testImageSet = [x for x in testImageSet if x not in doneset]
    testImageSet = [testImageSet[i] for i in range(len(testImageSet)) if (i%numjobs+1) == job]
    printStatus(INFO, 'working on %d-%d, %d test images -> %s' % (numjobs,job,len(testImageSet),resultfile) )
    
    testreader = TagReader(testCollection, rootpath=rootpath)    

    if tagsimMethod == "wns":
        tagrel = SIM_TO_TAGREL["wns"](trainCollection, useWnVob, "wup", rootpath)
    else:
        tagrel = SIM_TO_TAGREL[tagsimMethod](trainCollection, useWnVob, rootpath)

 
    done = 0
    fw = open(resultfile, "w")
    
    for qry_id in testImageSet:
        qry_tags = testreader.get(qry_id)    
        tagvotes = tagrel.estimate(qry_tags)
        newline = qry_id + " " + " ".join(["%s %s" % (tag, niceNumber(vote,8)) for (tag,vote) in tagvotes])
        fw.write(newline+"\n")
        done += 1
        if done%1000 == 0:
            printStatus(INFO, "%d done" % done)
    # done    
    fw.close()
    printStatus(INFO, "%d done" % done)
Beispiel #5
0
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName):
    assert(modelName.startswith('fastlinear'))
    
    rootpath = options.rootpath
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job
    topk = options.topk
    
    outputName = '%s,%s' % (feature,modelName)
    
    resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt')
    if numjobs>1:
        resultfile += '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    test_imset = readImageSet(testCollection, testCollection, rootpath)
    test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job]
    test_imset = set(test_imset)
    nr_of_test_images = len(test_imset)
    printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile))

    ma = ModelArray(trainCollection, trainAnnotationName, feature, modelName, rootpath=rootpath)
        
    feat_file = StreamFile(os.path.join(rootpath, testCollection, "FeatureData", feature))
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    done = 0

    feat_file.open()
    for _id, _vec in feat_file:
        if _id not in test_imset:
            continue
       
        res = ma.predict([_vec],prob=0)
        tagvotes = res[0]
        if topk>0:
            tagvotes = tagvotes[:topk]
        newline = '%s %s\n' % (_id, " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes]))
        fw.write(newline)
        done += 1
        if done % 1e4  == 0:
            printStatus(INFO, "%d done" % done)

    feat_file.close()
    fw.close()
    printStatus(INFO, "%d done" % (done))
    return done
Beispiel #6
0
def process(options, testCollection):
    overwrite = options.overwrite
    rootpath = options.rootpath
    corpus = options.corpus
    word2vec_model = options.word2vec
    embedding_model = options.embedding
    Y0 = options.Y0
    Y1 = options.Y1
    pY0 = options.pY0
    r = options.r
    blocksize = 2000

    embedding_name = '%s,%s,%s' % (corpus, word2vec_model, embedding_model)
    for synset_name in [Y0, Y1]:
        assert(os.path.exists(os.path.join(rootpath, 'synset2vec', synset_name, embedding_name)))

    resfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, embedding_name, pY0, 'id.tagvotes.txt')
    if checkToSkip(resfile, overwrite):
        return 0

    label_file = 'data/ilsvrc12/synsets.txt'
    label2vec_dir = os.path.join(rootpath, 'synset2vec', Y0, embedding_name)
    i2v = Image2Vec(label_file, label2vec_dir)

    tagger = ZeroshotTagger(synset_name=Y1, embedding_name=embedding_name, rootpath=rootpath)

    imset = readImageSet(testCollection, testCollection, rootpath)
    feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', pY0)
    feat_file = BigFile(feat_dir)
    

    printStatus(INFO, 'tagging %d images' % len(imset))
    makedirsforfile(resfile)
    fw = open(resfile, 'w')

    start = 0
    while start < len(imset):
        end = min(len(imset), start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end))
        todo = imset[start:end]
        if not todo:
            break

        renamed, vectors = feat_file.read(todo)
        output = []
        for _id,_vec in zip(renamed, vectors):
            im_vec = i2v.embedding(_vec)
            pred = tagger.predict(im_vec, topk=options.r)
            output.append('%s %s\n' % (_id, ' '.join(['%s %s'%(x[0],x[1]) for x in pred])))
        start = end
        fw.write(''.join(output))

    fw.close()
Beispiel #7
0
def process(options, collection):
    rootpath = options.rootpath
    tpp = options.tpp
    overwrite = options.overwrite

    resultfile = os.path.join(rootpath, collection, "tagrel", collection,
                              'tagpos,%s' % tpp, 'id.tagvotes.txt')
    if checkToSkip(resultfile, overwrite):
        sys.exit(0)

    imset = readImageSet(collection, collection, rootpath)
    printStatus(INFO,
                'working on %d test images -> %s' % (len(imset), resultfile))

    reader = TagReader(collection, tpp=tpp, rootpath=rootpath)

    makedirsforfile(resultfile)
    fw = open(resultfile, "w")
    output = []
    done = 0

    for im in imset:
        tags = reader.get(im)
        tagSet = set()
        tagSeq = []
        for tag in str.split(tags):
            if tag not in tagSet:
                tagSeq.append(tag)
                tagSet.add(tag)
        assert (len(tagSeq) == len(tagSet))

        nr_tags = len(tagSeq)
        tagvotes = [(tagSeq[i], 1.0 - float(i) / nr_tags)
                    for i in range(nr_tags)]
        newline = "%s %s" % (im, " ".join(
            ["%s %g" % (x[0], x[1]) for x in tagvotes]))
        output.append(newline + "\n")
        done += 1

        if len(output) % 1e4 == 0:
            printStatus(
                INFO, '%d %s %s' % (done, im, ' '.join(
                    ['%s:%g' % (x[0], x[1]) for x in tagvotes[:3]])))
            fw.write("".join(output))
            fw.flush()
            output = []

    if output:
        fw.write("".join(output))
    fw.close()
    printStatus(INFO, 'done')
Beispiel #8
0
 def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH):
     self.rootpath = rootpath
     self.concepts = readConcepts(collection, annotationName, rootpath)
     self.nr_of_concepts = len(self.concepts)
     self.concept2index = dict(zip(self.concepts, range(self.nr_of_concepts)))
     
     self.imset = readImageSet(collection, collection, rootpath)
     self.nr_of_images = len(self.imset)
     self.knndir = os.path.join(collection, '%s,%sknn,uu,1500' % (feature, distance))
     
     self.k = DEFAULT_K
     self.noise = 0
     
     self._load_tag_data(collection, tpp, rootpath)
     
     printStatus(INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len(self.tag2freq), distance, self.k))
Beispiel #9
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite
    random = options.random

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.zeros((len(id_images), len(concepts)))

    id_images = []
    tag2idx = dict(zip(concepts, xrange(len(concepts))))
    with open(
            os.path.join(rootpath, workingCollection, 'TextData',
                         'id.userid.lemmtags.txt')) as f:
        cnt = 0
        for line in f:
            id_img, _, tags = line.split('\t')
            tags = tags.split()
            if len(tags) > 0:
                tags = [(tag2idx.get(x, -1), y)
                        for x, y in zip(tags, xrange(len(tags)))]
                idx = np.array([x[0] for x in tags])
                vals = 1. / (1. + np.array([x[1] for x in tags]))
                tagmatrix[cnt, idx] = vals

            id_images.append(id_img)
            cnt += 1

    # random rank for untagged images
    if random:
        tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand(
            tagmatrix.shape[0], tagmatrix.shape[1])

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump(
            {
                'concepts': concepts,
                'id_images': map(int, id_images),
                'scores': tagmatrix
            }, f, pickle.HIGHEST_PROTOCOL)
Beispiel #10
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.random.rand(len(id_images), len(concepts))

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)    
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
Beispiel #11
0
def process(options, collection):
    rootpath = options.rootpath
    tpp = options.tpp
    overwrite = options.overwrite

    
    resultfile = os.path.join(rootpath, collection, "tagrel", collection, 'tagpos,%s'%tpp, 'id.tagvotes.txt')
    if checkToSkip(resultfile, overwrite):
        sys.exit(0)    

    imset = readImageSet(collection, collection, rootpath)
    printStatus(INFO, 'working on %d test images -> %s' % (len(imset),resultfile))
    
    reader = TagReader(collection,tpp=tpp,rootpath=rootpath)   
    
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")
    output = []
    done = 0
    
    for im in imset:
        tags = reader.get(im)
        tagSet = set()
        tagSeq = []
        for tag in str.split(tags):
            if tag not in tagSet:
                tagSeq.append(tag)
                tagSet.add(tag)
        assert(len(tagSeq) == len(tagSet))
        
        nr_tags = len(tagSeq)
        tagvotes = [(tagSeq[i], 1.0-float(i)/nr_tags) for i in range(nr_tags)]
        newline = "%s %s" % (im, " ".join(["%s %g" % (x[0],x[1]) for x in tagvotes]))
        output.append(newline + "\n")
        done += 1
        
        if len(output)%1e4 == 0:
            printStatus(INFO, '%d %s %s' % (done,im,' '.join(['%s:%g' % (x[0],x[1]) for x in tagvotes[:3]] )))
            fw.write("".join(output))
            fw.flush()
            output = []
        
    if output:
        fw.write("".join(output))
    fw.close()
    printStatus(INFO, 'done')
Beispiel #12
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.random.rand(len(id_images), len(concepts))

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump(
            {
                'concepts': concepts,
                'id_images': map(int, id_images),
                'scores': tagmatrix
            }, f, pickle.HIGHEST_PROTOCOL)
Beispiel #13
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite
    random = options.random

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.zeros((len(id_images), len(concepts)))

    id_images = []
    tag2idx = dict(zip(concepts, xrange(len(concepts))))
    with open(os.path.join(rootpath, workingCollection, 'TextData', 'id.userid.lemmtags.txt')) as f:
        cnt = 0
        for line in f:
            id_img, _, tags = line.split('\t')
            tags = tags.split()
            if len(tags) > 0:
                tags = [(tag2idx.get(x,-1), y) for x,y in zip(tags, xrange(len(tags)))]
                idx = np.array([x[0] for x in tags])
                vals = 1. / (1. + np.array([x[1] for x in tags]))
                tagmatrix[cnt, idx] = vals

            id_images.append(id_img)
            cnt += 1

    # random rank for untagged images
    if random:
        tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand(tagmatrix.shape[0], tagmatrix.shape[1])

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)    
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
Beispiel #14
0
def process(options, workingCollection, feature):
    rootpath = options.rootpath
    k_ratio = options.kratio
    distance = options.distance
    overwrite = options.overwrite

    nnName = distance + "knn"
    resultfile = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,k_ratio), 'laplacianI.mat')

    if checkToSkip(resultfile, overwrite):
        return 0

    workingSet = readImageSet(workingCollection, workingCollection, rootpath)
    workingSet.sort()

    tot_images = len(workingSet)
    printStatus(INFO, '%d images' % (tot_images))

    K_neighs = int(math.floor(len(workingSet) * k_ratio))
    printStatus(INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio))

    printStatus(INFO, 'Allocating I,J,V arrays')
    I = np.zeros((K_neighs * tot_images * 2))
    J = np.zeros((K_neighs * tot_images * 2))
    V = np.zeros((K_neighs * tot_images * 2))
    n_entries = 0

    # distances
    printStatus(INFO, 'Starting to fill I,J,V arrays')
    for i in xrange(tot_images):
        try:
            neighbors = _get_neighbors('%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs*2, feature, distance)
            # remove images with features but not in the working set
            NNrow = []
            NNDrow = []
            new_neighs = []
            for x in neighbors:
                try:
                    NNrow.append(bisect_index(workingSet, x[0]))
                    NNDrow.append(x[1])
                    new_neighs.append(x)
                except ValueError:
                    pass
            #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors])
            #NNDrow = np.array([x[1] for x in neighbors])
            NNrow = np.array(NNrow)
            NNDrow = np.array(NNDrow)
            neighbors = new_neighs[0:K_neighs]
        except ValueError:
            printStatus(INFO, 'ERROR: id_img %s has non-standard format!' % (workingSet[i]))
            sys.exit(1)

        if len(neighbors) < K_neighs:
            printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (workingSet[i], len(neighbors), K_neighs))
            sys.exit(1)

        if (i+1) % 1000 == 0:
            printStatus(INFO, '%d / %d done' % (i+1, tot_images))
        for k in xrange(K_neighs):
            if i != int(NNrow[k]): # -1 zero on the diagonal for a later step
                I[n_entries] = i
                J[n_entries] = int(NNrow[k]) # -1
                V[n_entries] = NNDrow[k]
                n_entries += 1
                I[n_entries] = int(NNrow[k]) # -1
                J[n_entries] = i
                V[n_entries] = NNDrow[k]
                n_entries += 1

    I = I[0:n_entries]
    J = J[0:n_entries]
    V = V[0:n_entries]

    printStatus(INFO, 'Removing duplicates')
    ind = np.lexsort((V,J,I))
    I = I[ind]
    J = J[ind]
    V = V[ind]
    a = np.concatenate([I.reshape(1, len(I)), J.reshape(1, len(J))], axis=0).T
    b = np.ascontiguousarray(a).view(np.dtype((np.void, a.dtype.itemsize * a.shape[1])))
    del a
    _, idx = np.unique(b, return_index=True)
    del b

    I = I[idx]
    J = J[idx]
    V = V[idx]

    printStatus(INFO, 'Computing the final laplacian matrix')
    sigma = np.median(V) ** 2.;
    printStatus(INFO, 'Estimated sigma^2 = %f' % sigma)
    V = np.exp(-V / sigma)

    matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tocsr()
    new_diag = matrix.sum(axis=0).T
    V = -V

    I_add = np.zeros((tot_images))
    J_add = np.zeros((tot_images))
    V_add = np.zeros((tot_images))
    for i,v in enumerate(new_diag):
        I_add[i] = i
        J_add[i] = i
        V_add[i] = v

    I = np.append(I, I_add)
    J = np.append(J, J_add)
    V = np.append(V, V_add)

    matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tolil()

    printStatus(INFO, 'Saving laplacian matrix to %s' % resultfile)
    makedirsforfile(resultfile)
    scipy.io.savemat(resultfile, {'im_similarity' : matrix, 'sigma' : sigma})
def process(options, testCollection, trainCollection, annotationName, feature):
    rootpath = options.rootpath
    k = options.k
    distance = options.distance
    overwrite = options.overwrite
    testset = testCollection
    onlytest = options.onlytest

    nnName = distance + "knn"
    resultfile_train = os.path.join(rootpath, trainCollection, 'TagProp-data',
                                    trainCollection,
                                    '%s,%s,%d' % (feature, nnName, k),
                                    'nn_train.h5')
    resultfile_test = os.path.join(rootpath, testCollection, 'TagProp-data',
                                   testset, trainCollection, annotationName,
                                   '%s,%s,%d' % (feature, nnName, k),
                                   'nn_test.h5')

    if (not onlytest
            and checkToSkip(resultfile_train, overwrite)) or checkToSkip(
                resultfile_test, overwrite):
        return 0

    testSet = readImageSet(testCollection, testset, rootpath)
    trainSet = readImageSet(trainCollection, trainCollection, rootpath)
    testSet.sort()
    trainSet.sort()

    #train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
    #train_feat_file = BigFile(train_feat_dir)

    tagger = NAME_TO_TAGGER["preknn"](trainCollection,
                                      annotationName,
                                      feature,
                                      distance,
                                      rootpath=rootpath,
                                      k=1001)

    printStatus(
        INFO,
        '%d test images, %d train images' % (len(testSet), len(trainSet)))

    # allocate train -> train nearest neighbors
    if not onlytest:
        printStatus(INFO, 'Allocating NN, NND matrices')
        NN = np.zeros((len(trainSet), k + 1), dtype=np.int32)
        NND = np.zeros((len(trainSet), k + 1))

        printStatus(INFO, 'Filling NN, NND matrices')
        for i, id_img in enumerate(trainSet):
            neighbors = tagger._get_neighbors(content=None,
                                              context='%s,%s' %
                                              (trainCollection, id_img))
            if len(neighbors) < k + 1:
                printStatus(
                    INFO, 'ERROR: id_img %s has %d < %d neighbors!' %
                    (id_img, len(neighbors), k + 1))
                sys.exit(1)

            NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors])
            NNDrow = np.array([x[1] for x in neighbors])

            NN[i, :] = NNrow[0:k + 1]
            NND[i, :] = NNDrow[0:k + 1]

            if i % 1000 == 0:
                printStatus(INFO, '%d / %d images' % (i, len(trainSet)))

        printStatus(INFO,
                    'Saving train matrices to file %s' % (resultfile_train))
        makedirsforfile(resultfile_train)
        fout = h5py.File(resultfile_train, 'w')
        fout['NN'] = NN
        fout['NND'] = NND
        fout['trainSet'] = trainSet
        fout['concepts'] = tagger.concepts
        fout.close()

        del NN
        del NND

    # allocate test -> train nearest neighbors
    printStatus(INFO, 'Allocating NNT, NNDT matrices')
    NNT = np.zeros((len(testSet), k), dtype=np.int32)
    NNDT = np.zeros((len(testSet), k))

    printStatus(INFO, 'Filling NNT, NNDT matrices')
    for i, id_img in enumerate(testSet):
        neighbors = tagger._get_neighbors(content=None,
                                          context='%s,%s' %
                                          (testCollection, id_img))
        if len(neighbors) < k:
            printStatus(
                INFO, 'ERROR: id_img %s has %d < %d neighbors!' %
                (id_img, len(neighbors), k))
            sys.exit(1)

        NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors])
        NNDrow = np.array([x[1] for x in neighbors])

        NNT[i, :] = NNrow[0:k]
        NNDT[i, :] = NNDrow[0:k]

        if i % 1000 == 0:
            printStatus(INFO, '%d / %d images' % (i, len(testSet)))

    printStatus(INFO, 'Saving test matrices to file %s' % (resultfile_test))
    makedirsforfile(resultfile_test)
    fout = h5py.File(resultfile_test, 'w')
    fout['NNT'] = NNT
    fout['NNDT'] = NNDT
    fout['trainSet'] = trainSet
    fout['testSet'] = testSet
    fout['concepts'] = tagger.concepts
    fout.close()
Beispiel #16
0
def process(options, testCollection, trainCollection, annotationName, feature, outputpkl):
    rootpath = options.rootpath
    k = options.k
    distance = options.distance
    variant = options.variant
    overwrite = options.overwrite
    testset = testCollection
    forcetrainmodel = options.trainmodel
    modelName = "tagprop"
    nnName = distance + "knn"

    printStatus(INFO, "Starting TagProp %s,%s,%s,%s,%s" % (variant, trainCollection, testCollection, annotationName, feature))

    resultfile = os.path.join(outputpkl)
    resultfile_tagprop = os.path.join(rootpath, testCollection, 'TagProp-Prediction', testset, trainCollection, annotationName, modelName, '%s,%s,%s,%d'%(feature,nnName,variant,k), 'prediction.mat')
    # if checkToSkip(resultfile, overwrite) or checkToSkip(resultfile_tagprop, overwrite):
    #     return 0

    tagmatrix_file = os.path.join(rootpath, trainCollection, 'TextData', 'lemm_wordnet_freq_tags.h5')
    if not os.path.exists(tagmatrix_file):
        printStatus(INFO, "Tag matrix file not found at %s Did you run wordnet_frequency_tags.py?" % (tagmatrix_file))
        sys.exit(1)

    train_neighs_file = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d'%(feature,nnName,k), 'nn_train.h5')
    if not os.path.exists(train_neighs_file):
        printStatus(INFO, "Matlab train neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (train_neighs_file))
        sys.exit(1)

    # do we need to perform learning?
    train_model_file = os.path.join(rootpath, trainCollection, 'TagProp-models', '%s,%s,%s,%d'%(feature,nnName,variant,k), 'model.mat')
    # if os.path.exists(train_model_file) and not forcetrainmodel:
    if False:
        printStatus(INFO, "model for %s available at %s" % (trainCollection, train_model_file))
    else:
        printStatus(INFO, "starting learning model for %s" % (trainCollection))
        makedirsforfile(train_model_file)

        # print(tagmatrix_file, train_neighs_file)
        # exit()
        script = """
                tagprop_path = '%s/model_based/tagprop/TagProp/';
                addpath(tagprop_path);
                tagmatrix = h5read('%s', '/tagmatrix') > 0.5;
                tagmatrix = sparse(tagmatrix);
                NN = h5read('%s', '/NN');
                NN = NN(2:end, :);
                NN = double(NN);
        """ % (survey_code, tagmatrix_file, train_neighs_file)

        if variant == 'dist' or variant == 'distsigmoids':
            script += """
                NND = h5read('%s', '/NND');
                NND = NND(2:end, :);
                NND = reshape(NND, 1, size(NND,1), size(NND,2));
                NND = double(NND);
            """ % train_neighs_file

        if variant == 'rank':
            script += """
                m = tagprop_learn(NN,[],tagmatrix);
            """
        elif variant == 'ranksigmoids':
            script += """
                m = tagprop_learn(NN,[],tagmatrix,'sigmoids',true);
            """
        elif variant == 'dist':
            script += """
                m = tagprop_learn(NN,NND,tagmatrix,'type','dist');
            """
        elif variant == 'distsigmoids':
            script += """
                m = tagprop_learn(NN,NND,tagmatrix,'type','dist','sigmoids',true);
            """

        script += """
                save('%s', 'm', '-v7.3');
        """ % train_model_file

        # call_matlab(script)

    # print(script)
    # exit()

    # we perform prediction
    printStatus(INFO, "starting prediction")
    test_neighs_file = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d'%(feature,nnName,k), 'nn_test.h5')
    if not os.path.exists(test_neighs_file):
        printStatus(INFO, "Matlab test neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (test_neighs_file))
        sys.exit(1)

    script += """
            tagprop_path = '%s/model_based/tagprop/TagProp/';
            addpath(tagprop_path);
            load('%s');
            tagmatrix = h5read('%s', '/tagmatrix') > 0.5;
            tagmatrix = sparse(tagmatrix);
            NNT = h5read('%s', '/NNT');
            NNT = double(NNT);

    """ % (survey_code, train_model_file, tagmatrix_file, test_neighs_file)

    if variant == 'dist' or variant == 'distsigmoids':
        script += """
            NNDT = h5read('%s', '/NNDT');
            NNDT = reshape(NNDT, 1, size(NNDT,1), size(NNDT,2));
            NNDT = double(NNDT);
        """ % test_neighs_file

    script += """
            P = tagprop_predict(NNT,[],m)';
            save('%s', '-v7.3');
            exit;
    """ % resultfile_tagprop

    # print(script)
    makedirsforfile(resultfile_tagprop)
    call_matlab(script)
    # exit()

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)

    concepts = readConcepts(testCollection, annotationName, rootpath)
    id_images = readImageSet(testCollection, testset, rootpath)
    id_images.sort()
    # id_images = map(int, id_images)

    # concepts mapping
    tagprop_output = h5py.File(resultfile_tagprop, 'r')
    tagprop_input = h5py.File(tagmatrix_file, 'r')
    mapping = getVocabMap(list(tagprop_input['vocab'][:]),concepts)

    final_tagmatrix = tagprop_output['P'][:][:,mapping]

    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images':id_images, 'scores':final_tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
Beispiel #17
0
def process(options, workingCollection, feature):
    rootpath = options.rootpath
    k_ratio = options.kratio
    distance = options.distance
    overwrite = options.overwrite

    nnName = distance + "knn"
    resultfile = os.path.join(rootpath, workingCollection, 'LaplacianI',
                              workingCollection,
                              '%s,%s,%f' % (feature, nnName, k_ratio),
                              'laplacianI.mat')

    if checkToSkip(resultfile, overwrite):
        return 0

    workingSet = readImageSet(workingCollection, workingCollection, rootpath)
    workingSet.sort()

    tot_images = len(workingSet)
    printStatus(INFO, '%d images' % (tot_images))

    K_neighs = int(math.floor(len(workingSet) * k_ratio))
    printStatus(
        INFO,
        '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio))

    printStatus(INFO, 'Allocating I,J,V arrays')
    I = np.zeros((K_neighs * tot_images * 2))
    J = np.zeros((K_neighs * tot_images * 2))
    V = np.zeros((K_neighs * tot_images * 2))
    n_entries = 0

    # distances
    printStatus(INFO, 'Starting to fill I,J,V arrays')
    for i in xrange(tot_images):
        try:
            neighbors = _get_neighbors(
                '%s,%s' % (workingCollection, workingSet[i]), rootpath,
                K_neighs * 2, feature, distance)
            # remove images with features but not in the working set
            NNrow = []
            NNDrow = []
            new_neighs = []
            for x in neighbors:
                try:
                    NNrow.append(bisect_index(workingSet, x[0]))
                    NNDrow.append(x[1])
                    new_neighs.append(x)
                except ValueError:
                    pass
            #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors])
            #NNDrow = np.array([x[1] for x in neighbors])
            NNrow = np.array(NNrow)
            NNDrow = np.array(NNDrow)
            neighbors = new_neighs[0:K_neighs]
        except ValueError:
            printStatus(
                INFO,
                'ERROR: id_img %s has non-standard format!' % (workingSet[i]))
            sys.exit(1)

        if len(neighbors) < K_neighs:
            printStatus(
                INFO, 'ERROR: id_img %s has %d < %d neighbors!' %
                (workingSet[i], len(neighbors), K_neighs))
            sys.exit(1)

        if (i + 1) % 1000 == 0:
            printStatus(INFO, '%d / %d done' % (i + 1, tot_images))
        for k in xrange(K_neighs):
            if i != int(NNrow[k]):  # -1 zero on the diagonal for a later step
                I[n_entries] = i
                J[n_entries] = int(NNrow[k])  # -1
                V[n_entries] = NNDrow[k]
                n_entries += 1
                I[n_entries] = int(NNrow[k])  # -1
                J[n_entries] = i
                V[n_entries] = NNDrow[k]
                n_entries += 1

    I = I[0:n_entries]
    J = J[0:n_entries]
    V = V[0:n_entries]

    printStatus(INFO, 'Removing duplicates')
    ind = np.lexsort((V, J, I))
    I = I[ind]
    J = J[ind]
    V = V[ind]
    a = np.concatenate([I.reshape(1, len(I)), J.reshape(1, len(J))], axis=0).T
    b = np.ascontiguousarray(a).view(
        np.dtype((np.void, a.dtype.itemsize * a.shape[1])))
    del a
    _, idx = np.unique(b, return_index=True)
    del b

    I = I[idx]
    J = J[idx]
    V = V[idx]

    printStatus(INFO, 'Computing the final laplacian matrix')
    sigma = np.median(V)**2.
    printStatus(INFO, 'Estimated sigma^2 = %f' % sigma)
    V = np.exp(-V / sigma)

    matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tocsr()
    new_diag = matrix.sum(axis=0).T
    V = -V

    I_add = np.zeros((tot_images))
    J_add = np.zeros((tot_images))
    V_add = np.zeros((tot_images))
    for i, v in enumerate(new_diag):
        I_add[i] = i
        J_add[i] = i
        V_add[i] = v

    I = np.append(I, I_add)
    J = np.append(J, J_add)
    V = np.append(V, V_add)

    matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tolil()

    printStatus(INFO, 'Saving laplacian matrix to %s' % resultfile)
    makedirsforfile(resultfile)
    scipy.io.savemat(resultfile, {'im_similarity': matrix, 'sigma': sigma})
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName):
    if modelName.startswith('fik'):
        from fiksvm.fiksvm import fiksvm_load_model as load_model
    else:
        from fastlinear.fastlinear import fastlinear_load_model as load_model

    rootpath = options.rootpath
    overwrite = options.overwrite
    prob_output = options.prob_output
    numjobs = options.numjobs
    job = options.job
    #blocksize = options.blocksize
    topk = options.topk
    
    outputName = '%s,%s' % (feature,modelName)
    if prob_output:
        outputName += ',prob'

    resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt')
    if numjobs>1:
        resultfile += '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    test_imset = readImageSet(testCollection, testCollection, rootpath)
    test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job]
    test_imset = set(test_imset)
    nr_of_test_images = len(test_imset)
    printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile))

    models = [None] * nr_of_concepts
    for c in range(nr_of_concepts):
        model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concepts[c])
        models[c] = load_model(model_file_name)
        if models[c] is None:
            return 0
        #(pA,pB) = model.get_probAB()
        

    feat_file = StreamFile(os.path.join(rootpath, testCollection, "FeatureData", feature))
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    done = 0

    feat_file.open()
    for _id, _vec in feat_file:
        if _id not in test_imset:
            continue
        if prob_output:
            scores = [models[c].predict_probability(_vec) for c in range(nr_of_concepts)]
        else:
            scores = [models[c].predict(_vec) for c in range(nr_of_concepts)]

        tagvotes = sorted(zip(concepts, scores), key=lambda v:v[1], reverse=True)
        if topk>0:
            tagvotes = tagvotes[:topk]
        newline = '%s %s\n' % (_id, " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes]))
        fw.write(newline)
        done += 1
        if done % 1e4  == 0:
            printStatus(INFO, "%d done" % done)

    feat_file.close()
    fw.close()
    printStatus(INFO, "%d done" % (done))
    return done
Beispiel #19
0
def process(options, workingCollection, annotationName, feature, outputpkl):
    rootpath = options.rootpath
    distance = options.distance
    overwrite = options.overwrite
    k_ratio = options.kratio
    ratio_cs = options.ratiocs
    lambda1 = options.lambda1
    lambda2 = options.lambda2
    outputonlytest = options.outputonlytest
    rawtagmatrix = options.rawtagmatrix
    modelName = "robustpca"
    nnName = distance + "knn"

    printStatus(INFO, "Starting RobustPCA %s,%s,%s,%s,%f,%f,%f" % (workingCollection, annotationName, feature, nnName, k_ratio, lambda1, lambda2))

    if rawtagmatrix:
        printStatus(INFO, "Using raw tag matrix.")
    else:
        printStatus(INFO, "Using preprocessed tag matrix.")

    resultfile = os.path.join(outputpkl)
    resultfile_robustpca = os.path.join(rootpath, workingCollection, 'RobustPCA-Prediction', '%s,%s,%f,%f,%f,%d'%(feature,nnName,lambda1,lambda2,k_ratio,rawtagmatrix), 'prediction.mat')

    if checkToSkip(resultfile_robustpca, overwrite):
        only_dump = True
    else:
        only_dump = False

    if not rawtagmatrix:
        tagmatrix_file = os.path.join(rootpath, workingCollection, 'RobustPCA', '%s,%s,%f'%(feature,nnName,DEFAULT_K_PROP), 'tagmatrix.h5')
        if not os.path.exists(tagmatrix_file):
            printStatus(INFO, "Tag matrix file not found at %s Did you run robustpca_preprocessing.py?" % (tagmatrix_file))
            sys.exit(1)
    else:
        tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5")
        if not os.path.exists(tagmatrix_file):
            printStatus(INFO, 'Tag matrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file))
            sys.exit(1)

    laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,k_ratio), 'laplacianI.mat')
    if not os.path.exists(laplacianI_file):
        printStatus(INFO, "LaplacianI file not found at %s Did you run laplacian_images.py?" % (laplacianI_file))
        sys.exit(1)

    laplacianT_file = os.path.join(rootpath, workingCollection, 'LaplacianT', '%f'%(ratio_cs), 'laplacianT.mat')
    if not os.path.exists(laplacianT_file):
        printStatus(INFO, "LaplacianT file not found at %s Did you run laplacian_tags.py?" % (laplacianT_file))
        sys.exit(1)

    # being learning
    script = """
        rpca_path = 'transduction_based/robustpca/';
        addpath(rpca_path);
        addpath([rpca_path, 'fast_svd/']);
        tagmatrix = sparse(double(h5read('%s', '/tagmatrix')));
        load('%s');
        load('%s');

        lambda1 = %f;
        lambda2 = %f;
        maxIters = 50;
        precision = 1e-4;
        mu_start = 1.;

        parpool('local', 4);
        [P,E]=robustpca(tagmatrix, lambda1, lambda2, tag_similarity, im_similarity, maxIters, precision, mu_start);
        """ % (tagmatrix_file, laplacianI_file, laplacianT_file, lambda1, lambda2)

    script += """
        delete(gcp);
        save('%s', 'P', 'E', 'lambda1', 'lambda2', '-v7.3');
        exit;
    """ % resultfile_robustpca

    if not only_dump:
        printStatus(INFO, "starting learning")
        makedirsforfile(resultfile_robustpca)
        call_matlab(script)

    if checkToSkip(resultfile, overwrite):
        return 0

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)
    concepts = readConcepts(workingCollection, annotationName, rootpath)
    if outputonlytest:
        testset_id_images = readImageSet(workingCollection.split('+')[1], workingCollection.split('+')[1], rootpath)
        testset_id_images.sort()

    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    id_images.sort()

    # concepts mapping
    robustpca_output = h5py.File(resultfile_robustpca, 'r')
    tagprop_input = h5py.File(tagmatrix_file, 'r')
    mapping = getVocabMap(list(tagprop_input['vocab'][:]),concepts)

    predicted_tagmatrix = robustpca_output['P'][:,mapping]

    if outputonlytest:
        idx = np.array([bisect_index(id_images, x) for x in testset_id_images])
        final_tagmatrix = predicted_tagmatrix[idx, :]
        assert(final_tagmatrix.shape[0] == idx.shape[0])
        id_images = testset_id_images
    else:
        final_tagmatrix = predicted_tagmatrix

    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images': id_images, 'scores':final_tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
Beispiel #20
0
import sys, os, random

from basic.common import ROOT_PATH
from basic.util import readImageSet
from simpleknn.bigfile import BigFile

if __name__ == '__main__':
    rootpath = ROOT_PATH
    collection = sys.argv[1]
    feature = sys.argv[2]
    
    imset = readImageSet(collection, collection)
    feat_dir = os.path.join(rootpath, collection, 'FeatureData', feature)
    feat_file = BigFile(feat_dir)
   
    imset = random.sample(imset, 50) 
    #imset = imset[:5]
     
    renamed,vectors = feat_file.read(imset)
    for name,vec in zip(renamed,vectors):
        print name, sum(vec), sum(vec[:64]), sum(vec[64:])

        
    

def process(options, trainCollection, testCollection, feature):
    rootpath = options.rootpath
    k = options.k
    distance = options.distance
    blocksize = options.blocksize
    uniqueUser = options.uu
    numjobs = options.numjobs
    job = options.job
    overwrite = options.overwrite
    testset = options.testset
    if not testset:
        testset = testCollection

    searchMethod = distance + 'knn'
    if uniqueUser:
        searchMethod += ",uu"
        tagfile = os.path.join(rootpath, trainCollection, 'TextData', 'id.userid.lemmtags.txt')
        im2user = {}
        for line in open(tagfile):
            im,userid,tags = line.split('\t')
            im2user[im] = userid
    
    resultdir = os.path.join(rootpath, testCollection, "SimilarityIndex", testset, trainCollection, "%s,%s,%d" % (feature,searchMethod,k))
    feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
    id_file = os.path.join(feat_dir, 'id.txt')
    shape_file = os.path.join(feat_dir, 'shape.txt')
    nr_of_images, feat_dim = map(int, open(shape_file).readline().split())
    nr_of_images = len(open(id_file).readline().strip().split())
    searcher = imagesearch.load_model(os.path.join(feat_dir, 'feature.bin'), feat_dim, nr_of_images, id_file)
    searcher.set_distance(distance)
        
    workingSet = readImageSet(testCollection, testset, rootpath=rootpath)
    workingSet = [workingSet[i] for i in range(len(workingSet)) if (i%numjobs+1) == job]
    printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,len(workingSet),resultdir))
    
    test_feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', feature)
    test_feat_file = BigFile(test_feat_dir)

    read_time = 0
    knn_time = 0
    start = 0
    done = 0
    filtered = 0

    while start < len(workingSet):
        end = min(len(workingSet), start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))

        s_time = time.time()
        renamed,vectors = test_feat_file.read(workingSet[start:end])
        read_time += time.time() - s_time
        nr_images = len(renamed)
        
        s_time = time.time()
        for i in range(nr_images):
            resultfile = os.path.join(resultdir, renamed[i][-2:], '%s.txt' % renamed[i])
            if checkToSkip(resultfile, overwrite):
                continue
            knn = searcher.search_knn(vectors[i], max_hits=max(3000,k*3))
            if uniqueUser:
                removed, newknn = unique_user_constraint(knn, im2user, k)
                filtered += removed
                knn = newknn
            else:
                knn = knn[:k]
            assert(len(knn) >= k)
            writeRankingResults(knn, resultfile)
            done += 1
        printStatus(INFO, 'job %d-%d: %d done, filtered neighbors %d' % (numjobs, job, done, filtered))
        start = end

    printStatus(INFO, 'job %d-%d: %d done, filtered neighbors %d' % (numjobs, job, done, filtered))
    testCollection = 'voc2008val'
    testAnnotationName = 'conceptsvoc2008val.txt'

    feature = 'dsift'
    modelName = 'fastlinear'
    modelName = 'fik50'
    metric = 'AP'
    scorer = getScorer(metric)

    if modelName.startswith('fik'):
        from fiksvm.fiksvm import fiksvm_load_model as load_model
    else:
        from fastlinear.fastlinear import fastlinear_load_model as load_model

    test_imset = readImageSet(testCollection,
                              testCollection,
                              rootpath=rootpath)
    test_feat_file = BigFile(
        os.path.join(rootpath, testCollection, 'FeatureData', feature))
    test_renamed, test_vectors = test_feat_file.read(test_imset)

    concepts = readConcepts(testCollection,
                            testAnnotationName,
                            rootpath=rootpath)

    print('### %s' % os.path.join(trainCollection, 'Models',
                                  trainAnnotationName, feature, modelName))
    results = []

    for concept in concepts:
        model_file_name = os.path.join(rootpath, trainCollection, 'Models',
Beispiel #23
0
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName):
    if modelName.startswith('fik'):
        from fiksvm.fiksvm import fiksvm_load_model as load_model
    else:
        from fastlinear.fastlinear import fastlinear_load_model as load_model

    rootpath = options.rootpath
    overwrite = options.overwrite
    prob_output = options.prob_output
    numjobs = options.numjobs
    job = options.job
    #blocksize = options.blocksize
    topk = options.topk
    
    outputName = '%s,%s' % (feature,modelName)
    if prob_output:
        outputName += ',prob'

    resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt')
    if numjobs>1:
        resultfile += '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    test_imset = readImageSet(testCollection, testCollection, rootpath)
    test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job]
    test_imset = set(test_imset)
    nr_of_test_images = len(test_imset)
    printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile))

    models = [None] * nr_of_concepts
    for c in range(nr_of_concepts):
        model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concepts[c])
        models[c] = load_model(model_file_name)
        if models[c] is None:
            return 0
        #(pA,pB) = model.get_probAB()
        

    feat_file = StreamFile(os.path.join(rootpath, testCollection, "FeatureData", feature))
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    done = 0

    feat_file.open()
    for _id, _vec in feat_file:
        if _id not in test_imset:
            continue
        if prob_output:
            scores = [models[c].predict_probability(_vec) for c in range(nr_of_concepts)]
        else:
            scores = [models[c].predict(_vec) for c in range(nr_of_concepts)]

        tagvotes = sorted(zip(concepts, scores), key=lambda v:v[1], reverse=True)
        if topk>0:
            tagvotes = tagvotes[:topk]
        newline = '%s %s\n' % (_id, " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes]))
        fw.write(newline)
        done += 1
        if done % 1e4  == 0:
            printStatus(INFO, "%d done" % done)

    feat_file.close()
    fw.close()
    printStatus(INFO, "%d done" % (done))
    return done
Beispiel #24
0
def process(options, testCollection, trainCollection, feature):
    rootpath = options.rootpath
    overwrite = options.overwrite
    tpp = options.tpp
    doRandomwalk = 1  #options.doRandomwalk
    uniqueUser = 0  #options.uniqueUser
    k = 1000  #options.k
    numjobs = options.numjobs
    job = options.job

    #resultfile = os.path.join(rootpath, testCollection, "tagrel", testCollection, trainCollection,
    #                          "%s,tagrank%d%d,%d,%s" % (feature,doRandomwalk,uniqueUser,k,tpp), "id.tagvotes.txt")

    resultfile = os.path.join(rootpath, testCollection, "tagrel",
                              testCollection, trainCollection,
                              '%s,tagrank,%s' % (feature, tpp),
                              'id.tagvotes.txt')

    if numjobs > 1:
        resultfile = resultfile + '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        sys.exit(0)

    try:
        doneset = set(
            [str.split(x)[0] for x in open(options.donefile).readlines()[:-1]])
    except:
        doneset = set()

    printStatus(INFO, "done set: %d" % len(doneset))

    testImageSet = readImageSet(testCollection, testCollection, rootpath)
    testImageSet = [x for x in testImageSet if x not in doneset]
    testImageSet = [
        testImageSet[i] for i in range(len(testImageSet))
        if (i % numjobs + 1) == job
    ]
    printStatus(
        INFO, 'working on %d-%d, %d test images -> %s' %
        (numjobs, job, len(testImageSet), resultfile))

    testreader = TagReader(testCollection, rootpath=rootpath)
    test_feat_file = BigFile(
        os.path.join(rootpath, testCollection, 'FeatureData', feature))
    block_size = 100

    tagranking = TagRanking(trainCollection,
                            feature=feature,
                            k=k,
                            rootpath=rootpath)

    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    done = 0

    nr_of_blocks = len(testImageSet) / block_size
    if nr_of_blocks * block_size < len(testImageSet):
        nr_of_blocks += 1

    for block_index in range(nr_of_blocks):
        start = block_index * block_size
        end = min(len(testImageSet), start + block_size)
        subset = testImageSet[start:end]
        if not subset:
            break
        renamed, features = test_feat_file.read(subset)
        printStatus(INFO, '%d - %d: %d images' % (start, end, len(subset)))

        output = []
        for i in range(len(renamed)):
            qry_id = renamed[i]
            qry_tags = testreader.get(qry_id)
            qry_vec = features[i]
            tagvotes = tagranking.estimate(
                qry_vec,
                qry_tags)  #, uniqueUser=uniqueUser, doRandomwalk=doRandomwalk)
            newline = "%s %s" % (qry_id, " ".join(
                ["%s %g" % (x[0], x[1]) for x in tagvotes]))
            output.append(newline + "\n")
            done += 1

        #printStatus(INFO, '%d %s %s' % (done,qry_id,' '.join(['%s:%g' % (x[0],x[1]) for x in tagvotes[:3]] )))
        fw.write("".join(output))
        fw.flush()

    fw.close()
    printStatus(INFO, 'done')
rootpath = ROOT_PATH
conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', trainAnnotationName)

from basic.annotationtable import writeConceptsTo
writeConceptsTo(test_tags, trainCollection, trainAnnotationName)

cmd = '%s/util/imagesearch/obtain_labeled_examples.py %s %s' % (parent_dir, trainCollection, conceptfile)
os.system('python ' + cmd)

train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
from util.simpleknn.bigfile import BigFile
train_feat_file = BigFile(train_feat_dir) 
feat_dim = train_feat_file.ndims

from basic.util import readImageSet
test_imset = readImageSet(testCollection)
test_feat_dir = os.path.join(rootpath, testCollection, 'featureData', feature)
test_feat_file = BigFile(test_feat_dir)
#test_renamed, test_vectors = test_feat_file.read(test_imset)


from model_based.dataengine.positiveengine import PositiveEngine
from model_based.dataengine.negativeengine import NegativeEngine

pe = PositiveEngine(trainCollection)
ne = NegativeEngine(trainCollection)

for tag in test_tags:
    pos_set = pe.sample(tag, 100)
    neg_set = ne.sample(tag, 100)
    names = pos_set + neg_set
def process(options, workingCollection, feature):
    rootpath = options.rootpath
    k_ratio = options.kratio
    distance = options.distance
    overwrite = options.overwrite
    laplaciankratio = options.laplaciankratio

    nnName = distance + "knn"
    resultfile = os.path.join(rootpath, workingCollection, 'RobustPCA', '%s,%s,%f'%(feature,nnName,k_ratio), 'tagmatrix.h5')

    if checkToSkip(resultfile, overwrite):
        return 0

    tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5")
    if not os.path.exists(tagmatrix_file):
        printStatus(INFO, 'Tagmatrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file))
        sys.exit(1)

    laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,laplaciankratio), 'laplacianI.mat')
    if not os.path.exists(laplacianI_file):
        printStatus(INFO, 'LaplacianI file not found in %s Did you run laplacian_images.py?' % (laplacianI_file))
        sys.exit(1)

    tagmatrix_data = h5py.File(tagmatrix_file, 'r')
    tagmatrix = tagmatrix_data['tagmatrix'][:]
    printStatus(INFO, 'tagmatrix.shape = %s' % (str(tagmatrix.shape)))

    laplacian_data = scipy.io.loadmat(laplacianI_file)
    sigma = laplacian_data['sigma']
    printStatus(INFO, 'Sigma^2 = %f' % (sigma))

    workingSet = readImageSet(workingCollection, workingCollection, rootpath)
    workingSet.sort()
    #print map(int, workingSet)[0:10], map(int, list(tagmatrix_data['id_images'][:])[0:10])
    #assert(np.all(map(int, workingSet) == list(tagmatrix_data['id_images'][:])))
    assert(np.all(workingSet == list(tagmatrix_data['id_images'][:])))

    tot_images = len(workingSet)
    printStatus(INFO, '%d images in %s' % (tot_images, workingCollection))

    printStatus(INFO, 'Mean images per tag = %f' % (np.mean(tagmatrix.sum(axis=0))))
    K_neighs = int(math.floor(np.mean(tagmatrix.sum(axis=0)) * k_ratio))
    printStatus(INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio))

    printStatus(INFO, 'Starting the propagation pre-processing')
    tagmatrix_new = np.zeros(tagmatrix.shape)
    for i in xrange(tot_images):
        neighbors = _get_neighbors('%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs * 2, feature, distance)

        #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors])
        #NNDrow = np.array([x[1] for x in neighbors])

        # remove images with features but not in the working set
        NNrow = []
        NNDrow = []
        new_neighs = []
        for x in neighbors:
            try:
                NNrow.append(bisect_index(workingSet, x[0]))
                NNDrow.append(x[1])
                new_neighs.append(x)
            except ValueError:
                pass
        #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors])
        #NNDrow = np.array([x[1] for x in neighbors])
        NNrow = np.array(NNrow)
        NNDrow = np.array(NNDrow)
        neighbors = new_neighs[0:K_neighs]
        
        C = np.sum(np.exp(-(NNDrow)/sigma))
        tagmatrix_new[i,:] = np.sum((np.exp(-(NNDrow)/sigma).T * tagmatrix[NNrow]) / C, axis=0);

        if (i+1) % 1000 == 0:
            printStatus(INFO, '%d / %d done' % (i+1, tot_images))

    # save output
    printStatus(INFO, 'Saving propagated tagmatrix to %s' % resultfile)
    makedirsforfile(resultfile)
    fout = h5py.File(resultfile, 'w')
    fout['tagmatrix'] = tagmatrix_new
    fout['vocab'] = tagmatrix_data['vocab'][:]
    fout['id_images'] = workingSet
    fout.close()
Beispiel #27
0
render = web.template.render('templates/')

pwd = os.path.dirname(os.path.realpath(__file__))
config = json.load(open(os.path.join(pwd, 'config.json')))

max_hits = config['max_hits']
rootpath = config['rootpath']
collection = config['collection']
rankMethod = config['rankMethod']
annotationName = config['annotationName']
metric = config['metric']
scorer = getScorer(metric)

simdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection,
                      rankMethod)
imset = readImageSet(collection, collection, rootpath)


class index:
    def GET(self):
        input = web.input(query=None)
        resp = {
            'status': 0,
            'hits': 0,
            'random': [],
            'tagrel': [],
            'metric': metric,
            'perf': 0
        }

        if input.query:
Beispiel #28
0
# shape file
with open(new_shape_file, 'w') as fout:
	imA, featA = open(coll1_shape_file).read().strip().split(" ")
	imB, featB = open(coll2_shape_file).read().strip().split(" ")
	assert featA == featB

	fout.write('%d %d' % (int(imA) + int(imB), int(featA)))

# copy and concatenate features
file(new_features_file,'wb').write(file(coll1_features_file,'rb').read() + file(coll2_features_file,'rb').read())

# copy Annotations
shutil.copytree("%s/%s/Annotations" % (datapath, coll1), "%s/%s+%s/Annotations" % (datapath, coll1, coll2))

# read ids
testset_id_images = readImageSet(coll2, coll2, datapath)
testset_id_images = set(map(int, testset_id_images))

train_id_images = readImageSet(coll1, coll1, datapath)
train_id_images = set(map(int, train_id_images))

base_new_id = max(testset_id_images.union(train_id_images)) + 1
duplicates = testset_id_images.intersection(train_id_images)
duplicates = dict([(x, x+base_new_id) for x in duplicates])

print "Found %d duplicates." % len(duplicates)

# read id.txt
coll1_featid_file = "%s/%s/FeatureData/%s/id.txt" % (datapath, coll1, feature)
coll2_featid_file = "%s/%s/FeatureData/%s/id.txt" % (datapath, coll2, feature)
new_featid_file = "%s/%s+%s/FeatureData/%s/id.txt" % (datapath, coll1, coll2, feature)
Beispiel #29
0
def process(options, testCollection, trainCollection, feature):
    rootpath = options.rootpath
    overwrite = options.overwrite
    tpp = options.tpp
    doRandomwalk =  1 #options.doRandomwalk
    uniqueUser = 0 #options.uniqueUser
    k = 1000 #options.k
    numjobs = options.numjobs
    job = options.job
    
    #resultfile = os.path.join(rootpath, testCollection, "tagrel", testCollection, trainCollection, 
    #                          "%s,tagrank%d%d,%d,%s" % (feature,doRandomwalk,uniqueUser,k,tpp), "id.tagvotes.txt")
    
    resultfile = os.path.join(rootpath, testCollection, "tagrel", testCollection, trainCollection, '%s,tagrank,%s' % (feature,tpp), 'id.tagvotes.txt')
        
    if numjobs>1:
        resultfile = resultfile + '.%d.%d' % (numjobs, job)
                              
    if checkToSkip(resultfile, overwrite):
        sys.exit(0)    

    try:
        doneset = set([str.split(x)[0] for x in open(options.donefile).readlines()[:-1]])
    except:
        doneset = set()
        
    printStatus(INFO, "done set: %d" % len(doneset))
    
    testImageSet = readImageSet(testCollection, testCollection, rootpath)
    testImageSet = [x for x in testImageSet if x not in doneset]
    testImageSet = [testImageSet[i] for i in range(len(testImageSet)) if (i%numjobs+1) == job]
    printStatus(INFO, 'working on %d-%d, %d test images -> %s' % (numjobs,job,len(testImageSet),resultfile) )
    
    testreader = TagReader(testCollection, rootpath=rootpath)   
    test_feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature))
    block_size = 100

    tagranking = TagRanking(trainCollection, feature=feature, k=k, rootpath=rootpath)
    
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")
    
    done = 0
    
    nr_of_blocks = len(testImageSet) / block_size
    if nr_of_blocks * block_size < len(testImageSet):
        nr_of_blocks += 1

    for block_index in range(nr_of_blocks):
        start = block_index * block_size
        end = min(len(testImageSet), start + block_size)
        subset = testImageSet[start:end]
        if not subset:
            break
        renamed, features = test_feat_file.read(subset)
        printStatus(INFO, '%d - %d: %d images' % (start, end, len(subset)))
        
        output = []
        for i in range(len(renamed)):
            qry_id = renamed[i]
            qry_tags = testreader.get(qry_id)
            qry_vec = features[i]
            tagvotes = tagranking.estimate(qry_vec, qry_tags) #, uniqueUser=uniqueUser, doRandomwalk=doRandomwalk)
            newline = "%s %s" % (qry_id, " ".join(["%s %g" % (x[0],x[1]) for x in tagvotes]))
            output.append(newline + "\n")
            done += 1
        
        #printStatus(INFO, '%d %s %s' % (done,qry_id,' '.join(['%s:%g' % (x[0],x[1]) for x in tagvotes[:3]] )))
        fw.write("".join(output))
        fw.flush()
  
    fw.close()
    printStatus(INFO, 'done')
Beispiel #30
0
    def test_tagging(self):
        corpus = 'flickr4m'
        word2vec_model = 'tagvec500'
        testCollection = 'imagenet2hop-random2k'
        imset = readImageSet(testCollection, testCollection, rootpath)
        feature = 'dascaffeprob'

        feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature))
        blocksize = 1000
        scorers = [HitScorer(n) for n in [1, 2, 5, 10]]

        overwrite = 1

        for embedding_model in str.split('conse conse2 hierse hierse2'):
            embedding_name = '%s,%s,%s' % (corpus, word2vec_model, embedding_model)

            for synset_name in str.split('imagenet1k imagenet1k2hop'):
                if 'imagenet1k' == synset_name:
                    label_file = 'data/ilsvrc12/synsets.txt'
                else:
                    label_file = 'data/ilsvrc12/synsets2hop.txt'

                params = '%s %s --embedding %s --word2vec %s --corpus %s --overwrite %d' % (label_file, synset_name, embedding_model, word2vec_model, corpus, overwrite)
                os.system('python build_synset_vec.py %s' % params)
                shape_file = os.path.join(rootpath, 'synset2vec', synset_name, embedding_name, 'shape.txt')
                self.assertTrue(os.path.exists(shape_file), msg="%s is not ready" % synset_name)

    
            synset_name = 'imagenet1k'
            label_file = 'data/ilsvrc12/synsets.txt'
            label2vec_dir = os.path.join(rootpath, 'synset2vec', synset_name, embedding_name)
            i2v = Image2Vec(label_file, label2vec_dir)

            tagger = ZeroshotTagger(embedding_name = embedding_name)
            printStatus(INFO, 'tagging %d images' % len(imset))

            start = 0

    
            overall_perf = [0.0] * len(scorers)
            nr_of_images = 0

            while start < len(imset):
                end = min(len(imset), start + blocksize)
                renamed, vectors = feat_file.read(imset[start:end])

                for _id,_vec in zip(renamed, vectors):
                    truth = set([_id.split('_')[0]])
                    im_vec = i2v.embedding(_vec)
                    pred = tagger.predict(im_vec)
                    sorted_labels = [int(x[0] in truth) for x in pred]
                    perf = [scorer.score(sorted_labels) for scorer in scorers]
                    overall_perf = [overall_perf[i] + perf[i] for i in range(len(scorers))]
                    nr_of_images += 1

                start = end
    
            res = [x/nr_of_images for x in overall_perf]
            print '_'*100
            print embedding_name
            print ' '.join([x.name() for x in scorers])
            print ' '.join(['%.3f' % x for x in res])
            print '_'*100
Beispiel #31
0
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName):
    if modelName.startswith('fik'):
        from fiksvm.fiksvm import fiksvm_load_model as load_model
    else:
        from fastlinear.fastlinear import fastlinear_load_model as load_model

    rootpath = options.rootpath
    overwrite = options.overwrite
    prob_output = options.prob_output
    numjobs = options.numjobs
    job = options.job
    blocksize = options.blocksize
    
    outputName = '%s,%s' % (feature,modelName)
    if prob_output:
        outputName += ',prob'

    resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt')
    if numjobs>1:
        resultfile += '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    test_imset = readImageSet(testCollection, testCollection, rootpath)
    test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job]
    nr_of_test_images = len(test_imset)
    printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile))

    models = [None] * nr_of_concepts
    for c in range(nr_of_concepts):
        model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concepts[c])
        models[c] = load_model(model_file_name)
        if models[c] is None:
            return 0
        #(pA,pB) = model.get_probAB()
        

    feat_file = BigFile(os.path.join(rootpath, testCollection, "FeatureData", feature))
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    read_time = 0
    test_time = 0
    start = 0
    done = 0

    while start < nr_of_test_images:
        end = min(nr_of_test_images, start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))

        s_time = time.time()
        renamed, test_X = feat_file.read(test_imset[start:end])
        read_time += time.time() - s_time
        
        s_time = time.time()
        output = [None] * len(renamed)
        for i in xrange(len(renamed)):
            if prob_output:
                scores = [models[c].predict_probability(test_X[i]) for c in range(nr_of_concepts)]
            else:
                scores = [models[c].predict(test_X[i]) for c in range(nr_of_concepts)]
            #dec_value = sigmoid_predict(dec_value, A=pA, B=pB)
            tagvotes = sorted(zip(concepts, scores), key=lambda v:v[1], reverse=True)
            output[i] = '%s %s\n' % (renamed[i], " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes]))
        test_time += time.time() - s_time
        start = end
        fw.write(''.join(output))
        fw.flush()
        done += len(output)

    # done    
    printStatus(INFO, "%d done. read time %g seconds, test_time %g seconds" % (done, read_time, test_time))
    fw.close()
    return done
Beispiel #32
0
    imB, featB = open(coll2_shape_file).read().strip().split(" ")
    assert featA == featB

    fout.write('%d %d' % (int(imA) + int(imB), int(featA)))

# copy and concatenate features
file(new_features_file, 'wb').write(
    file(coll1_features_file, 'rb').read() +
    file(coll2_features_file, 'rb').read())

# copy Annotations
shutil.copytree("%s/%s/Annotations" % (datapath, coll1),
                "%s/%s+%s/Annotations" % (datapath, coll1, coll2))

# read ids
testset_id_images = readImageSet(coll2, coll2, datapath)
testset_id_images = set(map(int, testset_id_images))

train_id_images = readImageSet(coll1, coll1, datapath)
train_id_images = set(map(int, train_id_images))

base_new_id = max(testset_id_images.union(train_id_images)) + 1
duplicates = testset_id_images.intersection(train_id_images)
duplicates = dict([(x, x + base_new_id) for x in duplicates])

print "Found %d duplicates." % len(duplicates)

# read id.txt
coll1_featid_file = "%s/%s/FeatureData/%s/id.txt" % (datapath, coll1, feature)
coll2_featid_file = "%s/%s/FeatureData/%s/id.txt" % (datapath, coll2, feature)
new_featid_file = "%s/%s+%s/FeatureData/%s/id.txt" % (datapath, coll1, coll2,
Beispiel #33
0
def process(options, workingCollection, feature):
    rootpath = options.rootpath
    k_ratio = options.kratio
    distance = options.distance
    overwrite = options.overwrite
    laplaciankratio = options.laplaciankratio

    nnName = distance + "knn"
    resultfile = os.path.join(rootpath, workingCollection, 'RobustPCA',
                              '%s,%s,%f' % (feature, nnName, k_ratio),
                              'tagmatrix.h5')

    if checkToSkip(resultfile, overwrite):
        return 0

    tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData',
                                  "lemm_wordnet_freq_tags.h5")
    if not os.path.exists(tagmatrix_file):
        printStatus(
            INFO,
            'Tagmatrix file not found in %s Did you run wordnet_frequency_tags.py?'
            % (tagmatrix_file))
        sys.exit(1)

    laplacianI_file = os.path.join(
        rootpath, workingCollection, 'LaplacianI', workingCollection,
        '%s,%s,%f' % (feature, nnName, laplaciankratio), 'laplacianI.mat')
    if not os.path.exists(laplacianI_file):
        printStatus(
            INFO,
            'LaplacianI file not found in %s Did you run laplacian_images.py?'
            % (laplacianI_file))
        sys.exit(1)

    tagmatrix_data = h5py.File(tagmatrix_file, 'r')
    tagmatrix = tagmatrix_data['tagmatrix'][:]
    printStatus(INFO, 'tagmatrix.shape = %s' % (str(tagmatrix.shape)))

    laplacian_data = scipy.io.loadmat(laplacianI_file)
    sigma = laplacian_data['sigma']
    printStatus(INFO, 'Sigma^2 = %f' % (sigma))

    workingSet = readImageSet(workingCollection, workingCollection, rootpath)
    workingSet.sort()
    #print map(int, workingSet)[0:10], map(int, list(tagmatrix_data['id_images'][:])[0:10])
    #assert(np.all(map(int, workingSet) == list(tagmatrix_data['id_images'][:])))
    assert (np.all(workingSet == list(tagmatrix_data['id_images'][:])))

    tot_images = len(workingSet)
    printStatus(INFO, '%d images in %s' % (tot_images, workingCollection))

    printStatus(INFO,
                'Mean images per tag = %f' % (np.mean(tagmatrix.sum(axis=0))))
    K_neighs = int(math.floor(np.mean(tagmatrix.sum(axis=0)) * k_ratio))
    printStatus(
        INFO,
        '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio))

    printStatus(INFO, 'Starting the propagation pre-processing')
    tagmatrix_new = np.zeros(tagmatrix.shape)
    for i in xrange(tot_images):
        neighbors = _get_neighbors(
            '%s,%s' % (workingCollection, workingSet[i]), rootpath,
            K_neighs * 2, feature, distance)

        #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors])
        #NNDrow = np.array([x[1] for x in neighbors])

        # remove images with features but not in the working set
        NNrow = []
        NNDrow = []
        new_neighs = []
        for x in neighbors:
            try:
                NNrow.append(bisect_index(workingSet, x[0]))
                NNDrow.append(x[1])
                new_neighs.append(x)
            except ValueError:
                pass
        #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors])
        #NNDrow = np.array([x[1] for x in neighbors])
        NNrow = np.array(NNrow)
        NNDrow = np.array(NNDrow)
        neighbors = new_neighs[0:K_neighs]

        C = np.sum(np.exp(-(NNDrow) / sigma))
        tagmatrix_new[i, :] = np.sum(
            (np.exp(-(NNDrow) / sigma).T * tagmatrix[NNrow]) / C, axis=0)

        if (i + 1) % 1000 == 0:
            printStatus(INFO, '%d / %d done' % (i + 1, tot_images))

    # save output
    printStatus(INFO, 'Saving propagated tagmatrix to %s' % resultfile)
    makedirsforfile(resultfile)
    fout = h5py.File(resultfile, 'w')
    fout['tagmatrix'] = tagmatrix_new
    fout['vocab'] = tagmatrix_data['vocab'][:]
    fout['id_images'] = workingSet
    fout.close()
Beispiel #34
0
def process(options, workingCollection, annotationName, feature, outputpkl):
    rootpath = options.rootpath
    distance = options.distance
    overwrite = options.overwrite
    k_ratio = options.kratio
    ratio_cs = options.ratiocs
    lambda1 = options.lambda1
    lambda2 = options.lambda2
    outputonlytest = options.outputonlytest
    rawtagmatrix = options.rawtagmatrix
    modelName = "robustpca"
    nnName = distance + "knn"

    printStatus(
        INFO, "Starting RobustPCA %s,%s,%s,%s,%f,%f,%f" %
        (workingCollection, annotationName, feature, nnName, k_ratio, lambda1,
         lambda2))

    if rawtagmatrix:
        printStatus(INFO, "Using raw tag matrix.")
    else:
        printStatus(INFO, "Using preprocessed tag matrix.")

    resultfile = os.path.join(outputpkl)
    resultfile_robustpca = os.path.join(
        rootpath, workingCollection, 'RobustPCA-Prediction',
        '%s,%s,%f,%f,%f,%d' %
        (feature, nnName, lambda1, lambda2, k_ratio, rawtagmatrix),
        'prediction.mat')

    if checkToSkip(resultfile_robustpca, overwrite):
        only_dump = True
    else:
        only_dump = False

    if not rawtagmatrix:
        tagmatrix_file = os.path.join(
            rootpath, workingCollection, 'RobustPCA',
            '%s,%s,%f' % (feature, nnName, DEFAULT_K_PROP), 'tagmatrix.h5')
        if not os.path.exists(tagmatrix_file):
            printStatus(
                INFO,
                "Tag matrix file not found at %s Did you run robustpca_preprocessing.py?"
                % (tagmatrix_file))
            sys.exit(1)
    else:
        tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData',
                                      "lemm_wordnet_freq_tags.h5")
        if not os.path.exists(tagmatrix_file):
            printStatus(
                INFO,
                'Tag matrix file not found in %s Did you run wordnet_frequency_tags.py?'
                % (tagmatrix_file))
            sys.exit(1)

    laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI',
                                   workingCollection,
                                   '%s,%s,%f' % (feature, nnName, k_ratio),
                                   'laplacianI.mat')
    if not os.path.exists(laplacianI_file):
        printStatus(
            INFO,
            "LaplacianI file not found at %s Did you run laplacian_images.py?"
            % (laplacianI_file))
        sys.exit(1)

    laplacianT_file = os.path.join(rootpath, workingCollection, 'LaplacianT',
                                   '%f' % (ratio_cs), 'laplacianT.mat')
    if not os.path.exists(laplacianT_file):
        printStatus(
            INFO,
            "LaplacianT file not found at %s Did you run laplacian_tags.py?" %
            (laplacianT_file))
        sys.exit(1)

    # being learning
    script = """
        rpca_path = 'transduction_based/robustpca/';
        addpath(rpca_path);
        addpath([rpca_path, 'fast_svd/']);
        tagmatrix = sparse(double(h5read('%s', '/tagmatrix')));
        load('%s');
        load('%s');

        lambda1 = %f;
        lambda2 = %f;
        maxIters = 50;
        precision = 1e-4;
        mu_start = 1.;

        parpool('local', 4);
        [P,E]=robustpca(tagmatrix, lambda1, lambda2, tag_similarity, im_similarity, maxIters, precision, mu_start);
        """ % (tagmatrix_file, laplacianI_file, laplacianT_file, lambda1,
               lambda2)

    script += """
        delete(gcp);
        save('%s', 'P', 'E', 'lambda1', 'lambda2', '-v7.3');
        exit;
    """ % resultfile_robustpca

    if not only_dump:
        printStatus(INFO, "starting learning")
        makedirsforfile(resultfile_robustpca)
        call_matlab(script)

    if checkToSkip(resultfile, overwrite):
        return 0

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)
    concepts = readConcepts(workingCollection, annotationName, rootpath)
    if outputonlytest:
        testset_id_images = readImageSet(
            workingCollection.split('+')[1],
            workingCollection.split('+')[1], rootpath)
        testset_id_images.sort()

    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    id_images.sort()

    # concepts mapping
    robustpca_output = h5py.File(resultfile_robustpca, 'r')
    tagprop_input = h5py.File(tagmatrix_file, 'r')
    mapping = getVocabMap(list(tagprop_input['vocab'][:]), concepts)

    predicted_tagmatrix = robustpca_output['P'][:, mapping]

    if outputonlytest:
        idx = np.array([bisect_index(id_images, x) for x in testset_id_images])
        final_tagmatrix = predicted_tagmatrix[idx, :]
        assert (final_tagmatrix.shape[0] == idx.shape[0])
        id_images = testset_id_images
    else:
        final_tagmatrix = predicted_tagmatrix

    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump(
            {
                'concepts': concepts,
                'id_images': id_images,
                'scores': final_tagmatrix
            }, f, pickle.HIGHEST_PROTOCOL)
Beispiel #35
0
import sys, os, random

from basic.common import ROOT_PATH
from basic.util import readImageSet
from simpleknn.bigfile import BigFile

if __name__ == '__main__':
    rootpath = ROOT_PATH
    collection = sys.argv[1]
    feature = sys.argv[2]

    imset = readImageSet(collection, collection)
    feat_dir = os.path.join(rootpath, collection, 'FeatureData', feature)
    feat_file = BigFile(feat_dir)

    imset = random.sample(imset, 50)
    #imset = imset[:5]

    renamed, vectors = feat_file.read(imset)
    for name, vec in zip(renamed, vectors):
        print name, sum(vec), sum(vec[:64]), sum(vec[64:])
Beispiel #36
0
def process(options, testCollection, trainCollection, annotationName, feature):
    rootpath = options.rootpath
    k = options.k
    distance = options.distance
    blocksize = options.blocksize
    donefile = options.donefile
    numjobs = options.numjobs
    job = options.job
    overwrite = options.overwrite
    taggerType = options.tagger
    noise = options.noise
    testset = options.testset
    if not testset:
        testset = testCollection

    modelName = taggerType
    if 'pretagvote' == taggerType and noise > 1e-3:
        modelName += '-noise%.2f' % noise
    if 'pqtagvote' == taggerType:
        nnName = "l2knn"
    else:
        nnName = distance + "knn"
    resultfile = os.path.join(rootpath, testCollection, 'autotagging', testset,
                              trainCollection, annotationName, modelName,
                              '%s,%s,%d' % (feature, nnName, k),
                              'id.tagvotes.txt')

    if numjobs > 1:
        resultfile += ".%d.%d" % (numjobs, job)
    if checkToSkip(resultfile, overwrite):
        return 0

    if donefile:
        doneset = set([x.split()[0] for x in open(donefile) if x.strip()])
    else:
        doneset = set()
    printStatus(
        INFO, "%d images have been done already, and they will be ignored" %
        len(doneset))

    workingSet = readImageSet(testCollection, testset, rootpath)
    workingSet = [x for x in workingSet if x not in doneset]
    workingSet = [
        workingSet[i] for i in range(len(workingSet))
        if (i % numjobs + 1) == job
    ]

    test_feat_dir = os.path.join(rootpath, testCollection, 'FeatureData',
                                 feature)
    test_feat_file = BigFile(test_feat_dir)

    tagger = NAME_TO_TAGGER[taggerType](trainCollection,
                                        annotationName,
                                        feature,
                                        distance,
                                        rootpath=rootpath)
    tagger.k = k
    tagger.noise = noise

    printStatus(
        INFO, "working on %d-%d, %d test images -> %s" %
        (numjobs, job, len(workingSet), resultfile))

    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    read_time = 0.0
    test_time = 0.0
    start = 0
    done = 0

    while start < len(workingSet):
        end = min(len(workingSet), start + blocksize)
        printStatus(INFO, 'tagging images from %d to %d' % (start, end - 1))

        s_time = time.time()
        renamed, vectors = test_feat_file.read(workingSet[start:end])
        nr_images = len(renamed)
        read_time += time.time() - s_time

        s_time = time.time()
        output = [None] * nr_images
        for i in range(nr_images):
            tagvotes = tagger.predict(content=vectors[i],
                                      context='%s,%s' %
                                      (testCollection, renamed[i]))
            output[i] = '%s %s\n' % (renamed[i], " ".join([
                "%s %s" % (tag, niceNumber(vote, 6))
                for (tag, vote) in tagvotes
            ]))
        test_time += time.time() - s_time
        start = end
        fw.write(''.join(output))
        done += len(output)

    fw.close()
    printStatus(
        INFO, '%d images tagged, read time %g seconds, test time %g seconds' %
        (done, read_time, test_time))
        if (
            wn.morphy(t) is None or len(t) < 3 or not validateAnnotation(thing_synsets, wn.synsets(t))
        ):  # or not t in vocabulary_50: #
            del count_tags[t]

    print "N tags post wordnet filter: ", len(count_tags)

    vocab = count_tags.keys()
    # print count_tags

    return vocab, count_tags


##############
workingSet = os.path.split(os.path.realpath(os.path.curdir))[1]
id_images = readImageSet(workingSet, workingSet, ROOT_PATH)
id_images.sort()
# id_images = map(int, id_images)

resultfile = os.path.join("TextData", "lemm_wordnet_freq_tags.h5")
if os.path.exists(resultfile):
    print "File %s already exists. Aborting..." % resultfile
    sys.exit(1)

tags_file = os.path.join("TextData", "id.userid.lemmtags.txt")
if len(sys.argv) > 1:
    print "Getting vocabulary from %s" % sys.argv[1]
    otherCollection = h5py.File(sys.argv[1], "r")
    vocab = list(otherCollection["vocab"])
    otherCollection.close()
else:
Beispiel #38
0
from basic.annotationtable import writeConceptsTo
writeConceptsTo(test_tags, trainCollection, trainAnnotationName)

cmd = '%s/util/imagesearch/obtain_labeled_examples.py %s %s' % (
    parent_dir, trainCollection, conceptfile)
os.system('python ' + cmd)

train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData',
                              feature)
from util.simpleknn.bigfile import BigFile
train_feat_file = BigFile(train_feat_dir)
feat_dim = train_feat_file.ndims

from basic.util import readImageSet
test_imset = readImageSet(testCollection)
test_feat_dir = os.path.join(rootpath, testCollection, 'featureData', feature)
test_feat_file = BigFile(test_feat_dir)
#test_renamed, test_vectors = test_feat_file.read(test_imset)

from model_based.dataengine.positiveengine import PositiveEngine
from model_based.dataengine.negativeengine import NegativeEngine

pe = PositiveEngine(trainCollection)
ne = NegativeEngine(trainCollection)

for tag in test_tags:
    pos_set = pe.sample(tag, 100)
    neg_set = ne.sample(tag, 100)
    names = pos_set + neg_set
    labels = [1] * len(pos_set) + [-1] * len(neg_set)
Beispiel #39
0

if __name__ == '__main__':
    rootpath = ROOT_PATH

    embedding_model = 'hierse2'
    embedding_name = 'flickr4m,tagvec500,%s' % embedding_model
    tagger = ZeroshotTagger(embedding_name = embedding_name)
    label_file = 'data/ilsvrc12/synsets.txt'
    label2vec_dir = os.path.join(rootpath, 'synset2vec', 'imagenet1k', embedding_name)
    from im2vec import Image2Vec
    i2v = Image2Vec(label_file, label2vec_dir)

    from basic.util import readImageSet
    testCollection = 'imagenet2hop'
    imset = readImageSet(testCollection, 'random100k', rootpath)
    feature = 'dascaffeprob'
    feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature))

    blocksize = 1000
    start = 0

    from eval import HitScorer

    scorers = [HitScorer(n) for n in [1, 2, 5, 10]]
    overall_perf = [0.0] * len(scorers)
    nr_of_images = 0

    while start < len(imset):
        end = min(len(imset), start + blocksize)
        renamed, vectors = feat_file.read(imset[start:end])
Beispiel #40
0
def process(options, testCollection, trainCollection, annotationName, feature, outputpkl):
    rootpath = options.rootpath
    k = options.k
    distance = options.distance
    variant = options.variant
    overwrite = options.overwrite
    testset = testCollection
    forcetrainmodel = options.trainmodel
    modelName = "tagprop"
    nnName = distance + "knn"

    printStatus(INFO, "Starting TagProp %s,%s,%s,%s,%s" % (variant, trainCollection, testCollection, annotationName, feature))

    resultfile = os.path.join(outputpkl)
    resultfile_tagprop = os.path.join(rootpath, testCollection, 'TagProp-Prediction', testset, trainCollection, annotationName, modelName, '%s,%s,%s,%d'%(feature,nnName,variant,k), 'prediction.mat')
    if checkToSkip(resultfile, overwrite) or checkToSkip(resultfile_tagprop, overwrite):
        return 0

    tagmatrix_file = os.path.join(rootpath, trainCollection, 'TextData', 'lemm_wordnet_freq_tags.h5')
    if not os.path.exists(tagmatrix_file):
        printStatus(INFO, "Tag matrix file not found at %s Did you run wordnet_frequency_tags.py?" % (tagmatrix_file))
        sys.exit(1)

    train_neighs_file = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d'%(feature,nnName,k), 'nn_train.h5')
    if not os.path.exists(train_neighs_file):
        printStatus(INFO, "Matlab train neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (train_neighs_file))
        sys.exit(1)

    # do we need to perform learning?
    train_model_file = os.path.join(rootpath, trainCollection, 'TagProp-models', '%s,%s,%s,%d'%(feature,nnName,variant,k), 'model.mat')
    if os.path.exists(train_model_file) and not forcetrainmodel:
        printStatus(INFO, "model for %s available at %s" % (trainCollection, train_model_file))
    else:
        printStatus(INFO, "starting learning model for %s" % (trainCollection))
        makedirsforfile(train_model_file)

        script = """
                tagprop_path = 'model_based/tagprop/TagProp/';
                addpath(tagprop_path);
                tagmatrix = h5read('%s', '/tagmatrix') > 0.5;
                tagmatrix = sparse(tagmatrix);
                NN = h5read('%s', '/NN');
                NN = NN(2:end, :);
                NN = double(NN);
        """ % (tagmatrix_file, train_neighs_file)

        if variant == 'dist' or variant == 'distsigmoids':
            script += """
                NND = h5read('%s', '/NND');
                NND = NND(2:end, :);
                NND = reshape(NND, 1, size(NND,1), size(NND,2));
                NND = double(NND);
            """ % train_neighs_file

        if variant == 'rank':
            script += """
                m = tagprop_learn(NN,[],tagmatrix);
            """
        elif variant == 'ranksigmoids':
            script += """
                m = tagprop_learn(NN,[],tagmatrix,'sigmoids',true);
            """
        elif variant == 'dist':
            script += """
                m = tagprop_learn(NN,NND,tagmatrix,'type','dist');
            """
        elif variant == 'distsigmoids':
            script += """
                m = tagprop_learn(NN,NND,tagmatrix,'type','dist','sigmoids',true);
            """

        script += """
                save('%s', 'm', '-v7.3');
                exit;
        """ % train_model_file

        call_matlab(script)

    # we perform prediction
    printStatus(INFO, "starting prediction")
    test_neighs_file = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d'%(feature,nnName,k), 'nn_test.h5')
    if not os.path.exists(test_neighs_file):
        printStatus(INFO, "Matlab test neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (test_neighs_file))
        sys.exit(1)

    script = """
            tagprop_path = 'model_based/tagprop/TagProp/';
            addpath(tagprop_path);
            load('%s');
            tagmatrix = h5read('%s', '/tagmatrix') > 0.5;
            tagmatrix = sparse(tagmatrix);
            NNT = h5read('%s', '/NNT');
            NNT = double(NNT);

    """ % (train_model_file, tagmatrix_file, test_neighs_file)

    if variant == 'dist' or variant == 'distsigmoids':
        script += """
            NNDT = h5read('%s', '/NNDT');
            NNDT = reshape(NNDT, 1, size(NNDT,1), size(NNDT,2));
            NNDT = double(NNDT);
        """ % test_neighs_file

    script += """
            P = tagprop_predict(NNT,[],m)';
            save('%s', '-v7.3');
            exit;
    """ % resultfile_tagprop

    makedirsforfile(resultfile_tagprop)
    call_matlab(script)

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)

    concepts = readConcepts(testCollection, annotationName, rootpath)
    id_images = readImageSet(testCollection, testset, rootpath)
    id_images.sort()
    # id_images = map(int, id_images)

    # concepts mapping
    tagprop_output = h5py.File(resultfile_tagprop, 'r')
    tagprop_input = h5py.File(tagmatrix_file, 'r')
    mapping = getVocabMap(list(tagprop_input['vocab'][:]),concepts)

    final_tagmatrix = tagprop_output['P'][:][:,mapping]

    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images':id_images, 'scores':final_tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
Beispiel #41
0
def process(options, testCollection, trainCollection, trainAnnotationName,
            feature, modelName):
    if modelName.startswith('fik'):
        from fiksvm.fiksvm import fiksvm_load_model as load_model
    else:
        from fastlinear.fastlinear import fastlinear_load_model as load_model

    rootpath = options.rootpath
    overwrite = options.overwrite
    prob_output = options.prob_output
    numjobs = options.numjobs
    job = options.job
    blocksize = options.blocksize

    outputName = '%s,%s' % (feature, modelName)
    if prob_output:
        outputName += ',prob'

    resultfile = os.path.join(rootpath, testCollection, 'autotagging',
                              testCollection, trainCollection,
                              trainAnnotationName, outputName,
                              'id.tagvotes.txt')
    if numjobs > 1:
        resultfile += '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(trainCollection,
                            trainAnnotationName,
                            rootpath=rootpath)
    nr_of_concepts = len(concepts)

    test_imset = readImageSet(testCollection, testCollection, rootpath)
    test_imset = [
        test_imset[i] for i in range(len(test_imset)) if i % numjobs + 1 == job
    ]
    nr_of_test_images = len(test_imset)
    printStatus(
        INFO, "working on %d-%d, %d test images -> %s" %
        (numjobs, job, nr_of_test_images, resultfile))

    models = [None] * nr_of_concepts
    for c in range(nr_of_concepts):
        model_file_name = os.path.join(rootpath, trainCollection, 'Models',
                                       trainAnnotationName, feature, modelName,
                                       '%s.model' % concepts[c])
        models[c] = load_model(model_file_name)
        if models[c] is None:
            return 0
        #(pA,pB) = model.get_probAB()

    feat_file = BigFile(
        os.path.join(rootpath, testCollection, "FeatureData", feature))
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    read_time = 0
    test_time = 0
    start = 0
    done = 0

    while start < nr_of_test_images:
        end = min(nr_of_test_images, start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end - 1))

        s_time = time.time()
        renamed, test_X = feat_file.read(test_imset[start:end])
        read_time += time.time() - s_time

        s_time = time.time()
        output = [None] * len(renamed)
        for i in xrange(len(renamed)):
            if prob_output:
                scores = [
                    models[c].predict_probability(test_X[i])
                    for c in range(nr_of_concepts)
                ]
            else:
                scores = [
                    models[c].predict(test_X[i]) for c in range(nr_of_concepts)
                ]
            #dec_value = sigmoid_predict(dec_value, A=pA, B=pB)
            tagvotes = sorted(zip(concepts, scores),
                              key=lambda v: v[1],
                              reverse=True)
            output[i] = '%s %s\n' % (renamed[i], " ".join([
                "%s %s" % (tag, niceNumber(vote, 6))
                for (tag, vote) in tagvotes
            ]))
        test_time += time.time() - s_time
        start = end
        fw.write(''.join(output))
        fw.flush()
        done += len(output)

    # done
    printStatus(
        INFO, "%d done. read time %g seconds, test_time %g seconds" %
        (done, read_time, test_time))
    fw.close()
    return done
Beispiel #42
0
    for t in count_tags.copy():
        if wn.morphy(t) is None or len(t) < 3 or not validateAnnotation(
                thing_synsets, wn.synsets(t)):  #or not t in vocabulary_50: #
            del count_tags[t]

    print "N tags post wordnet filter: ", len(count_tags)

    vocab = count_tags.keys()
    #print count_tags

    return vocab, count_tags


##############
workingSet = os.path.split(os.path.realpath(os.path.curdir))[1]
id_images = readImageSet(workingSet, workingSet, ROOT_PATH)
id_images.sort()
#id_images = map(int, id_images)

resultfile = os.path.join('TextData', "lemm_wordnet_freq_tags.h5")
if os.path.exists(resultfile):
    print "File %s already exists. Aborting..." % resultfile
    sys.exit(1)

tags_file = os.path.join('TextData', "id.userid.lemmtags.txt")
if len(sys.argv) > 1:
    print "Getting vocabulary from %s" % sys.argv[1]
    otherCollection = h5py.File(sys.argv[1], 'r')
    vocab = list(otherCollection['vocab'])
    otherCollection.close()
else:
Beispiel #43
0
import numpy as np
import cPickle as pickle

from basic.common import ROOT_PATH
from basic.util import readImageSet, bisect_index, getVocabMap
from basic.annotationtable import readConcepts

tagmatrix_file = h5py.File(sys.argv[1], 'r')
pkl_file = open(sys.argv[2], 'w')
workingCollection = sys.argv[3]
annotationName = sys.argv[4]
rootpath = ROOT_PATH

id_images = tagmatrix_file['id_images']
concepts = readConcepts(workingCollection, annotationName, rootpath)
testset_id_images = readImageSet(workingCollection.split('+')[1], workingCollection.split('+')[1], rootpath)
testset_id_images.sort()

if not type(id_images[0]) is str:
	id_images = map(str, id_images)

if not type(testset_id_images[0]) is str:
	testset_id_images = map(str, testset_id_images)

mapping = getVocabMap(list(tagmatrix_file['vocab'][:]),concepts)
predicted_tagmatrix = tagmatrix_file['tagmatrix'][:,mapping]

print "predicted_tagmatrix.shape = ", predicted_tagmatrix.shape
print "len(id_images) = ", len(id_images)
print "len(testset_id_images) = ", len(testset_id_images)
Beispiel #44
0
    trainCollection = 'voc2008train'
    trainAnnotationName = 'conceptsvoc2008train.txt'
    testCollection = 'voc2008val'
    testset = testCollection
    testAnnotationName = 'conceptsvoc2008val.txt'

    modelName = 'fik50' 
    #modelName = 'fastlinear'
    if 'fastlinear' == modelName:
        from fastlinear.fastlinear import fastlinear_load_model as load_model
    else:
        from fiksvm.fiksvm import fiksvm_load_model as load_model

    scorer = getScorer(metric)
    
    imset = readImageSet(testCollection,testset,rootpath=rootpath)
    concepts = readConcepts(testCollection,testAnnotationName,rootpath=rootpath)
    feat_dir = os.path.join(rootpath, testCollection, "FeatureData", feature)
    feat_file = BigFile(feat_dir)

    _renamed, _vectors = feat_file.read(imset)

    nr_of_images = len(_renamed)
    nr_of_concepts = len(concepts)
    
    mAP = 0.0
    models = [None] * len(concepts)

    stream = StreamFile(feat_dir)

    for i,concept in enumerate(concepts):
Beispiel #45
0
    nr_of_images_list = []
    feat_dim_list = []
    feat_files = []

    for feature in srcfeatures:
        shapefile = os.path.join(rootpath, collection, "FeatureData", feature, "shape.txt")
        nr_of_images, feat_dim = map(int, open(shapefile).readline().strip().split())
        nr_of_images_list.append(nr_of_images)
        feat_dim_list.append(feat_dim)
        feat_files.append(BigFile(os.path.join(rootpath, collection, "FeatureData", feature)))

    # assert(nr_of_images_list[0] == nr_of_images_list[1])
    new_feat_dim = sum(feat_dim_list)

    imset = readImageSet(collection, collection, rootpath)
    nr_of_images = len(imset)
    blocksize = 1000

    makedirsforfile(binary_file)
    fw = open(binary_file, "wb")
    new_imset = []
    start = 0

    while start < nr_of_images:
        end = min(nr_of_images, start + blocksize)
        printStatus(INFO, "processing images from %d to %d" % (start, end - 1))

        renamed_0, vecs_0 = feat_files[0].read(imset[start:end])
        renamed_1, vecs_1 = feat_files[1].read(imset[start:end])
def process(options, testCollection, trainCollection, annotationName, feature):
    rootpath = options.rootpath
    k = options.k
    distance = options.distance
    overwrite = options.overwrite
    testset = testCollection
    onlytest = options.onlytest
    
    nnName = distance + "knn"
    resultfile_train = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d'%(feature,nnName,k), 'nn_train.h5')
    resultfile_test = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d'%(feature,nnName,k), 'nn_test.h5')
    
    if (not onlytest and checkToSkip(resultfile_train, overwrite)) or checkToSkip(resultfile_test, overwrite):
        return 0

    testSet = readImageSet(testCollection, testset, rootpath)
    trainSet = readImageSet(trainCollection, trainCollection, rootpath)
    testSet.sort()
    trainSet.sort()

    #train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
    #train_feat_file = BigFile(train_feat_dir)

    tagger = NAME_TO_TAGGER["preknn"](trainCollection, annotationName, feature, distance, rootpath=rootpath, k=1001)

    printStatus(INFO, '%d test images, %d train images' % (len(testSet), len(trainSet)))

    # allocate train -> train nearest neighbors
    if not onlytest:
        printStatus(INFO, 'Allocating NN, NND matrices')    
        NN = np.zeros((len(trainSet), k+1), dtype=np.int32)
        NND = np.zeros((len(trainSet), k+1))

        printStatus(INFO, 'Filling NN, NND matrices')    
        for i,id_img in enumerate(trainSet):
            neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (trainCollection, id_img))
            if len(neighbors) < k+1:
                printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k+1))    
                sys.exit(1)

            NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors])
            NNDrow = np.array([x[1] for x in neighbors])

            NN[i,:] = NNrow[0:k+1]
            NND[i,:] = NNDrow[0:k+1]

            if i % 1000 == 0:
                printStatus(INFO, '%d / %d images' % (i, len(trainSet)))    

        printStatus(INFO, 'Saving train matrices to file %s' % (resultfile_train))
        makedirsforfile(resultfile_train)
        fout = h5py.File(resultfile_train, 'w')
        fout['NN'] = NN
        fout['NND'] = NND
        fout['trainSet'] = trainSet
        fout['concepts'] = tagger.concepts
        fout.close()

        del NN
        del NND
   
    # allocate test -> train nearest neighbors
    printStatus(INFO, 'Allocating NNT, NNDT matrices')        
    NNT = np.zeros((len(testSet), k), dtype=np.int32)
    NNDT = np.zeros((len(testSet), k))

    printStatus(INFO, 'Filling NNT, NNDT matrices')    
    for i,id_img in enumerate(testSet):
        neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (testCollection, id_img))
        if len(neighbors) < k:
            printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k))    
            sys.exit(1)

        NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors])
        NNDrow = np.array([x[1] for x in neighbors])

        NNT[i,:] = NNrow[0:k]
        NNDT[i,:] = NNDrow[0:k]

        if i % 1000 == 0:
            printStatus(INFO, '%d / %d images' % (i, len(testSet)))    
   
    printStatus(INFO, 'Saving test matrices to file %s' % (resultfile_test))
    makedirsforfile(resultfile_test)
    fout = h5py.File(resultfile_test, 'w')
    fout['NNT'] = NNT
    fout['NNDT'] = NNDT
    fout['trainSet'] = trainSet
    fout['testSet'] = testSet
    fout['concepts'] = tagger.concepts   
    fout.close()
Beispiel #47
0
def process(options, testCollection, trainCollection, trainAnnotationName,
            feature, modelName):
    assert (modelName.startswith('fastlinear'))

    rootpath = options.rootpath
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job
    topk = options.topk

    outputName = '%s,%s' % (feature, modelName)

    resultfile = os.path.join(rootpath, testCollection, 'autotagging',
                              testCollection, trainCollection,
                              trainAnnotationName, outputName,
                              'id.tagvotes.txt')
    if numjobs > 1:
        resultfile += '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(trainCollection,
                            trainAnnotationName,
                            rootpath=rootpath)
    nr_of_concepts = len(concepts)

    test_imset = readImageSet(testCollection, testCollection, rootpath)
    test_imset = [
        test_imset[i] for i in range(len(test_imset)) if i % numjobs + 1 == job
    ]
    test_imset = set(test_imset)
    nr_of_test_images = len(test_imset)
    printStatus(
        INFO, "working on %d-%d, %d test images -> %s" %
        (numjobs, job, nr_of_test_images, resultfile))

    ma = ModelArray(trainCollection,
                    trainAnnotationName,
                    feature,
                    modelName,
                    rootpath=rootpath)

    feat_file = StreamFile(
        os.path.join(rootpath, testCollection, "FeatureData", feature))
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    done = 0

    feat_file.open()
    for _id, _vec in feat_file:
        if _id not in test_imset:
            continue

        res = ma.predict([_vec], prob=0)
        tagvotes = res[0]
        if topk > 0:
            tagvotes = tagvotes[:topk]
        newline = '%s %s\n' % (_id, " ".join(
            ["%s %s" % (tag, niceNumber(vote, 6))
             for (tag, vote) in tagvotes]))
        fw.write(newline)
        done += 1
        if done % 1e4 == 0:
            printStatus(INFO, "%d done" % done)

    feat_file.close()
    fw.close()
    printStatus(INFO, "%d done" % (done))
    return done