Example #1
0
def process(options, collection, annotationName):
    rootpath = options.rootpath
    overwrite = options.overwrite
    neg_filter = options.neg_filter
    
    concepts = readConcepts(collection, annotationName, rootpath)
    newAnnotationName = annotationName[:-4] + 'social.txt'
    ne = STRING_TO_NEGATIVE_ENGINE[neg_filter](collection, rootpath)

    newConcepts = []
    for concept in concepts:
        resultfile = os.path.join(rootpath, collection, 'Annotations', 'Image', newAnnotationName, '%s.txt'%concept)
        if checkToSkip(resultfile, overwrite):
            newConcepts.append(concept)
            continue

        try:
            pos_set = readLabeledImageSet(collection, concept, tpp='lemm', rootpath=rootpath)
        except:
            pos_set = None 
        if not pos_set:
            printStatus(INFO, '*** %s has not labeled examples, will be ignored ***' % concept)
            continue
        neg_set = ne.sample(concept, int(1e8))
        assert(len(set(pos_set).intersection(set(neg_set))) == 0)
        newlabels = [1] * len(pos_set) + [-1] * len(neg_set)
        newnames = pos_set + neg_set
        printStatus(INFO, "anno(%s) %d pos %d neg -> %s" % (concept,len(pos_set),len(neg_set),resultfile))
        writeAnnotations(newnames, newlabels, resultfile)
        newConcepts.append(concept)

    writeConceptsTo(newConcepts, collection, newAnnotationName, rootpath)
Example #2
0
def process(options, collection, annotationName):
    rootpath = options.rootpath
    overwrite = options.overwrite

    concepts = readConcepts(collection,annotationName,rootpath)
    resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd")

    todo = [x for x in concepts if not os.path.exists(os.path.join(resultdir,x+'.txt')) or overwrite]
    if not todo:
        printStatus(INFO, 'nothing to do')
        return

    fcs = FlickrContextSim(collection, rootpath=rootpath)
    vob = fcs.vob
    resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd")
    printStatus(INFO, 'expanding tags for %s-%s -> %s' % (collection, annotationName, resultdir))
    
    for concept in todo:
        resultfile = os.path.join(resultdir, concept + '.txt')
            
        vals = []
        for tag in vob:
            dist = fcs.computeNGD(concept, tag, img=1)
            if dist < 10:
                vals.append((tag,dist))
        vals.sort(key=lambda v:v[1])
        printStatus(INFO, '%s -> %s' % (concept, ' '.join([x[0] for x in vals[:3]])))
        writeRankingResults(vals, resultfile)
Example #3
0
    def __init__(self,
                 collection,
                 annotationName,
                 feature,
                 distance,
                 tpp=DEFAULT_TPP,
                 rootpath=ROOT_PATH,
                 k=DEFAULT_K):
        self.rootpath = rootpath
        self.concepts = readConcepts(collection, annotationName, rootpath)
        self.nr_of_concepts = len(self.concepts)
        self.concept2index = dict(
            zip(self.concepts, range(self.nr_of_concepts)))

        self.imset = readImageSet(collection, collection, rootpath)
        self.nr_of_images = len(self.imset)
        self.knndir = os.path.join(collection,
                                   '%s,%sknn,1500' % (feature, distance))

        self.k = k
        self.noise = 0

        self._load_tag_data(collection, tpp, rootpath)

        printStatus(
            INFO,
            "%s, %d images, %d unique tags, %s %d neighbours for voting" %
            (self.__class__.__name__, self.nr_of_images, len(
                self.tag2freq), distance, self.k))
Example #4
0
def process(options, collection, annotationName, simdir, resultfile):
    rootpath = options.rootpath
    
    if checkToSkip(resultfile, options.overwrite):
        return 0
    
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    concept_num = len(concepts)

    id_images = readImageSet(collection, collection, rootpath)
    image_num = len(id_images)
    im2index = dict(zip(id_images, range(image_num)))
    print ('%d instances, %d concepts to dump -> %s' % (image_num, concept_num, resultfile))
    
    scores = np.zeros((image_num, concept_num)) - 1e4
    
    for c_id,concept in enumerate(concepts):
        simfile = os.path.join(simdir, '%s.txt' % concept)
        ranklist = readRankingResults(simfile)
        for im,score in ranklist:
            idx = im2index[im]
            scores[idx,c_id] = score

    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump({'concepts':concepts, 'id_images':map(int,id_images), 'scores':scores}, output, -1)
    output.close()
Example #5
0
def evaluateSearchEngines(searchers,
                          collection,
                          annotationName,
                          metric,
                          rootpath=ROOT_PATH):
    scorer = getScorer(metric)
    concepts = readConcepts(collection, annotationName, rootpath)

    nr_of_runs = len(searchers)
    nr_of_concepts = len(concepts)
    results = np.zeros((nr_of_concepts, nr_of_runs))

    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection, annotationName,
                                            concepts[i], rootpath)
        name2label = dict(zip(names, labels))

        for j in range(nr_of_runs):
            searchresults = searchers[j].scoreCollection(concepts[i])
            sorted_labels = [
                name2label[name] for (name, score) in searchresults
                if name in name2label
            ]
            results[i, j] = scorer.score(sorted_labels)

    for i in range(nr_of_concepts):
        print concepts[i], ' '.join([niceNumber(x, 3) for x in results[i, :]])
    mean_perf = results.mean(0)
    print 'mean%s' % metric, ' '.join([niceNumber(x, 3) for x in mean_perf])

    return concepts, results
Example #6
0
def process(options, trainCollection, modelAnnotationName, trainAnnotationName, feature):
    rootpath = options.rootpath
    modelName = options.model

    if 'fastlinear' == modelName:
        from fastlinear.fastlinear import fastlinear_load_model as load_model
        from fastlinear.fastlinear import fastlinear_save_model as save_model
    else:
        from fiksvm.fiksvm import fiksvm_load_model as load_model
        from fiksvm.fiksvm import fiksvm_save_model as save_model


    concepts = readConcepts(trainCollection, trainAnnotationName, rootpath)
    concepts = [concepts[i] for i in range(len(concepts)) if (i%options.numjobs + 1) == options.job]

    feat_file = BigFile(os.path.join(rootpath, trainCollection, "FeatureData", feature))

    for concept in concepts:
        modelfile = os.path.join(rootpath, trainCollection, 'Models', modelAnnotationName, feature, modelName, '%s.model' % concept)
        model = load_model(modelfile)
        (A0, B0) = model.get_probAB()
        if abs(A0) > 1e-8 and not options.overwrite:
            printStatus(INFO, "old parameters exist as A=%g, B=%g, skip" % (A0, B0))
            continue
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names, labels))
        results = classify_large_data(model, names, feat_file, prob_output=False)
        labels = [name2label[x[0]] for x in results]
        dec_values = [x[1] for x in results]
        printStatus(INFO, "%s +%d -%d" % (concept, len([x for x in labels if x==1]), len([x for x in labels if x==-1])))
        [A,B] = sigmoid_train(dec_values, labels)
        model.set_probAB(A, B)
        save_model(modelfile, model)
        (A1, B1) = model.get_probAB()
        printStatus(INFO, "A: %g -> %g, B: %g -> %g" % (A0, A1, B0, B1))
Example #7
0
def process(options, collection, annotationName, simdir, resultfile):
    rootpath = options.rootpath

    if checkToSkip(resultfile, options.overwrite):
        return 0

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    concept_num = len(concepts)

    id_images = readImageSet(collection, collection, rootpath)
    image_num = len(id_images)
    im2index = dict(zip(id_images, range(image_num)))
    print('%d instances, %d concepts to dump -> %s' %
          (image_num, concept_num, resultfile))

    scores = np.zeros((image_num, concept_num)) - 1e4

    for c_id, concept in enumerate(concepts):
        simfile = os.path.join(simdir, '%s.txt' % concept)
        ranklist = readRankingResults(simfile)
        for im, score in ranklist:
            idx = im2index[im]
            scores[idx, c_id] = score

    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump(
        {
            'concepts': concepts,
            'id_images': map(int, id_images),
            'scores': scores
        }, output, -1)
    output.close()
Example #8
0
def process(options, collection, annotationName):
    rootpath = options.rootpath
    overwrite = options.overwrite

    concepts = readConcepts(collection, annotationName, rootpath)
    resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd")

    todo = [
        x for x in concepts
        if not os.path.exists(os.path.join(resultdir, x + '.txt')) or overwrite
    ]
    if not todo:
        printStatus(INFO, 'nothing to do')
        return

    fcs = FlickrContextSim(collection, rootpath=rootpath)
    vob = fcs.vob
    resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd")
    printStatus(
        INFO, 'expanding tags for %s-%s -> %s' %
        (collection, annotationName, resultdir))

    for concept in todo:
        resultfile = os.path.join(resultdir, concept + '.txt')

        vals = []
        for tag in vob:
            dist = fcs.computeNGD(concept, tag, img=1)
            if dist < 10:
                vals.append((tag, dist))
        vals.sort(key=lambda v: v[1])
        printStatus(INFO,
                    '%s -> %s' % (concept, ' '.join([x[0] for x in vals[:3]])))
        writeRankingResults(vals, resultfile)
Example #9
0
    def __init__(self,
                 collection,
                 annotationName,
                 feature,
                 distance,
                 tpp=DEFAULT_TPP,
                 rootpath=ROOT_PATH):
        self.rootpath = rootpath
        self.concepts = readConcepts(collection, annotationName, rootpath)
        self.nr_of_concepts = len(self.concepts)
        self.concept2index = dict(
            zip(self.concepts, range(self.nr_of_concepts)))

        featuredir = os.path.join(rootpath, collection, 'FeatureData', feature)
        id_file = os.path.join(featuredir, "id.txt")
        shape_file = os.path.join(feat_dir, 'shape.txt')
        self.nr_of_images, feat_dim = map(int,
                                          open(shape_file).readline().split())

        self.searcher = load_model(featuredir,
                                   self.nr_of_images,
                                   feat_dim,
                                   nr_of_segments=512,
                                   segmentk=256,
                                   coarsek=4096)
        self.k = DEFAULT_K
        self._load_tag_data(collection, tpp, rootpath)
        printStatus(
            INFO,
            "%s, %d images, %d unique tags, %s %d neighbours for voting" %
            (self.__class__.__name__, self.nr_of_images, len(
                self.tag2freq), distance, self.k))
Example #10
0
    def __init__(self,
                 collection,
                 annotationName,
                 feature,
                 distance,
                 tpp=DEFAULT_TPP,
                 rootpath=ROOT_PATH):
        self.concepts = readConcepts(collection, annotationName, rootpath)
        self.nr_of_concepts = len(self.concepts)
        self.concept2index = dict(
            zip(self.concepts, range(self.nr_of_concepts)))

        feat_dir = os.path.join(rootpath, collection, "FeatureData", feature)
        id_file = os.path.join(feat_dir, 'id.txt')
        shape_file = os.path.join(feat_dir, 'shape.txt')
        self.nr_of_images, feat_dim = map(int,
                                          open(shape_file).readline().split())

        self.searcher = simpleknn.load_model(
            os.path.join(feat_dir, 'feature.bin'), feat_dim, self.nr_of_images,
            id_file)
        self.searcher.set_distance(distance)
        self.k = DEFAULT_K

        self._load_tag_data(collection, tpp, rootpath)

        printStatus(
            INFO,
            "%s, %d images, %d unique tags, %s %d neighbours for voting" %
            (self.__class__.__name__, self.nr_of_images, len(
                self.tag2freq), distance, self.k))
Example #11
0
def process(options, collection, annotationName, pos_num):
    assert(annotationName.endswith('.txt'))
    rootpath = options.rootpath
    pos_bag_num = options.pos_bag_num
    neg_bag_num = options.neg_bag_num
    neg_pos_ratio = options.neg_pos_ratio

    annotationNameStr = annotationName[:-4] + ('.random%d' % pos_num) + '.%d' + ('.npr%d' % neg_pos_ratio) + '.%d.txt'

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    
    skip = 0
    newAnnotationNames = [None] * (pos_bag_num * neg_bag_num)

    for idxp in range(pos_bag_num):
        for idxn in range(neg_bag_num):
            anno_idx = idxp * neg_bag_num + idxn
            newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn)
            resultfile = os.path.join(rootpath,collection,'Annotations',newAnnotationNames[anno_idx])
            if checkToSkip(resultfile, options.overwrite):
                skip += 1
                continue
            writeConcepts(concepts,resultfile)

    first,second,last = annotationNameStr.split('%d')
    scriptfile = os.path.join(rootpath,collection,'annotationfiles',first + '0-%d'%(pos_bag_num-1) + second + '0-%d'%(neg_bag_num-1) + last)

    makedirsforfile(scriptfile)
    fout = open(scriptfile,'w')
    fout.write('\n'.join(newAnnotationNames) + '\n')
    fout.close()

    if len(newAnnotationNames) == skip:
        return 0
        
    for concept in concepts:
        names,labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath)
        positivePool = [x[0] for x in zip(names,labels) if x[1]>0]
        negativePool = [x[0] for x in zip(names,labels) if x[1]<0]
        
        for idxp in range(pos_bag_num):
            if len(positivePool) > pos_num:
                positiveBag = random.sample(positivePool, pos_num)
            else:
                positiveBag = positivePool
            for idxn in range(neg_bag_num):
                anno_idx = idxp * neg_bag_num + idxn
                newAnnotationName = newAnnotationNames[anno_idx]
                resultfile = os.path.join(rootpath,collection,'Annotations','Image',newAnnotationName,'%s.txt'%concept)
                if checkToSkip(resultfile, options.overwrite):
                    continue
                real_neg_num = max(len(positiveBag) * neg_pos_ratio, 1000)
                real_neg_num = min(len(negativePool), real_neg_num)
                negativeBag = random.sample(negativePool, real_neg_num)

                assert(len(set(positiveBag).intersection(set(negativeBag))) == 0)
                printStatus(INFO, "anno(%s,%d) %d pos %d neg -> %s" % (concept,anno_idx,len(positiveBag),len(negativeBag),resultfile))
                writeAnnotations(positiveBag + negativeBag, [1]*len(positiveBag) + [-1]*len(negativeBag), resultfile)
Example #12
0
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName):
    assert(modelName.startswith('fastlinear'))
    
    rootpath = options.rootpath
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job
    topk = options.topk
    
    outputName = '%s,%s' % (feature,modelName)
    
    resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt')
    if numjobs>1:
        resultfile += '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    test_imset = readImageSet(testCollection, testCollection, rootpath)
    test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job]
    test_imset = set(test_imset)
    nr_of_test_images = len(test_imset)
    printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile))

    ma = ModelArray(trainCollection, trainAnnotationName, feature, modelName, rootpath=rootpath)
        
    feat_file = StreamFile(os.path.join(rootpath, testCollection, "FeatureData", feature))
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    done = 0

    feat_file.open()
    for _id, _vec in feat_file:
        if _id not in test_imset:
            continue
       
        res = ma.predict([_vec],prob=0)
        tagvotes = res[0]
        if topk>0:
            tagvotes = tagvotes[:topk]
        newline = '%s %s\n' % (_id, " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes]))
        fw.write(newline)
        done += 1
        if done % 1e4  == 0:
            printStatus(INFO, "%d done" % done)

    feat_file.close()
    fw.close()
    printStatus(INFO, "%d done" % (done))
    return done
Example #13
0
def process(options, trainCollection, baseAnnotationName, startAnnotationName, feature, modelName):
    global train_model, compress_model, save_model
    assert(modelName in ['fik', 'fastlinear'])
    if 'fik' == modelName:
        from model_based.svms.fiksvm.svmutil import svm_train as train_model
        from model_based.svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from model_based.svms.fiksvm.fiksvm import fiksvm_save_model as save_model
    else:
        from model_based.svms.fastlinear.liblinear193.python.liblinearutil import train as train_model
        from model_based.svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        from model_based.svms.fastlinear.fastlinear import fastlinear_save_model as save_model


    rootpath = options.rootpath
    overwrite = options.overwrite
    params = {'rootpath': rootpath, 'trainCollection': trainCollection, 'baseAnnotationName': baseAnnotationName,
              'startAnnotationName': startAnnotationName, 'feature': feature, 'model': modelName, 'strategy': options.strategy,
              'iterations': options.iterations, 'npr': options.npr, 'nr_bins': options.nr_bins}

    concepts = readConcepts(trainCollection, startAnnotationName, rootpath)
    newAnnotationName = get_new_annotation_name(params)
    newModelName = get_model_name(params)
    modeldir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, newModelName)
    todo = [concept for concept in concepts if overwrite or os.path.exists(os.path.join(modeldir,'%s.txt'%concept)) is False]
    activeConcepts = [todo[i] for i in range(len(todo)) if (i%options.numjobs+1) == options.job]

    params['feat_file'] = BigFile(os.path.join(rootpath, trainCollection, 'FeatureData', feature))

    if 'fik' == modelName:
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))    

        
    s_time = time.time()

    for concept in activeConcepts:
        printStatus(INFO, 'processing %s' % concept)
        modelfile = os.path.join(modeldir, '%s.model'%concept)
        if checkToSkip(modelfile, overwrite):
            continue
        new_model = NegativeBootstrap.learn(concept, params)
        makedirsforfile(modelfile)
        printStatus(INFO, 'save model to %s' % modelfile)
        save_model(modelfile, new_model)
        printStatus(INFO, '%s done' % concept)
        
    timecost = time.time() - s_time
    writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath)
    printStatus(INFO, 'done for %g concepts: %s' % (len(activeConcepts), ' '.join(activeConcepts)))
    printStatus(INFO, 'models stored at %s' % modeldir)
    printStatus(INFO, '%g seconds in total' % timecost)
Example #14
0
def process(options, testCollection, annotationName, tagvotefile):
    rootpath = options.rootpath
    tpp = options.tpp
    tagged = options.tagged
    overwrite = options.overwrite

    resultdir = generate_result_dir(options, testCollection, tagvotefile)
    
    concepts = readConcepts(testCollection, annotationName, rootpath)
    todo = []
    for concept in concepts:
        resfile = os.path.join(resultdir, '%s.txt'%concept)
        if checkToSkip(resfile, overwrite):
            continue
        todo.append(concept)

    if not todo:
        print ('nothing to do')
        return 0

    nr_of_concepts = len(todo)
    labeled_set = [None] * nr_of_concepts
    if tagged:
        for i in range(nr_of_concepts):
            labeled_set[i] = set(readLabeledImageSet(testCollection, todo[i], tpp, rootpath))
        
    concept2index = dict(zip(todo, range(nr_of_concepts)))
    ranklists = [[] for i in range(nr_of_concepts)]

    for line in open(tagvotefile):
        elems = line.strip().split()
        imageid = elems[0]
        del elems[0]
        assert(len(elems)%2==0)

        for i in range(0, len(elems), 2):
            tag = elems[i]
            c = concept2index.get(tag, -1)
            if c >= 0:
                if tagged and imageid not in labeled_set[c]:
                    continue
                score = float(elems[i+1])
                ranklists[c].append((imageid,score))

    for i in range(nr_of_concepts):
        concept = todo[i]
        resfile = os.path.join(resultdir, '%s.txt'%concept)
        ranklist = sorted(ranklists[i], key=lambda v:v[1], reverse=True)
        print ('%s %d -> %s' % (concept, len(ranklist), resfile))
        writeRankingResults(ranklist, resfile)
Example #15
0
def submit(searchers, collection,annotationName, rootpath=ROOT_PATH, overwrite=0):
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_runs = len(searchers)

    for concept in concepts:
        for j in range(nr_of_runs):
            resultfile = os.path.join(searchers[j].getOutputdir(), concept + ".txt")
            if checkToSkip(resultfile, overwrite):
                continue
            searchresults = searchers[j].scoreCollection(concept)
            print ("%s: %s %d -> %s" % (searchers[j].name, concept, len(searchresults), resultfile))
            writeRankingResults(searchresults, resultfile)

    printStatus('%s.submit'%os.path.basename(__file__), "done")
Example #16
0
 def __init__(self, testCollection, trainCollection, annotationName, rootpath=ROOT_PATH):
     self.name = '%s-%s-%s' % (self.__class__.__name__, trainCollection, annotationName)
     self.concepts = readConcepts(trainCollection, annotationName, rootpath)
     self.concept_num = len(self.concepts)
     self.concept2index = dict(zip(self.concepts, range(self.concept_num)))
     self.tbase = TagBase(trainCollection, tpp='lemm', rootpath=rootpath)
     self.rbase = ConceptRankBase(os.path.join(rootpath,trainCollection,'TextData', 'tag.concept-rank.%s.pkl' % annotationName))
     self.DEFAULT_RANK = self.tbase.tag_num()
     self.m = DEFAULT_M
     self.k_r = DEFAULT_KR
     self.k_s = DEFAULT_KS
     self.k_d = DEFAULT_KD
     self.normalize = True
     self.add_bonus = False
Example #17
0
    def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH):
        self.rootpath = rootpath
        self.concepts = readConcepts(collection, annotationName, rootpath)
        self.nr_of_concepts = len(self.concepts)
        self.concept2index = dict(zip(self.concepts, range(self.nr_of_concepts)))

        featuredir = os.path.join(rootpath,collection,'FeatureData',feature)
        id_file = os.path.join(featuredir, "id.txt")
        shape_file = os.path.join(feat_dir, 'shape.txt')
        self.nr_of_images, feat_dim = map(int, open(shape_file).readline().split())

        self.searcher = load_model(featuredir, self.nr_of_images, feat_dim,nr_of_segments=512,segmentk=256,coarsek=4096)
        self.k = DEFAULT_K
        self._load_tag_data(collection, tpp, rootpath)
        printStatus(INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len(self.tag2freq), distance,  self.k))
Example #18
0
 def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH):
     self.rootpath = rootpath
     self.concepts = readConcepts(collection, annotationName, rootpath)
     self.nr_of_concepts = len(self.concepts)
     self.concept2index = dict(zip(self.concepts, range(self.nr_of_concepts)))
     
     self.imset = readImageSet(collection, collection, rootpath)
     self.nr_of_images = len(self.imset)
     self.knndir = os.path.join(collection, '%s,%sknn,uu,1500' % (feature, distance))
     
     self.k = DEFAULT_K
     self.noise = 0
     
     self._load_tag_data(collection, tpp, rootpath)
     
     printStatus(INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len(self.tag2freq), distance, self.k))
Example #19
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite
    random = options.random

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.zeros((len(id_images), len(concepts)))

    id_images = []
    tag2idx = dict(zip(concepts, xrange(len(concepts))))
    with open(
            os.path.join(rootpath, workingCollection, 'TextData',
                         'id.userid.lemmtags.txt')) as f:
        cnt = 0
        for line in f:
            id_img, _, tags = line.split('\t')
            tags = tags.split()
            if len(tags) > 0:
                tags = [(tag2idx.get(x, -1), y)
                        for x, y in zip(tags, xrange(len(tags)))]
                idx = np.array([x[0] for x in tags])
                vals = 1. / (1. + np.array([x[1] for x in tags]))
                tagmatrix[cnt, idx] = vals

            id_images.append(id_img)
            cnt += 1

    # random rank for untagged images
    if random:
        tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand(
            tagmatrix.shape[0], tagmatrix.shape[1])

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump(
            {
                'concepts': concepts,
                'id_images': map(int, id_images),
                'scores': tagmatrix
            }, f, pickle.HIGHEST_PROTOCOL)
Example #20
0
    def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH):
        self.concepts = readConcepts(collection, annotationName, rootpath)
        self.nr_of_concepts = len(self.concepts)
        self.concept2index = dict(zip(self.concepts, range(self.nr_of_concepts)))
        
        feat_dir = os.path.join(rootpath, collection, "FeatureData", feature)
        id_file = os.path.join(feat_dir, 'id.txt')
        shape_file = os.path.join(feat_dir, 'shape.txt')
        self.nr_of_images, feat_dim = map(int, open(shape_file).readline().split())

        self.searcher = simpleknn.load_model(os.path.join(feat_dir, 'feature.bin'), feat_dim, self.nr_of_images, id_file)
        self.searcher.set_distance(distance)
        self.k = DEFAULT_K
        
        self._load_tag_data(collection, tpp, rootpath)
        
        printStatus(INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len(self.tag2freq), distance, self.k))
Example #21
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.random.rand(len(id_images), len(concepts))

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)    
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
Example #22
0
def process(options, collection, annotationName, runfile, newRunName):
    rootpath = options.rootpath
    overwrite = options.overwrite

    dataset = options.testset if options.testset else collection
    
    concepts = readConcepts(collection, annotationName, rootpath)
    simdir = os.path.join(rootpath, collection, "SimilarityIndex", dataset)

    data = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith("#")]
    models = []
    for line in data:
        weight,run = str.split(line)
        models.append((run, float(weight), 1))
    
    for concept in concepts:
        resultfile = os.path.join(simdir, newRunName, concept + ".txt")
        if checkToSkip(resultfile, overwrite):
            continue

        scorefile = os.path.join(simdir, models[0][0], concept + ".txt")
        if not os.path.exists(scorefile):
            print ("%s does not exist. skip" % scorefile)
            continue

        ranklist = readRankingResults(scorefile)
        names = sorted([x[0] for x in ranklist])

        nr_of_images = len(names)
        name2index = dict(zip(names, range(nr_of_images)))
   
        print ('%s %d' % (concept, nr_of_images))
        
        scoreTable = readImageScoreTable(concept, name2index, simdir, models, torank=options.torank)
        assert(scoreTable.shape[1] == nr_of_images)

        weights = [model[1] for model in models]

        scores = np.matrix(weights) * scoreTable
        scores = [float(scores[0,k]) for k in range(nr_of_images)]
  
        newranklist = [(names[i], scores[i]) for i in range(nr_of_images)]
        newranklist.sort(key=lambda v:(v[1],v[0]), reverse=True)
     
        writeRankingResults(newranklist, resultfile)
Example #23
0
def process(options, collection, annotationName, runfile, newRunName):
    rootpath = options.rootpath
    overwrite = options.overwrite

    dataset = options.testset if options.testset else collection

    concepts = readConcepts(collection, annotationName, rootpath)
    simdir = os.path.join(rootpath, collection, "SimilarityIndex", dataset)

    data = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith("#")]
    models = []
    for line in data:
        weight, run = str.split(line)
        models.append((run, float(weight), 1))

    for concept in concepts:
        resultfile = os.path.join(simdir, newRunName, concept + ".txt")
        if checkToSkip(resultfile, overwrite):
            continue

        scorefile = os.path.join(simdir, models[0][0], concept + ".txt")
        if not os.path.exists(scorefile):
            print("%s does not exist. skip" % scorefile)
            continue

        ranklist = readRankingResults(scorefile)
        names = sorted([x[0] for x in ranklist])

        nr_of_images = len(names)
        name2index = dict(zip(names, range(nr_of_images)))

        print("%s %d" % (concept, nr_of_images))

        scoreTable = readImageScoreTable(concept, name2index, simdir, models, torank=options.torank)
        assert scoreTable.shape[1] == nr_of_images

        weights = [model[1] for model in models]

        scores = np.matrix(weights) * scoreTable
        scores = [float(scores[0, k]) for k in range(nr_of_images)]

        newranklist = [(names[i], scores[i]) for i in range(nr_of_images)]
        newranklist.sort(key=lambda v: (v[1], v[0]), reverse=True)

        writeRankingResults(newranklist, resultfile)
Example #24
0
 def __init__(self, trainCollection, trainAnnotationName, feature, modelName, rootpath=ROOT_PATH):
     assert(modelName.startswith('fastlinear')), modelName
     self.concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath)
     self.nr_of_concepts = len(self.concepts)
     modeldir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName)
     model = load_model(os.path.join(modeldir, self.concepts[0]+'.model'))
     self.feat_dim = model.get_feat_dim()
      
     self.W = np.zeros((self.feat_dim, self.nr_of_concepts))
     self.AB = np.zeros((2, self.nr_of_concepts))
     for i in range(self.nr_of_concepts):
         model_file_name = os.path.join(modeldir, "%s.model" % self.concepts[i])
         model = load_model(model_file_name)
         self.W[:,i] = model.get_w()
         [A,B] = model.get_probAB()
         self.AB[:,i] = [A,B] if abs(A)>1e-8 else [-1,0]
         printStatus(INFO, '%s, A=%g, B=%g' % (self.concepts[i], A, B))
     printStatus(INFO, '%s-%s-%s -> %dx%d ModelArray' % (trainCollection,trainCollection,feature,self.feat_dim,self.nr_of_concepts))
Example #25
0
def process(options, trainCollection, annotationName):
    rootpath = options.rootpath
    overwrite = options.overwrite
    
    resultfile = os.path.join(rootpath, trainCollection, 'TextData', 'tag.concept-rank.%s.pkl' % annotationName)
    if checkToSkip(resultfile, overwrite):
        return 0
        
    concepts = readConcepts(trainCollection, annotationName, rootpath)
    concept_num = len(concepts)
    concept2index = dict(zip(concepts, range(concept_num)))
    tcb = TagCooccurBase(trainCollection, rootpath=rootpath)
    tag_num = tcb.tag_num()
    DEFAULT_RANK = tag_num
    rank_matrix = np.zeros((tag_num, concept_num), dtype=np.int) + DEFAULT_RANK
    tag_list = []
    
    for i,u in enumerate(tcb.vob):
        ranklist = tcb.top_cooccur(u,-1)
        concept2rank = {}
        rank = [DEFAULT_RANK] * concept_num
        
        hit = 0
        for j,x in enumerate(ranklist):
            idx = concept2index.get(x[0], -1)
            if idx>=0:
                rank_matrix[i,idx] = j+1
                hit += 1
                if hit == concept_num:
                    break
        tag_list.append(u)
        
        if (i+1) % 1e4 == 0:
            printStatus(INFO, '%d done' % (i+1) )
    
    assert(len(tag_list) == tag_num)
    
    import cPickle as pickle
    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump({'tags':tag_list, 'concepts':concepts, 'rank_matrix':rank_matrix}, output, -1)
    output.close()
    printStatus(INFO, '%dx%d dumped to %s' % (tag_num, concept_num, resultfile))
def submit(searchers,
           collection,
           annotationName,
           rootpath=ROOT_PATH,
           overwrite=0):
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_runs = len(searchers)

    for concept in concepts:
        for j in range(nr_of_runs):
            resultfile = os.path.join(searchers[j].getOutputdir(),
                                      concept + ".txt")
            if checkToSkip(resultfile, overwrite):
                continue
            searchresults = searchers[j].scoreCollection(concept)
            print("%s: %s %d -> %s" %
                  (searchers[j].name, concept, len(searchresults), resultfile))
            writeRankingResults(searchresults, resultfile)

    printStatus('%s.submit' % os.path.basename(__file__), "done")
Example #27
0
 def __init__(self,
              testCollection,
              trainCollection,
              annotationName,
              rootpath=ROOT_PATH):
     self.name = '%s-%s-%s' % (self.__class__.__name__, trainCollection,
                               annotationName)
     self.concepts = readConcepts(trainCollection, annotationName, rootpath)
     self.concept_num = len(self.concepts)
     self.concept2index = dict(zip(self.concepts, range(self.concept_num)))
     self.tbase = TagBase(trainCollection, tpp='lemm', rootpath=rootpath)
     self.rbase = ConceptRankBase(
         os.path.join(rootpath, trainCollection, 'TextData',
                      'tag.concept-rank.%s.pkl' % annotationName))
     self.DEFAULT_RANK = self.tbase.tag_num()
     self.m = DEFAULT_M
     self.k_r = DEFAULT_KR
     self.k_s = DEFAULT_KS
     self.k_d = DEFAULT_KD
     self.normalize = True
     self.add_bonus = False
Example #28
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.random.rand(len(id_images), len(concepts))

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump(
            {
                'concepts': concepts,
                'id_images': map(int, id_images),
                'scores': tagmatrix
            }, f, pickle.HIGHEST_PROTOCOL)
Example #29
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite
    random = options.random

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.zeros((len(id_images), len(concepts)))

    id_images = []
    tag2idx = dict(zip(concepts, xrange(len(concepts))))
    with open(os.path.join(rootpath, workingCollection, 'TextData', 'id.userid.lemmtags.txt')) as f:
        cnt = 0
        for line in f:
            id_img, _, tags = line.split('\t')
            tags = tags.split()
            if len(tags) > 0:
                tags = [(tag2idx.get(x,-1), y) for x,y in zip(tags, xrange(len(tags)))]
                idx = np.array([x[0] for x in tags])
                vals = 1. / (1. + np.array([x[1] for x in tags]))
                tagmatrix[cnt, idx] = vals

            id_images.append(id_img)
            cnt += 1

    # random rank for untagged images
    if random:
        tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand(tagmatrix.shape[0], tagmatrix.shape[1])

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)    
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
Example #30
0
def evaluateSearchEngines(searchers, collection, annotationName, metric, rootpath=ROOT_PATH):
    scorer = getScorer(metric)
    concepts = readConcepts(collection, annotationName, rootpath)
    
    nr_of_runs = len(searchers)
    nr_of_concepts = len(concepts)
    results = np.zeros((nr_of_concepts,nr_of_runs))


    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], rootpath)
        name2label = dict(zip(names,labels))
        
        for j in range(nr_of_runs):
            searchresults = searchers[j].scoreCollection(concepts[i])
            sorted_labels = [name2label[name] for (name,score) in searchresults if name in name2label]
            results[i,j] = scorer.score(sorted_labels)

    for i in range(nr_of_concepts):
        print concepts[i], ' '.join([niceNumber(x,3) for x in results[i,:]])
    mean_perf = results.mean(0)
    print 'mean%s'%metric, ' '.join([niceNumber(x,3) for x in mean_perf])

    return concepts,results
Example #31
0
def process(options, testCollection, trainCollection, annotationName, feature, outputpkl):
    rootpath = options.rootpath
    k = options.k
    distance = options.distance
    variant = options.variant
    overwrite = options.overwrite
    testset = testCollection
    forcetrainmodel = options.trainmodel
    modelName = "tagprop"
    nnName = distance + "knn"

    printStatus(INFO, "Starting TagProp %s,%s,%s,%s,%s" % (variant, trainCollection, testCollection, annotationName, feature))

    resultfile = os.path.join(outputpkl)
    resultfile_tagprop = os.path.join(rootpath, testCollection, 'TagProp-Prediction', testset, trainCollection, annotationName, modelName, '%s,%s,%s,%d'%(feature,nnName,variant,k), 'prediction.mat')
    # if checkToSkip(resultfile, overwrite) or checkToSkip(resultfile_tagprop, overwrite):
    #     return 0

    tagmatrix_file = os.path.join(rootpath, trainCollection, 'TextData', 'lemm_wordnet_freq_tags.h5')
    if not os.path.exists(tagmatrix_file):
        printStatus(INFO, "Tag matrix file not found at %s Did you run wordnet_frequency_tags.py?" % (tagmatrix_file))
        sys.exit(1)

    train_neighs_file = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d'%(feature,nnName,k), 'nn_train.h5')
    if not os.path.exists(train_neighs_file):
        printStatus(INFO, "Matlab train neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (train_neighs_file))
        sys.exit(1)

    # do we need to perform learning?
    train_model_file = os.path.join(rootpath, trainCollection, 'TagProp-models', '%s,%s,%s,%d'%(feature,nnName,variant,k), 'model.mat')
    # if os.path.exists(train_model_file) and not forcetrainmodel:
    if False:
        printStatus(INFO, "model for %s available at %s" % (trainCollection, train_model_file))
    else:
        printStatus(INFO, "starting learning model for %s" % (trainCollection))
        makedirsforfile(train_model_file)

        # print(tagmatrix_file, train_neighs_file)
        # exit()
        script = """
                tagprop_path = '%s/model_based/tagprop/TagProp/';
                addpath(tagprop_path);
                tagmatrix = h5read('%s', '/tagmatrix') > 0.5;
                tagmatrix = sparse(tagmatrix);
                NN = h5read('%s', '/NN');
                NN = NN(2:end, :);
                NN = double(NN);
        """ % (survey_code, tagmatrix_file, train_neighs_file)

        if variant == 'dist' or variant == 'distsigmoids':
            script += """
                NND = h5read('%s', '/NND');
                NND = NND(2:end, :);
                NND = reshape(NND, 1, size(NND,1), size(NND,2));
                NND = double(NND);
            """ % train_neighs_file

        if variant == 'rank':
            script += """
                m = tagprop_learn(NN,[],tagmatrix);
            """
        elif variant == 'ranksigmoids':
            script += """
                m = tagprop_learn(NN,[],tagmatrix,'sigmoids',true);
            """
        elif variant == 'dist':
            script += """
                m = tagprop_learn(NN,NND,tagmatrix,'type','dist');
            """
        elif variant == 'distsigmoids':
            script += """
                m = tagprop_learn(NN,NND,tagmatrix,'type','dist','sigmoids',true);
            """

        script += """
                save('%s', 'm', '-v7.3');
        """ % train_model_file

        # call_matlab(script)

    # print(script)
    # exit()

    # we perform prediction
    printStatus(INFO, "starting prediction")
    test_neighs_file = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d'%(feature,nnName,k), 'nn_test.h5')
    if not os.path.exists(test_neighs_file):
        printStatus(INFO, "Matlab test neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (test_neighs_file))
        sys.exit(1)

    script += """
            tagprop_path = '%s/model_based/tagprop/TagProp/';
            addpath(tagprop_path);
            load('%s');
            tagmatrix = h5read('%s', '/tagmatrix') > 0.5;
            tagmatrix = sparse(tagmatrix);
            NNT = h5read('%s', '/NNT');
            NNT = double(NNT);

    """ % (survey_code, train_model_file, tagmatrix_file, test_neighs_file)

    if variant == 'dist' or variant == 'distsigmoids':
        script += """
            NNDT = h5read('%s', '/NNDT');
            NNDT = reshape(NNDT, 1, size(NNDT,1), size(NNDT,2));
            NNDT = double(NNDT);
        """ % test_neighs_file

    script += """
            P = tagprop_predict(NNT,[],m)';
            save('%s', '-v7.3');
            exit;
    """ % resultfile_tagprop

    # print(script)
    makedirsforfile(resultfile_tagprop)
    call_matlab(script)
    # exit()

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)

    concepts = readConcepts(testCollection, annotationName, rootpath)
    id_images = readImageSet(testCollection, testset, rootpath)
    id_images.sort()
    # id_images = map(int, id_images)

    # concepts mapping
    tagprop_output = h5py.File(resultfile_tagprop, 'r')
    tagprop_input = h5py.File(tagmatrix_file, 'r')
    mapping = getVocabMap(list(tagprop_input['vocab'][:]),concepts)

    final_tagmatrix = tagprop_output['P'][:][:,mapping]

    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images':id_images, 'scores':final_tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
Example #32
0
def process(options, collection, annotationName, runfile, outDirectory):
    rootpath = options.rootpath

    apscorer = getScorer('AP')
    ndcg = getScorer('NDCG@20')
    ndcg2 = getScorer('NDCG2@20')
    p1scorer = getScorer('P@1')
    p5scorer = getScorer('P@5')

    datafiles = [
        x.strip() for x in open(runfile).readlines()
        if x.strip() and not x.strip().startswith('#')
    ]
    nr_of_runs = len(datafiles)

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    printStatus(INFO, 'read annotations from files')

    name2label = [{} for i in range(nr_of_concepts)]
    hit_imgset = [[] for i in range(nr_of_concepts)]
    rel_conset = {}

    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection,
                                            annotationName,
                                            concepts[i],
                                            skip_0=False,
                                            rootpath=rootpath)
        names = map(int, names)
        name2label[i] = dict(zip(names, labels))

        for im, lab in zip(names, labels):
            if lab > 0:
                rel_conset.setdefault(im, set()).add(i)

        label_file = os.path.join(rootpath, collection, 'tagged,lemm',
                                  '%s.txt' % concepts[i])
        try:
            hit_imgset[i] = set(map(int, open(label_file).readlines()))
        except:
            hit_imgset[i] = set()
        printStatus(
            INFO, 'readLabeledImageSet for %s-%s -> %d hits' %
            (collection, concepts[i], len(hit_imgset[i])))

    ap_table = np.zeros((nr_of_runs, nr_of_concepts))
    ap2_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts))

    print '#' * 100
    print '# method miap hit1 hit5'
    print '#' * 100

    for run_idx in range(nr_of_runs):
        data = pickle.load(open(datafiles[run_idx], 'rb'))
        scores = data['scores']
        assert (scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        imset = np.array([int(x) for x in imset])
        idx = np.argsort(imset)
        imset = imset[idx]
        scores = scores[idx]
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]

        for c_idx in range(nr_of_concepts):
            ground_truth = name2label[c_idx]
            ranklist = zip(imset, scores[:, c_idx])
            ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True)
            ranklist = [x for x in ranklist if x[0] in ground_truth]

            sorted_labels = [ground_truth[x[0]] for x in ranklist]
            assert (len(sorted_labels) > 0)
            #print concepts[c_idx], ranklist[:5], sorted_labels[:5]

            ap_table[run_idx, c_idx] = apscorer.score(sorted_labels)

            sorted_labels = [
                ground_truth[x[0]] for x in ranklist
                if x[0] in hit_imgset[c_idx]
            ]
            ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels)
            ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels)
            ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels)

        res = np.zeros((nr_of_images, 4))
        gt = np.zeros((nr_of_images, nr_of_concepts))
        for j in range(nr_of_images):
            ranklist = zip(range(nr_of_concepts), scores[j, :])
            ranklist.sort(key=lambda v: v[1], reverse=True)
            rel_set = rel_conset.get(imset[j], set())
            sorted_labels = [int(x[0] in rel_set) for x in ranklist]

            #print rel_set
            #print sorted_labels

            ap = apscorer.score(sorted_labels)
            hit1 = p1scorer.score(sorted_labels)
            hit5 = p5scorer.score(sorted_labels) > 0.1
            res[j, :] = [ap, hit1, hit5, len(rel_set)]
            gt[j, :] = sorted_labels
        avg_perf = res.mean(axis=0)
        print os.path.split(datafiles[run_idx])[-1], ' '.join(
            ['%.3f' % x for x in avg_perf])

        outMiap = h5py.File(
            os.path.join(outDirectory,
                         os.path.split(datafiles[run_idx])[-1] + ".h5"), 'w')
        outMiap['iap'] = res[:, 0]
        outMiap['ngt'] = res[:, 3]
        outMiap['hit1'] = res[:, 1]
        outMiap['hit5'] = res[:, 2]
        outMiap['gt'] = gt
        outMiap['concepts'] = concepts
        outMiap['ap'] = ap_table[run_idx, :]
        outMiap['ap2'] = ap2_table[run_idx, :]
        outMiap[ndcg.name()] = ndcg_table[run_idx, :]
        outMiap[ndcg2.name()] = ndcg2_table[run_idx, :]
        outMiap.close()

    print '#' * 100
    print '# untagged-concept', ' '.join(
        [os.path.split(x)[-1] for x in datafiles])
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ap_table[:, c_idx]])
    print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)])

    print '#' * 100
    print '# tagged-concept'
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ap2_table[:, c_idx]])
    print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])

    print '#' * 100
    print '# tagged-concept'
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ndcg_table[:, c_idx]])
    print 'mean%s' % ndcg.name(), ' '.join(
        ['%.3f' % x for x in ndcg_table.mean(axis=1)])

    print '#' * 100
    print '# tagged-concept'
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ndcg2_table[:, c_idx]])
    print 'mean%s' % ndcg2.name(), ' '.join(
        ['%.3f' % x for x in ndcg2_table.mean(axis=1)])
Example #33
0
def process(options, testCollection, trainCollection, trainAnnotationName,
            feature, modelName):
    if modelName.startswith('fik'):
        from fiksvm.fiksvm import fiksvm_load_model as load_model
    else:
        from fastlinear.fastlinear import fastlinear_load_model as load_model

    rootpath = options.rootpath
    overwrite = options.overwrite
    prob_output = options.prob_output
    numjobs = options.numjobs
    job = options.job
    blocksize = options.blocksize

    outputName = '%s,%s' % (feature, modelName)
    if prob_output:
        outputName += ',prob'

    resultfile = os.path.join(rootpath, testCollection, 'autotagging',
                              testCollection, trainCollection,
                              trainAnnotationName, outputName,
                              'id.tagvotes.txt')
    if numjobs > 1:
        resultfile += '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(trainCollection,
                            trainAnnotationName,
                            rootpath=rootpath)
    nr_of_concepts = len(concepts)

    test_imset = readImageSet(testCollection, testCollection, rootpath)
    test_imset = [
        test_imset[i] for i in range(len(test_imset)) if i % numjobs + 1 == job
    ]
    nr_of_test_images = len(test_imset)
    printStatus(
        INFO, "working on %d-%d, %d test images -> %s" %
        (numjobs, job, nr_of_test_images, resultfile))

    models = [None] * nr_of_concepts
    for c in range(nr_of_concepts):
        model_file_name = os.path.join(rootpath, trainCollection, 'Models',
                                       trainAnnotationName, feature, modelName,
                                       '%s.model' % concepts[c])
        models[c] = load_model(model_file_name)
        if models[c] is None:
            return 0
        #(pA,pB) = model.get_probAB()

    feat_file = BigFile(
        os.path.join(rootpath, testCollection, "FeatureData", feature))
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    read_time = 0
    test_time = 0
    start = 0
    done = 0

    while start < nr_of_test_images:
        end = min(nr_of_test_images, start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end - 1))

        s_time = time.time()
        renamed, test_X = feat_file.read(test_imset[start:end])
        read_time += time.time() - s_time

        s_time = time.time()
        output = [None] * len(renamed)
        for i in xrange(len(renamed)):
            if prob_output:
                scores = [
                    models[c].predict_probability(test_X[i])
                    for c in range(nr_of_concepts)
                ]
            else:
                scores = [
                    models[c].predict(test_X[i]) for c in range(nr_of_concepts)
                ]
            #dec_value = sigmoid_predict(dec_value, A=pA, B=pB)
            tagvotes = sorted(zip(concepts, scores),
                              key=lambda v: v[1],
                              reverse=True)
            output[i] = '%s %s\n' % (renamed[i], " ".join([
                "%s %s" % (tag, niceNumber(vote, 6))
                for (tag, vote) in tagvotes
            ]))
        test_time += time.time() - s_time
        start = end
        fw.write(''.join(output))
        fw.flush()
        done += len(output)

    # done
    printStatus(
        INFO, "%d done. read time %g seconds, test_time %g seconds" %
        (done, read_time, test_time))
    fw.close()
    return done
Example #34
0
#from svm import *
from fastsvm.svmutil import *
from fastsvm.svm import *
from fiksvm import *
from fiksvmutil import *
from fastsvm.fiksvm import svm_to_fiksvm as svm_to_fiksvm0

if __name__ == "__main__":
    rootpath = ROOT_PATH
    trainCollection = "voc2008train"
    testCollection = "voc2008val"
    annotationName = "conceptsvoc2008train.txt"
    #concept = "aeroplane"
    feature = "dsift"

    concepts = readConcepts(testCollection, 'conceptsvoc2008val.txt')
    scorer = getScorer('AP')

    min_vals, max_vals = find_min_max_vals(
        BigFile(
            os.path.join(rootpath, trainCollection, 'FeatureData', feature),
            FEATURE_TO_DIM[feature]))
    featurefile = os.path.join(rootpath, testCollection, "FeatureData",
                               feature, "id.feature.txt")

    feat_dim = 1024
    num_bins = 50

    #fikmodel.set_probAB(-1, 0)

    #print "fik model0", fikmodel0.get_nr_svs(), fikmodel0.get_feat_dim(), fikmodel0.get_probAB()
Example #35
0
def process(options, collection, annotationName, runfile):
    rootpath = options.rootpath

    p1_scorer = getScorer('P@3')
    p3_scorer = getScorer('P@5')
    r1_scorer = getScorer('R@3')
    r3_scorer = getScorer('R@5')
    ndcg1_scorer = getScorer('NDCG2@3')
    ndcg3_scorer = getScorer('NDCG2@5')
    ap_scorer = getScorer('AP')
    rr_scorer = getScorer('RR')

    datafiles = [
        x.strip() for x in open(runfile).readlines()
        if x.strip() and not x.strip().startswith('#')
    ]
    nr_of_runs = len(datafiles)

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    name2label = [{} for i in range(nr_of_concepts)]
    rel_conset = {}

    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection,
                                            annotationName,
                                            concepts[i],
                                            skip_0=False,
                                            rootpath=rootpath)
        #names = map(int, names)
        name2label[i] = dict(zip(names, labels))

        for im, lab in zip(names, labels):
            if lab > 0:
                rel_conset.setdefault(im, set()).add(i)

    # ('7975436322', set([33]))
    # for im, im_labels in rel_conset.items():
    #   print(im, im_labels)

    for run_idx in range(nr_of_runs):
        data = pickle.load(open(datafiles[run_idx], 'rb'))
        scores = data['scores']
        assert (scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        # for im in imset:
        #     print(im)
        #     raw_input()
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]

        res = np.zeros((nr_of_images, 8))
        for j in range(nr_of_images):
            ranklist = zip(range(nr_of_concepts), scores[j, :])
            ranklist.sort(key=lambda v: v[1], reverse=True)
            # print(ranklist)
            # raw_input()
            rel_set = rel_conset.get(imset[j], set())
            sorted_labels = [int(x[0] in rel_set) for x in ranklist]
            # print(sorted_labels)
            # raw_input()
            assert len(sorted_labels) == nr_of_concepts
            p1 = p1_scorer.score(sorted_labels)
            p3 = p3_scorer.score(sorted_labels)
            r1 = r1_scorer.score(sorted_labels)
            r3 = r3_scorer.score(sorted_labels)
            ndcg1 = ndcg1_scorer.score(sorted_labels)
            ndcg3 = ndcg3_scorer.score(sorted_labels)
            ap = ap_scorer.score(sorted_labels)
            rr = rr_scorer.score(sorted_labels)

            f1, f3 = 0.0, 0.0
            if (p1 + r1) != 0.0:
                f1 = 2 * p1 * r1 / (p1 + r1)
            if (p3 + r3) != 0.0:
                f3 = 2 * p3 * r3 / (p3 + r3)
            # h1, h3 = max(p1, r1), max(p3, r3)
            res[j, :] = [p1, p3, r1, r3, ndcg1, ndcg3, ap, rr]
            res[j, :] = [p1, p3, f1, f3, ndcg1, ndcg3, ap, rr]
            # res[j,:] = [p1, p3, h1, h3, ndcg1, ndcg3, ap, rr]
        avg_perf = res.mean(axis=0)
        name = path.basename(datafiles[run_idx]).split('.')[0]
        name = name.split(',')[1]
        stdout.write('%s\t' % name)
        # for x in avg_perf:
        for i in range(len(avg_perf)):
            if i == 4 or i == 5:
                continue
            # x = avg_perf[i] * 100.0
            x = avg_perf[i]
            if x >= 100.0:
                stdout.write('& %.1f ' % x)
            else:
                # stdout.write('& %.2f ' % x)
                stdout.write('& %s' % (('%.4f ' % x).lstrip('0')))
        stdout.write('\n')
Example #36
0
def process(options, trainCollection, trainAnnotationName, feature):
    import re
    p = re.compile(r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)')

    rootpath = options.rootpath
    best_param_dir = options.best_param_dir
    overwrite = options.overwrite
    #autoweight = options.autoweight
    numjobs = options.numjobs
    job = options.job
    beta = 0.5
    
    modelName = 'fastlinear'
    if best_param_dir:
        modelName += '-tuned'
    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    if not todo:
        return 0

    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    
    feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature))
    
    for concept in todo:
        if best_param_dir:
            param_file = os.path.join(best_param_dir, '%s.txt' % concept)
            m = p.search(open(param_file).readline().strip())
            C = float(m.group('C'))
            A = float(m.group('a'))
            B = float(m.group('b'))
        else:
            C = 1
            A = 0
            B = 0
        printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B))
        
        model_file_name = os.path.join(resultdir, concept + '.model')
        
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names,labels))
        renamed,vectors = feat_file.read(names)
        y = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if  1 == lab])
        nn = len([1 for lab in labels if  -1== lab])
        wp = float(beta) * (np+nn) / np
        wn = (1.0-beta) * (np+nn) /nn
    
        # no bias term added by setting "-B -1"
        svm_params = '-w1 %g -w-1 %g -s 2 -B -1 -q' % (wp*C, wn*C) 
        model = liblinear_train(y, vectors, svm_params)
        newmodel = liblinear_to_fastlinear([model], [1.0], feat_file.ndims)
        newmodel.set_probAB(A, B)
        makedirsforfile(model_file_name)
        printStatus(INFO, '-> %s'%model_file_name)
        fastlinear_save_model(model_file_name, newmodel)

        # reload the model file to do a simple check
        fastlinear_load_model(model_file_name)
        assert(abs(newmodel.get_probAB()[0]-A)<1e-6)
        assert(abs(newmodel.get_probAB()[1]-B)<1e-6)

    return len(todo)
Example #37
0
def process(options, collection, annotationName, runfile):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultdir = os.path.join(rootpath, collection, 'SimilarityIndex',
                             collection)

    apscorer = getScorer('AP')
    datafiles = [
        x.strip() for x in open(runfile).readlines()
        if x.strip() and not x.strip().startswith('#')
    ]
    nr_of_runs = len(datafiles)

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    printStatus(INFO, 'read annotations from files')

    name2label = [{} for i in range(nr_of_concepts)]
    hit_imgset = [[] for i in range(nr_of_concepts)]

    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection,
                                            annotationName,
                                            concepts[i],
                                            skip_0=False,
                                            rootpath=rootpath)
        names = map(int, names)
        name2label[i] = dict(zip(names, labels))

        label_file = os.path.join(rootpath, collection, 'tagged,lemm',
                                  '%s.txt' % concepts[i])
        try:
            hit_imgset[i] = set(map(int, open(label_file).readlines()))
        except:
            hit_imgset[i] = set()
        printStatus(
            INFO, 'readLabeledImageSet for %s-%s -> %d hits' %
            (collection, concepts[i], len(hit_imgset[i])))

    ap_table = np.zeros((nr_of_runs, nr_of_concepts))
    ap2_table = np.zeros((nr_of_runs, nr_of_concepts))

    for run_idx in range(nr_of_runs):
        runName = os.path.splitext(os.path.basename(datafiles[run_idx]))[0]
        data = pickle.load(open(datafiles[run_idx], 'rb'))
        scores = data['scores']
        assert (scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]

        for c_idx in range(nr_of_concepts):
            ground_truth = name2label[c_idx]
            ranklist = zip(imset, scores[:, c_idx])
            ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True)
            ranklist = [x for x in ranklist if x[0] in ground_truth]

            resfile = os.path.join(resultdir, runName,
                                   '%s.txt' % concepts[c_idx])
            if not checkToSkip(resfile, overwrite):
                writeRankingResults(ranklist, resfile)
            sorted_labels = [ground_truth[x[0]] for x in ranklist]
            assert (len(sorted_labels) > 0)
            #print concepts[c_idx], ranklist[:5], sorted_labels[:5]

            ap_table[run_idx, c_idx] = apscorer.score(sorted_labels)

            resfile = os.path.join(resultdir, 'tagged,lemm', runName,
                                   '%s.txt' % concepts[c_idx])
            if not checkToSkip(resfile, overwrite):
                writeRankingResults(
                    [x for x in ranklist if x[0] in hit_imgset[c_idx]],
                    resfile)

            sorted_labels = [
                ground_truth[x[0]] for x in ranklist
                if x[0] in hit_imgset[c_idx]
            ]
            ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels)

    print '#' * 100
    print '# untagged-concept', ' '.join(
        [os.path.basename(x) for x in datafiles])
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ap_table[:, c_idx]])
    print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)])

    print '#' * 100
    print '# tagged-concept'
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ap2_table[:, c_idx]])
    print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])
Example #38
0
def process(options, testCollection, trainCollection, annotationName,
            tagrelMethod, tagfeature):
    rootpath = options.rootpath
    overwrite = options.overwrite

    concepts = readConcepts(trainCollection, annotationName, rootpath)
    nr_of_concepts = len(concepts)
    mapping = dict(zip(concepts, range(nr_of_concepts)))

    feat_dir = os.path.join(rootpath, testCollection, 'FeatureData',
                            tagfeature)
    binary_file = os.path.join(feat_dir, 'feature.bin')
    id_file = os.path.join(feat_dir, 'id.txt')
    shape_file = os.path.join(feat_dir, 'shape.txt')

    if checkToSkip(binary_file, overwrite):
        sys.exit(0)

    inputfile = os.path.join(rootpath, testCollection, 'autotagging',
                             testCollection, trainCollection, tagrelMethod,
                             'id.tagvotes.txt')
    if not os.path.exists(inputfile):
        printError(INFO, '%s does not exist' % inputfile)
        sys.exit(0)

    makedirsforfile(binary_file)
    fw = open(binary_file, 'wb')
    processed = set()
    imset = []
    count_line = 0

    for line in open(inputfile):
        count_line += 1
        elems = str.split(line.strip())
        name = elems[0]

        if name in processed:
            continue
        processed.add(name)

        del elems[0]
        assert (len(elems) == 2 * nr_of_concepts)
        vec = [0] * nr_of_concepts

        for i in range(0, len(elems), 2):
            tag = elems[i]
            idx = mapping[tag]
            score = float(elems[i + 1])
            vec[idx] = score

        s = float(sum(vec))  # l_1 normalized
        vec = np.array([x / s for x in vec], dtype=np.float32)
        vec.tofile(fw)
        imset.append(name)

    fw.close()

    fw = open(id_file, 'w')
    fw.write(' '.join(imset))
    fw.close()

    fw = open(shape_file, 'w')
    fw.write('%d %d' % (len(imset), nr_of_concepts))
    fw.close()
    print('%d lines parsed, %d ids ->  %d unique ids' %
          (count_line, len(processed), len(imset)))
Example #39
0
def process(options, trainCollection, annotationfile, feature, modelName):
    assert (modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1  #options.autoweight
    beta = 0.5
    C = 1
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {'rootpath': rootpath, 'model': modelName}

    if 'fik' == modelName:
        from svms.fiksvm.svmutil import svm_train as train_model
        from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from svms.fiksvm.fiksvm import fiksvm_save_model as save_model
        from svms.fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData',
                                   feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))
    else:
        from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model
        from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        from svms.fastlinear.fastlinear import fastlinear_save_model as save_model

    newAnnotationName = os.path.split(annotationfile)[-1]
    trainAnnotationNames = [
        x.strip() for x in open(annotationfile).readlines()
        if x.strip() and not x.strip().startswith('#')
    ]
    for annotationName in trainAnnotationNames:
        conceptfile = os.path.join(rootpath, trainCollection, 'Annotations',
                                   annotationName)
        if not os.path.exists(conceptfile):
            print '%s does not exist' % conceptfile
            return 0

    concepts = readConcepts(trainCollection,
                            trainAnnotationNames[0],
                            rootpath=rootpath)

    resultdir = os.path.join(rootpath, trainCollection, 'Models',
                             newAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)]
    printStatus(INFO,
                'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(
        os.path.join(rootpath, trainCollection, 'FeatureData', feature))
    feat_dim = train_feat_file.ndims

    s_time = time.time()

    for concept in todo:
        assemble_model = None
        for t in range(1, len(trainAnnotationNames) + 1):
            names, labels = readAnnotationsFrom(trainCollection,
                                                trainAnnotationNames[t - 1],
                                                concept,
                                                skip_0=True,
                                                rootpath=rootpath)
            name2label = dict(zip(names, labels))
            renamed, vectors = train_feat_file.read(names)
            Ys = [name2label[x] for x in renamed]
            np = len([1 for lab in labels if 1 == lab])
            nn = len([1 for lab in labels if -1 == lab])
            wp = float(beta) * (np + nn) / np
            wn = (1.0 - beta) * (np + nn) / nn

            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
            else:
                svm_params = '-c %g' % C

            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '

            g_t = train_model(Ys, vectors, svm_params + ' -q')
            if t == 1:
                assemble_model = compress_model([g_t], [1.0], feat_dim, params)
            else:
                new_model = compress_model([g_t], [1.0], feat_dim, params)
                assemble_model.add_fastsvm(new_model, 1 - 1.0 / t, 1.0 / t)

        new_model_file = os.path.join(resultdir, '%s.model' % concept)
        makedirsforfile(new_model_file)
        printStatus(INFO, 'save model to %s' % new_model_file)
        save_model(new_model_file, assemble_model)
        printStatus(INFO, '%s done' % concept)

    timecost = time.time() - s_time
    writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath)
    printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo)))
    printStatus(INFO, 'models stored at %s' % resultdir)
    printStatus(INFO, '%g seconds in total' % timecost)
Example #40
0
def process(options, workingCollection, annotationName, feature, outputpkl):
    rootpath = options.rootpath
    distance = options.distance
    overwrite = options.overwrite
    k_ratio = options.kratio
    ratio_cs = options.ratiocs
    lambda1 = options.lambda1
    lambda2 = options.lambda2
    outputonlytest = options.outputonlytest
    rawtagmatrix = options.rawtagmatrix
    modelName = "robustpca"
    nnName = distance + "knn"

    printStatus(INFO, "Starting RobustPCA %s,%s,%s,%s,%f,%f,%f" % (workingCollection, annotationName, feature, nnName, k_ratio, lambda1, lambda2))

    if rawtagmatrix:
        printStatus(INFO, "Using raw tag matrix.")
    else:
        printStatus(INFO, "Using preprocessed tag matrix.")

    resultfile = os.path.join(outputpkl)
    resultfile_robustpca = os.path.join(rootpath, workingCollection, 'RobustPCA-Prediction', '%s,%s,%f,%f,%f,%d'%(feature,nnName,lambda1,lambda2,k_ratio,rawtagmatrix), 'prediction.mat')

    if checkToSkip(resultfile_robustpca, overwrite):
        only_dump = True
    else:
        only_dump = False

    if not rawtagmatrix:
        tagmatrix_file = os.path.join(rootpath, workingCollection, 'RobustPCA', '%s,%s,%f'%(feature,nnName,DEFAULT_K_PROP), 'tagmatrix.h5')
        if not os.path.exists(tagmatrix_file):
            printStatus(INFO, "Tag matrix file not found at %s Did you run robustpca_preprocessing.py?" % (tagmatrix_file))
            sys.exit(1)
    else:
        tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5")
        if not os.path.exists(tagmatrix_file):
            printStatus(INFO, 'Tag matrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file))
            sys.exit(1)

    laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,k_ratio), 'laplacianI.mat')
    if not os.path.exists(laplacianI_file):
        printStatus(INFO, "LaplacianI file not found at %s Did you run laplacian_images.py?" % (laplacianI_file))
        sys.exit(1)

    laplacianT_file = os.path.join(rootpath, workingCollection, 'LaplacianT', '%f'%(ratio_cs), 'laplacianT.mat')
    if not os.path.exists(laplacianT_file):
        printStatus(INFO, "LaplacianT file not found at %s Did you run laplacian_tags.py?" % (laplacianT_file))
        sys.exit(1)

    # being learning
    script = """
        rpca_path = 'transduction_based/robustpca/';
        addpath(rpca_path);
        addpath([rpca_path, 'fast_svd/']);
        tagmatrix = sparse(double(h5read('%s', '/tagmatrix')));
        load('%s');
        load('%s');

        lambda1 = %f;
        lambda2 = %f;
        maxIters = 50;
        precision = 1e-4;
        mu_start = 1.;

        parpool('local', 4);
        [P,E]=robustpca(tagmatrix, lambda1, lambda2, tag_similarity, im_similarity, maxIters, precision, mu_start);
        """ % (tagmatrix_file, laplacianI_file, laplacianT_file, lambda1, lambda2)

    script += """
        delete(gcp);
        save('%s', 'P', 'E', 'lambda1', 'lambda2', '-v7.3');
        exit;
    """ % resultfile_robustpca

    if not only_dump:
        printStatus(INFO, "starting learning")
        makedirsforfile(resultfile_robustpca)
        call_matlab(script)

    if checkToSkip(resultfile, overwrite):
        return 0

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)
    concepts = readConcepts(workingCollection, annotationName, rootpath)
    if outputonlytest:
        testset_id_images = readImageSet(workingCollection.split('+')[1], workingCollection.split('+')[1], rootpath)
        testset_id_images.sort()

    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    id_images.sort()

    # concepts mapping
    robustpca_output = h5py.File(resultfile_robustpca, 'r')
    tagprop_input = h5py.File(tagmatrix_file, 'r')
    mapping = getVocabMap(list(tagprop_input['vocab'][:]),concepts)

    predicted_tagmatrix = robustpca_output['P'][:,mapping]

    if outputonlytest:
        idx = np.array([bisect_index(id_images, x) for x in testset_id_images])
        final_tagmatrix = predicted_tagmatrix[idx, :]
        assert(final_tagmatrix.shape[0] == idx.shape[0])
        id_images = testset_id_images
    else:
        final_tagmatrix = predicted_tagmatrix

    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images': id_images, 'scores':final_tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
Example #41
0
from basic.common import ROOT_PATH,checkToSkip,makedirsforfile
from basic.util import readImageSet
from simpleknn.bigfile import BigFile, StreamFile
from basic.annotationtable import readConcepts,readAnnotationsFrom


rootpath = ROOT_PATH

trainCollection = 'voc2008train'
trainAnnotationName = 'conceptsvoc2008train.txt'
modelName = 'fik50'
modelName = 'fastlinear'
modelName = sys.argv[1]
feature = 'dsift'
weight_dir = os.path.join(rootpath, trainCollection, 'l2r', modelName)

concepts = readConcepts(trainCollection,trainAnnotationName,rootpath=rootpath)
nr_of_models = 5

for concept in concepts:
    weight_file = os.path.join(weight_dir, '%s.txt' % concept)
    makedirsforfile(weight_file)
    weights = [1.0/nr_of_models] * nr_of_models
    model = os.path.join(trainCollection, 'Models', 'conceptsvoc2008train.txt', feature, modelName)
    models = [model] * nr_of_models
    fw = open(weight_file, 'w')
    fw.write('\n'.join(['%g %s' % (w,m) for w,m in zip(weights, models)]))
    fw.close()

    rootpath = options.rootpath
    nr_pos = options.pos_nr
    collection = argv[0] #'train1m'
    annotationName = argv[1] # 'conceptsmir14social.txt'
    rankMethod = argv[2] #'train1m/fcs-wn_color64+dsift_borda'
    posName = argv[3] #'fcstagrelbc'
    neg_pos_ratio = options.neg_pos_ratio
    nr_neg = neg_pos_ratio * nr_pos
    nr_neg_bags = options.nr_neg_bags # 10
    overwrite = options.overwrite

    assert( annotationName.endswith('social.txt') )
    assert( rankMethod.startswith('tagged,lemm/%s'%collection) )

    newAnnotationTemplate = annotationName[:-4] + '.' + posName + str(nr_pos) + ('.random%d'%nr_neg) + '.%d.txt'
    concepts = readConcepts(collection, annotationName, rootpath)    
    simdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection, rankMethod)

    scriptfile = os.path.join(rootpath,collection,'annotationfiles', annotationName[:-4] + '.' + posName + str(nr_pos) + ('.random%d'%nr_neg) + '.0-%d.txt'%(nr_neg_bags-1))
    makedirsforfile(scriptfile)
    fout = open(scriptfile,'w')
    fout.write('\n'.join([newAnnotationTemplate%t for t in range(nr_neg_bags)]) + '\n')
    fout.close()


    for concept in concepts:
        simfile = os.path.join(simdir, '%s.txt' % concept)
        ranklist = readRankingResults(simfile)
        pos_bag = [x[0] for x in ranklist[:nr_pos]]
        names, labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath)
        negativePool = [x[0] for x in zip(names,labels) if x[1] < 0]
Example #43
0
    overwrite = cmdOpts.getInt('overwrite')
    rootpath = cmdOpts.getString('rootpath')
    collection = cmdOpts.getString('collection')
    annotationName = cmdOpts.getString('annotationName')
    tpp = cmdOpts.getString('tpp')
    nr_pos = cmdOpts.getInt('nr_pos')
    pos_source = cmdOpts.getString('pos_source')
    select_pos = cmdOpts.getString('select_pos')
    neg_filter = cmdOpts.getString('neg_filter')
    neg_pos_ratio = cmdOpts.getInt('neg_pos_ratio')
    nr_pos_bags = cmdOpts.getInt('nr_pos_bags')
    nr_neg_bags = cmdOpts.getInt('nr_neg_bags')
    nr_neg = nr_pos * neg_pos_ratio

    concepts = readConcepts(collection, annotationName)
    annotationNameStr = generate_new_annotation_template(cmdOpts)

    nr_skipped = 0
    newAnnotationNames = [None] * (nr_pos_bags * nr_neg_bags)

    for idxp in range(nr_pos_bags):
        for idxn in range(nr_neg_bags):
            anno_idx = idxp * nr_neg_bags + idxn
            newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn)
            resultfile = os.path.join(rootpath, collection, 'Annotations',
                                      newAnnotationNames[anno_idx])
            if checkToSkip(resultfile, overwrite):
                nr_skipped += 1
                continue
            writeConcepts(concepts, resultfile)
Example #44
0
def process(options, testCollection, trainCollection, annotationName, feature, outputpkl):
    rootpath = options.rootpath
    k = options.k
    distance = options.distance
    variant = options.variant
    overwrite = options.overwrite
    testset = testCollection
    forcetrainmodel = options.trainmodel
    modelName = "tagprop"
    nnName = distance + "knn"

    printStatus(INFO, "Starting TagProp %s,%s,%s,%s,%s" % (variant, trainCollection, testCollection, annotationName, feature))

    resultfile = os.path.join(outputpkl)
    resultfile_tagprop = os.path.join(rootpath, testCollection, 'TagProp-Prediction', testset, trainCollection, annotationName, modelName, '%s,%s,%s,%d'%(feature,nnName,variant,k), 'prediction.mat')
    if checkToSkip(resultfile, overwrite) or checkToSkip(resultfile_tagprop, overwrite):
        return 0

    tagmatrix_file = os.path.join(rootpath, trainCollection, 'TextData', 'lemm_wordnet_freq_tags.h5')
    if not os.path.exists(tagmatrix_file):
        printStatus(INFO, "Tag matrix file not found at %s Did you run wordnet_frequency_tags.py?" % (tagmatrix_file))
        sys.exit(1)

    train_neighs_file = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d'%(feature,nnName,k), 'nn_train.h5')
    if not os.path.exists(train_neighs_file):
        printStatus(INFO, "Matlab train neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (train_neighs_file))
        sys.exit(1)

    # do we need to perform learning?
    train_model_file = os.path.join(rootpath, trainCollection, 'TagProp-models', '%s,%s,%s,%d'%(feature,nnName,variant,k), 'model.mat')
    if os.path.exists(train_model_file) and not forcetrainmodel:
        printStatus(INFO, "model for %s available at %s" % (trainCollection, train_model_file))
    else:
        printStatus(INFO, "starting learning model for %s" % (trainCollection))
        makedirsforfile(train_model_file)

        script = """
                tagprop_path = 'model_based/tagprop/TagProp/';
                addpath(tagprop_path);
                tagmatrix = h5read('%s', '/tagmatrix') > 0.5;
                tagmatrix = sparse(tagmatrix);
                NN = h5read('%s', '/NN');
                NN = NN(2:end, :);
                NN = double(NN);
        """ % (tagmatrix_file, train_neighs_file)

        if variant == 'dist' or variant == 'distsigmoids':
            script += """
                NND = h5read('%s', '/NND');
                NND = NND(2:end, :);
                NND = reshape(NND, 1, size(NND,1), size(NND,2));
                NND = double(NND);
            """ % train_neighs_file

        if variant == 'rank':
            script += """
                m = tagprop_learn(NN,[],tagmatrix);
            """
        elif variant == 'ranksigmoids':
            script += """
                m = tagprop_learn(NN,[],tagmatrix,'sigmoids',true);
            """
        elif variant == 'dist':
            script += """
                m = tagprop_learn(NN,NND,tagmatrix,'type','dist');
            """
        elif variant == 'distsigmoids':
            script += """
                m = tagprop_learn(NN,NND,tagmatrix,'type','dist','sigmoids',true);
            """

        script += """
                save('%s', 'm', '-v7.3');
                exit;
        """ % train_model_file

        call_matlab(script)

    # we perform prediction
    printStatus(INFO, "starting prediction")
    test_neighs_file = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d'%(feature,nnName,k), 'nn_test.h5')
    if not os.path.exists(test_neighs_file):
        printStatus(INFO, "Matlab test neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (test_neighs_file))
        sys.exit(1)

    script = """
            tagprop_path = 'model_based/tagprop/TagProp/';
            addpath(tagprop_path);
            load('%s');
            tagmatrix = h5read('%s', '/tagmatrix') > 0.5;
            tagmatrix = sparse(tagmatrix);
            NNT = h5read('%s', '/NNT');
            NNT = double(NNT);

    """ % (train_model_file, tagmatrix_file, test_neighs_file)

    if variant == 'dist' or variant == 'distsigmoids':
        script += """
            NNDT = h5read('%s', '/NNDT');
            NNDT = reshape(NNDT, 1, size(NNDT,1), size(NNDT,2));
            NNDT = double(NNDT);
        """ % test_neighs_file

    script += """
            P = tagprop_predict(NNT,[],m)';
            save('%s', '-v7.3');
            exit;
    """ % resultfile_tagprop

    makedirsforfile(resultfile_tagprop)
    call_matlab(script)

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)

    concepts = readConcepts(testCollection, annotationName, rootpath)
    id_images = readImageSet(testCollection, testset, rootpath)
    id_images.sort()
    # id_images = map(int, id_images)

    # concepts mapping
    tagprop_output = h5py.File(resultfile_tagprop, 'r')
    tagprop_input = h5py.File(tagmatrix_file, 'r')
    mapping = getVocabMap(list(tagprop_input['vocab'][:]),concepts)

    final_tagmatrix = tagprop_output['P'][:][:,mapping]

    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images':id_images, 'scores':final_tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
Example #45
0
    trainAnnotationName = 'conceptsvoc2008train.txt'
    testCollection = 'voc2008val'
    testset = testCollection
    testAnnotationName = 'conceptsvoc2008val.txt'

    modelName = 'fik50' 
    #modelName = 'fastlinear'
    if 'fastlinear' == modelName:
        from fastlinear.fastlinear import fastlinear_load_model as load_model
    else:
        from fiksvm.fiksvm import fiksvm_load_model as load_model

    scorer = getScorer(metric)
    
    imset = readImageSet(testCollection,testset,rootpath=rootpath)
    concepts = readConcepts(testCollection,testAnnotationName,rootpath=rootpath)
    feat_dir = os.path.join(rootpath, testCollection, "FeatureData", feature)
    feat_file = BigFile(feat_dir)

    _renamed, _vectors = feat_file.read(imset)

    nr_of_images = len(_renamed)
    nr_of_concepts = len(concepts)
    
    mAP = 0.0
    models = [None] * len(concepts)

    stream = StreamFile(feat_dir)

    for i,concept in enumerate(concepts):
        model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concept)
def process(options, trainCollection, trainAnnotationName, feature):
    import re
    p = re.compile(
        r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)')

    rootpath = options.rootpath
    overwrite = options.overwrite
    #autoweight = options.autoweight
    numjobs = options.numjobs
    job = options.job
    nr_bins = options.nr_bins
    best_param_dir = options.best_param_dir
    beta = 0.5

    modelName = 'fik%d' % nr_bins
    if best_param_dir:
        modelName += '-tuned'

    concepts = readConcepts(trainCollection,
                            trainAnnotationName,
                            rootpath=rootpath)
    resultdir = os.path.join(rootpath, trainCollection, 'Models',
                             trainAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)]
    printStatus(INFO,
                'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
    feat_file = BigFile(feat_dir)
    params = {'nr_bins': nr_bins}

    with open(os.path.join(feat_dir, 'minmax.txt'), 'r') as f:
        params['min_vals'] = map(float, str.split(f.readline()))
        params['max_vals'] = map(float, str.split(f.readline()))

    for concept in todo:
        if best_param_dir:
            param_file = os.path.join(best_param_dir, '%s.txt' % concept)
            m = p.search(open(param_file).readline().strip())
            C = float(m.group('C'))
            A = float(m.group('a'))
            B = float(m.group('b'))
        else:
            C = 1
            A = 0
            B = 0
        printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B))

        model_file_name = os.path.join(resultdir, concept + '.model')

        names, labels = readAnnotationsFrom(trainCollection,
                                            trainAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        name2label = dict(zip(names, labels))
        renamed, vectors = feat_file.read(names)
        y = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if 1 == lab])
        nn = len([1 for lab in labels if -1 == lab])
        wp = float(beta) * (np + nn) / np
        wn = (1.0 - beta) * (np + nn) / nn

        svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
        model = svm_train(
            y, vectors,
            svm_params + ' -s 0 -t %d -q' % KERNEL_TYPE.index("HI"))
        newmodel = svm_to_fiksvm([model], [1.0], feat_file.ndims, params)
        newmodel.set_probAB(A, B)
        makedirsforfile(model_file_name)
        printStatus(INFO, '-> %s' % model_file_name)
        fiksvm_save_model(model_file_name, newmodel)

        # reload the model file to do a simple check
        fiksvm_load_model(model_file_name)
        assert (abs(newmodel.get_probAB()[0] - A) < 1e-6)
        assert (abs(newmodel.get_probAB()[1] - B) < 1e-6)

    return len(todo)
Example #47
0
def process(options, testCollection, trainCollection, annotationName, tagrelMethod, tagfeature):
    rootpath = options.rootpath
    overwrite = options.overwrite

    concepts = readConcepts(trainCollection, annotationName, rootpath)
    nr_of_concepts = len(concepts)
    mapping = dict(zip(concepts,range(nr_of_concepts)))
    
    feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', tagfeature)
    binary_file = os.path.join(feat_dir, 'feature.bin')
    id_file = os.path.join(feat_dir, 'id.txt')
    shape_file = os.path.join(feat_dir,'shape.txt')

    if checkToSkip(binary_file, overwrite):
        sys.exit(0)

    inputfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, tagrelMethod, 'id.tagvotes.txt')
    if not os.path.exists(inputfile):
        printError(INFO, '%s does not exist' % inputfile)
        sys.exit(0)

    makedirsforfile(binary_file)
    fw = open(binary_file, 'wb')
    processed = set()
    imset = []
    count_line = 0

    for line in open(inputfile):
        count_line += 1
        elems = str.split(line.strip())
        name = elems[0]

        if name in processed:
            continue
        processed.add(name)

        del elems[0]
        assert(len(elems) == 2 * nr_of_concepts)
        vec = [0] * nr_of_concepts

        for i in range(0, len(elems), 2):
            tag = elems[i]
            idx = mapping[tag]
            score = float(elems[i+1])
            vec[idx] = score

        s = float(sum(vec)) # l_1 normalized
        vec = np.array([x/s for x in vec], dtype=np.float32)
        vec.tofile(fw)
        imset.append(name)

    fw.close()

    fw = open(id_file, 'w')
    fw.write(' '.join(imset))
    fw.close()

    fw = open(shape_file, 'w')
    fw.write('%d %d' % (len(imset), nr_of_concepts))
    fw.close()
    print ('%d lines parsed, %d ids ->  %d unique ids' % (count_line, len(processed), len(imset)))
Example #48
0
def process(options, trainCollection, trainAnnotationName, valCollection,
            valAnnotationName, feature, modelName):
    assert (modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1  #options.autoweight
    beta = 0.5
    metric = options.metric
    scorer = getScorer(metric)
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {}

    if 'fik' == modelName:
        from fiksvm.svmutil import svm_train as train_model
        from fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData',
                                   feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))
    else:
        from fastlinear.liblinear193.python.liblinearutil import train as train_model
        from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        modelName = 'fastlinear'

    concepts = readConcepts(trainCollection,
                            trainAnnotationName,
                            rootpath=rootpath)
    valConcepts = readConcepts(valCollection,
                               valAnnotationName,
                               rootpath=rootpath)
    concept_num = len(concepts)
    for i in range(concept_num):
        assert (concepts[i] == valConcepts[i])

    resultdir = os.path.join(
        rootpath, trainCollection, 'Models', trainAnnotationName,
        '%s,best_params' % modelName,
        '%s,%s,%s' % (valCollection, valAnnotationName, feature))
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.txt')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)]
    printStatus(INFO,
                'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(
        os.path.join(rootpath, trainCollection, 'FeatureData', feature))
    val_feat_file = BigFile(
        os.path.join(rootpath, valCollection, 'FeatureData', feature))
    feat_dim = train_feat_file.ndims
    assert (feat_dim == val_feat_file.ndims)

    for concept in todo:
        names, labels = readAnnotationsFrom(trainCollection,
                                            trainAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        name2label = dict(zip(names, labels))
        renamed, vectors = train_feat_file.read(names)
        Ys = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if 1 == lab])
        nn = len([1 for lab in labels if -1 == lab])
        wp = float(beta) * (np + nn) / np
        wn = (1.0 - beta) * (np + nn) / nn

        names, labels = readAnnotationsFrom(valCollection,
                                            valAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        val_name2label = dict(zip(names, labels))
        val_renamed, val_vectors = val_feat_file.read(names)

        min_perf = 2.0
        worst_C = 1.0
        max_perf = 0.0
        best_C = 1.0
        best_scores = None
        best_labels = None
        for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]:
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
            else:
                svm_params = '-c %g' % C

            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
            #print modelName, '>'*20, svm_params
            model = train_model(Ys, vectors, svm_params + ' -q')
            new_model = compress_model([model], [1.0], feat_dim, params)

            ranklist = [(val_renamed[i], new_model.predict(val_vectors[i]))
                        for i in range(len(val_renamed))]
            ranklist.sort(key=lambda v: v[1], reverse=True)
            sorted_labels = [val_name2label[x[0]] for x in ranklist]
            perf = scorer.score(sorted_labels)
            if max_perf < perf:
                max_perf = perf
                best_C = C
                best_scores = [x[1] for x in ranklist]
                best_labels = list(sorted_labels)
            if min_perf > perf:
                min_perf = perf
                worst_C = C

        [A, B] = sigmoid_train(best_scores, best_labels)
        resultfile = os.path.join(resultdir, '%s.txt' % concept)

        printStatus(
            INFO,
            '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' %
            (concept, min_perf, worst_C, max_perf, best_C, A, B))
        makedirsforfile(resultfile)
        fw = open(resultfile, 'w')
        fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B))
        fw.close()
Example #49
0
def process(options, trainCollection, trainAnnotationName, valCollection, valAnnotationName, feature, modelName):
    assert(modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1 #options.autoweight
    beta = 0.5
    metric = options.metric
    scorer = getScorer(metric)
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {}
    
    if 'fik' == modelName:
        from fiksvm.svmutil import svm_train as train_model
        from fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))    
    else:
        from fastlinear.liblinear193.python.liblinearutil import train as train_model
        from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        modelName = 'fastlinear'


    
    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    valConcepts = readConcepts(valCollection,valAnnotationName, rootpath=rootpath)
    concept_num = len(concepts)
    for i in range(concept_num):
        assert(concepts[i] == valConcepts[i])
    
    resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, '%s,best_params'%modelName, '%s,%s,%s' % (valCollection,valAnnotationName,feature))
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.txt')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature))
    val_feat_file = BigFile(os.path.join(rootpath,valCollection,'FeatureData',feature))
    feat_dim = train_feat_file.ndims
    assert(feat_dim == val_feat_file.ndims)

    
    for concept in todo:
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names,labels))
        renamed,vectors = train_feat_file.read(names)
        Ys = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if  1 == lab])
        nn = len([1 for lab in labels if  -1== lab])
        wp = float(beta) * (np+nn) / np
        wn = (1.0-beta) * (np+nn) /nn
    
        names,labels = readAnnotationsFrom(valCollection, valAnnotationName, concept, skip_0=True, rootpath=rootpath)
        val_name2label = dict(zip(names,labels))
        val_renamed, val_vectors = val_feat_file.read(names)
        
        min_perf = 2.0
        worst_C = 1.0
        max_perf = 0.0
        best_C = 1.0
        best_scores = None
        best_labels = None
        for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]:
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) 
            else:
                svm_params = '-c %g' % C
            
            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
            #print modelName, '>'*20, svm_params
            model = train_model(Ys, vectors, svm_params + ' -q')
            new_model = compress_model([model], [1.0], feat_dim, params)

            ranklist = [(val_renamed[i], new_model.predict(val_vectors[i])) for i in range(len(val_renamed))]
            ranklist.sort(key=lambda v:v[1], reverse=True)
            sorted_labels = [val_name2label[x[0]] for x in ranklist]
            perf = scorer.score(sorted_labels)
            if max_perf < perf:
                max_perf = perf
                best_C = C
                best_scores = [x[1] for x in ranklist]
                best_labels = list(sorted_labels)
            if min_perf > perf:
                min_perf = perf
                worst_C = C
                
        [A,B] = sigmoid_train(best_scores, best_labels)
        resultfile = os.path.join(resultdir, '%s.txt' % concept)
        
        printStatus(INFO, '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' % (concept, min_perf, worst_C, max_perf, best_C, A, B))
        makedirsforfile(resultfile)
        fw = open(resultfile, 'w')
        fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B))
        fw.close()
Example #50
0
def process(options, trainCollection, annotationfile, feature, modelName):
    assert(modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1 #options.autoweight
    beta = 0.5
    C = 1
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {'rootpath': rootpath, 'model': modelName}
    
    if 'fik' == modelName:
        from svms.fiksvm.svmutil import svm_train as train_model
        from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from svms.fiksvm.fiksvm import fiksvm_save_model as save_model
        from svms.fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))    
    else:
        from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model
        from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        from svms.fastlinear.fastlinear import fastlinear_save_model as save_model
 
    newAnnotationName = os.path.split(annotationfile)[-1]
    trainAnnotationNames = [x.strip() for x in open(annotationfile).readlines() if x.strip() and not x.strip().startswith('#')]
    for annotationName in trainAnnotationNames:
        conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', annotationName)
        if not os.path.exists(conceptfile):
            print '%s does not exist' % conceptfile
            return 0

    concepts = readConcepts(trainCollection, trainAnnotationNames[0], rootpath=rootpath)

    resultdir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature))
    feat_dim = train_feat_file.ndims

    s_time = time.time()

    for concept in todo:
        assemble_model = None
        for t in range(1, len(trainAnnotationNames)+1):
            names,labels = readAnnotationsFrom(trainCollection, trainAnnotationNames[t-1], concept, skip_0=True, rootpath=rootpath)
            name2label = dict(zip(names,labels))
            renamed,vectors = train_feat_file.read(names)
            Ys = [name2label[x] for x in renamed]
            np = len([1 for lab in labels if  1 == lab])
            nn = len([1 for lab in labels if  -1== lab])
            wp = float(beta) * (np+nn) / np
            wn = (1.0-beta) * (np+nn) /nn
    
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) 
            else:
                svm_params = '-c %g' % C
            
            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
           
            g_t = train_model(Ys, vectors, svm_params + ' -q')
            if t == 1:
                assemble_model = compress_model([g_t], [1.0], feat_dim, params)
            else:
                new_model = compress_model([g_t], [1.0], feat_dim, params)
                assemble_model.add_fastsvm(new_model, 1-1.0/t, 1.0/t)

        new_model_file = os.path.join(resultdir, '%s.model' % concept)            
        makedirsforfile(new_model_file)
        printStatus(INFO, 'save model to %s' % new_model_file)
        save_model(new_model_file, assemble_model)
        printStatus(INFO, '%s done' % concept)

        
    timecost = time.time() - s_time
    writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath)
    printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo)))
    printStatus(INFO, 'models stored at %s' % resultdir)
    printStatus(INFO, '%g seconds in total' % timecost)
Example #51
0
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName):
    if modelName.startswith('fik'):
        from fiksvm.fiksvm import fiksvm_load_model as load_model
    else:
        from fastlinear.fastlinear import fastlinear_load_model as load_model

    rootpath = options.rootpath
    overwrite = options.overwrite
    prob_output = options.prob_output
    numjobs = options.numjobs
    job = options.job
    #blocksize = options.blocksize
    topk = options.topk
    
    outputName = '%s,%s' % (feature,modelName)
    if prob_output:
        outputName += ',prob'

    resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt')
    if numjobs>1:
        resultfile += '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    test_imset = readImageSet(testCollection, testCollection, rootpath)
    test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job]
    test_imset = set(test_imset)
    nr_of_test_images = len(test_imset)
    printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile))

    models = [None] * nr_of_concepts
    for c in range(nr_of_concepts):
        model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concepts[c])
        models[c] = load_model(model_file_name)
        if models[c] is None:
            return 0
        #(pA,pB) = model.get_probAB()
        

    feat_file = StreamFile(os.path.join(rootpath, testCollection, "FeatureData", feature))
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    done = 0

    feat_file.open()
    for _id, _vec in feat_file:
        if _id not in test_imset:
            continue
        if prob_output:
            scores = [models[c].predict_probability(_vec) for c in range(nr_of_concepts)]
        else:
            scores = [models[c].predict(_vec) for c in range(nr_of_concepts)]

        tagvotes = sorted(zip(concepts, scores), key=lambda v:v[1], reverse=True)
        if topk>0:
            tagvotes = tagvotes[:topk]
        newline = '%s %s\n' % (_id, " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes]))
        fw.write(newline)
        done += 1
        if done % 1e4  == 0:
            printStatus(INFO, "%d done" % done)

    feat_file.close()
    fw.close()
    printStatus(INFO, "%d done" % (done))
    return done
Example #52
0
import sys, os
from basic.common import checkToSkip, ROOT_PATH, makedirsforfile
from basic.annotationtable import readConcepts, readAnnotationsFrom, writeAnnotationsTo, writeConceptsTo
from basic.data import readImageSet

if __name__ == '__main__':
    args = sys.argv[1:]
    rootpath = '/var/scratch2/xirong/VisualSearch'
    srcCollection = args[0]
    annotationName = args[1]
    dstCollection = args[2]
    overwrite = 0

    concepts = readConcepts(srcCollection, annotationName, rootpath)
    todo = []
    for concept in concepts:
        resfile = os.path.join(rootpath, dstCollection, 'Annotations', 'Image',
                               annotationName, '%s.txt' % concept)
        if checkToSkip(resfile, overwrite):
            continue
        todo.append(concept)
    if not todo:
        print('nothing to do')
        sys.exit(0)

    imset = set(readImageSet(dstCollection, dstCollection, rootpath))

    for concept in todo:
        names, labels = readAnnotationsFrom(srcCollection,
                                            annotationName,
                                            concept,
Example #53
0
def process(options, workingCollection, annotationName, feature, outputpkl):
    rootpath = options.rootpath
    distance = options.distance
    overwrite = options.overwrite
    k_ratio = options.kratio
    ratio_cs = options.ratiocs
    lambda1 = options.lambda1
    lambda2 = options.lambda2
    outputonlytest = options.outputonlytest
    rawtagmatrix = options.rawtagmatrix
    modelName = "robustpca"
    nnName = distance + "knn"

    printStatus(
        INFO, "Starting RobustPCA %s,%s,%s,%s,%f,%f,%f" %
        (workingCollection, annotationName, feature, nnName, k_ratio, lambda1,
         lambda2))

    if rawtagmatrix:
        printStatus(INFO, "Using raw tag matrix.")
    else:
        printStatus(INFO, "Using preprocessed tag matrix.")

    resultfile = os.path.join(outputpkl)
    resultfile_robustpca = os.path.join(
        rootpath, workingCollection, 'RobustPCA-Prediction',
        '%s,%s,%f,%f,%f,%d' %
        (feature, nnName, lambda1, lambda2, k_ratio, rawtagmatrix),
        'prediction.mat')

    if checkToSkip(resultfile_robustpca, overwrite):
        only_dump = True
    else:
        only_dump = False

    if not rawtagmatrix:
        tagmatrix_file = os.path.join(
            rootpath, workingCollection, 'RobustPCA',
            '%s,%s,%f' % (feature, nnName, DEFAULT_K_PROP), 'tagmatrix.h5')
        if not os.path.exists(tagmatrix_file):
            printStatus(
                INFO,
                "Tag matrix file not found at %s Did you run robustpca_preprocessing.py?"
                % (tagmatrix_file))
            sys.exit(1)
    else:
        tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData',
                                      "lemm_wordnet_freq_tags.h5")
        if not os.path.exists(tagmatrix_file):
            printStatus(
                INFO,
                'Tag matrix file not found in %s Did you run wordnet_frequency_tags.py?'
                % (tagmatrix_file))
            sys.exit(1)

    laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI',
                                   workingCollection,
                                   '%s,%s,%f' % (feature, nnName, k_ratio),
                                   'laplacianI.mat')
    if not os.path.exists(laplacianI_file):
        printStatus(
            INFO,
            "LaplacianI file not found at %s Did you run laplacian_images.py?"
            % (laplacianI_file))
        sys.exit(1)

    laplacianT_file = os.path.join(rootpath, workingCollection, 'LaplacianT',
                                   '%f' % (ratio_cs), 'laplacianT.mat')
    if not os.path.exists(laplacianT_file):
        printStatus(
            INFO,
            "LaplacianT file not found at %s Did you run laplacian_tags.py?" %
            (laplacianT_file))
        sys.exit(1)

    # being learning
    script = """
        rpca_path = 'transduction_based/robustpca/';
        addpath(rpca_path);
        addpath([rpca_path, 'fast_svd/']);
        tagmatrix = sparse(double(h5read('%s', '/tagmatrix')));
        load('%s');
        load('%s');

        lambda1 = %f;
        lambda2 = %f;
        maxIters = 50;
        precision = 1e-4;
        mu_start = 1.;

        parpool('local', 4);
        [P,E]=robustpca(tagmatrix, lambda1, lambda2, tag_similarity, im_similarity, maxIters, precision, mu_start);
        """ % (tagmatrix_file, laplacianI_file, laplacianT_file, lambda1,
               lambda2)

    script += """
        delete(gcp);
        save('%s', 'P', 'E', 'lambda1', 'lambda2', '-v7.3');
        exit;
    """ % resultfile_robustpca

    if not only_dump:
        printStatus(INFO, "starting learning")
        makedirsforfile(resultfile_robustpca)
        call_matlab(script)

    if checkToSkip(resultfile, overwrite):
        return 0

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)
    concepts = readConcepts(workingCollection, annotationName, rootpath)
    if outputonlytest:
        testset_id_images = readImageSet(
            workingCollection.split('+')[1],
            workingCollection.split('+')[1], rootpath)
        testset_id_images.sort()

    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    id_images.sort()

    # concepts mapping
    robustpca_output = h5py.File(resultfile_robustpca, 'r')
    tagprop_input = h5py.File(tagmatrix_file, 'r')
    mapping = getVocabMap(list(tagprop_input['vocab'][:]), concepts)

    predicted_tagmatrix = robustpca_output['P'][:, mapping]

    if outputonlytest:
        idx = np.array([bisect_index(id_images, x) for x in testset_id_images])
        final_tagmatrix = predicted_tagmatrix[idx, :]
        assert (final_tagmatrix.shape[0] == idx.shape[0])
        id_images = testset_id_images
    else:
        final_tagmatrix = predicted_tagmatrix

    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump(
            {
                'concepts': concepts,
                'id_images': id_images,
                'scores': final_tagmatrix
            }, f, pickle.HIGHEST_PROTOCOL)
Example #54
0
def process(options, collection, annotationName, runfile):
    rootpath = options.rootpath
    
    apscorer = getScorer('AP')
    ndcg = getScorer('NDCG@20')
    ndcg2 = getScorer('NDCG2@20')
    p1scorer = getScorer('P@1')
    p5scorer = getScorer('P@5')

    datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#')]
    nr_of_runs = len(datafiles)
    
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)  
    nr_of_concepts = len(concepts)
    
    printStatus(INFO, 'read annotations from files')
    
    name2label = [{} for i in range(nr_of_concepts)]
    hit_imgset = [[] for i in range(nr_of_concepts)]
    rel_conset = {}
    
    for i in range(nr_of_concepts):
        names,labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath)
        names = map(int, names)
        name2label[i] = dict(zip(names,labels))
        
        for im,lab in zip(names,labels):
            if lab > 0:
                rel_conset.setdefault(im,set()).add(i)

        label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt'% concepts[i])
        try:
            hit_imgset[i] = set(map(int, open(label_file).readlines()))
        except:
            hit_imgset[i] = set()
        printStatus(INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i])))
        
    ap_table = np.zeros((nr_of_runs, nr_of_concepts))
    ap2_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts))
    
    print '#'*100
    print '# method miap hit1 hit5'
    print '#'*100
    
    for run_idx in range(nr_of_runs):
        data = pickle.load(open(datafiles[run_idx],'rb'))
        scores = data['scores']
        assert(scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]
                   
        for c_idx in range(nr_of_concepts):
            ground_truth = name2label[c_idx]
            ranklist =  zip(imset, scores[:,c_idx])
            ranklist.sort(key=lambda v:(v[1], str(v[0])), reverse=True)
            ranklist = [x for x in ranklist if x[0] in ground_truth]
            
            sorted_labels = [ground_truth[x[0]] for x in ranklist]
            assert(len(sorted_labels)>0)
            #print concepts[c_idx], ranklist[:5], sorted_labels[:5]

            ap_table[run_idx, c_idx] = apscorer.score(sorted_labels)

            sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]]
            ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels)
            ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels)
            ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels)

        res = np.zeros((nr_of_images, 3))
        for j in range(nr_of_images):
            ranklist = zip(range(nr_of_concepts), scores[j,:])
            ranklist.sort(key=lambda v:v[1], reverse=True)
            rel_set = rel_conset.get(imset[j], set())
            sorted_labels = [int(x[0] in rel_set) for x in ranklist]
            ap = apscorer.score(sorted_labels)
            hit1 = p1scorer.score(sorted_labels)
            hit5 = p5scorer.score(sorted_labels) > 0.1
            res[j,:] = [ap, hit1, hit5]
        avg_perf = res.mean(axis=0)
        print os.path.split(datafiles[run_idx])[-1], ' '.join(['%.3f' % x for x in avg_perf])
            


    print '#'*100
    print '# untagged-concept', ' '.join([os.path.split(x)[-1] for x in datafiles])
    print '#'*100
            
    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ap_table[:,c_idx]])
    print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)])
    
    print '#'*100
    print '# tagged-concept'
    print '#'*100
    
    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ap2_table[:,c_idx]])
    print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])
    
    print '#'*100
    print '# tagged-concept'
    print '#'*100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ndcg_table[:,c_idx]])
    print 'mean%s' % ndcg.name(), ' '.join(['%.3f' % x for x in ndcg_table.mean(axis=1)])

    print '#'*100
    print '# tagged-concept'
    print '#'*100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ndcg2_table[:,c_idx]])
    print 'mean%s'%ndcg2.name(), ' '.join(['%.3f' % x for x in ndcg2_table.mean(axis=1)])
Example #55
0
def process(options, collection, annotationName, runfile):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection)

    apscorer = getScorer('AP')
    datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#')]
    nr_of_runs = len(datafiles)
    
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)  
    nr_of_concepts = len(concepts)
    
    printStatus(INFO, 'read annotations from files')
    
    name2label = [{} for i in range(nr_of_concepts)]
    hit_imgset = [[] for i in range(nr_of_concepts)]
    
    for i in range(nr_of_concepts):
        names,labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath)
        names = map(int, names)
        name2label[i] = dict(zip(names,labels))

        label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt'% concepts[i])
        try:
            hit_imgset[i] = set(map(int, open(label_file).readlines()))
        except:
            hit_imgset[i] = set()
        printStatus(INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i])))
        
    ap_table = np.zeros((nr_of_runs, nr_of_concepts))
    ap2_table = np.zeros((nr_of_runs, nr_of_concepts))
    
    for run_idx in range(nr_of_runs):
        runName = os.path.splitext(os.path.basename(datafiles[run_idx]))[0]
        data = pickle.load(open(datafiles[run_idx],'rb'))
        scores = data['scores']
        assert(scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]
                   
        for c_idx in range(nr_of_concepts):
            ground_truth = name2label[c_idx]
            ranklist =  zip(imset, scores[:,c_idx])
            ranklist.sort(key=lambda v:(v[1], str(v[0])), reverse=True)
            ranklist = [x for x in ranklist if x[0] in ground_truth]

            resfile = os.path.join(resultdir, runName, '%s.txt' % concepts[c_idx])
            if not checkToSkip(resfile, overwrite):
                writeRankingResults(ranklist, resfile)            
            sorted_labels = [ground_truth[x[0]] for x in ranklist]
            assert(len(sorted_labels)>0)
            #print concepts[c_idx], ranklist[:5], sorted_labels[:5]

            ap_table[run_idx, c_idx] = apscorer.score(sorted_labels)

            resfile = os.path.join(resultdir, 'tagged,lemm', runName, '%s.txt' % concepts[c_idx])
            if not checkToSkip(resfile, overwrite):
                writeRankingResults([x for x in ranklist if x[0] in hit_imgset[c_idx]], resfile)            
            
            sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]]
            ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels)
     

    print '#'*100
    print '# untagged-concept', ' '.join([os.path.basename(x) for x in datafiles])
    print '#'*100
            
    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ap_table[:,c_idx]])
    print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)])
    
    print '#'*100
    print '# tagged-concept'
    print '#'*100
    
    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ap2_table[:,c_idx]])
    print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])