def process(options, trainCollection, modelAnnotationName, trainAnnotationName,
            feature):
    rootpath = options.rootpath
    modelName = options.model

    if 'fastlinear' == modelName:
        from fastlinear.fastlinear import fastlinear_load_model as load_model
        from fastlinear.fastlinear import fastlinear_save_model as save_model
    else:
        from fiksvm.fiksvm import fiksvm_load_model as load_model
        from fiksvm.fiksvm import fiksvm_save_model as save_model

    concepts = readConcepts(trainCollection, trainAnnotationName, rootpath)
    concepts = [
        concepts[i] for i in range(len(concepts))
        if (i % options.numjobs + 1) == options.job
    ]

    feat_file = None  #BigFile(os.path.join(rootpath, trainCollection, "FeatureData", feature))

    for concept in concepts:
        modelfile = os.path.join(rootpath, trainCollection, 'Models',
                                 modelAnnotationName, feature, modelName,
                                 '%s.model' % concept)
        model = load_model(modelfile)
        (A0, B0) = model.get_probAB()
        if abs(A0) > 1e-8 and not options.overwrite:
            printStatus(INFO,
                        "old parameters exist as A=%g, B=%g, skip" % (A0, B0))
            continue
        names, labels = readAnnotationsFrom(trainCollection,
                                            trainAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        name2label = dict(zip(names, labels))

        if not feat_file:
            feat_file = BigFile(
                os.path.join(rootpath, trainCollection, "FeatureData",
                             feature))

        results = classify_large_data(model,
                                      names,
                                      feat_file,
                                      prob_output=False)
        labels = [name2label[x[0]] for x in results]
        dec_values = [x[1] for x in results]
        printStatus(
            INFO, "%s +%d -%d" % (concept, len([x for x in labels if x == 1]),
                                  len([x for x in labels if x == -1])))
        [A, B] = sigmoid_train(dec_values, labels)
        model.set_probAB(A, B)
        save_model(modelfile, model)
        (A1, B1) = model.get_probAB()
        printStatus(INFO, "A: %g -> %g, B: %g -> %g" % (A0, A1, B0, B1))
Example #2
0
def process(options, collection, feature):
    rootpath = options.rootpath 
    tpp = options.tpp 
    k = 1000 #options.k
    numjobs = options.numjobs
    job = options.job
    overwrite = options.overwrite
    
    feat_dir = os.path.join(rootpath, collection, 'FeatureData', feature)
    feat_file = BigFile(feat_dir)
    hitlists = buildHitLists(collection, tpp, rootpath)
    printStatus(INFO, 'nr of tags: %d' % len(hitlists))
    
    vob = sorted(hitlists.keys())
    vob = [vob[i] for i in range(len(vob)) if i%numjobs == job-1]
    printStatus(INFO, 'working on %d-%d: %d tags' % (numjobs, job, len(vob)))
    
    for tag_idx,tag in enumerate(vob):
        resultdir = os.path.join(rootpath, collection, 'FeatureIndex', feature, tag[:2], tag)
        binfile = os.path.join(resultdir, 'feature.bin')
        if checkToSkip(binfile, overwrite):
            continue
            
        hitlist = hitlists[tag]
        hitlist = hitlist[:k] # keep at most 1000 images per tag
        renamed,vecs = feat_file.read(hitlist)
        
        makedirsforfile(binfile)
        np.array(vecs).astype(np.float32).tofile(binfile)
        idfile = os.path.join(resultdir, 'id.txt')
        fw = open(idfile, 'w')
        fw.write(' '.join(renamed))
        fw.close()
        
        shapefile = os.path.join(resultdir, 'shape.txt')
        fw = open(shapefile, 'w')
        fw.write('%d %d' % (len(renamed), len(vecs[0])))
        fw.close()
        
        if tag_idx%1e3 == 0:
            printStatus(INFO, '%d - %s, %d images' % (tag_idx, tag, len(hitlist)))
def process(options, collection, feature):
    rootpath = options.rootpath
    tpp = options.tpp
    k = 1000  # options.k
    numjobs = options.numjobs
    job = options.job
    overwrite = options.overwrite

    feat_dir = os.path.join(rootpath, collection, "FeatureData", feature)
    feat_file = BigFile(feat_dir)
    hitlists = buildHitLists(collection, tpp, rootpath)
    printStatus(INFO, "nr of tags: %d" % len(hitlists))

    vob = sorted(hitlists.keys())
    vob = [vob[i] for i in range(len(vob)) if i % numjobs == job - 1]
    printStatus(INFO, "working on %d-%d: %d tags" % (numjobs, job, len(vob)))

    for tag_idx, tag in enumerate(vob):
        resultdir = os.path.join(rootpath, collection, "FeatureIndex", feature, tag[:2], tag)
        binfile = os.path.join(resultdir, "feature.bin")
        if checkToSkip(binfile, overwrite):
            continue

        hitlist = hitlists[tag]
        hitlist = hitlist[:k]  # keep at most 1000 images per tag
        renamed, vecs = feat_file.read(hitlist)

        makedirsforfile(binfile)
        np.array(vecs).astype(np.float32).tofile(binfile)
        idfile = os.path.join(resultdir, "id.txt")
        fw = open(idfile, "w")
        fw.write(" ".join(renamed))
        fw.close()

        shapefile = os.path.join(resultdir, "shape.txt")
        fw = open(shapefile, "w")
        fw.write("%d %d" % (len(renamed), len(vecs[0])))
        fw.close()

        if tag_idx % 1e3 == 0:
            printStatus(INFO, "%d - %s, %d images" % (tag_idx, tag, len(hitlist)))
Example #4
0
def process(options, testCollection, trainCollection, annotationName, feature):
    rootpath = options.rootpath
    k = options.k
    distance = options.distance
    blocksize = options.blocksize
    donefile = options.donefile
    numjobs = options.numjobs
    job = options.job
    overwrite = options.overwrite
    taggerType = options.tagger
    noise = options.noise
    testset = options.testset
    if not testset:
        testset = testCollection

    modelName = taggerType
    if 'pretagvote' == taggerType and noise > 1e-3:
        modelName += '-noise%.2f' % noise
    if 'pqtagvote' == taggerType:
        nnName = "l2knn"
    else:
        nnName = distance + "knn"
    resultfile = os.path.join(rootpath, testCollection, 'autotagging', testset,
                              trainCollection, annotationName, modelName,
                              '%s,%s,%d' % (feature, nnName, k),
                              'id.tagvotes.txt')

    if numjobs > 1:
        resultfile += ".%d.%d" % (numjobs, job)
    if checkToSkip(resultfile, overwrite):
        return 0

    if donefile:
        doneset = set([x.split()[0] for x in open(donefile) if x.strip()])
    else:
        doneset = set()
    printStatus(
        INFO, "%d images have been done already, and they will be ignored" %
        len(doneset))

    workingSet = readImageSet(testCollection, testset, rootpath)
    workingSet = [x for x in workingSet if x not in doneset]
    workingSet = [
        workingSet[i] for i in range(len(workingSet))
        if (i % numjobs + 1) == job
    ]

    test_feat_dir = os.path.join(rootpath, testCollection, 'FeatureData',
                                 feature)
    test_feat_file = BigFile(test_feat_dir)

    tagger = NAME_TO_TAGGER[taggerType](trainCollection,
                                        annotationName,
                                        feature,
                                        distance,
                                        rootpath=rootpath)
    tagger.k = k
    tagger.noise = noise

    printStatus(
        INFO, "working on %d-%d, %d test images -> %s" %
        (numjobs, job, len(workingSet), resultfile))

    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    read_time = 0.0
    test_time = 0.0
    start = 0
    done = 0

    while start < len(workingSet):
        end = min(len(workingSet), start + blocksize)
        printStatus(INFO, 'tagging images from %d to %d' % (start, end - 1))

        s_time = time.time()
        renamed, vectors = test_feat_file.read(workingSet[start:end])
        nr_images = len(renamed)
        read_time += time.time() - s_time

        s_time = time.time()
        output = [None] * nr_images
        for i in range(nr_images):
            tagvotes = tagger.predict(content=vectors[i],
                                      context='%s,%s' %
                                      (testCollection, renamed[i]))
            output[i] = '%s %s\n' % (renamed[i], " ".join([
                "%s %s" % (tag, niceNumber(vote, 6))
                for (tag, vote) in tagvotes
            ]))
        test_time += time.time() - s_time
        start = end
        fw.write(''.join(output))
        done += len(output)

    fw.close()
    printStatus(
        INFO, '%d images tagged, read time %g seconds, test time %g seconds' %
        (done, read_time, test_time))
Example #5
0
def process(options, trainCollection, annotationfile, feature, modelName):
    assert (modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1  #options.autoweight
    beta = 0.5
    C = 1
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {'rootpath': rootpath, 'model': modelName}

    if 'fik' == modelName:
        from svms.fiksvm.svmutil import svm_train as train_model
        from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from svms.fiksvm.fiksvm import fiksvm_save_model as save_model
        from svms.fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData',
                                   feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))
    else:
        from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model
        from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        from svms.fastlinear.fastlinear import fastlinear_save_model as save_model

    newAnnotationName = os.path.split(annotationfile)[-1]
    trainAnnotationNames = [
        x.strip() for x in open(annotationfile).readlines()
        if x.strip() and not x.strip().startswith('#')
    ]
    for annotationName in trainAnnotationNames:
        conceptfile = os.path.join(rootpath, trainCollection, 'Annotations',
                                   annotationName)
        if not os.path.exists(conceptfile):
            print '%s does not exist' % conceptfile
            return 0

    concepts = readConcepts(trainCollection,
                            trainAnnotationNames[0],
                            rootpath=rootpath)

    resultdir = os.path.join(rootpath, trainCollection, 'Models',
                             newAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)]
    printStatus(INFO,
                'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(
        os.path.join(rootpath, trainCollection, 'FeatureData', feature))
    feat_dim = train_feat_file.ndims

    s_time = time.time()

    for concept in todo:
        assemble_model = None
        for t in range(1, len(trainAnnotationNames) + 1):
            names, labels = readAnnotationsFrom(trainCollection,
                                                trainAnnotationNames[t - 1],
                                                concept,
                                                skip_0=True,
                                                rootpath=rootpath)
            name2label = dict(zip(names, labels))
            renamed, vectors = train_feat_file.read(names)
            Ys = [name2label[x] for x in renamed]
            np = len([1 for lab in labels if 1 == lab])
            nn = len([1 for lab in labels if -1 == lab])
            wp = float(beta) * (np + nn) / np
            wn = (1.0 - beta) * (np + nn) / nn

            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
            else:
                svm_params = '-c %g' % C

            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '

            g_t = train_model(Ys, vectors, svm_params + ' -q')
            if t == 1:
                assemble_model = compress_model([g_t], [1.0], feat_dim, params)
            else:
                new_model = compress_model([g_t], [1.0], feat_dim, params)
                assemble_model.add_fastsvm(new_model, 1 - 1.0 / t, 1.0 / t)

        new_model_file = os.path.join(resultdir, '%s.model' % concept)
        makedirsforfile(new_model_file)
        printStatus(INFO, 'save model to %s' % new_model_file)
        save_model(new_model_file, assemble_model)
        printStatus(INFO, '%s done' % concept)

    timecost = time.time() - s_time
    writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath)
    printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo)))
    printStatus(INFO, 'models stored at %s' % resultdir)
    printStatus(INFO, '%g seconds in total' % timecost)
Example #6
0
def process(options, testCollection, trainCollection, trainAnnotationName,
            feature, modelName):
    if modelName.startswith('fik'):
        from fiksvm.fiksvm import fiksvm_load_model as load_model
    else:
        from fastlinear.fastlinear import fastlinear_load_model as load_model

    rootpath = options.rootpath
    overwrite = options.overwrite
    prob_output = options.prob_output
    numjobs = options.numjobs
    job = options.job
    blocksize = options.blocksize

    outputName = '%s,%s' % (feature, modelName)
    if prob_output:
        outputName += ',prob'

    resultfile = os.path.join(rootpath, testCollection, 'autotagging',
                              testCollection, trainCollection,
                              trainAnnotationName, outputName,
                              'id.tagvotes.txt')
    if numjobs > 1:
        resultfile += '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(trainCollection,
                            trainAnnotationName,
                            rootpath=rootpath)
    nr_of_concepts = len(concepts)

    test_imset = readImageSet(testCollection, testCollection, rootpath)
    test_imset = [
        test_imset[i] for i in range(len(test_imset)) if i % numjobs + 1 == job
    ]
    nr_of_test_images = len(test_imset)
    printStatus(
        INFO, "working on %d-%d, %d test images -> %s" %
        (numjobs, job, nr_of_test_images, resultfile))

    models = [None] * nr_of_concepts
    for c in range(nr_of_concepts):
        model_file_name = os.path.join(rootpath, trainCollection, 'Models',
                                       trainAnnotationName, feature, modelName,
                                       '%s.model' % concepts[c])
        models[c] = load_model(model_file_name)
        if models[c] is None:
            return 0
        #(pA,pB) = model.get_probAB()

    feat_file = BigFile(
        os.path.join(rootpath, testCollection, "FeatureData", feature))
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    read_time = 0
    test_time = 0
    start = 0
    done = 0

    while start < nr_of_test_images:
        end = min(nr_of_test_images, start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end - 1))

        s_time = time.time()
        renamed, test_X = feat_file.read(test_imset[start:end])
        read_time += time.time() - s_time

        s_time = time.time()
        output = [None] * len(renamed)
        for i in xrange(len(renamed)):
            if prob_output:
                scores = [
                    models[c].predict_probability(test_X[i])
                    for c in range(nr_of_concepts)
                ]
            else:
                scores = [
                    models[c].predict(test_X[i]) for c in range(nr_of_concepts)
                ]
            #dec_value = sigmoid_predict(dec_value, A=pA, B=pB)
            tagvotes = sorted(zip(concepts, scores),
                              key=lambda v: v[1],
                              reverse=True)
            output[i] = '%s %s\n' % (renamed[i], " ".join([
                "%s %s" % (tag, niceNumber(vote, 6))
                for (tag, vote) in tagvotes
            ]))
        test_time += time.time() - s_time
        start = end
        fw.write(''.join(output))
        fw.flush()
        done += len(output)

    # done
    printStatus(
        INFO, "%d done. read time %g seconds, test_time %g seconds" %
        (done, read_time, test_time))
    fw.close()
    return done
Example #7
0
from basic.constant import ROOT_PATH
rootpath = ROOT_PATH
conceptfile = os.path.join(rootpath, trainCollection, 'Annotations',
                           trainAnnotationName)

from basic.annotationtable import writeConceptsTo
writeConceptsTo(test_tags, trainCollection, trainAnnotationName)

cmd = '%s/util/imagesearch/obtain_labeled_examples.py %s %s' % (
    parent_dir, trainCollection, conceptfile)
os.system('python ' + cmd)

train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData',
                              feature)
from util.simpleknn.bigfile import BigFile
train_feat_file = BigFile(train_feat_dir)
feat_dim = train_feat_file.ndims

from basic.util import readImageSet
test_imset = readImageSet(testCollection)
test_feat_dir = os.path.join(rootpath, testCollection, 'featureData', feature)
test_feat_file = BigFile(test_feat_dir)
#test_renamed, test_vectors = test_feat_file.read(test_imset)

from model_based.dataengine.positiveengine import PositiveEngine
from model_based.dataengine.negativeengine import NegativeEngine

pe = PositiveEngine(trainCollection)
ne = NegativeEngine(trainCollection)

for tag in test_tags:
Example #8
0
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName):
    if modelName.startswith('fik'):
        from fiksvm.fiksvm import fiksvm_load_model as load_model
    else:
        from fastlinear.fastlinear import fastlinear_load_model as load_model

    rootpath = options.rootpath
    overwrite = options.overwrite
    prob_output = options.prob_output
    numjobs = options.numjobs
    job = options.job
    blocksize = options.blocksize
    
    outputName = '%s,%s' % (feature,modelName)
    if prob_output:
        outputName += ',prob'

    resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt')
    if numjobs>1:
        resultfile += '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    test_imset = readImageSet(testCollection, testCollection, rootpath)
    test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job]
    nr_of_test_images = len(test_imset)
    printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile))

    models = [None] * nr_of_concepts
    for c in range(nr_of_concepts):
        model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concepts[c])
        models[c] = load_model(model_file_name)
        if models[c] is None:
            return 0
        #(pA,pB) = model.get_probAB()
        

    feat_file = BigFile(os.path.join(rootpath, testCollection, "FeatureData", feature))
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    read_time = 0
    test_time = 0
    start = 0
    done = 0

    while start < nr_of_test_images:
        end = min(nr_of_test_images, start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))

        s_time = time.time()
        renamed, test_X = feat_file.read(test_imset[start:end])
        read_time += time.time() - s_time
        
        s_time = time.time()
        output = [None] * len(renamed)
        for i in xrange(len(renamed)):
            if prob_output:
                scores = [models[c].predict_probability(test_X[i]) for c in range(nr_of_concepts)]
            else:
                scores = [models[c].predict(test_X[i]) for c in range(nr_of_concepts)]
            #dec_value = sigmoid_predict(dec_value, A=pA, B=pB)
            tagvotes = sorted(zip(concepts, scores), key=lambda v:v[1], reverse=True)
            output[i] = '%s %s\n' % (renamed[i], " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes]))
        test_time += time.time() - s_time
        start = end
        fw.write(''.join(output))
        fw.flush()
        done += len(output)

    # done    
    printStatus(INFO, "%d done. read time %g seconds, test_time %g seconds" % (done, read_time, test_time))
    fw.close()
    return done
Example #9
0
def process(options, trainCollection, baseAnnotationName, startAnnotationName,
            feature, modelName):
    global train_model, compress_model, save_model
    assert (modelName in ['fik', 'fastlinear'])
    if 'fik' == modelName:
        from model_based.svms.fiksvm.svmutil import svm_train as train_model
        from model_based.svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from model_based.svms.fiksvm.fiksvm import fiksvm_save_model as save_model
    else:
        from model_based.svms.fastlinear.liblinear193.python.liblinearutil import train as train_model
        from model_based.svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        from model_based.svms.fastlinear.fastlinear import fastlinear_save_model as save_model

    rootpath = options.rootpath
    overwrite = options.overwrite
    params = {
        'rootpath': rootpath,
        'trainCollection': trainCollection,
        'baseAnnotationName': baseAnnotationName,
        'startAnnotationName': startAnnotationName,
        'feature': feature,
        'model': modelName,
        'strategy': options.strategy,
        'iterations': options.iterations,
        'npr': options.npr,
        'nr_bins': options.nr_bins
    }

    concepts = readConcepts(trainCollection, startAnnotationName, rootpath)
    newAnnotationName = get_new_annotation_name(params)
    newModelName = get_model_name(params)
    modeldir = os.path.join(rootpath, trainCollection, 'Models',
                            newAnnotationName, feature, newModelName)
    todo = [
        concept for concept in concepts if overwrite
        or os.path.exists(os.path.join(modeldir, '%s.txt' % concept)) is False
    ]
    activeConcepts = [
        todo[i] for i in range(len(todo))
        if (i % options.numjobs + 1) == options.job
    ]

    params['feat_file'] = BigFile(
        os.path.join(rootpath, trainCollection, 'FeatureData', feature))

    if 'fik' == modelName:
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData',
                                   feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))

    s_time = time.time()

    for concept in activeConcepts:
        printStatus(INFO, 'processing %s' % concept)
        modelfile = os.path.join(modeldir, '%s.model' % concept)
        if checkToSkip(modelfile, overwrite):
            continue
        new_model = NegativeBootstrap.learn(concept, params)
        makedirsforfile(modelfile)
        printStatus(INFO, 'save model to %s' % modelfile)
        save_model(modelfile, new_model)
        printStatus(INFO, '%s done' % concept)

    timecost = time.time() - s_time
    writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath)
    printStatus(
        INFO, 'done for %g concepts: %s' %
        (len(activeConcepts), ' '.join(activeConcepts)))
    printStatus(INFO, 'models stored at %s' % modeldir)
    printStatus(INFO, '%g seconds in total' % timecost)
Example #10
0
testCollection = 'mirflickr08'


from basic.constant import ROOT_PATH
rootpath = ROOT_PATH
conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', trainAnnotationName)

from basic.annotationtable import writeConceptsTo
writeConceptsTo(test_tags, trainCollection, trainAnnotationName)

cmd = '%s/util/imagesearch/obtain_labeled_examples.py %s %s' % (parent_dir, trainCollection, conceptfile)
os.system('python ' + cmd)

train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
from util.simpleknn.bigfile import BigFile
train_feat_file = BigFile(train_feat_dir) 
feat_dim = train_feat_file.ndims

from basic.util import readImageSet
test_imset = readImageSet(testCollection)
test_feat_dir = os.path.join(rootpath, testCollection, 'featureData', feature)
test_feat_file = BigFile(test_feat_dir)
#test_renamed, test_vectors = test_feat_file.read(test_imset)


from model_based.dataengine.positiveengine import PositiveEngine
from model_based.dataengine.negativeengine import NegativeEngine

pe = PositiveEngine(trainCollection)
ne = NegativeEngine(trainCollection)
Example #11
0
def process(options, testCollection, trainCollection, feature):
    rootpath = options.rootpath
    overwrite = options.overwrite
    tpp = options.tpp
    doRandomwalk = 1  #options.doRandomwalk
    uniqueUser = 0  #options.uniqueUser
    k = 1000  #options.k
    numjobs = options.numjobs
    job = options.job

    #resultfile = os.path.join(rootpath, testCollection, "tagrel", testCollection, trainCollection,
    #                          "%s,tagrank%d%d,%d,%s" % (feature,doRandomwalk,uniqueUser,k,tpp), "id.tagvotes.txt")

    resultfile = os.path.join(rootpath, testCollection, "tagrel",
                              testCollection, trainCollection,
                              '%s,tagrank,%s' % (feature, tpp),
                              'id.tagvotes.txt')

    if numjobs > 1:
        resultfile = resultfile + '.%d.%d' % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        sys.exit(0)

    try:
        doneset = set(
            [str.split(x)[0] for x in open(options.donefile).readlines()[:-1]])
    except:
        doneset = set()

    printStatus(INFO, "done set: %d" % len(doneset))

    testImageSet = readImageSet(testCollection, testCollection, rootpath)
    testImageSet = [x for x in testImageSet if x not in doneset]
    testImageSet = [
        testImageSet[i] for i in range(len(testImageSet))
        if (i % numjobs + 1) == job
    ]
    printStatus(
        INFO, 'working on %d-%d, %d test images -> %s' %
        (numjobs, job, len(testImageSet), resultfile))

    testreader = TagReader(testCollection, rootpath=rootpath)
    test_feat_file = BigFile(
        os.path.join(rootpath, testCollection, 'FeatureData', feature))
    block_size = 100

    tagranking = TagRanking(trainCollection,
                            feature=feature,
                            k=k,
                            rootpath=rootpath)

    makedirsforfile(resultfile)
    fw = open(resultfile, "w")

    done = 0

    nr_of_blocks = len(testImageSet) / block_size
    if nr_of_blocks * block_size < len(testImageSet):
        nr_of_blocks += 1

    for block_index in range(nr_of_blocks):
        start = block_index * block_size
        end = min(len(testImageSet), start + block_size)
        subset = testImageSet[start:end]
        if not subset:
            break
        renamed, features = test_feat_file.read(subset)
        printStatus(INFO, '%d - %d: %d images' % (start, end, len(subset)))

        output = []
        for i in range(len(renamed)):
            qry_id = renamed[i]
            qry_tags = testreader.get(qry_id)
            qry_vec = features[i]
            tagvotes = tagranking.estimate(
                qry_vec,
                qry_tags)  #, uniqueUser=uniqueUser, doRandomwalk=doRandomwalk)
            newline = "%s %s" % (qry_id, " ".join(
                ["%s %g" % (x[0], x[1]) for x in tagvotes]))
            output.append(newline + "\n")
            done += 1

        #printStatus(INFO, '%d %s %s' % (done,qry_id,' '.join(['%s:%g' % (x[0],x[1]) for x in tagvotes[:3]] )))
        fw.write("".join(output))
        fw.flush()

    fw.close()
    printStatus(INFO, 'done')
Example #12
0
def process(options, testCollection, trainCollection, feature):
    rootpath = options.rootpath
    overwrite = options.overwrite
    tpp = options.tpp
    doRandomwalk =  1 #options.doRandomwalk
    uniqueUser = 0 #options.uniqueUser
    k = 1000 #options.k
    numjobs = options.numjobs
    job = options.job
    
    #resultfile = os.path.join(rootpath, testCollection, "tagrel", testCollection, trainCollection, 
    #                          "%s,tagrank%d%d,%d,%s" % (feature,doRandomwalk,uniqueUser,k,tpp), "id.tagvotes.txt")
    
    resultfile = os.path.join(rootpath, testCollection, "tagrel", testCollection, trainCollection, '%s,tagrank,%s' % (feature,tpp), 'id.tagvotes.txt')
        
    if numjobs>1:
        resultfile = resultfile + '.%d.%d' % (numjobs, job)
                              
    if checkToSkip(resultfile, overwrite):
        sys.exit(0)    

    try:
        doneset = set([str.split(x)[0] for x in open(options.donefile).readlines()[:-1]])
    except:
        doneset = set()
        
    printStatus(INFO, "done set: %d" % len(doneset))
    
    testImageSet = readImageSet(testCollection, testCollection, rootpath)
    testImageSet = [x for x in testImageSet if x not in doneset]
    testImageSet = [testImageSet[i] for i in range(len(testImageSet)) if (i%numjobs+1) == job]
    printStatus(INFO, 'working on %d-%d, %d test images -> %s' % (numjobs,job,len(testImageSet),resultfile) )
    
    testreader = TagReader(testCollection, rootpath=rootpath)   
    test_feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature))
    block_size = 100

    tagranking = TagRanking(trainCollection, feature=feature, k=k, rootpath=rootpath)
    
    makedirsforfile(resultfile)
    fw = open(resultfile, "w")
    
    done = 0
    
    nr_of_blocks = len(testImageSet) / block_size
    if nr_of_blocks * block_size < len(testImageSet):
        nr_of_blocks += 1

    for block_index in range(nr_of_blocks):
        start = block_index * block_size
        end = min(len(testImageSet), start + block_size)
        subset = testImageSet[start:end]
        if not subset:
            break
        renamed, features = test_feat_file.read(subset)
        printStatus(INFO, '%d - %d: %d images' % (start, end, len(subset)))
        
        output = []
        for i in range(len(renamed)):
            qry_id = renamed[i]
            qry_tags = testreader.get(qry_id)
            qry_vec = features[i]
            tagvotes = tagranking.estimate(qry_vec, qry_tags) #, uniqueUser=uniqueUser, doRandomwalk=doRandomwalk)
            newline = "%s %s" % (qry_id, " ".join(["%s %g" % (x[0],x[1]) for x in tagvotes]))
            output.append(newline + "\n")
            done += 1
        
        #printStatus(INFO, '%d %s %s' % (done,qry_id,' '.join(['%s:%g' % (x[0],x[1]) for x in tagvotes[:3]] )))
        fw.write("".join(output))
        fw.flush()
  
    fw.close()
    printStatus(INFO, 'done')
Example #13
0
                'tags': ''
            } for x in selected]
        resp['content'] = content[:max_hits]
        return render.index(resp)


class ImageSearch:
    def POST(self):
        input = web.input()
        raise web.seeother('/?query=%s' % input.query)


if __name__ == "__main__":
    app = web.application(urls, globals())

    test_feat_dir = os.path.join(rootpath, testCollection, 'FeatureData',
                                 feature)
    web.test_feat_file = BigFile(test_feat_dir)

    from knntagger import TagVoteTagger as ImageTagger
    web.tagger = ImageTagger(trainCollection, rootpath)

    train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData',
                                  feature)
    web.searcher = load_model(train_feat_dir)
    web.searcher.set_distance(distance)

    web.imset = readImageSet(testCollection, testCollection, rootpath)

    app.run()
Example #14
0
def process(options, trainCollection, trainAnnotationName, feature):
    import re
    p = re.compile(r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)')

    rootpath = options.rootpath
    best_param_dir = options.best_param_dir
    overwrite = options.overwrite
    #autoweight = options.autoweight
    numjobs = options.numjobs
    job = options.job
    beta = 0.5
    
    modelName = 'fastlinear'
    if best_param_dir:
        modelName += '-tuned'
    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    if not todo:
        return 0

    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    
    feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature))
    
    for concept in todo:
        if best_param_dir:
            param_file = os.path.join(best_param_dir, '%s.txt' % concept)
            m = p.search(open(param_file).readline().strip())
            C = float(m.group('C'))
            A = float(m.group('a'))
            B = float(m.group('b'))
        else:
            C = 1
            A = 0
            B = 0
        printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B))
        
        model_file_name = os.path.join(resultdir, concept + '.model')
        
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names,labels))
        renamed,vectors = feat_file.read(names)
        y = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if  1 == lab])
        nn = len([1 for lab in labels if  -1== lab])
        wp = float(beta) * (np+nn) / np
        wn = (1.0-beta) * (np+nn) /nn
    
        # no bias term added by setting "-B -1"
        svm_params = '-w1 %g -w-1 %g -s 2 -B -1 -q' % (wp*C, wn*C) 
        model = liblinear_train(y, vectors, svm_params)
        newmodel = liblinear_to_fastlinear([model], [1.0], feat_file.ndims)
        newmodel.set_probAB(A, B)
        makedirsforfile(model_file_name)
        printStatus(INFO, '-> %s'%model_file_name)
        fastlinear_save_model(model_file_name, newmodel)

        # reload the model file to do a simple check
        fastlinear_load_model(model_file_name)
        assert(abs(newmodel.get_probAB()[0]-A)<1e-6)
        assert(abs(newmodel.get_probAB()[1]-B)<1e-6)

    return len(todo)
def process(options, trainCollection, testCollection, feature):
    rootpath = options.rootpath
    k = options.k
    distance = options.distance
    blocksize = options.blocksize
    uniqueUser = options.uu
    numjobs = options.numjobs
    job = options.job
    overwrite = options.overwrite
    testset = options.testset
    if not testset:
        testset = testCollection

    searchMethod = distance + 'knn'
    if uniqueUser:
        searchMethod += ",uu"
        tagfile = os.path.join(rootpath, trainCollection, 'TextData', 'id.userid.lemmtags.txt')
        im2user = {}
        for line in open(tagfile):
            im,userid,tags = line.split('\t')
            im2user[im] = userid
    
    resultdir = os.path.join(rootpath, testCollection, "SimilarityIndex", testset, trainCollection, "%s,%s,%d" % (feature,searchMethod,k))
    feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
    id_file = os.path.join(feat_dir, 'id.txt')
    shape_file = os.path.join(feat_dir, 'shape.txt')
    nr_of_images, feat_dim = map(int, open(shape_file).readline().split())
    nr_of_images = len(open(id_file).readline().strip().split())
    searcher = imagesearch.load_model(os.path.join(feat_dir, 'feature.bin'), feat_dim, nr_of_images, id_file)
    searcher.set_distance(distance)
        
    workingSet = readImageSet(testCollection, testset, rootpath=rootpath)
    workingSet = [workingSet[i] for i in range(len(workingSet)) if (i%numjobs+1) == job]
    printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,len(workingSet),resultdir))
    
    test_feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', feature)
    test_feat_file = BigFile(test_feat_dir)

    read_time = 0
    knn_time = 0
    start = 0
    done = 0
    filtered = 0

    while start < len(workingSet):
        end = min(len(workingSet), start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))

        s_time = time.time()
        renamed,vectors = test_feat_file.read(workingSet[start:end])
        read_time += time.time() - s_time
        nr_images = len(renamed)
        
        s_time = time.time()
        for i in range(nr_images):
            resultfile = os.path.join(resultdir, renamed[i][-2:], '%s.txt' % renamed[i])
            if checkToSkip(resultfile, overwrite):
                continue
            knn = searcher.search_knn(vectors[i], max_hits=max(3000,k*3))
            if uniqueUser:
                removed, newknn = unique_user_constraint(knn, im2user, k)
                filtered += removed
                knn = newknn
            else:
                knn = knn[:k]
            assert(len(knn) >= k)
            writeRankingResults(knn, resultfile)
            done += 1
        printStatus(INFO, 'job %d-%d: %d done, filtered neighbors %d' % (numjobs, job, done, filtered))
        start = end

    printStatus(INFO, 'job %d-%d: %d done, filtered neighbors %d' % (numjobs, job, done, filtered))
Example #16
0
def process(options, trainCollection, annotationfile, feature, modelName):
    assert(modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1 #options.autoweight
    beta = 0.5
    C = 1
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {'rootpath': rootpath, 'model': modelName}
    
    if 'fik' == modelName:
        from svms.fiksvm.svmutil import svm_train as train_model
        from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from svms.fiksvm.fiksvm import fiksvm_save_model as save_model
        from svms.fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))    
    else:
        from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model
        from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        from svms.fastlinear.fastlinear import fastlinear_save_model as save_model
 
    newAnnotationName = os.path.split(annotationfile)[-1]
    trainAnnotationNames = [x.strip() for x in open(annotationfile).readlines() if x.strip() and not x.strip().startswith('#')]
    for annotationName in trainAnnotationNames:
        conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', annotationName)
        if not os.path.exists(conceptfile):
            print '%s does not exist' % conceptfile
            return 0

    concepts = readConcepts(trainCollection, trainAnnotationNames[0], rootpath=rootpath)

    resultdir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature))
    feat_dim = train_feat_file.ndims

    s_time = time.time()

    for concept in todo:
        assemble_model = None
        for t in range(1, len(trainAnnotationNames)+1):
            names,labels = readAnnotationsFrom(trainCollection, trainAnnotationNames[t-1], concept, skip_0=True, rootpath=rootpath)
            name2label = dict(zip(names,labels))
            renamed,vectors = train_feat_file.read(names)
            Ys = [name2label[x] for x in renamed]
            np = len([1 for lab in labels if  1 == lab])
            nn = len([1 for lab in labels if  -1== lab])
            wp = float(beta) * (np+nn) / np
            wn = (1.0-beta) * (np+nn) /nn
    
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) 
            else:
                svm_params = '-c %g' % C
            
            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
           
            g_t = train_model(Ys, vectors, svm_params + ' -q')
            if t == 1:
                assemble_model = compress_model([g_t], [1.0], feat_dim, params)
            else:
                new_model = compress_model([g_t], [1.0], feat_dim, params)
                assemble_model.add_fastsvm(new_model, 1-1.0/t, 1.0/t)

        new_model_file = os.path.join(resultdir, '%s.model' % concept)            
        makedirsforfile(new_model_file)
        printStatus(INFO, 'save model to %s' % new_model_file)
        save_model(new_model_file, assemble_model)
        printStatus(INFO, '%s done' % concept)

        
    timecost = time.time() - s_time
    writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath)
    printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo)))
    printStatus(INFO, 'models stored at %s' % resultdir)
    printStatus(INFO, '%g seconds in total' % timecost)
Example #17
0
        sys.exit(0)

    nr_of_images_list = []
    feat_dim_list = []
    feat_files = []

    for feature in srcfeatures:
        shapefile = os.path.join(rootpath, collection, 'FeatureData', feature,
                                 'shape.txt')
        nr_of_images, feat_dim = map(
            int,
            open(shapefile).readline().strip().split())
        nr_of_images_list.append(nr_of_images)
        feat_dim_list.append(feat_dim)
        feat_files.append(
            BigFile(os.path.join(rootpath, collection, 'FeatureData',
                                 feature)))

    #assert(nr_of_images_list[0] == nr_of_images_list[1])
    new_feat_dim = sum(feat_dim_list)

    imset = readImageSet(collection, collection, rootpath)
    nr_of_images = len(imset)
    blocksize = 1000

    makedirsforfile(binary_file)
    fw = open(binary_file, 'wb')
    new_imset = []
    start = 0

    while start < nr_of_images:
        end = min(nr_of_images, start + blocksize)