Beispiel #1
0
def process(options, trainCollection, modelAnnotationName, trainAnnotationName, feature):
    rootpath = options.rootpath
    modelName = options.model

    if 'fastlinear' == modelName:
        from fastlinear.fastlinear import fastlinear_load_model as load_model
        from fastlinear.fastlinear import fastlinear_save_model as save_model
    else:
        from fiksvm.fiksvm import fiksvm_load_model as load_model
        from fiksvm.fiksvm import fiksvm_save_model as save_model


    concepts = readConcepts(trainCollection, trainAnnotationName, rootpath)
    concepts = [concepts[i] for i in range(len(concepts)) if (i%options.numjobs + 1) == options.job]

    feat_file = BigFile(os.path.join(rootpath, trainCollection, "FeatureData", feature))

    for concept in concepts:
        modelfile = os.path.join(rootpath, trainCollection, 'Models', modelAnnotationName, feature, modelName, '%s.model' % concept)
        model = load_model(modelfile)
        (A0, B0) = model.get_probAB()
        if abs(A0) > 1e-8 and not options.overwrite:
            printStatus(INFO, "old parameters exist as A=%g, B=%g, skip" % (A0, B0))
            continue
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names, labels))
        results = classify_large_data(model, names, feat_file, prob_output=False)
        labels = [name2label[x[0]] for x in results]
        dec_values = [x[1] for x in results]
        printStatus(INFO, "%s +%d -%d" % (concept, len([x for x in labels if x==1]), len([x for x in labels if x==-1])))
        [A,B] = sigmoid_train(dec_values, labels)
        model.set_probAB(A, B)
        save_model(modelfile, model)
        (A1, B1) = model.get_probAB()
        printStatus(INFO, "A: %g -> %g, B: %g -> %g" % (A0, A1, B0, B1))
Beispiel #2
0
def evaluateSearchEngines(searchers,
                          collection,
                          annotationName,
                          metric,
                          rootpath=ROOT_PATH):
    scorer = getScorer(metric)
    concepts = readConcepts(collection, annotationName, rootpath)

    nr_of_runs = len(searchers)
    nr_of_concepts = len(concepts)
    results = np.zeros((nr_of_concepts, nr_of_runs))

    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection, annotationName,
                                            concepts[i], rootpath)
        name2label = dict(zip(names, labels))

        for j in range(nr_of_runs):
            searchresults = searchers[j].scoreCollection(concepts[i])
            sorted_labels = [
                name2label[name] for (name, score) in searchresults
                if name in name2label
            ]
            results[i, j] = scorer.score(sorted_labels)

    for i in range(nr_of_concepts):
        print concepts[i], ' '.join([niceNumber(x, 3) for x in results[i, :]])
    mean_perf = results.mean(0)
    print 'mean%s' % metric, ' '.join([niceNumber(x, 3) for x in mean_perf])

    return concepts, results
def process(options, collection, annotationName, pos_num):
    assert(annotationName.endswith('.txt'))
    rootpath = options.rootpath
    pos_bag_num = options.pos_bag_num
    neg_bag_num = options.neg_bag_num
    neg_pos_ratio = options.neg_pos_ratio

    annotationNameStr = annotationName[:-4] + ('.random%d' % pos_num) + '.%d' + ('.npr%d' % neg_pos_ratio) + '.%d.txt'

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    
    skip = 0
    newAnnotationNames = [None] * (pos_bag_num * neg_bag_num)

    for idxp in range(pos_bag_num):
        for idxn in range(neg_bag_num):
            anno_idx = idxp * neg_bag_num + idxn
            newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn)
            resultfile = os.path.join(rootpath,collection,'Annotations',newAnnotationNames[anno_idx])
            if checkToSkip(resultfile, options.overwrite):
                skip += 1
                continue
            writeConcepts(concepts,resultfile)

    first,second,last = annotationNameStr.split('%d')
    scriptfile = os.path.join(rootpath,collection,'annotationfiles',first + '0-%d'%(pos_bag_num-1) + second + '0-%d'%(neg_bag_num-1) + last)

    makedirsforfile(scriptfile)
    fout = open(scriptfile,'w')
    fout.write('\n'.join(newAnnotationNames) + '\n')
    fout.close()

    if len(newAnnotationNames) == skip:
        return 0
        
    for concept in concepts:
        names,labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath)
        positivePool = [x[0] for x in zip(names,labels) if x[1]>0]
        negativePool = [x[0] for x in zip(names,labels) if x[1]<0]
        
        for idxp in range(pos_bag_num):
            if len(positivePool) > pos_num:
                positiveBag = random.sample(positivePool, pos_num)
            else:
                positiveBag = positivePool
            for idxn in range(neg_bag_num):
                anno_idx = idxp * neg_bag_num + idxn
                newAnnotationName = newAnnotationNames[anno_idx]
                resultfile = os.path.join(rootpath,collection,'Annotations','Image',newAnnotationName,'%s.txt'%concept)
                if checkToSkip(resultfile, options.overwrite):
                    continue
                real_neg_num = max(len(positiveBag) * neg_pos_ratio, 1000)
                real_neg_num = min(len(negativePool), real_neg_num)
                negativeBag = random.sample(negativePool, real_neg_num)

                assert(len(set(positiveBag).intersection(set(negativeBag))) == 0)
                printStatus(INFO, "anno(%s,%d) %d pos %d neg -> %s" % (concept,anno_idx,len(positiveBag),len(negativeBag),resultfile))
                writeAnnotations(positiveBag + negativeBag, [1]*len(positiveBag) + [-1]*len(negativeBag), resultfile)
Beispiel #4
0
    def GET(self):
        input = web.input(query=None)
        resp = {
            'status': 0,
            'hits': 0,
            'random': [],
            'tagrel': [],
            'metric': metric,
            'perf': 0
        }

        if input.query:
            resp['status'] = 1
            resp['query'] = input.query
            query = input.query.lower()

            if query.isdigit():  # request to view a specific image
                resp['hits'] = 1
                resp['tagrel'] = [{'id': query}]
                return render.index(resp)

            try:
                names, labels = readAnnotationsFrom(collection, annotationName,
                                                    query)
                name2label = dict(zip(names, labels))
            except Exception, e:
                name2label = {}

            content = []
            try:
                if input.tagrel == '0':
                    labeled = readLabeledImageSet(collection,
                                                  query,
                                                  rootpath=rootpath)
                    ranklist = [(x, 0) for x in labeled]
                else:
                    simfile = os.path.join(simdir, '%s.txt' % query)
                    ranklist = readRankingResults(simfile)
                resp['hits'] = len(ranklist)
                for name, score in ranklist:
                    color = 'Chartreuse' if name2label.get(name,
                                                           0) > 0 else 'red'
                    color = 'white' if name not in name2label else color
                    res = {'id': name, 'color': color}
                    content.append(res)
                resp['perf'] = 0 if not name2label else scorer.score(
                    [name2label[x[0]] for x in ranklist if x[0] in name2label])
                resp['tagrel'] = content[:max_hits]
            except:
                None
Beispiel #5
0
    def GET(self):
        input = web.input(query=None)
        resp = {'status':0, 'hits':0, 'random':[], 'tagrel':[], 'metric':metric, 'perf':0}

        if input.query:
            resp['status'] = 1
            resp['query'] = input.query
            query = input.query.lower()

            if query.isdigit(): # request to view a specific image
                resp['hits'] = 1
                resp['tagrel'] = [{'id':query}]
                return  render.index(resp)
            
            try:
                names,labels = readAnnotationsFrom(collection, annotationName, query)
                name2label = dict(zip(names,labels))
            except Exception, e:
                name2label = {}

            content = []
            try:
                if input.tagrel == '0':
                    labeled = readLabeledImageSet(collection, query, rootpath=rootpath)
                    ranklist = [(x,0) for x in labeled]
                else:
                    simfile = os.path.join(simdir, '%s.txt' % query)
                    ranklist = readRankingResults(simfile)
                resp['hits'] = len(ranklist)
                for name,score in ranklist:
                    color = 'Chartreuse' if name2label.get(name,0)>0 else 'red'
                    color = 'white' if name not in name2label else color
                    res = {'id':name, 'color':color}
                    content.append(res)
                resp['perf'] = 0 if not name2label else scorer.score([name2label[x[0]] for x in ranklist if x[0] in name2label])
                resp['tagrel'] = content[:max_hits]
            except:
                None
Beispiel #6
0
def evaluateSearchEngines(searchers, collection, annotationName, metric, rootpath=ROOT_PATH):
    scorer = getScorer(metric)
    concepts = readConcepts(collection, annotationName, rootpath)
    
    nr_of_runs = len(searchers)
    nr_of_concepts = len(concepts)
    results = np.zeros((nr_of_concepts,nr_of_runs))


    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], rootpath)
        name2label = dict(zip(names,labels))
        
        for j in range(nr_of_runs):
            searchresults = searchers[j].scoreCollection(concepts[i])
            sorted_labels = [name2label[name] for (name,score) in searchresults if name in name2label]
            results[i,j] = scorer.score(sorted_labels)

    for i in range(nr_of_concepts):
        print concepts[i], ' '.join([niceNumber(x,3) for x in results[i,:]])
    mean_perf = results.mean(0)
    print 'mean%s'%metric, ' '.join([niceNumber(x,3) for x in mean_perf])

    return concepts,results
def process(options, trainCollection, trainAnnotationName, valCollection,
            valAnnotationName, feature, modelName):
    assert (modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1  #options.autoweight
    beta = 0.5
    metric = options.metric
    scorer = getScorer(metric)
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {}

    if 'fik' == modelName:
        from fiksvm.svmutil import svm_train as train_model
        from fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData',
                                   feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))
    else:
        from fastlinear.liblinear193.python.liblinearutil import train as train_model
        from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        modelName = 'fastlinear'

    concepts = readConcepts(trainCollection,
                            trainAnnotationName,
                            rootpath=rootpath)
    valConcepts = readConcepts(valCollection,
                               valAnnotationName,
                               rootpath=rootpath)
    concept_num = len(concepts)
    for i in range(concept_num):
        assert (concepts[i] == valConcepts[i])

    resultdir = os.path.join(
        rootpath, trainCollection, 'Models', trainAnnotationName,
        '%s,best_params' % modelName,
        '%s,%s,%s' % (valCollection, valAnnotationName, feature))
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.txt')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)]
    printStatus(INFO,
                'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(
        os.path.join(rootpath, trainCollection, 'FeatureData', feature))
    val_feat_file = BigFile(
        os.path.join(rootpath, valCollection, 'FeatureData', feature))
    feat_dim = train_feat_file.ndims
    assert (feat_dim == val_feat_file.ndims)

    for concept in todo:
        names, labels = readAnnotationsFrom(trainCollection,
                                            trainAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        name2label = dict(zip(names, labels))
        renamed, vectors = train_feat_file.read(names)
        Ys = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if 1 == lab])
        nn = len([1 for lab in labels if -1 == lab])
        wp = float(beta) * (np + nn) / np
        wn = (1.0 - beta) * (np + nn) / nn

        names, labels = readAnnotationsFrom(valCollection,
                                            valAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        val_name2label = dict(zip(names, labels))
        val_renamed, val_vectors = val_feat_file.read(names)

        min_perf = 2.0
        worst_C = 1.0
        max_perf = 0.0
        best_C = 1.0
        best_scores = None
        best_labels = None
        for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]:
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
            else:
                svm_params = '-c %g' % C

            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
            #print modelName, '>'*20, svm_params
            model = train_model(Ys, vectors, svm_params + ' -q')
            new_model = compress_model([model], [1.0], feat_dim, params)

            ranklist = [(val_renamed[i], new_model.predict(val_vectors[i]))
                        for i in range(len(val_renamed))]
            ranklist.sort(key=lambda v: v[1], reverse=True)
            sorted_labels = [val_name2label[x[0]] for x in ranklist]
            perf = scorer.score(sorted_labels)
            if max_perf < perf:
                max_perf = perf
                best_C = C
                best_scores = [x[1] for x in ranklist]
                best_labels = list(sorted_labels)
            if min_perf > perf:
                min_perf = perf
                worst_C = C

        [A, B] = sigmoid_train(best_scores, best_labels)
        resultfile = os.path.join(resultdir, '%s.txt' % concept)

        printStatus(
            INFO,
            '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' %
            (concept, min_perf, worst_C, max_perf, best_C, A, B))
        makedirsforfile(resultfile)
        fw = open(resultfile, 'w')
        fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B))
        fw.close()
def process(options, trainCollection, trainAnnotationName, feature):
    import re
    p = re.compile(
        r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)')

    rootpath = options.rootpath
    overwrite = options.overwrite
    #autoweight = options.autoweight
    numjobs = options.numjobs
    job = options.job
    nr_bins = options.nr_bins
    best_param_dir = options.best_param_dir
    beta = 0.5

    modelName = 'fik%d' % nr_bins
    if best_param_dir:
        modelName += '-tuned'

    concepts = readConcepts(trainCollection,
                            trainAnnotationName,
                            rootpath=rootpath)
    resultdir = os.path.join(rootpath, trainCollection, 'Models',
                             trainAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)]
    printStatus(INFO,
                'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
    feat_file = BigFile(feat_dir)
    params = {'nr_bins': nr_bins}

    with open(os.path.join(feat_dir, 'minmax.txt'), 'r') as f:
        params['min_vals'] = map(float, str.split(f.readline()))
        params['max_vals'] = map(float, str.split(f.readline()))

    for concept in todo:
        if best_param_dir:
            param_file = os.path.join(best_param_dir, '%s.txt' % concept)
            m = p.search(open(param_file).readline().strip())
            C = float(m.group('C'))
            A = float(m.group('a'))
            B = float(m.group('b'))
        else:
            C = 1
            A = 0
            B = 0
        printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B))

        model_file_name = os.path.join(resultdir, concept + '.model')

        names, labels = readAnnotationsFrom(trainCollection,
                                            trainAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        name2label = dict(zip(names, labels))
        renamed, vectors = feat_file.read(names)
        y = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if 1 == lab])
        nn = len([1 for lab in labels if -1 == lab])
        wp = float(beta) * (np + nn) / np
        wn = (1.0 - beta) * (np + nn) / nn

        svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
        model = svm_train(
            y, vectors,
            svm_params + ' -s 0 -t %d -q' % KERNEL_TYPE.index("HI"))
        newmodel = svm_to_fiksvm([model], [1.0], feat_file.ndims, params)
        newmodel.set_probAB(A, B)
        makedirsforfile(model_file_name)
        printStatus(INFO, '-> %s' % model_file_name)
        fiksvm_save_model(model_file_name, newmodel)

        # reload the model file to do a simple check
        fiksvm_load_model(model_file_name)
        assert (abs(newmodel.get_probAB()[0] - A) < 1e-6)
        assert (abs(newmodel.get_probAB()[1] - B) < 1e-6)

    return len(todo)
Beispiel #9
0
    feat_dim = 1000
    scorer = getScorer("AP")
    
    targetConcept = sys.argv[1] #"aeroplane"

    train_feat_file = BigFile(os.path.join(ROOT_PATH, trainCollection, "FeatureData", feature), feat_dim)
    test_feat_file = BigFile(os.path.join(ROOT_PATH, testCollection, "FeatureData", feature), feat_dim)
    testImageSet = test_feat_file.names #random.sample(test_feat_file.names, 10000)
    
    minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
    with open(minmax_file, 'r') as f:
        min_vals = map(float, str.split(f.readline()))
        max_vals = map(float, str.split(f.readline()))


    [names,labels] = readAnnotationsFrom(collection=trainCollection, annotationName=trainAnnotationName, concept=targetConcept, rootpath=rootpath)
    name2label = dict(zip(names,labels))
    (renamed, vectors) = train_feat_file.read(names)
    relabeled = [name2label[x] for x in renamed] #label is either 1 or -1
    
    [names,labels] = readAnnotationsFrom(collection=testCollection, annotationName=testAnnotationName, concept=targetConcept, rootpath=rootpath)
    test2label = dict(zip(names,labels))
    

    for beta in [0.5]: #[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        #model = hiksvm_train(relabeled, vectors, beta=beta)
        cv = 3
        best_beta, cv_score, model = hiksvm_train_cv(relabeled, vectors, cv, scorer, min_vals, max_vals)
        print best_beta, cv_score
        #fikmodel = svm_to_fiksvm([model], [1.0], 1, dim, 50)
        fikmodel = svm_to_fiksvm([model], 1, [1.0], feat_dim=feat_dim, min_vals=min_vals, max_vals=max_vals, num_bins=50)
Beispiel #10
0
def process(options, trainCollection, devCollection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    method = options.method
    metric = options.metric

    qrysim = options.qrysim
    qrythres = options.qrythres
    ntopimg = options.ntopimg
    ntopqry = options.ntopqry
    mincc = options.mincc
    feature = options.feature

    # semantic embedding
    k = options.k
    corpus = options.corpus
    word2vec_model = options.word2vec
    label_source = options.label_source

    # result path
    ranking_result_path = os.path.join(rootpath, devCollection,
                                       'SimilarityIndex', devCollection,
                                       'MetaData', method, feature)
    DCG_result_path = os.path.join(rootpath, devCollection, 'DCG', method,
                                   feature)
    if checkToSkip(ranking_result_path, overwrite):
        sys.exit(0)
    if checkToSkip(DCG_result_path, overwrite):
        sys.exit(0)

    # inpute of query
    qp = SimpleQueryParser()
    qid_query_file = os.path.join(rootpath, devCollection, 'Annotations',
                                  'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)  #(qid query)
    qid2query = dict(zip(qid_list,
                         [qp.process(query) for query in query_list]))

    # path of image feature
    train_feat_path = os.path.join(rootpath, trainCollection, 'FeatureData',
                                   feature)
    dev_feat_path = os.path.join(rootpath, devCollection, 'FeatureData',
                                 feature)

    # method selection
    if method == 'se':
        se_searcher = SemanticEmbedding(label_source, corpus, word2vec_model,
                                        dev_feat_path, rootpath)

    elif method == 't2i':
        nnquery_file = os.path.join(rootpath, devCollection, 'TextData',
                                    'querynn', options.nnqueryfile)
        qryClick_file = os.path.join(rootpath, trainCollection, 'TextData',
                                     options.queryclickfile)
        t2i_searcher = Text2Image(nnquery_file, qryClick_file, dev_feat_path,
                                  train_feat_path, ntopqry)

    elif method == 'i2t':
        nnimage_file = os.path.join(rootpath, devCollection, 'TextData',
                                    'imagenn', feature, options.nnimagefile)
        imgClick_file = os.path.join(rootpath, trainCollection, 'TextData',
                                     options.imageclickfile)
        i2t_searcher = Image2Text(nnimage_file, imgClick_file, qrysim, ntopimg,
                                  ntopqry)

    else:
        print "this model is not supported with %s" % method
        sys.exit(0)

    # calculate DCG@25
    scorer = getScorer(metric)

    done = 0
    failed_count = 0
    qid2dcg = collections.OrderedDict()
    qid2iid_label_score = {}

    for query_id in qid_list:

        iid_list, label_list = readAnnotationsFrom(
            devCollection, 'concepts%s.txt' % devCollection, query_id, False,
            rootpath)

        if method == 'se':
            scorelist = se_searcher.do_search(qid2query[query_id], iid_list, k)

        elif method == 't2i':
            scorelist = t2i_searcher.text2image(query_id, iid_list, qrythres,
                                                mincc)

        elif method == 'i2t':
            scorelist = i2t_searcher.image2text(qid2query[query_id], iid_list,
                                                mincc)

        if len(scorelist) == 0:
            failed_count += 1
            scorelist = [0] * len(iid_list)
            qid2iid_label_score[query_id] = zip(iid_list, label_list,
                                                scorelist)
            random.shuffle(qid2iid_label_score[query_id])
        else:
            qid2iid_label_score[query_id] = zip(iid_list, label_list,
                                                scorelist)
            qid2iid_label_score[query_id] = sorted(
                qid2iid_label_score[query_id],
                key=lambda v: v[2],
                reverse=True)

        # calculate the result ranking of DCG@25 from our model
        qid2dcg[query_id] = scorer.score(
            [x[1] for x in qid2iid_label_score[query_id]])
        printMessage("Done", query_id, qid2query[query_id])

        done += 1
        if (done % 20 == 0):
            writeRankingResult(ranking_result_path, qid2iid_label_score)
            qid2iid_label_score = {}

    writeRankingResult(ranking_result_path, qid2iid_label_score)
    writeDCGResult(DCG_result_path, qid2dcg)
    print "number of failed query: %d" % failed_count
    print "average DCG@25: %f" % (1.0 * sum(qid2dcg.values()) /
                                  len(qid2dcg.values()))

    result_path_file = "result/individual_result_pathes.txt"
    if os.path.exists(result_path_file):
        fout = open(result_path_file, 'a')
    else:
        makedirsforfile(result_path_file)
        fout = open(result_path_file, 'w')
    fout.write(ranking_result_path + '\n')
    fout.close()
Beispiel #11
0
def process(options, trainCollection, annotationfile, feature, modelName):
    assert (modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1  #options.autoweight
    beta = 0.5
    C = 1
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {'rootpath': rootpath, 'model': modelName}

    if 'fik' == modelName:
        from svms.fiksvm.svmutil import svm_train as train_model
        from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from svms.fiksvm.fiksvm import fiksvm_save_model as save_model
        from svms.fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData',
                                   feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))
    else:
        from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model
        from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        from svms.fastlinear.fastlinear import fastlinear_save_model as save_model

    newAnnotationName = os.path.split(annotationfile)[-1]
    trainAnnotationNames = [
        x.strip() for x in open(annotationfile).readlines()
        if x.strip() and not x.strip().startswith('#')
    ]
    for annotationName in trainAnnotationNames:
        conceptfile = os.path.join(rootpath, trainCollection, 'Annotations',
                                   annotationName)
        if not os.path.exists(conceptfile):
            print '%s does not exist' % conceptfile
            return 0

    concepts = readConcepts(trainCollection,
                            trainAnnotationNames[0],
                            rootpath=rootpath)

    resultdir = os.path.join(rootpath, trainCollection, 'Models',
                             newAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)]
    printStatus(INFO,
                'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(
        os.path.join(rootpath, trainCollection, 'FeatureData', feature))
    feat_dim = train_feat_file.ndims

    s_time = time.time()

    for concept in todo:
        assemble_model = None
        for t in range(1, len(trainAnnotationNames) + 1):
            names, labels = readAnnotationsFrom(trainCollection,
                                                trainAnnotationNames[t - 1],
                                                concept,
                                                skip_0=True,
                                                rootpath=rootpath)
            name2label = dict(zip(names, labels))
            renamed, vectors = train_feat_file.read(names)
            Ys = [name2label[x] for x in renamed]
            np = len([1 for lab in labels if 1 == lab])
            nn = len([1 for lab in labels if -1 == lab])
            wp = float(beta) * (np + nn) / np
            wn = (1.0 - beta) * (np + nn) / nn

            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
            else:
                svm_params = '-c %g' % C

            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '

            g_t = train_model(Ys, vectors, svm_params + ' -q')
            if t == 1:
                assemble_model = compress_model([g_t], [1.0], feat_dim, params)
            else:
                new_model = compress_model([g_t], [1.0], feat_dim, params)
                assemble_model.add_fastsvm(new_model, 1 - 1.0 / t, 1.0 / t)

        new_model_file = os.path.join(resultdir, '%s.model' % concept)
        makedirsforfile(new_model_file)
        printStatus(INFO, 'save model to %s' % new_model_file)
        save_model(new_model_file, assemble_model)
        printStatus(INFO, '%s done' % concept)

    timecost = time.time() - s_time
    writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath)
    printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo)))
    printStatus(INFO, 'models stored at %s' % resultdir)
    printStatus(INFO, '%g seconds in total' % timecost)
Beispiel #12
0
    #tagrelMethod = 'flickr1m/ccgd,knn,1000'

    concepts = readConcepts(collection, sourceAnnotationName%0, rootpath)

    holdoutfile = os.path.join(rootpath, collection, "ImageSets", "holdout.txt") 
    holdoutSet = set(map(str.strip, open(holdoutfile).readlines()))
    print ('%s holdout %d' % (collection,len(holdoutSet)))
 
    for concept in concepts:
        simfile = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'tagged,lemm', tagrelMethod, '%s.txt' % concept)
        searchresults = readRankingResults(simfile)
        searchresults = [x for x in searchresults if x[0] not in holdoutSet]
        positiveSet = [x[0] for x in searchresults[:numPos]]
                
        for t in range(T):
            newAnnotationName = sourceAnnotationName % t
            newAnnotationName = newAnnotationName.replace('rand%d.0'%numPos, posName)
            names,labels = readAnnotationsFrom(collection,sourceAnnotationName%t,concept,rootpath)
            
            negativeSet = [x[0] for x in zip(names,labels) if -1 == x[1]]
            renamed = positiveSet + negativeSet
            relabeled = [1] * len(positiveSet) + [-1] * len(negativeSet)
            print ('[%s] %s +%d, -%d -> %s' % (concept,sourceAnnotationName % t,len(positiveSet),len(negativeSet),newAnnotationName)) 
            writeAnnotationsTo(renamed, relabeled, collection, newAnnotationName, concept, rootpath)
            
    for t in range(T):
        newAnnotationName = sourceAnnotationName % t
        newAnnotationName = newAnnotationName.replace('rand%d.0'%numPos, posName)
        writeConceptsTo(concepts, collection, newAnnotationName, rootpath)
        
Beispiel #13
0
    concepts = readConcepts(testCollection, 'conceptsvoc2008val.txt')
    scorer = getScorer('AP')

    min_vals, max_vals = find_min_max_vals(BigFile(os.path.join(rootpath, trainCollection, 'FeatureData', feature), FEATURE_TO_DIM[feature]))
    featurefile = os.path.join(rootpath, testCollection, "FeatureData", feature, "id.feature.txt")

    feat_dim = 1024
    num_bins = 50

    #fikmodel.set_probAB(-1, 0)
    
    #print "fik model0", fikmodel0.get_nr_svs(), fikmodel0.get_feat_dim(), fikmodel0.get_probAB()
    #print "fik model", fikmodel.get_nr_svs(), fikmodel.get_feat_dim(), fikmodel.get_probAB()
    mAP = [0]*4
    for concept in concepts:
        names,labels = readAnnotationsFrom(testCollection, 'conceptsvoc2008val.txt', concept)
        name2label = dict(zip(names,labels))
        ranklist = []

        modelfile = os.path.join(rootpath, trainCollection, "Models", annotationName, feature, 'hiksvm', "%s.model" % concept)
        #print modelfile
        model = svm_load_model(modelfile)
        #print model.get_svm_type()
        #print model.get_nr_class()
        svm_models = [model, model]
        num_models = len(svm_models)
        fikmodel0 = svm_to_fiksvm0(svm_models, [1.0/num_models]*num_models, num_models, feat_dim, num_bins)
        fikmodel1 = svm_to_fiksvm(svm_models, num_models, [1.0/num_models]*num_models, feat_dim, min_vals, max_vals, num_bins)
        fikmodel2 = svm_to_fiksvm(svm_models, num_models, [1.0/num_models]*num_models, feat_dim, min_vals, max_vals, num_bins)
        fikmodel2.add_new_fikmodel(fikmodel1, 0.5)
        print concept, fikmodel1.get_nr_svs(), fikmodel1.get_nr_svs() + fikmodel1.get_nr_svs()/2,
Beispiel #14
0
def process(options, trainCollection, devCollection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    metric = options.metric

    qrythres = options.qrythres
    ntopimg = options.ntopimg
    ntopqry = options.ntopqry
    mincc = options.mincc
    feature = options.feature


    # result path
    ranking_result_path = os.path.join(rootpath, devCollection, 'SimilarityIndex', devCollection, 'MetaData', 'text2image', feature)
    DCG_result_path = os.path.join(rootpath, devCollection, metric, 'text2image', feature)
    if checkToSkip(ranking_result_path, overwrite):
        sys.exit(0)
    if checkToSkip(DCG_result_path, overwrite):
        sys.exit(0)

    # inpute of query
    qp = SimpleQueryParser()
    qid_query_file = os.path.join(rootpath, devCollection, 'Annotations', 'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)   #(qid query)
    qid2query =  dict(zip(qid_list, [qp.process(query) for query in query_list]))

    # random performance for specific queries
    qid_randomperf_file = os.path.join(rootpath, devCollection, 'Annotations', '*****@*****.**')
    qid2randomperf = {}
    for line in open(qid_randomperf_file):
        qid, random_perf = line.strip().split()
        qid2randomperf[qid] = float(random_perf)

    
    # path of image feature
    train_feat_path = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
    dev_feat_path = os.path.join(rootpath, devCollection, 'FeatureData', feature)

    nnquery_file = os.path.join(rootpath, devCollection, 'TextData','querynn', options.nnqueryfile)
    qryClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.queryclickfile)
    t2i_searcher = Text2Image(nnquery_file, qryClick_file, dev_feat_path, train_feat_path, ntopqry)

    # calculate DCG@25
    scorer = getScorer(metric)

    done = 0
    failed_count = 0
    qid2dcg = collections.OrderedDict()
    qid2iid_label_score = {}

    for query_id in qid_list:

        iid_list, label_list = readAnnotationsFrom(devCollection, 'concepts%s.txt' % devCollection, query_id, False, rootpath)        

        scorelist = t2i_searcher.doSearch( query_id, iid_list, ntopimg, qrythres, mincc)
         

        if len(scorelist) == 0: 
            failed_count += 1
            qid2dcg[query_id] = qid2randomperf[query_id]
        else:
            qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist)
            qid2iid_label_score[query_id] = sorted(qid2iid_label_score[query_id], key=lambda v:v[2], reverse=True)
            # calculate the result ranking of DCG@25 from our model
            qid2dcg[query_id] = scorer.score([x[1] for x in qid2iid_label_score[query_id]])
        printMessage("Done", query_id, qid2query[query_id])

        done += 1
        if(done % 20 == 0):
            writeRankingResult(ranking_result_path, qid2iid_label_score)
            qid2iid_label_score = {}
    
    writeRankingResult(ranking_result_path, qid2iid_label_score)
    writeDCGResult(DCG_result_path, qid2dcg)
    print "number of failed query: %d" % failed_count 
    print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values()))
Beispiel #15
0
    train_feat_file = BigFile(
        os.path.join(ROOT_PATH, trainCollection, "FeatureData", feature),
        feat_dim)
    test_feat_file = BigFile(
        os.path.join(ROOT_PATH, testCollection, "FeatureData", feature),
        feat_dim)
    testImageSet = test_feat_file.names  #random.sample(test_feat_file.names, 10000)

    minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData',
                               feature, 'minmax.txt')
    with open(minmax_file, 'r') as f:
        min_vals = map(float, str.split(f.readline()))
        max_vals = map(float, str.split(f.readline()))

    [names, labels] = readAnnotationsFrom(collection=trainCollection,
                                          annotationName=trainAnnotationName,
                                          concept=targetConcept,
                                          rootpath=rootpath)
    name2label = dict(zip(names, labels))
    (renamed, vectors) = train_feat_file.read(names)
    relabeled = [name2label[x] for x in renamed]  #label is either 1 or -1

    [names, labels] = readAnnotationsFrom(collection=testCollection,
                                          annotationName=testAnnotationName,
                                          concept=targetConcept,
                                          rootpath=rootpath)
    test2label = dict(zip(names, labels))

    for beta in [0.5]:  #[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        #model = hiksvm_train(relabeled, vectors, beta=beta)
        cv = 3
        best_beta, cv_score, model = hiksvm_train_cv(relabeled, vectors, cv,
Beispiel #16
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    feature = options.feature
    method = options.method
    sigma = options.sigma

    # result path
    ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex',
                                       collection, 'MetaData', method, feature)
    DCG_result_path = os.path.join(rootpath, collection, 'DCG', method,
                                   feature)
    if checkToSkip(ranking_result_path, overwrite):
        sys.exit(0)
    if checkToSkip(DCG_result_path, overwrite):
        sys.exit(0)

    # inpute of query
    qid_query_file = os.path.join(rootpath, collection, 'Annotations',
                                  'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)
    qid2query = dict(zip(qid_list, query_list))

    # inpute of image
    img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature)
    img_feats = BigFile(img_feat_path)

    # the model to calculate DCG@25
    scorer = getScorer("DCG@25")

    done = 0
    qid2dcg = collections.OrderedDict()
    qid2iid_label_score = {}

    for qid in qid_list:
        iid_list, label_list = readAnnotationsFrom(
            collection, 'concepts%s.txt' % collection, qid, False, rootpath)

        renamed, test_X = img_feats.read(iid_list)

        parzen_list = []
        for imidx in iid_list:
            parzen_list.append(
                calParzen(img_feats.read_one(imidx), test_X, sigma))

        # parzen_list_suffle = calParzen_fast(test_X, len(renamed), sigma)
        # parzen_list = []
        # for imidx in iid_list:
        #     parzen_list.append(parzen_list_suffle[renamed.index(imidx)])

        sorted_tuple = sorted(zip(iid_list, label_list, parzen_list),
                              key=lambda v: v[2],
                              reverse=True)
        qid2iid_label_score[qid] = sorted_tuple

        # calculate DCG@25
        sorted_label = [x[1] for x in sorted_tuple]
        qid2dcg[qid] = scorer.score(sorted_label)
        printMessage("Done", qid, qid2query[qid])

        done += 1
        if done % 20 == 0:
            writeRankingResult(ranking_result_path, qid2iid_label_score)
            qid2iid_label_score = {}

    writeDCGResult(DCG_result_path, qid2dcg)
    writeRankingResult(ranking_result_path, qid2iid_label_score)
    print "average DCG@25: %f" % (1.0 * sum(qid2dcg.values()) /
                                  len(qid2dcg.values()))

    result_path_file = "result/individual_result_pathes.txt"
    if os.path.exists(result_path_file):
        fout = open(result_path_file, 'a')
    else:
        makedirsforfile(result_path_file)
        fout = open(result_path_file, 'w')
    fout.write(ranking_result_path + '\n')
    fout.close()
Beispiel #17
0
    print('%s holdout %d' % (collection, len(holdoutSet)))

    for concept in concepts:
        simfile = os.path.join(rootpath, collection, 'SimilarityIndex',
                               collection, 'tagged,lemm', tagrelMethod,
                               '%s.txt' % concept)
        searchresults = readRankingResults(simfile)
        searchresults = [x for x in searchresults if x[0] not in holdoutSet]
        positiveSet = [x[0] for x in searchresults[:numPos]]

        for t in range(T):
            newAnnotationName = sourceAnnotationName % t
            newAnnotationName = newAnnotationName.replace(
                'rand%d.0' % numPos, posName)
            names, labels = readAnnotationsFrom(collection,
                                                sourceAnnotationName % t,
                                                concept, rootpath)

            negativeSet = [x[0] for x in zip(names, labels) if -1 == x[1]]
            renamed = positiveSet + negativeSet
            relabeled = [1] * len(positiveSet) + [-1] * len(negativeSet)
            print('[%s] %s +%d, -%d -> %s' %
                  (concept, sourceAnnotationName % t, len(positiveSet),
                   len(negativeSet), newAnnotationName))
            writeAnnotationsTo(renamed, relabeled, collection,
                               newAnnotationName, concept, rootpath)

    for t in range(T):
        newAnnotationName = sourceAnnotationName % t
        newAnnotationName = newAnnotationName.replace('rand%d.0' % numPos,
                                                      posName)
def process(options, collection, annotationName, runfile, outDirectory):
    rootpath = options.rootpath

    apscorer = getScorer("AP")
    ndcg = getScorer("NDCG@20")
    ndcg2 = getScorer("NDCG2@20")
    p1scorer = getScorer("P@1")
    p5scorer = getScorer("P@5")

    datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith("#")]
    nr_of_runs = len(datafiles)

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    printStatus(INFO, "read annotations from files")

    name2label = [{} for i in range(nr_of_concepts)]
    hit_imgset = [[] for i in range(nr_of_concepts)]
    rel_conset = {}

    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath)
        names = map(int, names)
        name2label[i] = dict(zip(names, labels))

        for im, lab in zip(names, labels):
            if lab > 0:
                rel_conset.setdefault(im, set()).add(i)

        label_file = os.path.join(rootpath, collection, "tagged,lemm", "%s.txt" % concepts[i])
        try:
            hit_imgset[i] = set(map(int, open(label_file).readlines()))
        except:
            hit_imgset[i] = set()
        printStatus(INFO, "readLabeledImageSet for %s-%s -> %d hits" % (collection, concepts[i], len(hit_imgset[i])))

    ap_table = np.zeros((nr_of_runs, nr_of_concepts))
    ap2_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts))

    print "#" * 100
    print "# method miap hit1 hit5"
    print "#" * 100

    for run_idx in range(nr_of_runs):
        data = pickle.load(open(datafiles[run_idx], "rb"))
        scores = data["scores"]
        assert scores.shape[1] == nr_of_concepts
        imset = data["id_images"]
        imset = np.array([int(x) for x in imset])
        idx = np.argsort(imset)
        imset = imset[idx]
        scores = scores[idx]
        nr_of_images = len(imset)
        # print datafiles[run_idx], imset[:5], imset[-5:]

        for c_idx in range(nr_of_concepts):
            ground_truth = name2label[c_idx]
            ranklist = zip(imset, scores[:, c_idx])
            ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True)
            ranklist = [x for x in ranklist if x[0] in ground_truth]

            sorted_labels = [ground_truth[x[0]] for x in ranklist]
            assert len(sorted_labels) > 0
            # print concepts[c_idx], ranklist[:5], sorted_labels[:5]

            ap_table[run_idx, c_idx] = apscorer.score(sorted_labels)

            sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]]
            ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels)
            ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels)
            ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels)

        res = np.zeros((nr_of_images, 4))
        gt = np.zeros((nr_of_images, nr_of_concepts))
        for j in range(nr_of_images):
            ranklist = zip(range(nr_of_concepts), scores[j, :])
            ranklist.sort(key=lambda v: v[1], reverse=True)
            rel_set = rel_conset.get(imset[j], set())
            sorted_labels = [int(x[0] in rel_set) for x in ranklist]

            # print rel_set
            # print sorted_labels

            ap = apscorer.score(sorted_labels)
            hit1 = p1scorer.score(sorted_labels)
            hit5 = p5scorer.score(sorted_labels) > 0.1
            res[j, :] = [ap, hit1, hit5, len(rel_set)]
            gt[j, :] = sorted_labels
        avg_perf = res.mean(axis=0)
        print os.path.split(datafiles[run_idx])[-1], " ".join(["%.3f" % x for x in avg_perf])

        outMiap = h5py.File(os.path.join(outDirectory, os.path.split(datafiles[run_idx])[-1] + ".h5"), "w")
        outMiap["iap"] = res[:, 0]
        outMiap["ngt"] = res[:, 3]
        outMiap["hit1"] = res[:, 1]
        outMiap["hit5"] = res[:, 2]
        outMiap["gt"] = gt
        outMiap["concepts"] = concepts
        outMiap["ap"] = ap_table[run_idx, :]
        outMiap["ap2"] = ap2_table[run_idx, :]
        outMiap[ndcg.name()] = ndcg_table[run_idx, :]
        outMiap[ndcg2.name()] = ndcg2_table[run_idx, :]
        outMiap.close()

    print "#" * 100
    print "# untagged-concept", " ".join([os.path.split(x)[-1] for x in datafiles])
    print "#" * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], " ".join(["%.3f" % x for x in ap_table[:, c_idx]])
    print "meanAP", " ".join(["%.3f" % x for x in ap_table.mean(axis=1)])

    print "#" * 100
    print "# tagged-concept"
    print "#" * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], " ".join(["%.3f" % x for x in ap2_table[:, c_idx]])
    print "meanAP2", " ".join(["%.3f" % x for x in ap2_table.mean(axis=1)])

    print "#" * 100
    print "# tagged-concept"
    print "#" * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], " ".join(["%.3f" % x for x in ndcg_table[:, c_idx]])
    print "mean%s" % ndcg.name(), " ".join(["%.3f" % x for x in ndcg_table.mean(axis=1)])

    print "#" * 100
    print "# tagged-concept"
    print "#" * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], " ".join(["%.3f" % x for x in ndcg2_table[:, c_idx]])
    print "mean%s" % ndcg2.name(), " ".join(["%.3f" % x for x in ndcg2_table.mean(axis=1)])
Beispiel #19
0
    def learn(concept, params):
        rootpath = params['rootpath']
        trainCollection = params['trainCollection']
        baseAnnotationName = params['baseAnnotationName']
        startAnnotationName = params['startAnnotationName']
        strategy = params['strategy']
        feature = params['feature']
        feat_file = params['feat_file']
        feat_dim = feat_file.ndims
        npr = params['npr']
        iterations = params['iterations']
        beta = 0.5

        names, labels = readAnnotationsFrom(trainCollection,
                                            startAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        positive_bag = [x[0] for x in zip(names, labels) if x[1] > 0]
        negative_bag = [x[0] for x in zip(names, labels) if x[1] < 0]

        names, labels = readAnnotationsFrom(trainCollection,
                                            baseAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        negative_pool = [x[0] for x in zip(names, labels) if x[1] < 0]

        Usize = max(5000, len(positive_bag) * npr)
        Usize = min(10000, Usize)
        Usize = min(Usize, len(negative_pool))

        new_model = None

        for t in range(1, iterations + 1):
            printStatus(INFO, 'iter %d (%s)' % (t, concept))
            if t > 1:  # select relevant negative examples
                # check how good at classifying positive training examples
                results = classify_large_data(assemble_model, positive_bag,
                                              feat_file)
                pos_error_rate = len([1 for x in results if x[1] < 0]) / float(
                    len(results))

                U = random.sample(negative_pool, Usize)
                predictions = classify_large_data(assemble_model, U, feat_file)
                neg_error_rate = len([1 for x in predictions if x[1] > 0
                                      ]) / float(len(predictions))

                error_rate = (pos_error_rate + neg_error_rate) / 2.0

                printStatus(
                    INFO,
                    'iter %d: %s %.3f -> %s %.3f, pe=%.3f, ne=%.3f, error=%.3f'
                    % (t, predictions[-1][0], predictions[-1][1],
                       predictions[0][0], predictions[0][1], pos_error_rate,
                       neg_error_rate, error_rate))
                if error_rate < MIN_ERROR_RATE:
                    printStatus(
                        INFO,
                        'hit stop criteria: error (%.3f) < MIN_ERROR_RATE (%.3f)'
                        % (error_rate, MIN_ERROR_RATE))
                    break

                # assume that 1% of the randomly sampled set is truely positive, and the classifier will rank them at the top
                # so ignore them
                nr_of_estimated_pos = int(len(predictions) * 0.01)
                negative_bag = NegativeBootstrap.sampling(
                    predictions[nr_of_estimated_pos:], strategy,
                    max(1000, len(positive_bag)))

            new_names = positive_bag + negative_bag
            new_labels = [1] * len(positive_bag) + [-1] * len(negative_bag)
            name2label = dict(zip(new_names, new_labels))
            renamed, vectors = feat_file.read(new_names)
            Ys = [name2label[x] for x in renamed]

            np = len([1 for y in Ys if y > 0])
            nn = len([1 for y in Ys if y < 0])
            assert (len(positive_bag) == np)
            assert (len(negative_bag) == nn)
            wp = float(beta) * (np + nn) / np
            wn = (1.0 - beta) * (np + nn) / nn
            C = 1
            svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
            if 'fik' == params['model']:
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2'
            g_t = train_model(Ys, vectors, svm_params + ' -q')
            if t == 1:
                assemble_model = compress_model([g_t], [1.0], feat_dim, params)
            else:
                new_model = compress_model([g_t], [1.0], feat_dim, params)
                assemble_model.add_fastsvm(new_model, 1 - 1.0 / t, 1.0 / t)

        return assemble_model
Beispiel #20
0
def process(options, collection, annotationName, runfile):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultdir = os.path.join(rootpath, collection, 'SimilarityIndex',
                             collection)

    apscorer = getScorer('AP')
    datafiles = [
        x.strip() for x in open(runfile).readlines()
        if x.strip() and not x.strip().startswith('#')
    ]
    nr_of_runs = len(datafiles)

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    printStatus(INFO, 'read annotations from files')

    name2label = [{} for i in range(nr_of_concepts)]
    hit_imgset = [[] for i in range(nr_of_concepts)]

    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection,
                                            annotationName,
                                            concepts[i],
                                            skip_0=False,
                                            rootpath=rootpath)
        names = map(int, names)
        name2label[i] = dict(zip(names, labels))

        label_file = os.path.join(rootpath, collection, 'tagged,lemm',
                                  '%s.txt' % concepts[i])
        try:
            hit_imgset[i] = set(map(int, open(label_file).readlines()))
        except:
            hit_imgset[i] = set()
        printStatus(
            INFO, 'readLabeledImageSet for %s-%s -> %d hits' %
            (collection, concepts[i], len(hit_imgset[i])))

    ap_table = np.zeros((nr_of_runs, nr_of_concepts))
    ap2_table = np.zeros((nr_of_runs, nr_of_concepts))

    for run_idx in range(nr_of_runs):
        runName = os.path.splitext(os.path.basename(datafiles[run_idx]))[0]
        data = pickle.load(open(datafiles[run_idx], 'rb'))
        scores = data['scores']
        assert (scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]

        for c_idx in range(nr_of_concepts):
            ground_truth = name2label[c_idx]
            ranklist = zip(imset, scores[:, c_idx])
            ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True)
            ranklist = [x for x in ranklist if x[0] in ground_truth]

            resfile = os.path.join(resultdir, runName,
                                   '%s.txt' % concepts[c_idx])
            if not checkToSkip(resfile, overwrite):
                writeRankingResults(ranklist, resfile)
            sorted_labels = [ground_truth[x[0]] for x in ranklist]
            assert (len(sorted_labels) > 0)
            #print concepts[c_idx], ranklist[:5], sorted_labels[:5]

            ap_table[run_idx, c_idx] = apscorer.score(sorted_labels)

            resfile = os.path.join(resultdir, 'tagged,lemm', runName,
                                   '%s.txt' % concepts[c_idx])
            if not checkToSkip(resfile, overwrite):
                writeRankingResults(
                    [x for x in ranklist if x[0] in hit_imgset[c_idx]],
                    resfile)

            sorted_labels = [
                ground_truth[x[0]] for x in ranklist
                if x[0] in hit_imgset[c_idx]
            ]
            ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels)

    print '#' * 100
    print '# untagged-concept', ' '.join(
        [os.path.basename(x) for x in datafiles])
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ap_table[:, c_idx]])
    print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)])

    print '#' * 100
    print '# tagged-concept'
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ap2_table[:, c_idx]])
    print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])
                                  trainAnnotationName, feature, modelName))
    results = []

    for concept in concepts:
        model_file_name = os.path.join(rootpath, trainCollection, 'Models',
                                       trainAnnotationName, feature, modelName,
                                       '%s.model' % concept)
        model = load_model(model_file_name)

        ranklist = [(test_renamed[i], model.predict(test_vectors[i]))
                    for i in range(len(test_renamed))]
        ranklist.sort(key=lambda v: v[1], reverse=True)

        names, labels = readAnnotationsFrom(testCollection,
                                            testAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        test_name2label = dict(zip(names, labels))
        sorted_labels = [
            test_name2label[x[0]] for x in ranklist if x[0] in test_name2label
        ]
        perf = scorer.score(sorted_labels)

        print('%s %g' % (concept, perf))

        results.append((concept, perf))

    mean_perf = sum([x[1] for x in results]) / len(concepts)
    print('mean%s %g' % (metric, mean_perf))
def process(options, trainCollection, trainAnnotationName, valCollection, valAnnotationName, feature, modelName):
    assert(modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1 #options.autoweight
    beta = 0.5
    metric = options.metric
    scorer = getScorer(metric)
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {}
    
    if 'fik' == modelName:
        from fiksvm.svmutil import svm_train as train_model
        from fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))    
    else:
        from fastlinear.liblinear193.python.liblinearutil import train as train_model
        from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        modelName = 'fastlinear'


    
    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    valConcepts = readConcepts(valCollection,valAnnotationName, rootpath=rootpath)
    concept_num = len(concepts)
    for i in range(concept_num):
        assert(concepts[i] == valConcepts[i])
    
    resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, '%s,best_params'%modelName, '%s,%s,%s' % (valCollection,valAnnotationName,feature))
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.txt')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature))
    val_feat_file = BigFile(os.path.join(rootpath,valCollection,'FeatureData',feature))
    feat_dim = train_feat_file.ndims
    assert(feat_dim == val_feat_file.ndims)

    
    for concept in todo:
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names,labels))
        renamed,vectors = train_feat_file.read(names)
        Ys = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if  1 == lab])
        nn = len([1 for lab in labels if  -1== lab])
        wp = float(beta) * (np+nn) / np
        wn = (1.0-beta) * (np+nn) /nn
    
        names,labels = readAnnotationsFrom(valCollection, valAnnotationName, concept, skip_0=True, rootpath=rootpath)
        val_name2label = dict(zip(names,labels))
        val_renamed, val_vectors = val_feat_file.read(names)
        
        min_perf = 2.0
        worst_C = 1.0
        max_perf = 0.0
        best_C = 1.0
        best_scores = None
        best_labels = None
        for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]:
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) 
            else:
                svm_params = '-c %g' % C
            
            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
            #print modelName, '>'*20, svm_params
            model = train_model(Ys, vectors, svm_params + ' -q')
            new_model = compress_model([model], [1.0], feat_dim, params)

            ranklist = [(val_renamed[i], new_model.predict(val_vectors[i])) for i in range(len(val_renamed))]
            ranklist.sort(key=lambda v:v[1], reverse=True)
            sorted_labels = [val_name2label[x[0]] for x in ranklist]
            perf = scorer.score(sorted_labels)
            if max_perf < perf:
                max_perf = perf
                best_C = C
                best_scores = [x[1] for x in ranklist]
                best_labels = list(sorted_labels)
            if min_perf > perf:
                min_perf = perf
                worst_C = C
                
        [A,B] = sigmoid_train(best_scores, best_labels)
        resultfile = os.path.join(resultdir, '%s.txt' % concept)
        
        printStatus(INFO, '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' % (concept, min_perf, worst_C, max_perf, best_C, A, B))
        makedirsforfile(resultfile)
        fw = open(resultfile, 'w')
        fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B))
        fw.close()
Beispiel #23
0
    overwrite = 0

    concepts = readConcepts(srcCollection, annotationName, rootpath)
    todo = []
    for concept in concepts:
        resfile = os.path.join(rootpath, dstCollection, 'Annotations', 'Image',
                               annotationName, '%s.txt' % concept)
        if checkToSkip(resfile, overwrite):
            continue
        todo.append(concept)
    if not todo:
        print('nothing to do')
        sys.exit(0)

    imset = set(readImageSet(dstCollection, dstCollection, rootpath))

    for concept in todo:
        names, labels = readAnnotationsFrom(srcCollection,
                                            annotationName,
                                            concept,
                                            rootpath=rootpath)
        selected = [x for x in zip(names, labels) if x[0] in imset]
        print concept, len(selected)
        writeAnnotationsTo([x[0] for x in selected], [x[1] for x in selected],
                           dstCollection,
                           annotationName,
                           concept,
                           rootpath=rootpath)

    writeConceptsTo(concepts, dstCollection, annotationName, rootpath)
    newAnnotationTemplate = annotationName[:-4] + '.' + posName + str(nr_pos) + ('.random%d'%nr_neg) + '.%d.txt'
    concepts = readConcepts(collection, annotationName, rootpath)    
    simdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection, rankMethod)

    scriptfile = os.path.join(rootpath,collection,'annotationfiles', annotationName[:-4] + '.' + posName + str(nr_pos) + ('.random%d'%nr_neg) + '.0-%d.txt'%(nr_neg_bags-1))
    makedirsforfile(scriptfile)
    fout = open(scriptfile,'w')
    fout.write('\n'.join([newAnnotationTemplate%t for t in range(nr_neg_bags)]) + '\n')
    fout.close()


    for concept in concepts:
        simfile = os.path.join(simdir, '%s.txt' % concept)
        ranklist = readRankingResults(simfile)
        pos_bag = [x[0] for x in ranklist[:nr_pos]]
        names, labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath)
        negativePool = [x[0] for x in zip(names,labels) if x[1] < 0]

        for t in range(nr_neg_bags):
            newAnnotationName = newAnnotationTemplate % t
            resultfile = os.path.join(rootpath, collection, 'Annotations', 'Image', newAnnotationName, '%s.txt'%concept)
            if checkToSkip(resultfile, overwrite):
                continue
            true_nr_neg = max(500, len(pos_bag)*neg_pos_ratio)
            neg_bag = random.sample(negativePool, true_nr_neg) #len(pos_bag)*neg_pos_ratio)
            assert(len(set(pos_bag).intersection(set(neg_bag))) == 0)
            printStatus(INFO, "anno(%s,%d) %d pos %d neg -> %s" % (concept,t,len(pos_bag),len(neg_bag),resultfile))
            writeAnnotations(pos_bag + neg_bag, [1]*len(pos_bag) + [-1]*len(neg_bag), resultfile)

    for t in range(nr_neg_bags):
        newAnnotationName = newAnnotationTemplate % t
Beispiel #25
0
        BigFile(
            os.path.join(rootpath, trainCollection, 'FeatureData', feature),
            FEATURE_TO_DIM[feature]))
    featurefile = os.path.join(rootpath, testCollection, "FeatureData",
                               feature, "id.feature.txt")

    feat_dim = 1024
    num_bins = 50

    #fikmodel.set_probAB(-1, 0)

    #print "fik model0", fikmodel0.get_nr_svs(), fikmodel0.get_feat_dim(), fikmodel0.get_probAB()
    #print "fik model", fikmodel.get_nr_svs(), fikmodel.get_feat_dim(), fikmodel.get_probAB()
    mAP = [0] * 4
    for concept in concepts:
        names, labels = readAnnotationsFrom(testCollection,
                                            'conceptsvoc2008val.txt', concept)
        name2label = dict(zip(names, labels))
        ranklist = []

        modelfile = os.path.join(rootpath, trainCollection, "Models",
                                 annotationName, feature, 'hiksvm',
                                 "%s.model" % concept)
        #print modelfile
        model = svm_load_model(modelfile)
        #print model.get_svm_type()
        #print model.get_nr_class()
        svm_models = [model, model]
        num_models = len(svm_models)
        fikmodel0 = svm_to_fiksvm0(svm_models, [1.0 / num_models] * num_models,
                                   num_models, feat_dim, num_bins)
        fikmodel1 = svm_to_fiksvm(svm_models, num_models,
Beispiel #26
0
def process(options, trainCollection, trainAnnotationName, feature):
    import re
    p = re.compile(r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)')

    rootpath = options.rootpath
    best_param_dir = options.best_param_dir
    overwrite = options.overwrite
    #autoweight = options.autoweight
    numjobs = options.numjobs
    job = options.job
    beta = 0.5
    
    modelName = 'fastlinear'
    if best_param_dir:
        modelName += '-tuned'
    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    if not todo:
        return 0

    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    
    feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature))
    
    for concept in todo:
        if best_param_dir:
            param_file = os.path.join(best_param_dir, '%s.txt' % concept)
            m = p.search(open(param_file).readline().strip())
            C = float(m.group('C'))
            A = float(m.group('a'))
            B = float(m.group('b'))
        else:
            C = 1
            A = 0
            B = 0
        printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B))
        
        model_file_name = os.path.join(resultdir, concept + '.model')
        
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names,labels))
        renamed,vectors = feat_file.read(names)
        y = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if  1 == lab])
        nn = len([1 for lab in labels if  -1== lab])
        wp = float(beta) * (np+nn) / np
        wn = (1.0-beta) * (np+nn) /nn
    
        # no bias term added by setting "-B -1"
        svm_params = '-w1 %g -w-1 %g -s 2 -B -1 -q' % (wp*C, wn*C) 
        model = liblinear_train(y, vectors, svm_params)
        newmodel = liblinear_to_fastlinear([model], [1.0], feat_file.ndims)
        newmodel.set_probAB(A, B)
        makedirsforfile(model_file_name)
        printStatus(INFO, '-> %s'%model_file_name)
        fastlinear_save_model(model_file_name, newmodel)

        # reload the model file to do a simple check
        fastlinear_load_model(model_file_name)
        assert(abs(newmodel.get_probAB()[0]-A)<1e-6)
        assert(abs(newmodel.get_probAB()[1]-B)<1e-6)

    return len(todo)
Beispiel #27
0
if __name__ == '__main__':
    args = sys.argv[1:]
    rootpath = '/var/scratch2/xirong/VisualSearch'
    srcCollection = args[0]
    annotationName = args[1]
    dstCollection = args[2]
    overwrite = 0

    concepts = readConcepts(srcCollection, annotationName, rootpath)
    todo = []
    for concept in concepts:
        resfile = os.path.join(rootpath, dstCollection, 'Annotations', 'Image', annotationName, '%s.txt'%concept)
        if checkToSkip(resfile, overwrite):
            continue
        todo.append(concept)
    if not todo:
        print ('nothing to do')
        sys.exit(0)


    imset = set(readImageSet(dstCollection, dstCollection, rootpath))

    for concept in todo:
        names,labels = readAnnotationsFrom(srcCollection, annotationName, concept, rootpath=rootpath)
        selected = [x for x in zip(names,labels) if x[0] in imset]
        print concept, len(selected)
        writeAnnotationsTo([x[0] for x in selected], [x[1] for x in selected], dstCollection, annotationName,  concept, rootpath=rootpath)

    writeConceptsTo(concepts, dstCollection, annotationName, rootpath)
Beispiel #28
0
def process(options, collection, annotationName, runfile):
    rootpath = options.rootpath

    p1_scorer = getScorer('P@3')
    p3_scorer = getScorer('P@5')
    r1_scorer = getScorer('R@3')
    r3_scorer = getScorer('R@5')
    ndcg1_scorer = getScorer('NDCG2@3')
    ndcg3_scorer = getScorer('NDCG2@5')
    ap_scorer = getScorer('AP')
    rr_scorer = getScorer('RR')

    datafiles = [
        x.strip() for x in open(runfile).readlines()
        if x.strip() and not x.strip().startswith('#')
    ]
    nr_of_runs = len(datafiles)

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    name2label = [{} for i in range(nr_of_concepts)]
    rel_conset = {}

    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection,
                                            annotationName,
                                            concepts[i],
                                            skip_0=False,
                                            rootpath=rootpath)
        #names = map(int, names)
        name2label[i] = dict(zip(names, labels))

        for im, lab in zip(names, labels):
            if lab > 0:
                rel_conset.setdefault(im, set()).add(i)

    # ('7975436322', set([33]))
    # for im, im_labels in rel_conset.items():
    #   print(im, im_labels)

    for run_idx in range(nr_of_runs):
        data = pickle.load(open(datafiles[run_idx], 'rb'))
        scores = data['scores']
        assert (scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        # for im in imset:
        #     print(im)
        #     raw_input()
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]

        res = np.zeros((nr_of_images, 8))
        for j in range(nr_of_images):
            ranklist = zip(range(nr_of_concepts), scores[j, :])
            ranklist.sort(key=lambda v: v[1], reverse=True)
            # print(ranklist)
            # raw_input()
            rel_set = rel_conset.get(imset[j], set())
            sorted_labels = [int(x[0] in rel_set) for x in ranklist]
            # print(sorted_labels)
            # raw_input()
            assert len(sorted_labels) == nr_of_concepts
            p1 = p1_scorer.score(sorted_labels)
            p3 = p3_scorer.score(sorted_labels)
            r1 = r1_scorer.score(sorted_labels)
            r3 = r3_scorer.score(sorted_labels)
            ndcg1 = ndcg1_scorer.score(sorted_labels)
            ndcg3 = ndcg3_scorer.score(sorted_labels)
            ap = ap_scorer.score(sorted_labels)
            rr = rr_scorer.score(sorted_labels)

            f1, f3 = 0.0, 0.0
            if (p1 + r1) != 0.0:
                f1 = 2 * p1 * r1 / (p1 + r1)
            if (p3 + r3) != 0.0:
                f3 = 2 * p3 * r3 / (p3 + r3)
            # h1, h3 = max(p1, r1), max(p3, r3)
            res[j, :] = [p1, p3, r1, r3, ndcg1, ndcg3, ap, rr]
            res[j, :] = [p1, p3, f1, f3, ndcg1, ndcg3, ap, rr]
            # res[j,:] = [p1, p3, h1, h3, ndcg1, ndcg3, ap, rr]
        avg_perf = res.mean(axis=0)
        name = path.basename(datafiles[run_idx]).split('.')[0]
        name = name.split(',')[1]
        stdout.write('%s\t' % name)
        # for x in avg_perf:
        for i in range(len(avg_perf)):
            if i == 4 or i == 5:
                continue
            # x = avg_perf[i] * 100.0
            x = avg_perf[i]
            if x >= 100.0:
                stdout.write('& %.1f ' % x)
            else:
                # stdout.write('& %.2f ' % x)
                stdout.write('& %s' % (('%.4f ' % x).lstrip('0')))
        stdout.write('\n')
Beispiel #29
0
def process(options, trainCollection, devCollection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    method = options.method
    metric = options.metric

    qrysim = options.qrysim
    qrythres = options.qrythres
    ntopimg = options.ntopimg
    ntopqry = options.ntopqry
    mincc = options.mincc
    feature = options.feature

    # semantic embedding
    k = options.k
    corpus = options.corpus
    word2vec_model = options.word2vec
    label_source = options.label_source

    # result path
    ranking_result_path = os.path.join(rootpath, devCollection, 'SimilarityIndex', devCollection, 'MetaData', method, feature)
    DCG_result_path = os.path.join(rootpath, devCollection, metric, method, feature)
    if checkToSkip(ranking_result_path, overwrite):
        sys.exit(0)
    if checkToSkip(DCG_result_path, overwrite):
        sys.exit(0)

    # inpute of query
    qp = SimpleQueryParser()
    qid_query_file = os.path.join(rootpath, devCollection, 'Annotations', 'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)   #(qid query)
    qid2query =  dict(zip(qid_list, [qp.process(query) for query in query_list]))
    
    # path of image feature
    train_feat_path = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
    dev_feat_path = os.path.join(rootpath, devCollection, 'FeatureData', feature)


    # method selection
    if method =='conse':
        se_searcher = ConSE(label_source, corpus, word2vec_model, dev_feat_path, rootpath)

    elif method == 't2i' or method == 'ta': 
        nnquery_file = os.path.join(rootpath, devCollection, 'TextData','querynn', options.nnqueryfile)
        qryClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.queryclickfile)
        t2i_searcher = Text2Image(nnquery_file, qryClick_file, dev_feat_path, train_feat_path, ntopqry)

    elif method == 'i2t' or method == 'ia':
        nnimage_file = os.path.join(rootpath, devCollection, 'TextData','imagenn', feature, options.nnimagefile)
        imgClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.imageclickfile)
        i2t_searcher = Image2Text(nnimage_file, imgClick_file, qrysim, ntopimg, ntopqry)

    else:
        print "this model is not supported with %s" % method
        sys.exit(0)


 
    # calculate DCG@25
    scorer = getScorer(metric)

    done = 0
    failed_count = 0
    qid2dcg = collections.OrderedDict()
    qid2iid_label_score = {}

    for query_id in qid_list:

        iid_list, label_list = readAnnotationsFrom(devCollection, 'concepts%s.txt' % devCollection, query_id, False, rootpath)        

        if method == 'conse':
            scorelist = se_searcher.do_search(qid2query[query_id], iid_list, k)

        elif method == 't2i':
            scorelist = t2i_searcher.text2image(query_id, iid_list, qrythres, mincc )

        elif method == 'ta':
            scorelist = t2i_searcher.textAnnotation( query_id, iid_list, ntopimg, qrythres, mincc)

        elif method == 'i2t': 
            scorelist = i2t_searcher.image2text(qid2query[query_id], iid_list, mincc )

        elif method == 'ia':
            scorelist = i2t_searcher.imageAnnotation( qid2query[query_id], iid_list, mincc )    
         

        if len(scorelist) == 0: 
            failed_count += 1
            scorelist = [0]*len(iid_list)
            qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist)
            random.shuffle(qid2iid_label_score[query_id])
        else:
            qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist)
            qid2iid_label_score[query_id] = sorted(qid2iid_label_score[query_id], key=lambda v:v[2], reverse=True)


        # calculate the result ranking of DCG@25 from our model
        qid2dcg[query_id] = scorer.score([x[1] for x in qid2iid_label_score[query_id]])
        printMessage("Done", query_id, qid2query[query_id])

        done += 1
        if(done % 20 == 0):
            writeRankingResult(ranking_result_path, qid2iid_label_score)
            qid2iid_label_score = {}
    
    writeRankingResult(ranking_result_path, qid2iid_label_score)
    writeDCGResult(DCG_result_path, qid2dcg)
    print "number of failed query: %d" % failed_count 
    print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values()))

    result_path_file = "result/individual_result_pathes.txt"
    if os.path.exists(result_path_file):
        fout = open(result_path_file,'a')
    else:
        makedirsforfile(result_path_file)
        fout = open(result_path_file, 'w')
    fout.write(ranking_result_path + '\n')
    fout.close()
Beispiel #30
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    feature = options.feature
    method = options.method
    sigma =options.sigma

    # result path
    ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'MetaData', method, feature)
    DCG_result_path = os.path.join(rootpath, collection, 'DCG', method, feature)
    if checkToSkip(ranking_result_path, overwrite):
        sys.exit(0)
    if checkToSkip(DCG_result_path, overwrite):
        sys.exit(0)
    
    # inpute of query
    qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)
    qid2query =  dict(zip(qid_list, query_list))
    
    # inpute of image
    img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature)
    img_feats = BigFile(img_feat_path)

    # the model to calculate DCG@25
    scorer = getScorer("DCG@25")


    done = 0
    qid2dcg = collections.OrderedDict()
    qid2iid_label_score = {}

    for qid in qid_list:
        iid_list, label_list = readAnnotationsFrom(collection, 'concepts%s.txt' % collection, qid, False, rootpath)

        renamed, test_X = img_feats.read(iid_list)

        parzen_list = []
        for imidx in iid_list:
            parzen_list.append(calParzen(img_feats.read_one(imidx), test_X , sigma))

        sorted_tuple = sorted(zip(iid_list, label_list, parzen_list), key=lambda v:v[2], reverse=True)
        qid2iid_label_score[qid] = sorted_tuple

        # calculate DCG@25
        sorted_label = [x[1] for x in sorted_tuple]
        qid2dcg[qid] = scorer.score(sorted_label)
        printMessage("Done", qid, qid2query[qid])

        done += 1
        if done % 20 == 0:
             writeRankingResult(ranking_result_path, qid2iid_label_score)
             qid2iid_label_score = {}


    writeDCGResult(DCG_result_path, qid2dcg)
    writeRankingResult(ranking_result_path, qid2iid_label_score)
    print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values()))

    result_path_file = "result/individual_result_pathes.txt"
    if os.path.exists(result_path_file):
        fout = open(result_path_file,'a')
    else:
        makedirsforfile(result_path_file)
        fout = open(result_path_file, 'w')
    fout.write(ranking_result_path + '\n')
    fout.close()
Beispiel #31
0
def process(options, collection, annotationName, runfile):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection)

    apscorer = getScorer('AP')
    datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#')]
    nr_of_runs = len(datafiles)
    
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)  
    nr_of_concepts = len(concepts)
    
    printStatus(INFO, 'read annotations from files')
    
    name2label = [{} for i in range(nr_of_concepts)]
    hit_imgset = [[] for i in range(nr_of_concepts)]
    
    for i in range(nr_of_concepts):
        names,labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath)
        names = map(int, names)
        name2label[i] = dict(zip(names,labels))

        label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt'% concepts[i])
        try:
            hit_imgset[i] = set(map(int, open(label_file).readlines()))
        except:
            hit_imgset[i] = set()
        printStatus(INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i])))
        
    ap_table = np.zeros((nr_of_runs, nr_of_concepts))
    ap2_table = np.zeros((nr_of_runs, nr_of_concepts))
    
    for run_idx in range(nr_of_runs):
        runName = os.path.splitext(os.path.basename(datafiles[run_idx]))[0]
        data = pickle.load(open(datafiles[run_idx],'rb'))
        scores = data['scores']
        assert(scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]
                   
        for c_idx in range(nr_of_concepts):
            ground_truth = name2label[c_idx]
            ranklist =  zip(imset, scores[:,c_idx])
            ranklist.sort(key=lambda v:(v[1], str(v[0])), reverse=True)
            ranklist = [x for x in ranklist if x[0] in ground_truth]

            resfile = os.path.join(resultdir, runName, '%s.txt' % concepts[c_idx])
            if not checkToSkip(resfile, overwrite):
                writeRankingResults(ranklist, resfile)            
            sorted_labels = [ground_truth[x[0]] for x in ranklist]
            assert(len(sorted_labels)>0)
            #print concepts[c_idx], ranklist[:5], sorted_labels[:5]

            ap_table[run_idx, c_idx] = apscorer.score(sorted_labels)

            resfile = os.path.join(resultdir, 'tagged,lemm', runName, '%s.txt' % concepts[c_idx])
            if not checkToSkip(resfile, overwrite):
                writeRankingResults([x for x in ranklist if x[0] in hit_imgset[c_idx]], resfile)            
            
            sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]]
            ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels)
     

    print '#'*100
    print '# untagged-concept', ' '.join([os.path.basename(x) for x in datafiles])
    print '#'*100
            
    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ap_table[:,c_idx]])
    print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)])
    
    print '#'*100
    print '# tagged-concept'
    print '#'*100
    
    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ap2_table[:,c_idx]])
    print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])
Beispiel #32
0
def process(options, trainCollection, annotationfile, feature, modelName):
    assert(modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1 #options.autoweight
    beta = 0.5
    C = 1
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {'rootpath': rootpath, 'model': modelName}
    
    if 'fik' == modelName:
        from svms.fiksvm.svmutil import svm_train as train_model
        from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from svms.fiksvm.fiksvm import fiksvm_save_model as save_model
        from svms.fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))    
    else:
        from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model
        from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        from svms.fastlinear.fastlinear import fastlinear_save_model as save_model
 
    newAnnotationName = os.path.split(annotationfile)[-1]
    trainAnnotationNames = [x.strip() for x in open(annotationfile).readlines() if x.strip() and not x.strip().startswith('#')]
    for annotationName in trainAnnotationNames:
        conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', annotationName)
        if not os.path.exists(conceptfile):
            print '%s does not exist' % conceptfile
            return 0

    concepts = readConcepts(trainCollection, trainAnnotationNames[0], rootpath=rootpath)

    resultdir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, modelName)
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.model')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature))
    feat_dim = train_feat_file.ndims

    s_time = time.time()

    for concept in todo:
        assemble_model = None
        for t in range(1, len(trainAnnotationNames)+1):
            names,labels = readAnnotationsFrom(trainCollection, trainAnnotationNames[t-1], concept, skip_0=True, rootpath=rootpath)
            name2label = dict(zip(names,labels))
            renamed,vectors = train_feat_file.read(names)
            Ys = [name2label[x] for x in renamed]
            np = len([1 for lab in labels if  1 == lab])
            nn = len([1 for lab in labels if  -1== lab])
            wp = float(beta) * (np+nn) / np
            wn = (1.0-beta) * (np+nn) /nn
    
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) 
            else:
                svm_params = '-c %g' % C
            
            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
           
            g_t = train_model(Ys, vectors, svm_params + ' -q')
            if t == 1:
                assemble_model = compress_model([g_t], [1.0], feat_dim, params)
            else:
                new_model = compress_model([g_t], [1.0], feat_dim, params)
                assemble_model.add_fastsvm(new_model, 1-1.0/t, 1.0/t)

        new_model_file = os.path.join(resultdir, '%s.model' % concept)            
        makedirsforfile(new_model_file)
        printStatus(INFO, 'save model to %s' % new_model_file)
        save_model(new_model_file, assemble_model)
        printStatus(INFO, '%s done' % concept)

        
    timecost = time.time() - s_time
    writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath)
    printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo)))
    printStatus(INFO, 'models stored at %s' % resultdir)
    printStatus(INFO, '%g seconds in total' % timecost)
Beispiel #33
0
def process(options, collection, annotationName, runfile, outDirectory):
    rootpath = options.rootpath

    apscorer = getScorer('AP')
    ndcg = getScorer('NDCG@20')
    ndcg2 = getScorer('NDCG2@20')
    p1scorer = getScorer('P@1')
    p5scorer = getScorer('P@5')

    datafiles = [
        x.strip() for x in open(runfile).readlines()
        if x.strip() and not x.strip().startswith('#')
    ]
    nr_of_runs = len(datafiles)

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    printStatus(INFO, 'read annotations from files')

    name2label = [{} for i in range(nr_of_concepts)]
    hit_imgset = [[] for i in range(nr_of_concepts)]
    rel_conset = {}

    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection,
                                            annotationName,
                                            concepts[i],
                                            skip_0=False,
                                            rootpath=rootpath)
        names = map(int, names)
        name2label[i] = dict(zip(names, labels))

        for im, lab in zip(names, labels):
            if lab > 0:
                rel_conset.setdefault(im, set()).add(i)

        label_file = os.path.join(rootpath, collection, 'tagged,lemm',
                                  '%s.txt' % concepts[i])
        try:
            hit_imgset[i] = set(map(int, open(label_file).readlines()))
        except:
            hit_imgset[i] = set()
        printStatus(
            INFO, 'readLabeledImageSet for %s-%s -> %d hits' %
            (collection, concepts[i], len(hit_imgset[i])))

    ap_table = np.zeros((nr_of_runs, nr_of_concepts))
    ap2_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts))

    print '#' * 100
    print '# method miap hit1 hit5'
    print '#' * 100

    for run_idx in range(nr_of_runs):
        data = pickle.load(open(datafiles[run_idx], 'rb'))
        scores = data['scores']
        assert (scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        imset = np.array([int(x) for x in imset])
        idx = np.argsort(imset)
        imset = imset[idx]
        scores = scores[idx]
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]

        for c_idx in range(nr_of_concepts):
            ground_truth = name2label[c_idx]
            ranklist = zip(imset, scores[:, c_idx])
            ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True)
            ranklist = [x for x in ranklist if x[0] in ground_truth]

            sorted_labels = [ground_truth[x[0]] for x in ranklist]
            assert (len(sorted_labels) > 0)
            #print concepts[c_idx], ranklist[:5], sorted_labels[:5]

            ap_table[run_idx, c_idx] = apscorer.score(sorted_labels)

            sorted_labels = [
                ground_truth[x[0]] for x in ranklist
                if x[0] in hit_imgset[c_idx]
            ]
            ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels)
            ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels)
            ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels)

        res = np.zeros((nr_of_images, 4))
        gt = np.zeros((nr_of_images, nr_of_concepts))
        for j in range(nr_of_images):
            ranklist = zip(range(nr_of_concepts), scores[j, :])
            ranklist.sort(key=lambda v: v[1], reverse=True)
            rel_set = rel_conset.get(imset[j], set())
            sorted_labels = [int(x[0] in rel_set) for x in ranklist]

            #print rel_set
            #print sorted_labels

            ap = apscorer.score(sorted_labels)
            hit1 = p1scorer.score(sorted_labels)
            hit5 = p5scorer.score(sorted_labels) > 0.1
            res[j, :] = [ap, hit1, hit5, len(rel_set)]
            gt[j, :] = sorted_labels
        avg_perf = res.mean(axis=0)
        print os.path.split(datafiles[run_idx])[-1], ' '.join(
            ['%.3f' % x for x in avg_perf])

        outMiap = h5py.File(
            os.path.join(outDirectory,
                         os.path.split(datafiles[run_idx])[-1] + ".h5"), 'w')
        outMiap['iap'] = res[:, 0]
        outMiap['ngt'] = res[:, 3]
        outMiap['hit1'] = res[:, 1]
        outMiap['hit5'] = res[:, 2]
        outMiap['gt'] = gt
        outMiap['concepts'] = concepts
        outMiap['ap'] = ap_table[run_idx, :]
        outMiap['ap2'] = ap2_table[run_idx, :]
        outMiap[ndcg.name()] = ndcg_table[run_idx, :]
        outMiap[ndcg2.name()] = ndcg2_table[run_idx, :]
        outMiap.close()

    print '#' * 100
    print '# untagged-concept', ' '.join(
        [os.path.split(x)[-1] for x in datafiles])
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ap_table[:, c_idx]])
    print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)])

    print '#' * 100
    print '# tagged-concept'
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ap2_table[:, c_idx]])
    print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])

    print '#' * 100
    print '# tagged-concept'
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ndcg_table[:, c_idx]])
    print 'mean%s' % ndcg.name(), ' '.join(
        ['%.3f' % x for x in ndcg_table.mean(axis=1)])

    print '#' * 100
    print '# tagged-concept'
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ndcg2_table[:, c_idx]])
    print 'mean%s' % ndcg2.name(), ' '.join(
        ['%.3f' % x for x in ndcg2_table.mean(axis=1)])
Beispiel #34
0
    nr_of_images = len(_renamed)
    nr_of_concepts = len(concepts)
    
    mAP = 0.0
    models = [None] * len(concepts)

    stream = StreamFile(feat_dir)

    for i,concept in enumerate(concepts):
        model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concept)
        model = load_model(model_file_name)
        #print model.get_probAB()
        models[i] = model

        names,labels = readAnnotationsFrom(testCollection, testAnnotationName, concept, rootpath=rootpath)
        name2label = dict(zip(names,labels))

        ranklist1 = [(_id, model.predict(_vec)) for _id,_vec in zip(_renamed, _vectors)]

        stream.open()
        ranklist2 = [(_id, model.predict(_vec)) for _id,_vec in stream]
        stream.close()

        ranklist3 = [(_id, model.predict_probability(_vec)) for _id,_vec in zip(_renamed, _vectors)]

        print concept,

        for ranklist in [ranklist1, ranklist2, ranklist3]:
            ranklist.sort(key=lambda v:v[1], reverse=True)
            sorted_labels = [name2label[x[0]] for x in ranklist if x[0] in name2label]
Beispiel #35
0
    def learn(concept, params):
        rootpath = params['rootpath']
        trainCollection = params['trainCollection']
        baseAnnotationName = params['baseAnnotationName']
        startAnnotationName = params['startAnnotationName']
        strategy = params['strategy']
        feature = params['feature']
        feat_file = params['feat_file']
        feat_dim = feat_file.ndims
        npr = params['npr']
        iterations = params['iterations']
        beta = 0.5
        
        names,labels = readAnnotationsFrom(trainCollection, startAnnotationName, concept, skip_0=True, rootpath=rootpath)
        positive_bag = [x[0] for x in zip(names,labels) if x[1] > 0]
        negative_bag = [x[0] for x in zip(names,labels) if x[1] < 0]

        names,labels = readAnnotationsFrom(trainCollection, baseAnnotationName, concept, skip_0=True, rootpath=rootpath)
        negative_pool = [x[0] for x in zip(names,labels) if x[1] < 0]

        Usize = max(5000, len(positive_bag) * npr)
        Usize = min(10000, Usize)
        Usize = min(Usize, len(negative_pool))

        new_model = None
         
        for t in range(1, iterations+1):
            printStatus(INFO, 'iter %d (%s)' % (t, concept))
            if t > 1: # select relevant negative examples 
                # check how good at classifying positive training examples
                results = classify_large_data(assemble_model, positive_bag, feat_file)
                pos_error_rate = len([1 for x in results if x[1]<0])/float(len(results))
 
                U = random.sample(negative_pool, Usize)
                predictions = classify_large_data(assemble_model, U, feat_file)
                neg_error_rate = len([1 for x in predictions if x[1]>0])/float(len(predictions))               
               
                error_rate = (pos_error_rate + neg_error_rate)/2.0

                printStatus(INFO, 'iter %d: %s %.3f -> %s %.3f, pe=%.3f, ne=%.3f, error=%.3f' % (t, predictions[-1][0], predictions[-1][1], 
                                                                                                    predictions[0][0], predictions[0][1], 
                                                                                                    pos_error_rate, neg_error_rate, error_rate))
                if error_rate < MIN_ERROR_RATE:
                    printStatus(INFO, 'hit stop criteria: error (%.3f) < MIN_ERROR_RATE (%.3f)' % (error_rate, MIN_ERROR_RATE))
                    break

                # assume that 1% of the randomly sampled set is truely positive, and the classifier will rank them at the top
                # so ignore them
                nr_of_estimated_pos = int(len(predictions)*0.01)
                negative_bag = NegativeBootstrap.sampling(predictions[nr_of_estimated_pos:], strategy, max(1000, len(positive_bag)))

            new_names = positive_bag + negative_bag
            new_labels = [1] * len(positive_bag) + [-1] * len(negative_bag)
            name2label = dict(zip(new_names,new_labels))
            renamed, vectors = feat_file.read(new_names)
            Ys = [name2label[x] for x in renamed] 

            np = len([1 for y in Ys if y>0])
            nn = len([1 for y in Ys if y<0])
            assert(len(positive_bag) == np)
            assert(len(negative_bag) == nn) 
            wp = float(beta) * (np+nn) / np
            wn = (1.0-beta) * (np+nn) /nn
            C = 1
            svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) 
            if 'fik' == params['model']:
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2'
            g_t = train_model(Ys, vectors, svm_params + ' -q')
            if t == 1:
                assemble_model = compress_model([g_t], [1.0], feat_dim, params)
            else:
                new_model = compress_model([g_t], [1.0], feat_dim, params)
                assemble_model.add_fastsvm(new_model, 1-1.0/t, 1.0/t)

        return assemble_model
Beispiel #36
0
def process(options, collection, annotationName, runfile):
    rootpath = options.rootpath
    
    apscorer = getScorer('AP')
    ndcg = getScorer('NDCG@20')
    ndcg2 = getScorer('NDCG2@20')
    p1scorer = getScorer('P@1')
    p5scorer = getScorer('P@5')

    datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#')]
    nr_of_runs = len(datafiles)
    
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)  
    nr_of_concepts = len(concepts)
    
    printStatus(INFO, 'read annotations from files')
    
    name2label = [{} for i in range(nr_of_concepts)]
    hit_imgset = [[] for i in range(nr_of_concepts)]
    rel_conset = {}
    
    for i in range(nr_of_concepts):
        names,labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath)
        names = map(int, names)
        name2label[i] = dict(zip(names,labels))
        
        for im,lab in zip(names,labels):
            if lab > 0:
                rel_conset.setdefault(im,set()).add(i)

        label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt'% concepts[i])
        try:
            hit_imgset[i] = set(map(int, open(label_file).readlines()))
        except:
            hit_imgset[i] = set()
        printStatus(INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i])))
        
    ap_table = np.zeros((nr_of_runs, nr_of_concepts))
    ap2_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts))
    
    print '#'*100
    print '# method miap hit1 hit5'
    print '#'*100
    
    for run_idx in range(nr_of_runs):
        data = pickle.load(open(datafiles[run_idx],'rb'))
        scores = data['scores']
        assert(scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]
                   
        for c_idx in range(nr_of_concepts):
            ground_truth = name2label[c_idx]
            ranklist =  zip(imset, scores[:,c_idx])
            ranklist.sort(key=lambda v:(v[1], str(v[0])), reverse=True)
            ranklist = [x for x in ranklist if x[0] in ground_truth]
            
            sorted_labels = [ground_truth[x[0]] for x in ranklist]
            assert(len(sorted_labels)>0)
            #print concepts[c_idx], ranklist[:5], sorted_labels[:5]

            ap_table[run_idx, c_idx] = apscorer.score(sorted_labels)

            sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]]
            ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels)
            ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels)
            ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels)

        res = np.zeros((nr_of_images, 3))
        for j in range(nr_of_images):
            ranklist = zip(range(nr_of_concepts), scores[j,:])
            ranklist.sort(key=lambda v:v[1], reverse=True)
            rel_set = rel_conset.get(imset[j], set())
            sorted_labels = [int(x[0] in rel_set) for x in ranklist]
            ap = apscorer.score(sorted_labels)
            hit1 = p1scorer.score(sorted_labels)
            hit5 = p5scorer.score(sorted_labels) > 0.1
            res[j,:] = [ap, hit1, hit5]
        avg_perf = res.mean(axis=0)
        print os.path.split(datafiles[run_idx])[-1], ' '.join(['%.3f' % x for x in avg_perf])
            


    print '#'*100
    print '# untagged-concept', ' '.join([os.path.split(x)[-1] for x in datafiles])
    print '#'*100
            
    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ap_table[:,c_idx]])
    print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)])
    
    print '#'*100
    print '# tagged-concept'
    print '#'*100
    
    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ap2_table[:,c_idx]])
    print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])
    
    print '#'*100
    print '# tagged-concept'
    print '#'*100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ndcg_table[:,c_idx]])
    print 'mean%s' % ndcg.name(), ' '.join(['%.3f' % x for x in ndcg_table.mean(axis=1)])

    print '#'*100
    print '# tagged-concept'
    print '#'*100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ndcg2_table[:,c_idx]])
    print 'mean%s'%ndcg2.name(), ' '.join(['%.3f' % x for x in ndcg2_table.mean(axis=1)])
def process(options, collection, annotationName, pos_num):
    assert (annotationName.endswith('.txt'))
    rootpath = options.rootpath
    pos_bag_num = options.pos_bag_num
    neg_bag_num = options.neg_bag_num
    neg_pos_ratio = options.neg_pos_ratio

    annotationNameStr = annotationName[:-4] + (
        '.random%d' % pos_num) + '.%d' + ('.npr%d' % neg_pos_ratio) + '.%d.txt'

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)

    skip = 0
    newAnnotationNames = [None] * (pos_bag_num * neg_bag_num)

    for idxp in range(pos_bag_num):
        for idxn in range(neg_bag_num):
            anno_idx = idxp * neg_bag_num + idxn
            newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn)
            resultfile = os.path.join(rootpath, collection, 'Annotations',
                                      newAnnotationNames[anno_idx])
            if checkToSkip(resultfile, options.overwrite):
                skip += 1
                continue
            writeConcepts(concepts, resultfile)

    first, second, last = annotationNameStr.split('%d')
    scriptfile = os.path.join(
        rootpath, collection, 'annotationfiles', first + '0-%d' %
        (pos_bag_num - 1) + second + '0-%d' % (neg_bag_num - 1) + last)

    makedirsforfile(scriptfile)
    fout = open(scriptfile, 'w')
    fout.write('\n'.join(newAnnotationNames) + '\n')
    fout.close()

    if len(newAnnotationNames) == skip:
        return 0

    for concept in concepts:
        names, labels = readAnnotationsFrom(collection,
                                            annotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        positivePool = [x[0] for x in zip(names, labels) if x[1] > 0]
        negativePool = [x[0] for x in zip(names, labels) if x[1] < 0]

        for idxp in range(pos_bag_num):
            if len(positivePool) > pos_num:
                positiveBag = random.sample(positivePool, pos_num)
            else:
                positiveBag = positivePool
            for idxn in range(neg_bag_num):
                anno_idx = idxp * neg_bag_num + idxn
                newAnnotationName = newAnnotationNames[anno_idx]
                resultfile = os.path.join(rootpath, collection, 'Annotations',
                                          'Image', newAnnotationName,
                                          '%s.txt' % concept)
                if checkToSkip(resultfile, options.overwrite):
                    continue
                real_neg_num = max(len(positiveBag) * neg_pos_ratio, 1000)
                real_neg_num = min(len(negativePool), real_neg_num)
                negativeBag = random.sample(negativePool, real_neg_num)

                assert (len(set(positiveBag).intersection(
                    set(negativeBag))) == 0)
                printStatus(
                    INFO, "anno(%s,%d) %d pos %d neg -> %s" %
                    (concept, anno_idx, len(positiveBag), len(negativeBag),
                     resultfile))
                writeAnnotations(positiveBag + negativeBag,
                                 [1] * len(positiveBag) +
                                 [-1] * len(negativeBag), resultfile)