Esempio n. 1
0
def evaluateSearchEngines(searchers,
                          collection,
                          annotationName,
                          metric,
                          rootpath=ROOT_PATH):
    scorer = getScorer(metric)
    concepts = readConcepts(collection, annotationName, rootpath)

    nr_of_runs = len(searchers)
    nr_of_concepts = len(concepts)
    results = np.zeros((nr_of_concepts, nr_of_runs))

    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection, annotationName,
                                            concepts[i], rootpath)
        name2label = dict(zip(names, labels))

        for j in range(nr_of_runs):
            searchresults = searchers[j].scoreCollection(concepts[i])
            sorted_labels = [
                name2label[name] for (name, score) in searchresults
                if name in name2label
            ]
            results[i, j] = scorer.score(sorted_labels)

    for i in range(nr_of_concepts):
        print concepts[i], ' '.join([niceNumber(x, 3) for x in results[i, :]])
    mean_perf = results.mean(0)
    print 'mean%s' % metric, ' '.join([niceNumber(x, 3) for x in mean_perf])

    return concepts, results
Esempio n. 2
0
def eval(data, weights):
    scorer = getScorer('DCG@25')
    feat_dim = data[0][-1].shape[1]
    nr_of_qry = len(data)

    mean_perf = [0] * feat_dim
    avg_perf = 0

    for i in range(nr_of_qry):
        qid, labels, scores = data[i]
        fusion = scores.dot(weights)
        sorted_idx = (0 - fusion).argsort()
        sorted_labels = [labels[x] for x in sorted_idx]
        avg_perf += scorer.score(sorted_labels)
    return avg_perf / nr_of_qry
Esempio n. 3
0
def eval(data, weights):
    scorer = getScorer('DCG@25')
    feat_dim = data[0][-1].shape[1]
    nr_of_qry = len(data)

    mean_perf = [0] * feat_dim
    avg_perf = 0

    for i in range(nr_of_qry):
        qid, labels, scores = data[i]
        fusion = scores.dot(weights)
        sorted_idx = (0-fusion).argsort()
        sorted_labels = [labels[x] for x in sorted_idx]
        avg_perf += scorer.score(sorted_labels)
    return avg_perf / nr_of_qry
Esempio n. 4
0
def i2t_map(c2i, n_caption=5):
    """
    Images->Text (Text Search)
    c2i: (5N, N) matrix of caption to image errors
    """
    scorer = getScorer('AP')
    perf_list = []
    for i in range(c2i.shape[1]):
        d_i = c2i[:, i]
        labels = [0] * len(d_i)
        labels[i * n_caption:(i + 1) * n_caption] = [1] * n_caption

        sorted_labels = [labels[x] for x in np.argsort(d_i)]
        current_score = scorer.score(sorted_labels)
        perf_list.append(current_score)

    return np.mean(perf_list)
Esempio n. 5
0
def t2v_map(c2i, t2v_gts):
    """
    Text->Videos (Text-to-Video Retrieval)
    c2i: (5N, N) matrix of caption to video errors
    """
    scorer = getScorer('AP')
    perf_list = []
    for i in range(c2i.shape[0]):
        d_i = c2i[i, :]
        labels = [0]*len(d_i)

        x = t2v_gts[i][0]
        labels[x] = 1

        sorted_labels = [labels[x] for x in np.argsort(d_i)]

        current_score = scorer.score(sorted_labels)
        perf_list.append(current_score)
    return np.mean(perf_list)
def t2i_map(c2i, n_caption=5):
    """
    Text->Videos (Text-to-Video Retrieval)
    c2i: (5N, N) matrix of caption to video errors
    """
    # print("errors matrix shape: ", c2i.shape)
    assert c2i.shape[0] / c2i.shape[1] == n_caption, c2i.shape

    scorer = getScorer('AP')
    perf_list = []
    for i in range(c2i.shape[0]):
        d_i = c2i[i, :]
        labels = [0] * len(d_i)
        labels[i / n_caption] = 1

        sorted_labels = [labels[x] for x in np.argsort(d_i)]
        current_score = scorer.score(sorted_labels)
        perf_list.append(current_score)

    return np.mean(perf_list)
def i2t_map(c2i, n_caption=5):
    """
    Videos->Text (Video-to-Text Retrieval)
    c2i: (5N, N) matrix of caption to video errors
    """
    # print("errors matrix shape: ", c2i.shape)
    assert c2i.shape[0] / c2i.shape[1] == n_caption, c2i.shape

    scorer = getScorer('AP')
    perf_list = []
    for i in range(c2i.shape[1]):
        d_i = c2i[:, i]
        labels = [0] * len(d_i)
        labels[i * n_caption:(i + 1) * n_caption] = [1] * n_caption

        sorted_labels = [labels[x] for x in np.argsort(d_i)]
        current_score = scorer.score(sorted_labels)
        perf_list.append(current_score)

    return np.mean(perf_list)
Esempio n. 8
0
def v2t_map(c2i, v2t_gts):
    """
    Videos->Text (Video-to-Text Retrieval)
    c2i: (5N, N) matrix of caption to video errors
    """
    scorer = getScorer('AP')
    perf_list = []
    for i in range(c2i.shape[1]):
        d_i = c2i[:, i]

        labels = [0]*len(d_i)
        # labels[i*n_caption:(i+1)*n_caption] = [1]*n_caption
        for x in v2t_gts[i]:
            labels[x] = 1
        sorted_labels = [labels[x] for x in np.argsort(d_i)]

        current_score = scorer.score(sorted_labels)
        perf_list.append(current_score)

    return np.mean(perf_list)
Esempio n. 9
0
def eval_file(data, weights, file_name):
    scorer = getScorer('DCG')
    feat_dim = data[0][-1].shape[1]
    nr_of_qry = len(data)

    mean_perf = [0] * feat_dim
    avg_perf = 0

    fout = open(file_name, 'w')

    for i in range(nr_of_qry):
        qid, labels, scores = data[i]
        fusion = scores.dot(weights)

        print '.' * 20
        print fusion
        print '-' * 20
        sorted_idx = (0 - fusion).argsort()
        sorted_labels = [labels[x] for x in sorted_idx]
        fout.write(qid + " " + str(scorer.score(sorted_labels)) + "\n")
    fout.close()
Esempio n. 10
0
def eval_file(data, weights, file_name):
    scorer = getScorer('DCG')
    feat_dim = data[0][-1].shape[1]
    nr_of_qry = len(data)

    mean_perf = [0] * feat_dim
    avg_perf = 0

    fout = open(file_name, 'w')

    for i in range(nr_of_qry):
        qid, labels, scores = data[i]
        fusion = scores.dot(weights)

        print '.' * 20
        print fusion
        print '-' * 20
        sorted_idx = (0-fusion).argsort()
        sorted_labels = [labels[x] for x in sorted_idx]
        fout.write(qid +" "+ str(scorer.score(sorted_labels)) + "\n") 
    fout.close()
Esempio n. 11
0
def test():
    scorer = getScorer('DCG@25')
    data = load_data()
    feat_dim = data[0][-1].shape[1]
    nr_of_qry = len(data)


    mean_perf = [0] * feat_dim
    rand_perf = 0
    avg_perf = 0

    weights = [1.0/feat_dim] * feat_dim
        
    for i in range(nr_of_qry):
        qid, labels, scores = data[i]
        nr_of_img = len(labels)
        A = (0-scores).argsort(axis=0)
        for j in range(feat_dim):
            sorted_idx = A[:,j]
            sorted_labels = [labels[x] for x in sorted_idx]
            random_guess = scorer.score(random.sample(labels, len(labels)))
            run = scorer.score(sorted_labels)
            mean_perf[j] += run
            rand_perf += random_guess
        #print int(run > random_guess), run
        
        avg = scores.dot(weights)
        sorted_idx = (0-avg).argsort()
        sorted_labels = [labels[x] for x in sorted_idx]
        avg_perf += scorer.score(sorted_labels)
    
    print 'random guess', rand_perf / (feat_dim * nr_of_qry)
    for j in range(feat_dim):
        print 'DCG@25 of feature_%d: %s' % (j, mean_perf[j] / nr_of_qry)
    # print 'mean DCG@25 of different feature:' , avg_perf / nr_of_qry
        
    print 'DCG@25 of average_fusion:', eval(data, weights)
Esempio n. 12
0
def test():
    scorer = getScorer('DCG@25')
    data = load_data()
    feat_dim = data[0][-1].shape[1]
    nr_of_qry = len(data)

    mean_perf = [0] * feat_dim
    rand_perf = 0
    avg_perf = 0

    weights = [1.0 / feat_dim] * feat_dim

    for i in range(nr_of_qry):
        qid, labels, scores = data[i]
        nr_of_img = len(labels)
        A = (0 - scores).argsort(axis=0)
        for j in range(feat_dim):
            sorted_idx = A[:, j]
            sorted_labels = [labels[x] for x in sorted_idx]
            random_guess = scorer.score(random.sample(labels, len(labels)))
            run = scorer.score(sorted_labels)
            mean_perf[j] += run
            rand_perf += random_guess
        #print int(run > random_guess), run

        avg = scores.dot(weights)
        sorted_idx = (0 - avg).argsort()
        sorted_labels = [labels[x] for x in sorted_idx]
        avg_perf += scorer.score(sorted_labels)

    print 'random guess', rand_perf / (feat_dim * nr_of_qry)
    for j in range(feat_dim):
        print 'DCG@25 of feature_%d: %s' % (j, mean_perf[j] / nr_of_qry)
    # print 'mean DCG@25 of different feature:' , avg_perf / nr_of_qry

    print 'DCG@25 of average_fusion:', eval(data, weights)
Esempio n. 13
0
def evaluateSearchEngines(searchers, collection, annotationName, metric, rootpath=ROOT_PATH):
    scorer = getScorer(metric)
    concepts = readConcepts(collection, annotationName, rootpath)
    
    nr_of_runs = len(searchers)
    nr_of_concepts = len(concepts)
    results = np.zeros((nr_of_concepts,nr_of_runs))


    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], rootpath)
        name2label = dict(zip(names,labels))
        
        for j in range(nr_of_runs):
            searchresults = searchers[j].scoreCollection(concepts[i])
            sorted_labels = [name2label[name] for (name,score) in searchresults if name in name2label]
            results[i,j] = scorer.score(sorted_labels)

    for i in range(nr_of_concepts):
        print concepts[i], ' '.join([niceNumber(x,3) for x in results[i,:]])
    mean_perf = results.mean(0)
    print 'mean%s'%metric, ' '.join([niceNumber(x,3) for x in mean_perf])

    return concepts,results
Esempio n. 14
0
def process(options, trainCollection, devCollection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    method = options.method
    metric = options.metric

    qrysim = options.qrysim
    qrythres = options.qrythres
    ntopimg = options.ntopimg
    ntopqry = options.ntopqry
    mincc = options.mincc
    feature = options.feature

    # semantic embedding
    k = options.k
    corpus = options.corpus
    word2vec_model = options.word2vec
    label_source = options.label_source

    # result path
    ranking_result_path = os.path.join(rootpath, devCollection, 'SimilarityIndex', devCollection, 'MetaData', method, feature)
    DCG_result_path = os.path.join(rootpath, devCollection, metric, method, feature)
    if checkToSkip(ranking_result_path, overwrite):
        sys.exit(0)
    if checkToSkip(DCG_result_path, overwrite):
        sys.exit(0)

    # inpute of query
    qp = SimpleQueryParser()
    qid_query_file = os.path.join(rootpath, devCollection, 'Annotations', 'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)   #(qid query)
    qid2query =  dict(zip(qid_list, [qp.process(query) for query in query_list]))
    
    # path of image feature
    train_feat_path = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
    dev_feat_path = os.path.join(rootpath, devCollection, 'FeatureData', feature)


    # method selection
    if method =='conse':
        se_searcher = ConSE(label_source, corpus, word2vec_model, dev_feat_path, rootpath)

    elif method == 't2i' or method == 'ta': 
        nnquery_file = os.path.join(rootpath, devCollection, 'TextData','querynn', options.nnqueryfile)
        qryClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.queryclickfile)
        t2i_searcher = Text2Image(nnquery_file, qryClick_file, dev_feat_path, train_feat_path, ntopqry)

    elif method == 'i2t' or method == 'ia':
        nnimage_file = os.path.join(rootpath, devCollection, 'TextData','imagenn', feature, options.nnimagefile)
        imgClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.imageclickfile)
        i2t_searcher = Image2Text(nnimage_file, imgClick_file, qrysim, ntopimg, ntopqry)

    else:
        print "this model is not supported with %s" % method
        sys.exit(0)


 
    # calculate DCG@25
    scorer = getScorer(metric)

    done = 0
    failed_count = 0
    qid2dcg = collections.OrderedDict()
    qid2iid_label_score = {}

    for query_id in qid_list:

        iid_list, label_list = readAnnotationsFrom(devCollection, 'concepts%s.txt' % devCollection, query_id, False, rootpath)        

        if method == 'conse':
            scorelist = se_searcher.do_search(qid2query[query_id], iid_list, k)

        elif method == 't2i':
            scorelist = t2i_searcher.text2image(query_id, iid_list, qrythres, mincc )

        elif method == 'ta':
            scorelist = t2i_searcher.textAnnotation( query_id, iid_list, ntopimg, qrythres, mincc)

        elif method == 'i2t': 
            scorelist = i2t_searcher.image2text(qid2query[query_id], iid_list, mincc )

        elif method == 'ia':
            scorelist = i2t_searcher.imageAnnotation( qid2query[query_id], iid_list, mincc )    
         

        if len(scorelist) == 0: 
            failed_count += 1
            scorelist = [0]*len(iid_list)
            qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist)
            random.shuffle(qid2iid_label_score[query_id])
        else:
            qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist)
            qid2iid_label_score[query_id] = sorted(qid2iid_label_score[query_id], key=lambda v:v[2], reverse=True)


        # calculate the result ranking of DCG@25 from our model
        qid2dcg[query_id] = scorer.score([x[1] for x in qid2iid_label_score[query_id]])
        printMessage("Done", query_id, qid2query[query_id])

        done += 1
        if(done % 20 == 0):
            writeRankingResult(ranking_result_path, qid2iid_label_score)
            qid2iid_label_score = {}
    
    writeRankingResult(ranking_result_path, qid2iid_label_score)
    writeDCGResult(DCG_result_path, qid2dcg)
    print "number of failed query: %d" % failed_count 
    print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values()))

    result_path_file = "result/individual_result_pathes.txt"
    if os.path.exists(result_path_file):
        fout = open(result_path_file,'a')
    else:
        makedirsforfile(result_path_file)
        fout = open(result_path_file, 'w')
    fout.write(ranking_result_path + '\n')
    fout.close()
Esempio n. 15
0
urls = ('/', 'index', '/search', 'ImageSearch', '/images/(.*)', 'images',
        '/img/(.*)', 'images', '/images2/(.*)', 'bigimages')

render = web.template.render('templates/')

pwd = os.path.dirname(os.path.realpath(__file__))
config = json.load(open(os.path.join(pwd, 'config.json')))

max_hits = config['max_hits']
rootpath = config['rootpath']
collection = config['collection']
rankMethod = config['rankMethod']
annotationName = config['annotationName']
metric = config['metric']
scorer = getScorer(metric)

simdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection,
                      rankMethod)
imset = readImageSet(collection, collection, rootpath)


class index:
    def GET(self):
        input = web.input(query=None)
        resp = {
            'status': 0,
            'hits': 0,
            'random': [],
            'tagrel': [],
            'metric': metric,
Esempio n. 16
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    feature = options.feature
    method = options.method
    sigma =options.sigma

    # result path
    ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'MetaData', method, feature)
    DCG_result_path = os.path.join(rootpath, collection, 'DCG', method, feature)
    if checkToSkip(ranking_result_path, overwrite):
        sys.exit(0)
    if checkToSkip(DCG_result_path, overwrite):
        sys.exit(0)
    
    # inpute of query
    qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)
    qid2query =  dict(zip(qid_list, query_list))
    
    # inpute of image
    img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature)
    img_feats = BigFile(img_feat_path)

    # the model to calculate DCG@25
    scorer = getScorer("DCG@25")


    done = 0
    qid2dcg = collections.OrderedDict()
    qid2iid_label_score = {}

    for qid in qid_list:
        iid_list, label_list = readAnnotationsFrom(collection, 'concepts%s.txt' % collection, qid, False, rootpath)

        renamed, test_X = img_feats.read(iid_list)

        parzen_list = []
        for imidx in iid_list:
            parzen_list.append(calParzen(img_feats.read_one(imidx), test_X , sigma))

        sorted_tuple = sorted(zip(iid_list, label_list, parzen_list), key=lambda v:v[2], reverse=True)
        qid2iid_label_score[qid] = sorted_tuple

        # calculate DCG@25
        sorted_label = [x[1] for x in sorted_tuple]
        qid2dcg[qid] = scorer.score(sorted_label)
        printMessage("Done", qid, qid2query[qid])

        done += 1
        if done % 20 == 0:
             writeRankingResult(ranking_result_path, qid2iid_label_score)
             qid2iid_label_score = {}


    writeDCGResult(DCG_result_path, qid2dcg)
    writeRankingResult(ranking_result_path, qid2iid_label_score)
    print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values()))

    result_path_file = "result/individual_result_pathes.txt"
    if os.path.exists(result_path_file):
        fout = open(result_path_file,'a')
    else:
        makedirsforfile(result_path_file)
        fout = open(result_path_file, 'w')
    fout.write(ranking_result_path + '\n')
    fout.close()
Esempio n. 17
0
def process(options, trainCollection, trainAnnotationName, valCollection,
            valAnnotationName, feature, modelName):
    assert (modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1  #options.autoweight
    beta = 0.5
    metric = options.metric
    scorer = getScorer(metric)
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {}

    if 'fik' == modelName:
        from fiksvm.svmutil import svm_train as train_model
        from fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData',
                                   feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))
    else:
        from fastlinear.liblinear193.python.liblinearutil import train as train_model
        from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        modelName = 'fastlinear'

    concepts = readConcepts(trainCollection,
                            trainAnnotationName,
                            rootpath=rootpath)
    valConcepts = readConcepts(valCollection,
                               valAnnotationName,
                               rootpath=rootpath)
    concept_num = len(concepts)
    for i in range(concept_num):
        assert (concepts[i] == valConcepts[i])

    resultdir = os.path.join(
        rootpath, trainCollection, 'Models', trainAnnotationName,
        '%s,best_params' % modelName,
        '%s,%s,%s' % (valCollection, valAnnotationName, feature))
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.txt')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)]
    printStatus(INFO,
                'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(
        os.path.join(rootpath, trainCollection, 'FeatureData', feature))
    val_feat_file = BigFile(
        os.path.join(rootpath, valCollection, 'FeatureData', feature))
    feat_dim = train_feat_file.ndims
    assert (feat_dim == val_feat_file.ndims)

    for concept in todo:
        names, labels = readAnnotationsFrom(trainCollection,
                                            trainAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        name2label = dict(zip(names, labels))
        renamed, vectors = train_feat_file.read(names)
        Ys = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if 1 == lab])
        nn = len([1 for lab in labels if -1 == lab])
        wp = float(beta) * (np + nn) / np
        wn = (1.0 - beta) * (np + nn) / nn

        names, labels = readAnnotationsFrom(valCollection,
                                            valAnnotationName,
                                            concept,
                                            skip_0=True,
                                            rootpath=rootpath)
        val_name2label = dict(zip(names, labels))
        val_renamed, val_vectors = val_feat_file.read(names)

        min_perf = 2.0
        worst_C = 1.0
        max_perf = 0.0
        best_C = 1.0
        best_scores = None
        best_labels = None
        for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]:
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C)
            else:
                svm_params = '-c %g' % C

            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
            #print modelName, '>'*20, svm_params
            model = train_model(Ys, vectors, svm_params + ' -q')
            new_model = compress_model([model], [1.0], feat_dim, params)

            ranklist = [(val_renamed[i], new_model.predict(val_vectors[i]))
                        for i in range(len(val_renamed))]
            ranklist.sort(key=lambda v: v[1], reverse=True)
            sorted_labels = [val_name2label[x[0]] for x in ranklist]
            perf = scorer.score(sorted_labels)
            if max_perf < perf:
                max_perf = perf
                best_C = C
                best_scores = [x[1] for x in ranklist]
                best_labels = list(sorted_labels)
            if min_perf > perf:
                min_perf = perf
                worst_C = C

        [A, B] = sigmoid_train(best_scores, best_labels)
        resultfile = os.path.join(resultdir, '%s.txt' % concept)

        printStatus(
            INFO,
            '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' %
            (concept, min_perf, worst_C, max_perf, best_C, A, B))
        makedirsforfile(resultfile)
        fw = open(resultfile, 'w')
        fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B))
        fw.close()
Esempio n. 18
0
def process(options, collection, annotationName, runfile):
    rootpath = options.rootpath

    p1_scorer = getScorer('P@3')
    p3_scorer = getScorer('P@5')
    r1_scorer = getScorer('R@3')
    r3_scorer = getScorer('R@5')
    ndcg1_scorer = getScorer('NDCG2@3')
    ndcg3_scorer = getScorer('NDCG2@5')
    ap_scorer = getScorer('AP')
    rr_scorer = getScorer('RR')

    datafiles = [
        x.strip() for x in open(runfile).readlines()
        if x.strip() and not x.strip().startswith('#')
    ]
    nr_of_runs = len(datafiles)

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    name2label = [{} for i in range(nr_of_concepts)]
    rel_conset = {}

    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection,
                                            annotationName,
                                            concepts[i],
                                            skip_0=False,
                                            rootpath=rootpath)
        #names = map(int, names)
        name2label[i] = dict(zip(names, labels))

        for im, lab in zip(names, labels):
            if lab > 0:
                rel_conset.setdefault(im, set()).add(i)

    # ('7975436322', set([33]))
    # for im, im_labels in rel_conset.items():
    #   print(im, im_labels)

    for run_idx in range(nr_of_runs):
        data = pickle.load(open(datafiles[run_idx], 'rb'))
        scores = data['scores']
        assert (scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        # for im in imset:
        #     print(im)
        #     raw_input()
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]

        res = np.zeros((nr_of_images, 8))
        for j in range(nr_of_images):
            ranklist = zip(range(nr_of_concepts), scores[j, :])
            ranklist.sort(key=lambda v: v[1], reverse=True)
            # print(ranklist)
            # raw_input()
            rel_set = rel_conset.get(imset[j], set())
            sorted_labels = [int(x[0] in rel_set) for x in ranklist]
            # print(sorted_labels)
            # raw_input()
            assert len(sorted_labels) == nr_of_concepts
            p1 = p1_scorer.score(sorted_labels)
            p3 = p3_scorer.score(sorted_labels)
            r1 = r1_scorer.score(sorted_labels)
            r3 = r3_scorer.score(sorted_labels)
            ndcg1 = ndcg1_scorer.score(sorted_labels)
            ndcg3 = ndcg3_scorer.score(sorted_labels)
            ap = ap_scorer.score(sorted_labels)
            rr = rr_scorer.score(sorted_labels)

            f1, f3 = 0.0, 0.0
            if (p1 + r1) != 0.0:
                f1 = 2 * p1 * r1 / (p1 + r1)
            if (p3 + r3) != 0.0:
                f3 = 2 * p3 * r3 / (p3 + r3)
            # h1, h3 = max(p1, r1), max(p3, r3)
            res[j, :] = [p1, p3, r1, r3, ndcg1, ndcg3, ap, rr]
            res[j, :] = [p1, p3, f1, f3, ndcg1, ndcg3, ap, rr]
            # res[j,:] = [p1, p3, h1, h3, ndcg1, ndcg3, ap, rr]
        avg_perf = res.mean(axis=0)
        name = path.basename(datafiles[run_idx]).split('.')[0]
        name = name.split(',')[1]
        stdout.write('%s\t' % name)
        # for x in avg_perf:
        for i in range(len(avg_perf)):
            if i == 4 or i == 5:
                continue
            # x = avg_perf[i] * 100.0
            x = avg_perf[i]
            if x >= 100.0:
                stdout.write('& %.1f ' % x)
            else:
                # stdout.write('& %.2f ' % x)
                stdout.write('& %s' % (('%.4f ' % x).lstrip('0')))
        stdout.write('\n')
Esempio n. 19
0
from fastsvm.svm import *
from fiksvm import *
from fiksvmutil import *
from fastsvm.fiksvm import svm_to_fiksvm as svm_to_fiksvm0


if __name__ == "__main__":
    rootpath = ROOT_PATH
    trainCollection = "voc2008train"
    testCollection = "voc2008val"
    annotationName = "conceptsvoc2008train.txt"
    #concept = "aeroplane"
    feature = "dsift"

    concepts = readConcepts(testCollection, 'conceptsvoc2008val.txt')
    scorer = getScorer('AP')

    min_vals, max_vals = find_min_max_vals(BigFile(os.path.join(rootpath, trainCollection, 'FeatureData', feature), FEATURE_TO_DIM[feature]))
    featurefile = os.path.join(rootpath, testCollection, "FeatureData", feature, "id.feature.txt")

    feat_dim = 1024
    num_bins = 50

    #fikmodel.set_probAB(-1, 0)
    
    #print "fik model0", fikmodel0.get_nr_svs(), fikmodel0.get_feat_dim(), fikmodel0.get_probAB()
    #print "fik model", fikmodel.get_nr_svs(), fikmodel.get_feat_dim(), fikmodel.get_probAB()
    mAP = [0]*4
    for concept in concepts:
        names,labels = readAnnotationsFrom(testCollection, 'conceptsvoc2008val.txt', concept)
        name2label = dict(zip(names,labels))
Esempio n. 20
0
def process(options, collection, annotationName, runfile):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultdir = os.path.join(rootpath, collection, 'SimilarityIndex',
                             collection)

    apscorer = getScorer('AP')
    datafiles = [
        x.strip() for x in open(runfile).readlines()
        if x.strip() and not x.strip().startswith('#')
    ]
    nr_of_runs = len(datafiles)

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    printStatus(INFO, 'read annotations from files')

    name2label = [{} for i in range(nr_of_concepts)]
    hit_imgset = [[] for i in range(nr_of_concepts)]

    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection,
                                            annotationName,
                                            concepts[i],
                                            skip_0=False,
                                            rootpath=rootpath)
        names = map(int, names)
        name2label[i] = dict(zip(names, labels))

        label_file = os.path.join(rootpath, collection, 'tagged,lemm',
                                  '%s.txt' % concepts[i])
        try:
            hit_imgset[i] = set(map(int, open(label_file).readlines()))
        except:
            hit_imgset[i] = set()
        printStatus(
            INFO, 'readLabeledImageSet for %s-%s -> %d hits' %
            (collection, concepts[i], len(hit_imgset[i])))

    ap_table = np.zeros((nr_of_runs, nr_of_concepts))
    ap2_table = np.zeros((nr_of_runs, nr_of_concepts))

    for run_idx in range(nr_of_runs):
        runName = os.path.splitext(os.path.basename(datafiles[run_idx]))[0]
        data = pickle.load(open(datafiles[run_idx], 'rb'))
        scores = data['scores']
        assert (scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]

        for c_idx in range(nr_of_concepts):
            ground_truth = name2label[c_idx]
            ranklist = zip(imset, scores[:, c_idx])
            ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True)
            ranklist = [x for x in ranklist if x[0] in ground_truth]

            resfile = os.path.join(resultdir, runName,
                                   '%s.txt' % concepts[c_idx])
            if not checkToSkip(resfile, overwrite):
                writeRankingResults(ranklist, resfile)
            sorted_labels = [ground_truth[x[0]] for x in ranklist]
            assert (len(sorted_labels) > 0)
            #print concepts[c_idx], ranklist[:5], sorted_labels[:5]

            ap_table[run_idx, c_idx] = apscorer.score(sorted_labels)

            resfile = os.path.join(resultdir, 'tagged,lemm', runName,
                                   '%s.txt' % concepts[c_idx])
            if not checkToSkip(resfile, overwrite):
                writeRankingResults(
                    [x for x in ranklist if x[0] in hit_imgset[c_idx]],
                    resfile)

            sorted_labels = [
                ground_truth[x[0]] for x in ranklist
                if x[0] in hit_imgset[c_idx]
            ]
            ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels)

    print '#' * 100
    print '# untagged-concept', ' '.join(
        [os.path.basename(x) for x in datafiles])
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ap_table[:, c_idx]])
    print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)])

    print '#' * 100
    print '# tagged-concept'
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ap2_table[:, c_idx]])
    print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])
Esempio n. 21
0
def process(options, collection, annotationName, runfile):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection)

    apscorer = getScorer('AP')
    datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#')]
    nr_of_runs = len(datafiles)
    
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)  
    nr_of_concepts = len(concepts)
    
    printStatus(INFO, 'read annotations from files')
    
    name2label = [{} for i in range(nr_of_concepts)]
    hit_imgset = [[] for i in range(nr_of_concepts)]
    
    for i in range(nr_of_concepts):
        names,labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath)
        names = map(int, names)
        name2label[i] = dict(zip(names,labels))

        label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt'% concepts[i])
        try:
            hit_imgset[i] = set(map(int, open(label_file).readlines()))
        except:
            hit_imgset[i] = set()
        printStatus(INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i])))
        
    ap_table = np.zeros((nr_of_runs, nr_of_concepts))
    ap2_table = np.zeros((nr_of_runs, nr_of_concepts))
    
    for run_idx in range(nr_of_runs):
        runName = os.path.splitext(os.path.basename(datafiles[run_idx]))[0]
        data = pickle.load(open(datafiles[run_idx],'rb'))
        scores = data['scores']
        assert(scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]
                   
        for c_idx in range(nr_of_concepts):
            ground_truth = name2label[c_idx]
            ranklist =  zip(imset, scores[:,c_idx])
            ranklist.sort(key=lambda v:(v[1], str(v[0])), reverse=True)
            ranklist = [x for x in ranklist if x[0] in ground_truth]

            resfile = os.path.join(resultdir, runName, '%s.txt' % concepts[c_idx])
            if not checkToSkip(resfile, overwrite):
                writeRankingResults(ranklist, resfile)            
            sorted_labels = [ground_truth[x[0]] for x in ranklist]
            assert(len(sorted_labels)>0)
            #print concepts[c_idx], ranklist[:5], sorted_labels[:5]

            ap_table[run_idx, c_idx] = apscorer.score(sorted_labels)

            resfile = os.path.join(resultdir, 'tagged,lemm', runName, '%s.txt' % concepts[c_idx])
            if not checkToSkip(resfile, overwrite):
                writeRankingResults([x for x in ranklist if x[0] in hit_imgset[c_idx]], resfile)            
            
            sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]]
            ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels)
     

    print '#'*100
    print '# untagged-concept', ' '.join([os.path.basename(x) for x in datafiles])
    print '#'*100
            
    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ap_table[:,c_idx]])
    print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)])
    
    print '#'*100
    print '# tagged-concept'
    print '#'*100
    
    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ap2_table[:,c_idx]])
    print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])
Esempio n. 22
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    feature = options.feature
    method = options.method
    sigma = options.sigma

    # result path
    ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex',
                                       collection, 'MetaData', method, feature)
    DCG_result_path = os.path.join(rootpath, collection, 'DCG', method,
                                   feature)
    if checkToSkip(ranking_result_path, overwrite):
        sys.exit(0)
    if checkToSkip(DCG_result_path, overwrite):
        sys.exit(0)

    # inpute of query
    qid_query_file = os.path.join(rootpath, collection, 'Annotations',
                                  'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)
    qid2query = dict(zip(qid_list, query_list))

    # inpute of image
    img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature)
    img_feats = BigFile(img_feat_path)

    # the model to calculate DCG@25
    scorer = getScorer("DCG@25")

    done = 0
    qid2dcg = collections.OrderedDict()
    qid2iid_label_score = {}

    for qid in qid_list:
        iid_list, label_list = readAnnotationsFrom(
            collection, 'concepts%s.txt' % collection, qid, False, rootpath)

        renamed, test_X = img_feats.read(iid_list)

        parzen_list = []
        for imidx in iid_list:
            parzen_list.append(
                calParzen(img_feats.read_one(imidx), test_X, sigma))

        # parzen_list_suffle = calParzen_fast(test_X, len(renamed), sigma)
        # parzen_list = []
        # for imidx in iid_list:
        #     parzen_list.append(parzen_list_suffle[renamed.index(imidx)])

        sorted_tuple = sorted(zip(iid_list, label_list, parzen_list),
                              key=lambda v: v[2],
                              reverse=True)
        qid2iid_label_score[qid] = sorted_tuple

        # calculate DCG@25
        sorted_label = [x[1] for x in sorted_tuple]
        qid2dcg[qid] = scorer.score(sorted_label)
        printMessage("Done", qid, qid2query[qid])

        done += 1
        if done % 20 == 0:
            writeRankingResult(ranking_result_path, qid2iid_label_score)
            qid2iid_label_score = {}

    writeDCGResult(DCG_result_path, qid2dcg)
    writeRankingResult(ranking_result_path, qid2iid_label_score)
    print "average DCG@25: %f" % (1.0 * sum(qid2dcg.values()) /
                                  len(qid2dcg.values()))

    result_path_file = "result/individual_result_pathes.txt"
    if os.path.exists(result_path_file):
        fout = open(result_path_file, 'a')
    else:
        makedirsforfile(result_path_file)
        fout = open(result_path_file, 'w')
    fout.write(ranking_result_path + '\n')
    fout.close()
Esempio n. 23
0
def process(options, trainCollection, devCollection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    method = options.method
    metric = options.metric

    qrysim = options.qrysim
    qrythres = options.qrythres
    ntopimg = options.ntopimg
    ntopqry = options.ntopqry
    mincc = options.mincc
    feature = options.feature

    # semantic embedding
    k = options.k
    corpus = options.corpus
    word2vec_model = options.word2vec
    label_source = options.label_source

    # result path
    ranking_result_path = os.path.join(rootpath, devCollection,
                                       'SimilarityIndex', devCollection,
                                       'MetaData', method, feature)
    DCG_result_path = os.path.join(rootpath, devCollection, 'DCG', method,
                                   feature)
    if checkToSkip(ranking_result_path, overwrite):
        sys.exit(0)
    if checkToSkip(DCG_result_path, overwrite):
        sys.exit(0)

    # inpute of query
    qp = SimpleQueryParser()
    qid_query_file = os.path.join(rootpath, devCollection, 'Annotations',
                                  'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)  #(qid query)
    qid2query = dict(zip(qid_list,
                         [qp.process(query) for query in query_list]))

    # path of image feature
    train_feat_path = os.path.join(rootpath, trainCollection, 'FeatureData',
                                   feature)
    dev_feat_path = os.path.join(rootpath, devCollection, 'FeatureData',
                                 feature)

    # method selection
    if method == 'se':
        se_searcher = SemanticEmbedding(label_source, corpus, word2vec_model,
                                        dev_feat_path, rootpath)

    elif method == 't2i':
        nnquery_file = os.path.join(rootpath, devCollection, 'TextData',
                                    'querynn', options.nnqueryfile)
        qryClick_file = os.path.join(rootpath, trainCollection, 'TextData',
                                     options.queryclickfile)
        t2i_searcher = Text2Image(nnquery_file, qryClick_file, dev_feat_path,
                                  train_feat_path, ntopqry)

    elif method == 'i2t':
        nnimage_file = os.path.join(rootpath, devCollection, 'TextData',
                                    'imagenn', feature, options.nnimagefile)
        imgClick_file = os.path.join(rootpath, trainCollection, 'TextData',
                                     options.imageclickfile)
        i2t_searcher = Image2Text(nnimage_file, imgClick_file, qrysim, ntopimg,
                                  ntopqry)

    else:
        print "this model is not supported with %s" % method
        sys.exit(0)

    # calculate DCG@25
    scorer = getScorer(metric)

    done = 0
    failed_count = 0
    qid2dcg = collections.OrderedDict()
    qid2iid_label_score = {}

    for query_id in qid_list:

        iid_list, label_list = readAnnotationsFrom(
            devCollection, 'concepts%s.txt' % devCollection, query_id, False,
            rootpath)

        if method == 'se':
            scorelist = se_searcher.do_search(qid2query[query_id], iid_list, k)

        elif method == 't2i':
            scorelist = t2i_searcher.text2image(query_id, iid_list, qrythres,
                                                mincc)

        elif method == 'i2t':
            scorelist = i2t_searcher.image2text(qid2query[query_id], iid_list,
                                                mincc)

        if len(scorelist) == 0:
            failed_count += 1
            scorelist = [0] * len(iid_list)
            qid2iid_label_score[query_id] = zip(iid_list, label_list,
                                                scorelist)
            random.shuffle(qid2iid_label_score[query_id])
        else:
            qid2iid_label_score[query_id] = zip(iid_list, label_list,
                                                scorelist)
            qid2iid_label_score[query_id] = sorted(
                qid2iid_label_score[query_id],
                key=lambda v: v[2],
                reverse=True)

        # calculate the result ranking of DCG@25 from our model
        qid2dcg[query_id] = scorer.score(
            [x[1] for x in qid2iid_label_score[query_id]])
        printMessage("Done", query_id, qid2query[query_id])

        done += 1
        if (done % 20 == 0):
            writeRankingResult(ranking_result_path, qid2iid_label_score)
            qid2iid_label_score = {}

    writeRankingResult(ranking_result_path, qid2iid_label_score)
    writeDCGResult(DCG_result_path, qid2dcg)
    print "number of failed query: %d" % failed_count
    print "average DCG@25: %f" % (1.0 * sum(qid2dcg.values()) /
                                  len(qid2dcg.values()))

    result_path_file = "result/individual_result_pathes.txt"
    if os.path.exists(result_path_file):
        fout = open(result_path_file, 'a')
    else:
        makedirsforfile(result_path_file)
        fout = open(result_path_file, 'w')
    fout.write(ranking_result_path + '\n')
    fout.close()
Esempio n. 24
0
def process(options, collection, annotationName, runfile, outDirectory):
    rootpath = options.rootpath

    apscorer = getScorer("AP")
    ndcg = getScorer("NDCG@20")
    ndcg2 = getScorer("NDCG2@20")
    p1scorer = getScorer("P@1")
    p5scorer = getScorer("P@5")

    datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith("#")]
    nr_of_runs = len(datafiles)

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    printStatus(INFO, "read annotations from files")

    name2label = [{} for i in range(nr_of_concepts)]
    hit_imgset = [[] for i in range(nr_of_concepts)]
    rel_conset = {}

    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath)
        names = map(int, names)
        name2label[i] = dict(zip(names, labels))

        for im, lab in zip(names, labels):
            if lab > 0:
                rel_conset.setdefault(im, set()).add(i)

        label_file = os.path.join(rootpath, collection, "tagged,lemm", "%s.txt" % concepts[i])
        try:
            hit_imgset[i] = set(map(int, open(label_file).readlines()))
        except:
            hit_imgset[i] = set()
        printStatus(INFO, "readLabeledImageSet for %s-%s -> %d hits" % (collection, concepts[i], len(hit_imgset[i])))

    ap_table = np.zeros((nr_of_runs, nr_of_concepts))
    ap2_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts))

    print "#" * 100
    print "# method miap hit1 hit5"
    print "#" * 100

    for run_idx in range(nr_of_runs):
        data = pickle.load(open(datafiles[run_idx], "rb"))
        scores = data["scores"]
        assert scores.shape[1] == nr_of_concepts
        imset = data["id_images"]
        imset = np.array([int(x) for x in imset])
        idx = np.argsort(imset)
        imset = imset[idx]
        scores = scores[idx]
        nr_of_images = len(imset)
        # print datafiles[run_idx], imset[:5], imset[-5:]

        for c_idx in range(nr_of_concepts):
            ground_truth = name2label[c_idx]
            ranklist = zip(imset, scores[:, c_idx])
            ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True)
            ranklist = [x for x in ranklist if x[0] in ground_truth]

            sorted_labels = [ground_truth[x[0]] for x in ranklist]
            assert len(sorted_labels) > 0
            # print concepts[c_idx], ranklist[:5], sorted_labels[:5]

            ap_table[run_idx, c_idx] = apscorer.score(sorted_labels)

            sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]]
            ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels)
            ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels)
            ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels)

        res = np.zeros((nr_of_images, 4))
        gt = np.zeros((nr_of_images, nr_of_concepts))
        for j in range(nr_of_images):
            ranklist = zip(range(nr_of_concepts), scores[j, :])
            ranklist.sort(key=lambda v: v[1], reverse=True)
            rel_set = rel_conset.get(imset[j], set())
            sorted_labels = [int(x[0] in rel_set) for x in ranklist]

            # print rel_set
            # print sorted_labels

            ap = apscorer.score(sorted_labels)
            hit1 = p1scorer.score(sorted_labels)
            hit5 = p5scorer.score(sorted_labels) > 0.1
            res[j, :] = [ap, hit1, hit5, len(rel_set)]
            gt[j, :] = sorted_labels
        avg_perf = res.mean(axis=0)
        print os.path.split(datafiles[run_idx])[-1], " ".join(["%.3f" % x for x in avg_perf])

        outMiap = h5py.File(os.path.join(outDirectory, os.path.split(datafiles[run_idx])[-1] + ".h5"), "w")
        outMiap["iap"] = res[:, 0]
        outMiap["ngt"] = res[:, 3]
        outMiap["hit1"] = res[:, 1]
        outMiap["hit5"] = res[:, 2]
        outMiap["gt"] = gt
        outMiap["concepts"] = concepts
        outMiap["ap"] = ap_table[run_idx, :]
        outMiap["ap2"] = ap2_table[run_idx, :]
        outMiap[ndcg.name()] = ndcg_table[run_idx, :]
        outMiap[ndcg2.name()] = ndcg2_table[run_idx, :]
        outMiap.close()

    print "#" * 100
    print "# untagged-concept", " ".join([os.path.split(x)[-1] for x in datafiles])
    print "#" * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], " ".join(["%.3f" % x for x in ap_table[:, c_idx]])
    print "meanAP", " ".join(["%.3f" % x for x in ap_table.mean(axis=1)])

    print "#" * 100
    print "# tagged-concept"
    print "#" * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], " ".join(["%.3f" % x for x in ap2_table[:, c_idx]])
    print "meanAP2", " ".join(["%.3f" % x for x in ap2_table.mean(axis=1)])

    print "#" * 100
    print "# tagged-concept"
    print "#" * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], " ".join(["%.3f" % x for x in ndcg_table[:, c_idx]])
    print "mean%s" % ndcg.name(), " ".join(["%.3f" % x for x in ndcg_table.mean(axis=1)])

    print "#" * 100
    print "# tagged-concept"
    print "#" * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], " ".join(["%.3f" % x for x in ndcg2_table[:, c_idx]])
    print "mean%s" % ndcg2.name(), " ".join(["%.3f" % x for x in ndcg2_table.mean(axis=1)])
Esempio n. 25
0
def process(options, trainCollection, trainAnnotationName, valCollection, valAnnotationName, feature, modelName):
    assert(modelName in ['fik', 'fastlinear'])
    rootpath = options.rootpath
    autoweight = 1 #options.autoweight
    beta = 0.5
    metric = options.metric
    scorer = getScorer(metric)
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job

    params = {}
    
    if 'fik' == modelName:
        from fiksvm.svmutil import svm_train as train_model
        from fiksvm.fiksvm import svm_to_fiksvm as compress_model
        from fiksvm.svm import KERNEL_TYPE

        nr_bins = options.nr_bins
        modelName += str(nr_bins)
        params['nr_bins'] = nr_bins
        minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
        with open(minmax_file, 'r') as f:
            params['min_vals'] = map(float, str.split(f.readline()))
            params['max_vals'] = map(float, str.split(f.readline()))    
    else:
        from fastlinear.liblinear193.python.liblinearutil import train as train_model
        from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model
        modelName = 'fastlinear'


    
    concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath)
    valConcepts = readConcepts(valCollection,valAnnotationName, rootpath=rootpath)
    concept_num = len(concepts)
    for i in range(concept_num):
        assert(concepts[i] == valConcepts[i])
    
    resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, '%s,best_params'%modelName, '%s,%s,%s' % (valCollection,valAnnotationName,feature))
    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, concept + '.txt')
        if not checkToSkip(resultfile, overwrite):
            todo.append(concept)
    todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)]
    printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo)))
    if not todo:
        return 0

    train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature))
    val_feat_file = BigFile(os.path.join(rootpath,valCollection,'FeatureData',feature))
    feat_dim = train_feat_file.ndims
    assert(feat_dim == val_feat_file.ndims)

    
    for concept in todo:
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names,labels))
        renamed,vectors = train_feat_file.read(names)
        Ys = [name2label[x] for x in renamed]
        np = len([1 for lab in labels if  1 == lab])
        nn = len([1 for lab in labels if  -1== lab])
        wp = float(beta) * (np+nn) / np
        wn = (1.0-beta) * (np+nn) /nn
    
        names,labels = readAnnotationsFrom(valCollection, valAnnotationName, concept, skip_0=True, rootpath=rootpath)
        val_name2label = dict(zip(names,labels))
        val_renamed, val_vectors = val_feat_file.read(names)
        
        min_perf = 2.0
        worst_C = 1.0
        max_perf = 0.0
        best_C = 1.0
        best_scores = None
        best_labels = None
        for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]:
            if autoweight:
                svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) 
            else:
                svm_params = '-c %g' % C
            
            if modelName.startswith('fik'):
                svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI")
            else:
                svm_params += ' -s 2 -B -1 '
            #print modelName, '>'*20, svm_params
            model = train_model(Ys, vectors, svm_params + ' -q')
            new_model = compress_model([model], [1.0], feat_dim, params)

            ranklist = [(val_renamed[i], new_model.predict(val_vectors[i])) for i in range(len(val_renamed))]
            ranklist.sort(key=lambda v:v[1], reverse=True)
            sorted_labels = [val_name2label[x[0]] for x in ranklist]
            perf = scorer.score(sorted_labels)
            if max_perf < perf:
                max_perf = perf
                best_C = C
                best_scores = [x[1] for x in ranklist]
                best_labels = list(sorted_labels)
            if min_perf > perf:
                min_perf = perf
                worst_C = C
                
        [A,B] = sigmoid_train(best_scores, best_labels)
        resultfile = os.path.join(resultdir, '%s.txt' % concept)
        
        printStatus(INFO, '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' % (concept, min_perf, worst_C, max_perf, best_C, A, B))
        makedirsforfile(resultfile)
        fw = open(resultfile, 'w')
        fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B))
        fw.close()
Esempio n. 26
0
    from basic.constant import ROOT_PATH
    from basic.metric import getScorer
    from basic.common import writeRankingResults
    from basic.annotationtable import readAnnotationsFrom
    from simpleknn.bigfile import BigFile

    ROOT_PATH = '/home/root123/xirong/VisualSearch'
    rootpath = ROOT_PATH
    trainCollection = 'flickr81train'
    trainAnnotationName = 'concepts81train.random50.0.random50.0.txt'
    testCollection = "flickr81test"
    testAnnotationName = 'conceptsflickr81test.txt'
    feature = "dascaffeprob"
    feat_dim = 1000
    scorer = getScorer("AP")

    targetConcept = sys.argv[1]  #"aeroplane"

    train_feat_file = BigFile(
        os.path.join(ROOT_PATH, trainCollection, "FeatureData", feature),
        feat_dim)
    test_feat_file = BigFile(
        os.path.join(ROOT_PATH, testCollection, "FeatureData", feature),
        feat_dim)
    testImageSet = test_feat_file.names  #random.sample(test_feat_file.names, 10000)

    minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData',
                               feature, 'minmax.txt')
    with open(minmax_file, 'r') as f:
        min_vals = map(float, str.split(f.readline()))
Esempio n. 27
0
def process(options, trainCollection, devCollection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    metric = options.metric

    qrythres = options.qrythres
    ntopimg = options.ntopimg
    ntopqry = options.ntopqry
    mincc = options.mincc
    feature = options.feature


    # result path
    ranking_result_path = os.path.join(rootpath, devCollection, 'SimilarityIndex', devCollection, 'MetaData', 'text2image', feature)
    DCG_result_path = os.path.join(rootpath, devCollection, metric, 'text2image', feature)
    if checkToSkip(ranking_result_path, overwrite):
        sys.exit(0)
    if checkToSkip(DCG_result_path, overwrite):
        sys.exit(0)

    # inpute of query
    qp = SimpleQueryParser()
    qid_query_file = os.path.join(rootpath, devCollection, 'Annotations', 'qid.text.txt')
    qid_list, query_list = readQidQuery(qid_query_file)   #(qid query)
    qid2query =  dict(zip(qid_list, [qp.process(query) for query in query_list]))

    # random performance for specific queries
    qid_randomperf_file = os.path.join(rootpath, devCollection, 'Annotations', '*****@*****.**')
    qid2randomperf = {}
    for line in open(qid_randomperf_file):
        qid, random_perf = line.strip().split()
        qid2randomperf[qid] = float(random_perf)

    
    # path of image feature
    train_feat_path = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
    dev_feat_path = os.path.join(rootpath, devCollection, 'FeatureData', feature)

    nnquery_file = os.path.join(rootpath, devCollection, 'TextData','querynn', options.nnqueryfile)
    qryClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.queryclickfile)
    t2i_searcher = Text2Image(nnquery_file, qryClick_file, dev_feat_path, train_feat_path, ntopqry)

    # calculate DCG@25
    scorer = getScorer(metric)

    done = 0
    failed_count = 0
    qid2dcg = collections.OrderedDict()
    qid2iid_label_score = {}

    for query_id in qid_list:

        iid_list, label_list = readAnnotationsFrom(devCollection, 'concepts%s.txt' % devCollection, query_id, False, rootpath)        

        scorelist = t2i_searcher.doSearch( query_id, iid_list, ntopimg, qrythres, mincc)
         

        if len(scorelist) == 0: 
            failed_count += 1
            qid2dcg[query_id] = qid2randomperf[query_id]
        else:
            qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist)
            qid2iid_label_score[query_id] = sorted(qid2iid_label_score[query_id], key=lambda v:v[2], reverse=True)
            # calculate the result ranking of DCG@25 from our model
            qid2dcg[query_id] = scorer.score([x[1] for x in qid2iid_label_score[query_id]])
        printMessage("Done", query_id, qid2query[query_id])

        done += 1
        if(done % 20 == 0):
            writeRankingResult(ranking_result_path, qid2iid_label_score)
            qid2iid_label_score = {}
    
    writeRankingResult(ranking_result_path, qid2iid_label_score)
    writeDCGResult(DCG_result_path, qid2dcg)
    print "number of failed query: %d" % failed_count 
    print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values()))
Esempio n. 28
0
    from basic.constant import ROOT_PATH
    from basic.metric import getScorer
    from basic.common import writeRankingResults
    from basic.annotationtable import readAnnotationsFrom
    from simpleknn.bigfile import BigFile
    
    ROOT_PATH = '/home/root123/xirong/VisualSearch'
    rootpath = ROOT_PATH
    trainCollection = 'flickr81train'
    trainAnnotationName = 'concepts81train.random50.0.random50.0.txt'
    testCollection = "flickr81test"
    testAnnotationName = 'conceptsflickr81test.txt'
    feature = "dascaffeprob"
    feat_dim = 1000
    scorer = getScorer("AP")
    
    targetConcept = sys.argv[1] #"aeroplane"

    train_feat_file = BigFile(os.path.join(ROOT_PATH, trainCollection, "FeatureData", feature), feat_dim)
    test_feat_file = BigFile(os.path.join(ROOT_PATH, testCollection, "FeatureData", feature), feat_dim)
    testImageSet = test_feat_file.names #random.sample(test_feat_file.names, 10000)
    
    minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt')
    with open(minmax_file, 'r') as f:
        min_vals = map(float, str.split(f.readline()))
        max_vals = map(float, str.split(f.readline()))


    [names,labels] = readAnnotationsFrom(collection=trainCollection, annotationName=trainAnnotationName, concept=targetConcept, rootpath=rootpath)
    name2label = dict(zip(names,labels))
Esempio n. 29
0
def process(options, collection, annotationName, runfile, outDirectory):
    rootpath = options.rootpath

    apscorer = getScorer('AP')
    ndcg = getScorer('NDCG@20')
    ndcg2 = getScorer('NDCG2@20')
    p1scorer = getScorer('P@1')
    p5scorer = getScorer('P@5')

    datafiles = [
        x.strip() for x in open(runfile).readlines()
        if x.strip() and not x.strip().startswith('#')
    ]
    nr_of_runs = len(datafiles)

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_concepts = len(concepts)

    printStatus(INFO, 'read annotations from files')

    name2label = [{} for i in range(nr_of_concepts)]
    hit_imgset = [[] for i in range(nr_of_concepts)]
    rel_conset = {}

    for i in range(nr_of_concepts):
        names, labels = readAnnotationsFrom(collection,
                                            annotationName,
                                            concepts[i],
                                            skip_0=False,
                                            rootpath=rootpath)
        names = map(int, names)
        name2label[i] = dict(zip(names, labels))

        for im, lab in zip(names, labels):
            if lab > 0:
                rel_conset.setdefault(im, set()).add(i)

        label_file = os.path.join(rootpath, collection, 'tagged,lemm',
                                  '%s.txt' % concepts[i])
        try:
            hit_imgset[i] = set(map(int, open(label_file).readlines()))
        except:
            hit_imgset[i] = set()
        printStatus(
            INFO, 'readLabeledImageSet for %s-%s -> %d hits' %
            (collection, concepts[i], len(hit_imgset[i])))

    ap_table = np.zeros((nr_of_runs, nr_of_concepts))
    ap2_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts))

    print '#' * 100
    print '# method miap hit1 hit5'
    print '#' * 100

    for run_idx in range(nr_of_runs):
        data = pickle.load(open(datafiles[run_idx], 'rb'))
        scores = data['scores']
        assert (scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        imset = np.array([int(x) for x in imset])
        idx = np.argsort(imset)
        imset = imset[idx]
        scores = scores[idx]
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]

        for c_idx in range(nr_of_concepts):
            ground_truth = name2label[c_idx]
            ranklist = zip(imset, scores[:, c_idx])
            ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True)
            ranklist = [x for x in ranklist if x[0] in ground_truth]

            sorted_labels = [ground_truth[x[0]] for x in ranklist]
            assert (len(sorted_labels) > 0)
            #print concepts[c_idx], ranklist[:5], sorted_labels[:5]

            ap_table[run_idx, c_idx] = apscorer.score(sorted_labels)

            sorted_labels = [
                ground_truth[x[0]] for x in ranklist
                if x[0] in hit_imgset[c_idx]
            ]
            ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels)
            ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels)
            ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels)

        res = np.zeros((nr_of_images, 4))
        gt = np.zeros((nr_of_images, nr_of_concepts))
        for j in range(nr_of_images):
            ranklist = zip(range(nr_of_concepts), scores[j, :])
            ranklist.sort(key=lambda v: v[1], reverse=True)
            rel_set = rel_conset.get(imset[j], set())
            sorted_labels = [int(x[0] in rel_set) for x in ranklist]

            #print rel_set
            #print sorted_labels

            ap = apscorer.score(sorted_labels)
            hit1 = p1scorer.score(sorted_labels)
            hit5 = p5scorer.score(sorted_labels) > 0.1
            res[j, :] = [ap, hit1, hit5, len(rel_set)]
            gt[j, :] = sorted_labels
        avg_perf = res.mean(axis=0)
        print os.path.split(datafiles[run_idx])[-1], ' '.join(
            ['%.3f' % x for x in avg_perf])

        outMiap = h5py.File(
            os.path.join(outDirectory,
                         os.path.split(datafiles[run_idx])[-1] + ".h5"), 'w')
        outMiap['iap'] = res[:, 0]
        outMiap['ngt'] = res[:, 3]
        outMiap['hit1'] = res[:, 1]
        outMiap['hit5'] = res[:, 2]
        outMiap['gt'] = gt
        outMiap['concepts'] = concepts
        outMiap['ap'] = ap_table[run_idx, :]
        outMiap['ap2'] = ap2_table[run_idx, :]
        outMiap[ndcg.name()] = ndcg_table[run_idx, :]
        outMiap[ndcg2.name()] = ndcg2_table[run_idx, :]
        outMiap.close()

    print '#' * 100
    print '# untagged-concept', ' '.join(
        [os.path.split(x)[-1] for x in datafiles])
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ap_table[:, c_idx]])
    print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)])

    print '#' * 100
    print '# tagged-concept'
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ap2_table[:, c_idx]])
    print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])

    print '#' * 100
    print '# tagged-concept'
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ndcg_table[:, c_idx]])
    print 'mean%s' % ndcg.name(), ' '.join(
        ['%.3f' % x for x in ndcg_table.mean(axis=1)])

    print '#' * 100
    print '# tagged-concept'
    print '#' * 100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(
            ['%.3f' % x for x in ndcg2_table[:, c_idx]])
    print 'mean%s' % ndcg2.name(), ' '.join(
        ['%.3f' % x for x in ndcg2_table.mean(axis=1)])
Esempio n. 30
0
    feature = "dsift"
    
    trainCollection = 'voc2008train'
    trainAnnotationName = 'conceptsvoc2008train.txt'
    testCollection = 'voc2008val'
    testset = testCollection
    testAnnotationName = 'conceptsvoc2008val.txt'

    modelName = 'fik50' 
    #modelName = 'fastlinear'
    if 'fastlinear' == modelName:
        from fastlinear.fastlinear import fastlinear_load_model as load_model
    else:
        from fiksvm.fiksvm import fiksvm_load_model as load_model

    scorer = getScorer(metric)
    
    imset = readImageSet(testCollection,testset,rootpath=rootpath)
    concepts = readConcepts(testCollection,testAnnotationName,rootpath=rootpath)
    feat_dir = os.path.join(rootpath, testCollection, "FeatureData", feature)
    feat_file = BigFile(feat_dir)

    _renamed, _vectors = feat_file.read(imset)

    nr_of_images = len(_renamed)
    nr_of_concepts = len(concepts)
    
    mAP = 0.0
    models = [None] * len(concepts)

    stream = StreamFile(feat_dir)
Esempio n. 31
0
from fastsvm.svmutil import *
from fastsvm.svm import *
from fiksvm import *
from fiksvmutil import *
from fastsvm.fiksvm import svm_to_fiksvm as svm_to_fiksvm0

if __name__ == "__main__":
    rootpath = ROOT_PATH
    trainCollection = "voc2008train"
    testCollection = "voc2008val"
    annotationName = "conceptsvoc2008train.txt"
    #concept = "aeroplane"
    feature = "dsift"

    concepts = readConcepts(testCollection, 'conceptsvoc2008val.txt')
    scorer = getScorer('AP')

    min_vals, max_vals = find_min_max_vals(
        BigFile(
            os.path.join(rootpath, trainCollection, 'FeatureData', feature),
            FEATURE_TO_DIM[feature]))
    featurefile = os.path.join(rootpath, testCollection, "FeatureData",
                               feature, "id.feature.txt")

    feat_dim = 1024
    num_bins = 50

    #fikmodel.set_probAB(-1, 0)

    #print "fik model0", fikmodel0.get_nr_svs(), fikmodel0.get_feat_dim(), fikmodel0.get_probAB()
    #print "fik model", fikmodel.get_nr_svs(), fikmodel.get_feat_dim(), fikmodel.get_probAB()
Esempio n. 32
0
def process(options, collection, annotationName, runfile):
    rootpath = options.rootpath
    
    apscorer = getScorer('AP')
    ndcg = getScorer('NDCG@20')
    ndcg2 = getScorer('NDCG2@20')
    p1scorer = getScorer('P@1')
    p5scorer = getScorer('P@5')

    datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#')]
    nr_of_runs = len(datafiles)
    
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)  
    nr_of_concepts = len(concepts)
    
    printStatus(INFO, 'read annotations from files')
    
    name2label = [{} for i in range(nr_of_concepts)]
    hit_imgset = [[] for i in range(nr_of_concepts)]
    rel_conset = {}
    
    for i in range(nr_of_concepts):
        names,labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath)
        names = map(int, names)
        name2label[i] = dict(zip(names,labels))
        
        for im,lab in zip(names,labels):
            if lab > 0:
                rel_conset.setdefault(im,set()).add(i)

        label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt'% concepts[i])
        try:
            hit_imgset[i] = set(map(int, open(label_file).readlines()))
        except:
            hit_imgset[i] = set()
        printStatus(INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i])))
        
    ap_table = np.zeros((nr_of_runs, nr_of_concepts))
    ap2_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg_table = np.zeros((nr_of_runs, nr_of_concepts))
    ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts))
    
    print '#'*100
    print '# method miap hit1 hit5'
    print '#'*100
    
    for run_idx in range(nr_of_runs):
        data = pickle.load(open(datafiles[run_idx],'rb'))
        scores = data['scores']
        assert(scores.shape[1] == nr_of_concepts)
        imset = data['id_images']
        nr_of_images = len(imset)
        #print datafiles[run_idx], imset[:5], imset[-5:]
                   
        for c_idx in range(nr_of_concepts):
            ground_truth = name2label[c_idx]
            ranklist =  zip(imset, scores[:,c_idx])
            ranklist.sort(key=lambda v:(v[1], str(v[0])), reverse=True)
            ranklist = [x for x in ranklist if x[0] in ground_truth]
            
            sorted_labels = [ground_truth[x[0]] for x in ranklist]
            assert(len(sorted_labels)>0)
            #print concepts[c_idx], ranklist[:5], sorted_labels[:5]

            ap_table[run_idx, c_idx] = apscorer.score(sorted_labels)

            sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]]
            ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels)
            ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels)
            ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels)

        res = np.zeros((nr_of_images, 3))
        for j in range(nr_of_images):
            ranklist = zip(range(nr_of_concepts), scores[j,:])
            ranklist.sort(key=lambda v:v[1], reverse=True)
            rel_set = rel_conset.get(imset[j], set())
            sorted_labels = [int(x[0] in rel_set) for x in ranklist]
            ap = apscorer.score(sorted_labels)
            hit1 = p1scorer.score(sorted_labels)
            hit5 = p5scorer.score(sorted_labels) > 0.1
            res[j,:] = [ap, hit1, hit5]
        avg_perf = res.mean(axis=0)
        print os.path.split(datafiles[run_idx])[-1], ' '.join(['%.3f' % x for x in avg_perf])
            


    print '#'*100
    print '# untagged-concept', ' '.join([os.path.split(x)[-1] for x in datafiles])
    print '#'*100
            
    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ap_table[:,c_idx]])
    print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)])
    
    print '#'*100
    print '# tagged-concept'
    print '#'*100
    
    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ap2_table[:,c_idx]])
    print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])
    
    print '#'*100
    print '# tagged-concept'
    print '#'*100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ndcg_table[:,c_idx]])
    print 'mean%s' % ndcg.name(), ' '.join(['%.3f' % x for x in ndcg_table.mean(axis=1)])

    print '#'*100
    print '# tagged-concept'
    print '#'*100

    for c_idx in range(nr_of_concepts):
        print concepts[c_idx], ' '.join(['%.3f' % x for x in ndcg2_table[:,c_idx]])
    print 'mean%s'%ndcg2.name(), ' '.join(['%.3f' % x for x in ndcg2_table.mean(axis=1)])
Esempio n. 33
0
def process(options, trainCollection, valCollection, testCollection):
    lang = which_language(trainCollection)
    assert(which_language(trainCollection) == which_language(valCollection))
    assert(which_language(trainCollection) == which_language(testCollection))

    rootpath = options.rootpath
    overwrite =  options.overwrite
    checkpoint = options.checkpoint
    init_model_from = options.init_model_from
    unroll = options.unroll
    corpus = options.corpus
    word2vec = options.word2vec
    batch_size = options.batch_size
    
    w2vv_config = options.model_config
    config = load_config('w2vv_configs/%s.py' % w2vv_config)

    img_feature = config.img_feature
    set_style = config.set_style
    # text embedding style (word2vec, bag-of-words, word hashing)
    text_style = config.text_style
    L1_normalize = config.L1_normalize
    L2_normalize = config.L2_normalize
    
    bow_vocab = config.bow_vocab+'.txt'

    l2_p = config.l2_p
    dropout = config.dropout
    
    max_epochs= config.max_epochs
    optimizer = config.optimizer
    loss_fun = config.loss_fun
    lr = config.lr
    clipnorm = config.clipnorm
    activation = config.activation
    sequences = config.sequences

    # lstm
    sent_maxlen = config.sent_maxlen
    embed_size = config.embed_size
    we_trainable = config.we_trainable
    lstm_size = config.lstm_size

    n_layers = map(int, config.n_layers.strip().split('-'))

    if init_model_from != '':
        init_model_name = init_model_from.strip().split("/")[-1]
        train_style = INFO + "_" + init_model_name
    else:
        train_style = INFO

    rnn_style, bow_style, w2v_style = text_style.strip().split('@')
    
    # text embedding style
    model_info = w2vv_config

    if 'lstm' in text_style or 'gru' in text_style:
        if lang == 'zh':
            w2v_data_path = os.path.join(rootpath, 'zh_w2v', 'model', 'zh_jieba.model')
        else:
            w2v_data_path = os.path.join(rootpath, "word2vec", corpus, word2vec)

        # bag-of-words vocabulary file path
        text_data_path = os.path.join(rootpath, trainCollection, "TextData", "vocabulary", "bow", bow_vocab)
        bow_data_path = os.path.join(rootpath, trainCollection, "TextData", "vocabulary", bow_style, bow_vocab)
        
        # text embedding (text representation)
        text2vec = get_text_encoder(rnn_style)(text_data_path, ndims=0, language=lang, L1_normalize=L1_normalize, L2_normalize=L2_normalize, maxlen=sent_maxlen)
        bow2vec = get_text_encoder(bow_style)(bow_data_path, ndims=0, language=lang, L1_normalize=L1_normalize, L2_normalize=L2_normalize)
        w2v2vec = get_text_encoder(w2v_style)(w2v_data_path, ndims=0, language=lang, L1_normalize=L1_normalize, L2_normalize=L2_normalize)
        if n_layers[0] == 0:
            n_layers[0] = bow2vec.ndims + w2v2vec.ndims
        else:
            assert n_layers[0] == bow2vec.ndims + w2v2vec.ndims

        # log file
        checkpoint_dir = os.path.join(rootpath, trainCollection, checkpoint, valCollection, train_style, model_info)

    else:
        logger.info("%s is not supported, please check the 'text_style' parameter", text_style)
        sys.exit(0)

    train_loss_hist_file = os.path.join(checkpoint_dir, 'train_loss_hist.txt')
    val_per_hist_file = os.path.join(checkpoint_dir, 'val_per_hist.txt')
    model_file_name = os.path.join(checkpoint_dir, 'model.json')
    model_img_name = os.path.join(checkpoint_dir, 'model.png')

    logger.info(model_file_name)
    if checkToSkip(model_file_name, overwrite):
        sys.exit(0)

    makedirsforfile(val_per_hist_file)

    # img2vec
    img_feat_path = os.path.join(rootpath, FULL_COLLECTION, 'FeatureData', img_feature)
    img_feats = BigFile(img_feat_path)

    val_img_feat_path = os.path.join(rootpath, FULL_COLLECTION, 'FeatureData', img_feature)
    val_img_feats = BigFile(val_img_feat_path)

    # dataset 
    train_file = os.path.join(rootpath, trainCollection, 'TextData', '%s.caption.txt' % trainCollection)

    # training set
    # print "loss function: ", loss_fun
    dataset_style = 'sent_' + loss_fun
    DataSet = get_dataset(dataset_style)

    # represent text on the fly
    trainData = DataSet(train_file, batch_size, text2vec, bow2vec, w2v2vec, img_feats, flag_maxlen=True, maxlen=sent_maxlen)

    # get pre-trained word embedding
    we_weights = get_we_parameter(text2vec.vocab, w2v_data_path, lang)
    # define word2visualvec model
    w2vv = W2VV_MS( text2vec.nvocab, sent_maxlen, embed_size, we_weights, we_trainable, lstm_size, n_layers, dropout, l2_p, activation=activation, lstm_style=rnn_style, sequences=sequences, unroll=unroll)

    w2vv.save_json_model(model_file_name)
    w2vv.plot(model_img_name)
    w2vv.compile_model(optimizer, loss_fun, learning_rate = lr, clipnorm=clipnorm)
   

    if options.init_model_from != '':
        logger.info('initialize the model from %s', options.init_model_from)
        w2vv.init_model(options.init_model_from)

    # preparation for validation
    val_sent_file = os.path.join(rootpath, valCollection, 'TextData', '%s.caption.txt' % valCollection)
    val_sents_id, val_sents, val_id2sents = readSentsInfo(val_sent_file)
    val_img_list = map(str.strip, open(os.path.join(rootpath, valCollection,  set_style, '%s.txt' % valCollection)).readlines())

    sent_feats_1 = []
    sent_feats_2 = []
    new_val_sents_id = []
    for index, sent in enumerate(val_sents):
        sent_vec = text2vec.mapping(sent)
        bow_vec = bow2vec.mapping(sent)
        w2v_vec = w2v2vec.mapping(sent)
        if sent_vec is not None and bow_vec is not None and w2v_vec is not None:
            sent_feats_1.append(sent_vec)
            sent_feats_2.append(list(bow_vec) + list(w2v_vec))
            new_val_sents_id.append(val_sents_id[index])
    sent_feats_1 = pad_sequences(sent_feats_1, maxlen=sent_maxlen,  truncating='post')

    simer = get_simer('cosine_batch')()
    scorer = getScorer(options.val_metric)

    count = 0
    lr_count = 0
    best_validation_perf = 0
    best_epoch = -1
    train_loss_hist = []
    val_per_hist = []
    n_train_batches = int(np.ceil( 1.0 * trainData.datasize / batch_size ))
    if loss_fun == 'ctl':
        datasize = 2*trainData.datasize
    else:
        datasize = trainData.datasize
    for epoch in range(max_epochs):
        logger.info('Epoch %d', epoch)
        logger.info("Training..., learning rate: %g", w2vv.get_lr())
        
        train_loss_epoch = []
        train_progbar = generic_utils.Progbar(datasize)
        trainBatchIter = trainData.getBatchData()
        for minibatch_index in xrange(n_train_batches):
            train_X_batch, train_Y_batch = trainBatchIter.next()
            loss = w2vv.model.train_on_batch(train_X_batch, train_Y_batch)
            train_progbar.add(train_X_batch[0].shape[0], values=[("train loss", loss)])

            train_loss_epoch.append(loss)

        train_loss_hist.append(np.mean(train_loss_epoch))

        this_validation_perf = do_validation(val_img_list, val_img_feats, new_val_sents_id, sent_feats_1, sent_feats_2, simer, scorer, w2vv)
        val_per_hist.append(this_validation_perf)

        logger.info('previous_best_performance: %g', best_validation_perf)
        logger.info('current_performance: %g', this_validation_perf)

        fout_file = os.path.join(checkpoint_dir, 'epoch_%d.h5' % ( epoch))

        lr_count += 1
        if this_validation_perf > best_validation_perf:
            best_validation_perf = this_validation_perf          
            count = 0

            # save best model
            w2vv.model.save_weights(fout_file)
            if best_epoch != -1:
                os.system('rm '+ os.path.join(checkpoint_dir, 'epoch_%d.h5' % (best_epoch)))
            best_epoch = epoch
        else:
            # when the validation performance has decreased after an epoch,
            # we divide the learning rate by 2 and continue training;
            # but we use each learning rate for at least 3 epochs.
            if lr_count > 2:
                w2vv.decay_lr(0.5)
                lr_count = 0
            count += 1
            if count > 10:
                print ("Early stopping happend")
                break


    sorted_epoch_loss = zip(range(len(train_loss_hist)), train_loss_hist)
    with open(train_loss_hist_file, 'w') as fout:
        for i, loss in sorted_epoch_loss:
            fout.write("epoch_" + str(i) + " " + str(loss) + "\n")


    sorted_epoch_perf = sorted(zip(range(len(val_per_hist)), val_per_hist), key = lambda x: x[1], reverse=True)
    with open(val_per_hist_file, 'w') as fout:
        for i, perf in sorted_epoch_perf:
            fout.write("epoch_" + str(i) + " " + str(perf) + "\n")


    # generate the shell script for test
    templete = ''.join(open( 'TEMPLATE_do_test.sh').readlines())
    striptStr = templete.replace('@@@rootpath@@@', rootpath)
    striptStr = striptStr.replace('@@@overwrite@@@', str(overwrite))
    striptStr = striptStr.replace('@@@trainCollection@@@', trainCollection)
    striptStr = striptStr.replace('@@@testCollection@@@', '%s %s'%(valCollection, testCollection))
    striptStr = striptStr.replace('@@@model_config@@@', w2vv_config)
    striptStr = striptStr.replace('@@@set_style@@@', set_style)
    striptStr = striptStr.replace('@@@model_path@@@', checkpoint_dir)
    striptStr = striptStr.replace('@@@model_name@@@', 'model.json')
    striptStr = striptStr.replace('@@@weight_name@@@', 'epoch_%d.h5' % sorted_epoch_perf[0][0])
    runfile = 'do_test_%s_%s.sh' % (w2vv_config, testCollection)
    open( runfile, 'w' ).write(striptStr+'\n')
    os.system('chmod +x %s' % runfile)
    os.system('./%s' % runfile)