def process(options, trainCollection, modelAnnotationName, trainAnnotationName, feature): rootpath = options.rootpath modelName = options.model if 'fastlinear' == modelName: from fastlinear.fastlinear import fastlinear_load_model as load_model from fastlinear.fastlinear import fastlinear_save_model as save_model else: from fiksvm.fiksvm import fiksvm_load_model as load_model from fiksvm.fiksvm import fiksvm_save_model as save_model concepts = readConcepts(trainCollection, trainAnnotationName, rootpath) concepts = [concepts[i] for i in range(len(concepts)) if (i%options.numjobs + 1) == options.job] feat_file = BigFile(os.path.join(rootpath, trainCollection, "FeatureData", feature)) for concept in concepts: modelfile = os.path.join(rootpath, trainCollection, 'Models', modelAnnotationName, feature, modelName, '%s.model' % concept) model = load_model(modelfile) (A0, B0) = model.get_probAB() if abs(A0) > 1e-8 and not options.overwrite: printStatus(INFO, "old parameters exist as A=%g, B=%g, skip" % (A0, B0)) continue names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) results = classify_large_data(model, names, feat_file, prob_output=False) labels = [name2label[x[0]] for x in results] dec_values = [x[1] for x in results] printStatus(INFO, "%s +%d -%d" % (concept, len([x for x in labels if x==1]), len([x for x in labels if x==-1]))) [A,B] = sigmoid_train(dec_values, labels) model.set_probAB(A, B) save_model(modelfile, model) (A1, B1) = model.get_probAB() printStatus(INFO, "A: %g -> %g, B: %g -> %g" % (A0, A1, B0, B1))
def evaluateSearchEngines(searchers, collection, annotationName, metric, rootpath=ROOT_PATH): scorer = getScorer(metric) concepts = readConcepts(collection, annotationName, rootpath) nr_of_runs = len(searchers) nr_of_concepts = len(concepts) results = np.zeros((nr_of_concepts, nr_of_runs)) for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], rootpath) name2label = dict(zip(names, labels)) for j in range(nr_of_runs): searchresults = searchers[j].scoreCollection(concepts[i]) sorted_labels = [ name2label[name] for (name, score) in searchresults if name in name2label ] results[i, j] = scorer.score(sorted_labels) for i in range(nr_of_concepts): print concepts[i], ' '.join([niceNumber(x, 3) for x in results[i, :]]) mean_perf = results.mean(0) print 'mean%s' % metric, ' '.join([niceNumber(x, 3) for x in mean_perf]) return concepts, results
def process(options, collection, annotationName, pos_num): assert(annotationName.endswith('.txt')) rootpath = options.rootpath pos_bag_num = options.pos_bag_num neg_bag_num = options.neg_bag_num neg_pos_ratio = options.neg_pos_ratio annotationNameStr = annotationName[:-4] + ('.random%d' % pos_num) + '.%d' + ('.npr%d' % neg_pos_ratio) + '.%d.txt' concepts = readConcepts(collection, annotationName, rootpath=rootpath) skip = 0 newAnnotationNames = [None] * (pos_bag_num * neg_bag_num) for idxp in range(pos_bag_num): for idxn in range(neg_bag_num): anno_idx = idxp * neg_bag_num + idxn newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn) resultfile = os.path.join(rootpath,collection,'Annotations',newAnnotationNames[anno_idx]) if checkToSkip(resultfile, options.overwrite): skip += 1 continue writeConcepts(concepts,resultfile) first,second,last = annotationNameStr.split('%d') scriptfile = os.path.join(rootpath,collection,'annotationfiles',first + '0-%d'%(pos_bag_num-1) + second + '0-%d'%(neg_bag_num-1) + last) makedirsforfile(scriptfile) fout = open(scriptfile,'w') fout.write('\n'.join(newAnnotationNames) + '\n') fout.close() if len(newAnnotationNames) == skip: return 0 for concept in concepts: names,labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath) positivePool = [x[0] for x in zip(names,labels) if x[1]>0] negativePool = [x[0] for x in zip(names,labels) if x[1]<0] for idxp in range(pos_bag_num): if len(positivePool) > pos_num: positiveBag = random.sample(positivePool, pos_num) else: positiveBag = positivePool for idxn in range(neg_bag_num): anno_idx = idxp * neg_bag_num + idxn newAnnotationName = newAnnotationNames[anno_idx] resultfile = os.path.join(rootpath,collection,'Annotations','Image',newAnnotationName,'%s.txt'%concept) if checkToSkip(resultfile, options.overwrite): continue real_neg_num = max(len(positiveBag) * neg_pos_ratio, 1000) real_neg_num = min(len(negativePool), real_neg_num) negativeBag = random.sample(negativePool, real_neg_num) assert(len(set(positiveBag).intersection(set(negativeBag))) == 0) printStatus(INFO, "anno(%s,%d) %d pos %d neg -> %s" % (concept,anno_idx,len(positiveBag),len(negativeBag),resultfile)) writeAnnotations(positiveBag + negativeBag, [1]*len(positiveBag) + [-1]*len(negativeBag), resultfile)
def GET(self): input = web.input(query=None) resp = { 'status': 0, 'hits': 0, 'random': [], 'tagrel': [], 'metric': metric, 'perf': 0 } if input.query: resp['status'] = 1 resp['query'] = input.query query = input.query.lower() if query.isdigit(): # request to view a specific image resp['hits'] = 1 resp['tagrel'] = [{'id': query}] return render.index(resp) try: names, labels = readAnnotationsFrom(collection, annotationName, query) name2label = dict(zip(names, labels)) except Exception, e: name2label = {} content = [] try: if input.tagrel == '0': labeled = readLabeledImageSet(collection, query, rootpath=rootpath) ranklist = [(x, 0) for x in labeled] else: simfile = os.path.join(simdir, '%s.txt' % query) ranklist = readRankingResults(simfile) resp['hits'] = len(ranklist) for name, score in ranklist: color = 'Chartreuse' if name2label.get(name, 0) > 0 else 'red' color = 'white' if name not in name2label else color res = {'id': name, 'color': color} content.append(res) resp['perf'] = 0 if not name2label else scorer.score( [name2label[x[0]] for x in ranklist if x[0] in name2label]) resp['tagrel'] = content[:max_hits] except: None
def GET(self): input = web.input(query=None) resp = {'status':0, 'hits':0, 'random':[], 'tagrel':[], 'metric':metric, 'perf':0} if input.query: resp['status'] = 1 resp['query'] = input.query query = input.query.lower() if query.isdigit(): # request to view a specific image resp['hits'] = 1 resp['tagrel'] = [{'id':query}] return render.index(resp) try: names,labels = readAnnotationsFrom(collection, annotationName, query) name2label = dict(zip(names,labels)) except Exception, e: name2label = {} content = [] try: if input.tagrel == '0': labeled = readLabeledImageSet(collection, query, rootpath=rootpath) ranklist = [(x,0) for x in labeled] else: simfile = os.path.join(simdir, '%s.txt' % query) ranklist = readRankingResults(simfile) resp['hits'] = len(ranklist) for name,score in ranklist: color = 'Chartreuse' if name2label.get(name,0)>0 else 'red' color = 'white' if name not in name2label else color res = {'id':name, 'color':color} content.append(res) resp['perf'] = 0 if not name2label else scorer.score([name2label[x[0]] for x in ranklist if x[0] in name2label]) resp['tagrel'] = content[:max_hits] except: None
def evaluateSearchEngines(searchers, collection, annotationName, metric, rootpath=ROOT_PATH): scorer = getScorer(metric) concepts = readConcepts(collection, annotationName, rootpath) nr_of_runs = len(searchers) nr_of_concepts = len(concepts) results = np.zeros((nr_of_concepts,nr_of_runs)) for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], rootpath) name2label = dict(zip(names,labels)) for j in range(nr_of_runs): searchresults = searchers[j].scoreCollection(concepts[i]) sorted_labels = [name2label[name] for (name,score) in searchresults if name in name2label] results[i,j] = scorer.score(sorted_labels) for i in range(nr_of_concepts): print concepts[i], ' '.join([niceNumber(x,3) for x in results[i,:]]) mean_perf = results.mean(0) print 'mean%s'%metric, ' '.join([niceNumber(x,3) for x in mean_perf]) return concepts,results
def process(options, trainCollection, trainAnnotationName, valCollection, valAnnotationName, feature, modelName): assert (modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 metric = options.metric scorer = getScorer(metric) overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {} if 'fik' == modelName: from fiksvm.svmutil import svm_train as train_model from fiksvm.fiksvm import svm_to_fiksvm as compress_model from fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from fastlinear.liblinear193.python.liblinearutil import train as train_model from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model modelName = 'fastlinear' concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) valConcepts = readConcepts(valCollection, valAnnotationName, rootpath=rootpath) concept_num = len(concepts) for i in range(concept_num): assert (concepts[i] == valConcepts[i]) resultdir = os.path.join( rootpath, trainCollection, 'Models', trainAnnotationName, '%s,best_params' % modelName, '%s,%s,%s' % (valCollection, valAnnotationName, feature)) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.txt') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile( os.path.join(rootpath, trainCollection, 'FeatureData', feature)) val_feat_file = BigFile( os.path.join(rootpath, valCollection, 'FeatureData', feature)) feat_dim = train_feat_file.ndims assert (feat_dim == val_feat_file.ndims) for concept in todo: names, labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) renamed, vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1 == lab]) wp = float(beta) * (np + nn) / np wn = (1.0 - beta) * (np + nn) / nn names, labels = readAnnotationsFrom(valCollection, valAnnotationName, concept, skip_0=True, rootpath=rootpath) val_name2label = dict(zip(names, labels)) val_renamed, val_vectors = val_feat_file.read(names) min_perf = 2.0 worst_C = 1.0 max_perf = 0.0 best_C = 1.0 best_scores = None best_labels = None for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]: if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' #print modelName, '>'*20, svm_params model = train_model(Ys, vectors, svm_params + ' -q') new_model = compress_model([model], [1.0], feat_dim, params) ranklist = [(val_renamed[i], new_model.predict(val_vectors[i])) for i in range(len(val_renamed))] ranklist.sort(key=lambda v: v[1], reverse=True) sorted_labels = [val_name2label[x[0]] for x in ranklist] perf = scorer.score(sorted_labels) if max_perf < perf: max_perf = perf best_C = C best_scores = [x[1] for x in ranklist] best_labels = list(sorted_labels) if min_perf > perf: min_perf = perf worst_C = C [A, B] = sigmoid_train(best_scores, best_labels) resultfile = os.path.join(resultdir, '%s.txt' % concept) printStatus( INFO, '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' % (concept, min_perf, worst_C, max_perf, best_C, A, B)) makedirsforfile(resultfile) fw = open(resultfile, 'w') fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B)) fw.close()
def process(options, trainCollection, trainAnnotationName, feature): import re p = re.compile( r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)') rootpath = options.rootpath overwrite = options.overwrite #autoweight = options.autoweight numjobs = options.numjobs job = options.job nr_bins = options.nr_bins best_param_dir = options.best_param_dir beta = 0.5 modelName = 'fik%d' % nr_bins if best_param_dir: modelName += '-tuned' concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) feat_file = BigFile(feat_dir) params = {'nr_bins': nr_bins} with open(os.path.join(feat_dir, 'minmax.txt'), 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) for concept in todo: if best_param_dir: param_file = os.path.join(best_param_dir, '%s.txt' % concept) m = p.search(open(param_file).readline().strip()) C = float(m.group('C')) A = float(m.group('a')) B = float(m.group('b')) else: C = 1 A = 0 B = 0 printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B)) model_file_name = os.path.join(resultdir, concept + '.model') names, labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) renamed, vectors = feat_file.read(names) y = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1 == lab]) wp = float(beta) * (np + nn) / np wn = (1.0 - beta) * (np + nn) / nn svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C) model = svm_train( y, vectors, svm_params + ' -s 0 -t %d -q' % KERNEL_TYPE.index("HI")) newmodel = svm_to_fiksvm([model], [1.0], feat_file.ndims, params) newmodel.set_probAB(A, B) makedirsforfile(model_file_name) printStatus(INFO, '-> %s' % model_file_name) fiksvm_save_model(model_file_name, newmodel) # reload the model file to do a simple check fiksvm_load_model(model_file_name) assert (abs(newmodel.get_probAB()[0] - A) < 1e-6) assert (abs(newmodel.get_probAB()[1] - B) < 1e-6) return len(todo)
feat_dim = 1000 scorer = getScorer("AP") targetConcept = sys.argv[1] #"aeroplane" train_feat_file = BigFile(os.path.join(ROOT_PATH, trainCollection, "FeatureData", feature), feat_dim) test_feat_file = BigFile(os.path.join(ROOT_PATH, testCollection, "FeatureData", feature), feat_dim) testImageSet = test_feat_file.names #random.sample(test_feat_file.names, 10000) minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: min_vals = map(float, str.split(f.readline())) max_vals = map(float, str.split(f.readline())) [names,labels] = readAnnotationsFrom(collection=trainCollection, annotationName=trainAnnotationName, concept=targetConcept, rootpath=rootpath) name2label = dict(zip(names,labels)) (renamed, vectors) = train_feat_file.read(names) relabeled = [name2label[x] for x in renamed] #label is either 1 or -1 [names,labels] = readAnnotationsFrom(collection=testCollection, annotationName=testAnnotationName, concept=targetConcept, rootpath=rootpath) test2label = dict(zip(names,labels)) for beta in [0.5]: #[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: #model = hiksvm_train(relabeled, vectors, beta=beta) cv = 3 best_beta, cv_score, model = hiksvm_train_cv(relabeled, vectors, cv, scorer, min_vals, max_vals) print best_beta, cv_score #fikmodel = svm_to_fiksvm([model], [1.0], 1, dim, 50) fikmodel = svm_to_fiksvm([model], 1, [1.0], feat_dim=feat_dim, min_vals=min_vals, max_vals=max_vals, num_bins=50)
def process(options, trainCollection, devCollection): rootpath = options.rootpath overwrite = options.overwrite method = options.method metric = options.metric qrysim = options.qrysim qrythres = options.qrythres ntopimg = options.ntopimg ntopqry = options.ntopqry mincc = options.mincc feature = options.feature # semantic embedding k = options.k corpus = options.corpus word2vec_model = options.word2vec label_source = options.label_source # result path ranking_result_path = os.path.join(rootpath, devCollection, 'SimilarityIndex', devCollection, 'MetaData', method, feature) DCG_result_path = os.path.join(rootpath, devCollection, 'DCG', method, feature) if checkToSkip(ranking_result_path, overwrite): sys.exit(0) if checkToSkip(DCG_result_path, overwrite): sys.exit(0) # inpute of query qp = SimpleQueryParser() qid_query_file = os.path.join(rootpath, devCollection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) #(qid query) qid2query = dict(zip(qid_list, [qp.process(query) for query in query_list])) # path of image feature train_feat_path = os.path.join(rootpath, trainCollection, 'FeatureData', feature) dev_feat_path = os.path.join(rootpath, devCollection, 'FeatureData', feature) # method selection if method == 'se': se_searcher = SemanticEmbedding(label_source, corpus, word2vec_model, dev_feat_path, rootpath) elif method == 't2i': nnquery_file = os.path.join(rootpath, devCollection, 'TextData', 'querynn', options.nnqueryfile) qryClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.queryclickfile) t2i_searcher = Text2Image(nnquery_file, qryClick_file, dev_feat_path, train_feat_path, ntopqry) elif method == 'i2t': nnimage_file = os.path.join(rootpath, devCollection, 'TextData', 'imagenn', feature, options.nnimagefile) imgClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.imageclickfile) i2t_searcher = Image2Text(nnimage_file, imgClick_file, qrysim, ntopimg, ntopqry) else: print "this model is not supported with %s" % method sys.exit(0) # calculate DCG@25 scorer = getScorer(metric) done = 0 failed_count = 0 qid2dcg = collections.OrderedDict() qid2iid_label_score = {} for query_id in qid_list: iid_list, label_list = readAnnotationsFrom( devCollection, 'concepts%s.txt' % devCollection, query_id, False, rootpath) if method == 'se': scorelist = se_searcher.do_search(qid2query[query_id], iid_list, k) elif method == 't2i': scorelist = t2i_searcher.text2image(query_id, iid_list, qrythres, mincc) elif method == 'i2t': scorelist = i2t_searcher.image2text(qid2query[query_id], iid_list, mincc) if len(scorelist) == 0: failed_count += 1 scorelist = [0] * len(iid_list) qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist) random.shuffle(qid2iid_label_score[query_id]) else: qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist) qid2iid_label_score[query_id] = sorted( qid2iid_label_score[query_id], key=lambda v: v[2], reverse=True) # calculate the result ranking of DCG@25 from our model qid2dcg[query_id] = scorer.score( [x[1] for x in qid2iid_label_score[query_id]]) printMessage("Done", query_id, qid2query[query_id]) done += 1 if (done % 20 == 0): writeRankingResult(ranking_result_path, qid2iid_label_score) qid2iid_label_score = {} writeRankingResult(ranking_result_path, qid2iid_label_score) writeDCGResult(DCG_result_path, qid2dcg) print "number of failed query: %d" % failed_count print "average DCG@25: %f" % (1.0 * sum(qid2dcg.values()) / len(qid2dcg.values())) result_path_file = "result/individual_result_pathes.txt" if os.path.exists(result_path_file): fout = open(result_path_file, 'a') else: makedirsforfile(result_path_file) fout = open(result_path_file, 'w') fout.write(ranking_result_path + '\n') fout.close()
def process(options, trainCollection, annotationfile, feature, modelName): assert (modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 C = 1 overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {'rootpath': rootpath, 'model': modelName} if 'fik' == modelName: from svms.fiksvm.svmutil import svm_train as train_model from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model from svms.fiksvm.fiksvm import fiksvm_save_model as save_model from svms.fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model from svms.fastlinear.fastlinear import fastlinear_save_model as save_model newAnnotationName = os.path.split(annotationfile)[-1] trainAnnotationNames = [ x.strip() for x in open(annotationfile).readlines() if x.strip() and not x.strip().startswith('#') ] for annotationName in trainAnnotationNames: conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', annotationName) if not os.path.exists(conceptfile): print '%s does not exist' % conceptfile return 0 concepts = readConcepts(trainCollection, trainAnnotationNames[0], rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile( os.path.join(rootpath, trainCollection, 'FeatureData', feature)) feat_dim = train_feat_file.ndims s_time = time.time() for concept in todo: assemble_model = None for t in range(1, len(trainAnnotationNames) + 1): names, labels = readAnnotationsFrom(trainCollection, trainAnnotationNames[t - 1], concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) renamed, vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1 == lab]) wp = float(beta) * (np + nn) / np wn = (1.0 - beta) * (np + nn) / nn if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' g_t = train_model(Ys, vectors, svm_params + ' -q') if t == 1: assemble_model = compress_model([g_t], [1.0], feat_dim, params) else: new_model = compress_model([g_t], [1.0], feat_dim, params) assemble_model.add_fastsvm(new_model, 1 - 1.0 / t, 1.0 / t) new_model_file = os.path.join(resultdir, '%s.model' % concept) makedirsforfile(new_model_file) printStatus(INFO, 'save model to %s' % new_model_file) save_model(new_model_file, assemble_model) printStatus(INFO, '%s done' % concept) timecost = time.time() - s_time writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath) printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo))) printStatus(INFO, 'models stored at %s' % resultdir) printStatus(INFO, '%g seconds in total' % timecost)
#tagrelMethod = 'flickr1m/ccgd,knn,1000' concepts = readConcepts(collection, sourceAnnotationName%0, rootpath) holdoutfile = os.path.join(rootpath, collection, "ImageSets", "holdout.txt") holdoutSet = set(map(str.strip, open(holdoutfile).readlines())) print ('%s holdout %d' % (collection,len(holdoutSet))) for concept in concepts: simfile = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'tagged,lemm', tagrelMethod, '%s.txt' % concept) searchresults = readRankingResults(simfile) searchresults = [x for x in searchresults if x[0] not in holdoutSet] positiveSet = [x[0] for x in searchresults[:numPos]] for t in range(T): newAnnotationName = sourceAnnotationName % t newAnnotationName = newAnnotationName.replace('rand%d.0'%numPos, posName) names,labels = readAnnotationsFrom(collection,sourceAnnotationName%t,concept,rootpath) negativeSet = [x[0] for x in zip(names,labels) if -1 == x[1]] renamed = positiveSet + negativeSet relabeled = [1] * len(positiveSet) + [-1] * len(negativeSet) print ('[%s] %s +%d, -%d -> %s' % (concept,sourceAnnotationName % t,len(positiveSet),len(negativeSet),newAnnotationName)) writeAnnotationsTo(renamed, relabeled, collection, newAnnotationName, concept, rootpath) for t in range(T): newAnnotationName = sourceAnnotationName % t newAnnotationName = newAnnotationName.replace('rand%d.0'%numPos, posName) writeConceptsTo(concepts, collection, newAnnotationName, rootpath)
concepts = readConcepts(testCollection, 'conceptsvoc2008val.txt') scorer = getScorer('AP') min_vals, max_vals = find_min_max_vals(BigFile(os.path.join(rootpath, trainCollection, 'FeatureData', feature), FEATURE_TO_DIM[feature])) featurefile = os.path.join(rootpath, testCollection, "FeatureData", feature, "id.feature.txt") feat_dim = 1024 num_bins = 50 #fikmodel.set_probAB(-1, 0) #print "fik model0", fikmodel0.get_nr_svs(), fikmodel0.get_feat_dim(), fikmodel0.get_probAB() #print "fik model", fikmodel.get_nr_svs(), fikmodel.get_feat_dim(), fikmodel.get_probAB() mAP = [0]*4 for concept in concepts: names,labels = readAnnotationsFrom(testCollection, 'conceptsvoc2008val.txt', concept) name2label = dict(zip(names,labels)) ranklist = [] modelfile = os.path.join(rootpath, trainCollection, "Models", annotationName, feature, 'hiksvm', "%s.model" % concept) #print modelfile model = svm_load_model(modelfile) #print model.get_svm_type() #print model.get_nr_class() svm_models = [model, model] num_models = len(svm_models) fikmodel0 = svm_to_fiksvm0(svm_models, [1.0/num_models]*num_models, num_models, feat_dim, num_bins) fikmodel1 = svm_to_fiksvm(svm_models, num_models, [1.0/num_models]*num_models, feat_dim, min_vals, max_vals, num_bins) fikmodel2 = svm_to_fiksvm(svm_models, num_models, [1.0/num_models]*num_models, feat_dim, min_vals, max_vals, num_bins) fikmodel2.add_new_fikmodel(fikmodel1, 0.5) print concept, fikmodel1.get_nr_svs(), fikmodel1.get_nr_svs() + fikmodel1.get_nr_svs()/2,
def process(options, trainCollection, devCollection): rootpath = options.rootpath overwrite = options.overwrite metric = options.metric qrythres = options.qrythres ntopimg = options.ntopimg ntopqry = options.ntopqry mincc = options.mincc feature = options.feature # result path ranking_result_path = os.path.join(rootpath, devCollection, 'SimilarityIndex', devCollection, 'MetaData', 'text2image', feature) DCG_result_path = os.path.join(rootpath, devCollection, metric, 'text2image', feature) if checkToSkip(ranking_result_path, overwrite): sys.exit(0) if checkToSkip(DCG_result_path, overwrite): sys.exit(0) # inpute of query qp = SimpleQueryParser() qid_query_file = os.path.join(rootpath, devCollection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) #(qid query) qid2query = dict(zip(qid_list, [qp.process(query) for query in query_list])) # random performance for specific queries qid_randomperf_file = os.path.join(rootpath, devCollection, 'Annotations', '*****@*****.**') qid2randomperf = {} for line in open(qid_randomperf_file): qid, random_perf = line.strip().split() qid2randomperf[qid] = float(random_perf) # path of image feature train_feat_path = os.path.join(rootpath, trainCollection, 'FeatureData', feature) dev_feat_path = os.path.join(rootpath, devCollection, 'FeatureData', feature) nnquery_file = os.path.join(rootpath, devCollection, 'TextData','querynn', options.nnqueryfile) qryClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.queryclickfile) t2i_searcher = Text2Image(nnquery_file, qryClick_file, dev_feat_path, train_feat_path, ntopqry) # calculate DCG@25 scorer = getScorer(metric) done = 0 failed_count = 0 qid2dcg = collections.OrderedDict() qid2iid_label_score = {} for query_id in qid_list: iid_list, label_list = readAnnotationsFrom(devCollection, 'concepts%s.txt' % devCollection, query_id, False, rootpath) scorelist = t2i_searcher.doSearch( query_id, iid_list, ntopimg, qrythres, mincc) if len(scorelist) == 0: failed_count += 1 qid2dcg[query_id] = qid2randomperf[query_id] else: qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist) qid2iid_label_score[query_id] = sorted(qid2iid_label_score[query_id], key=lambda v:v[2], reverse=True) # calculate the result ranking of DCG@25 from our model qid2dcg[query_id] = scorer.score([x[1] for x in qid2iid_label_score[query_id]]) printMessage("Done", query_id, qid2query[query_id]) done += 1 if(done % 20 == 0): writeRankingResult(ranking_result_path, qid2iid_label_score) qid2iid_label_score = {} writeRankingResult(ranking_result_path, qid2iid_label_score) writeDCGResult(DCG_result_path, qid2dcg) print "number of failed query: %d" % failed_count print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values()))
train_feat_file = BigFile( os.path.join(ROOT_PATH, trainCollection, "FeatureData", feature), feat_dim) test_feat_file = BigFile( os.path.join(ROOT_PATH, testCollection, "FeatureData", feature), feat_dim) testImageSet = test_feat_file.names #random.sample(test_feat_file.names, 10000) minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: min_vals = map(float, str.split(f.readline())) max_vals = map(float, str.split(f.readline())) [names, labels] = readAnnotationsFrom(collection=trainCollection, annotationName=trainAnnotationName, concept=targetConcept, rootpath=rootpath) name2label = dict(zip(names, labels)) (renamed, vectors) = train_feat_file.read(names) relabeled = [name2label[x] for x in renamed] #label is either 1 or -1 [names, labels] = readAnnotationsFrom(collection=testCollection, annotationName=testAnnotationName, concept=targetConcept, rootpath=rootpath) test2label = dict(zip(names, labels)) for beta in [0.5]: #[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: #model = hiksvm_train(relabeled, vectors, beta=beta) cv = 3 best_beta, cv_score, model = hiksvm_train_cv(relabeled, vectors, cv,
def process(options, collection): rootpath = options.rootpath overwrite = options.overwrite feature = options.feature method = options.method sigma = options.sigma # result path ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'MetaData', method, feature) DCG_result_path = os.path.join(rootpath, collection, 'DCG', method, feature) if checkToSkip(ranking_result_path, overwrite): sys.exit(0) if checkToSkip(DCG_result_path, overwrite): sys.exit(0) # inpute of query qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) qid2query = dict(zip(qid_list, query_list)) # inpute of image img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature) img_feats = BigFile(img_feat_path) # the model to calculate DCG@25 scorer = getScorer("DCG@25") done = 0 qid2dcg = collections.OrderedDict() qid2iid_label_score = {} for qid in qid_list: iid_list, label_list = readAnnotationsFrom( collection, 'concepts%s.txt' % collection, qid, False, rootpath) renamed, test_X = img_feats.read(iid_list) parzen_list = [] for imidx in iid_list: parzen_list.append( calParzen(img_feats.read_one(imidx), test_X, sigma)) # parzen_list_suffle = calParzen_fast(test_X, len(renamed), sigma) # parzen_list = [] # for imidx in iid_list: # parzen_list.append(parzen_list_suffle[renamed.index(imidx)]) sorted_tuple = sorted(zip(iid_list, label_list, parzen_list), key=lambda v: v[2], reverse=True) qid2iid_label_score[qid] = sorted_tuple # calculate DCG@25 sorted_label = [x[1] for x in sorted_tuple] qid2dcg[qid] = scorer.score(sorted_label) printMessage("Done", qid, qid2query[qid]) done += 1 if done % 20 == 0: writeRankingResult(ranking_result_path, qid2iid_label_score) qid2iid_label_score = {} writeDCGResult(DCG_result_path, qid2dcg) writeRankingResult(ranking_result_path, qid2iid_label_score) print "average DCG@25: %f" % (1.0 * sum(qid2dcg.values()) / len(qid2dcg.values())) result_path_file = "result/individual_result_pathes.txt" if os.path.exists(result_path_file): fout = open(result_path_file, 'a') else: makedirsforfile(result_path_file) fout = open(result_path_file, 'w') fout.write(ranking_result_path + '\n') fout.close()
print('%s holdout %d' % (collection, len(holdoutSet))) for concept in concepts: simfile = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'tagged,lemm', tagrelMethod, '%s.txt' % concept) searchresults = readRankingResults(simfile) searchresults = [x for x in searchresults if x[0] not in holdoutSet] positiveSet = [x[0] for x in searchresults[:numPos]] for t in range(T): newAnnotationName = sourceAnnotationName % t newAnnotationName = newAnnotationName.replace( 'rand%d.0' % numPos, posName) names, labels = readAnnotationsFrom(collection, sourceAnnotationName % t, concept, rootpath) negativeSet = [x[0] for x in zip(names, labels) if -1 == x[1]] renamed = positiveSet + negativeSet relabeled = [1] * len(positiveSet) + [-1] * len(negativeSet) print('[%s] %s +%d, -%d -> %s' % (concept, sourceAnnotationName % t, len(positiveSet), len(negativeSet), newAnnotationName)) writeAnnotationsTo(renamed, relabeled, collection, newAnnotationName, concept, rootpath) for t in range(T): newAnnotationName = sourceAnnotationName % t newAnnotationName = newAnnotationName.replace('rand%d.0' % numPos, posName)
def process(options, collection, annotationName, runfile, outDirectory): rootpath = options.rootpath apscorer = getScorer("AP") ndcg = getScorer("NDCG@20") ndcg2 = getScorer("NDCG2@20") p1scorer = getScorer("P@1") p5scorer = getScorer("P@5") datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith("#")] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) printStatus(INFO, "read annotations from files") name2label = [{} for i in range(nr_of_concepts)] hit_imgset = [[] for i in range(nr_of_concepts)] rel_conset = {} for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) names = map(int, names) name2label[i] = dict(zip(names, labels)) for im, lab in zip(names, labels): if lab > 0: rel_conset.setdefault(im, set()).add(i) label_file = os.path.join(rootpath, collection, "tagged,lemm", "%s.txt" % concepts[i]) try: hit_imgset[i] = set(map(int, open(label_file).readlines())) except: hit_imgset[i] = set() printStatus(INFO, "readLabeledImageSet for %s-%s -> %d hits" % (collection, concepts[i], len(hit_imgset[i]))) ap_table = np.zeros((nr_of_runs, nr_of_concepts)) ap2_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts)) print "#" * 100 print "# method miap hit1 hit5" print "#" * 100 for run_idx in range(nr_of_runs): data = pickle.load(open(datafiles[run_idx], "rb")) scores = data["scores"] assert scores.shape[1] == nr_of_concepts imset = data["id_images"] imset = np.array([int(x) for x in imset]) idx = np.argsort(imset) imset = imset[idx] scores = scores[idx] nr_of_images = len(imset) # print datafiles[run_idx], imset[:5], imset[-5:] for c_idx in range(nr_of_concepts): ground_truth = name2label[c_idx] ranklist = zip(imset, scores[:, c_idx]) ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True) ranklist = [x for x in ranklist if x[0] in ground_truth] sorted_labels = [ground_truth[x[0]] for x in ranklist] assert len(sorted_labels) > 0 # print concepts[c_idx], ranklist[:5], sorted_labels[:5] ap_table[run_idx, c_idx] = apscorer.score(sorted_labels) sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]] ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels) ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels) ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels) res = np.zeros((nr_of_images, 4)) gt = np.zeros((nr_of_images, nr_of_concepts)) for j in range(nr_of_images): ranklist = zip(range(nr_of_concepts), scores[j, :]) ranklist.sort(key=lambda v: v[1], reverse=True) rel_set = rel_conset.get(imset[j], set()) sorted_labels = [int(x[0] in rel_set) for x in ranklist] # print rel_set # print sorted_labels ap = apscorer.score(sorted_labels) hit1 = p1scorer.score(sorted_labels) hit5 = p5scorer.score(sorted_labels) > 0.1 res[j, :] = [ap, hit1, hit5, len(rel_set)] gt[j, :] = sorted_labels avg_perf = res.mean(axis=0) print os.path.split(datafiles[run_idx])[-1], " ".join(["%.3f" % x for x in avg_perf]) outMiap = h5py.File(os.path.join(outDirectory, os.path.split(datafiles[run_idx])[-1] + ".h5"), "w") outMiap["iap"] = res[:, 0] outMiap["ngt"] = res[:, 3] outMiap["hit1"] = res[:, 1] outMiap["hit5"] = res[:, 2] outMiap["gt"] = gt outMiap["concepts"] = concepts outMiap["ap"] = ap_table[run_idx, :] outMiap["ap2"] = ap2_table[run_idx, :] outMiap[ndcg.name()] = ndcg_table[run_idx, :] outMiap[ndcg2.name()] = ndcg2_table[run_idx, :] outMiap.close() print "#" * 100 print "# untagged-concept", " ".join([os.path.split(x)[-1] for x in datafiles]) print "#" * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], " ".join(["%.3f" % x for x in ap_table[:, c_idx]]) print "meanAP", " ".join(["%.3f" % x for x in ap_table.mean(axis=1)]) print "#" * 100 print "# tagged-concept" print "#" * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], " ".join(["%.3f" % x for x in ap2_table[:, c_idx]]) print "meanAP2", " ".join(["%.3f" % x for x in ap2_table.mean(axis=1)]) print "#" * 100 print "# tagged-concept" print "#" * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], " ".join(["%.3f" % x for x in ndcg_table[:, c_idx]]) print "mean%s" % ndcg.name(), " ".join(["%.3f" % x for x in ndcg_table.mean(axis=1)]) print "#" * 100 print "# tagged-concept" print "#" * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], " ".join(["%.3f" % x for x in ndcg2_table[:, c_idx]]) print "mean%s" % ndcg2.name(), " ".join(["%.3f" % x for x in ndcg2_table.mean(axis=1)])
def learn(concept, params): rootpath = params['rootpath'] trainCollection = params['trainCollection'] baseAnnotationName = params['baseAnnotationName'] startAnnotationName = params['startAnnotationName'] strategy = params['strategy'] feature = params['feature'] feat_file = params['feat_file'] feat_dim = feat_file.ndims npr = params['npr'] iterations = params['iterations'] beta = 0.5 names, labels = readAnnotationsFrom(trainCollection, startAnnotationName, concept, skip_0=True, rootpath=rootpath) positive_bag = [x[0] for x in zip(names, labels) if x[1] > 0] negative_bag = [x[0] for x in zip(names, labels) if x[1] < 0] names, labels = readAnnotationsFrom(trainCollection, baseAnnotationName, concept, skip_0=True, rootpath=rootpath) negative_pool = [x[0] for x in zip(names, labels) if x[1] < 0] Usize = max(5000, len(positive_bag) * npr) Usize = min(10000, Usize) Usize = min(Usize, len(negative_pool)) new_model = None for t in range(1, iterations + 1): printStatus(INFO, 'iter %d (%s)' % (t, concept)) if t > 1: # select relevant negative examples # check how good at classifying positive training examples results = classify_large_data(assemble_model, positive_bag, feat_file) pos_error_rate = len([1 for x in results if x[1] < 0]) / float( len(results)) U = random.sample(negative_pool, Usize) predictions = classify_large_data(assemble_model, U, feat_file) neg_error_rate = len([1 for x in predictions if x[1] > 0 ]) / float(len(predictions)) error_rate = (pos_error_rate + neg_error_rate) / 2.0 printStatus( INFO, 'iter %d: %s %.3f -> %s %.3f, pe=%.3f, ne=%.3f, error=%.3f' % (t, predictions[-1][0], predictions[-1][1], predictions[0][0], predictions[0][1], pos_error_rate, neg_error_rate, error_rate)) if error_rate < MIN_ERROR_RATE: printStatus( INFO, 'hit stop criteria: error (%.3f) < MIN_ERROR_RATE (%.3f)' % (error_rate, MIN_ERROR_RATE)) break # assume that 1% of the randomly sampled set is truely positive, and the classifier will rank them at the top # so ignore them nr_of_estimated_pos = int(len(predictions) * 0.01) negative_bag = NegativeBootstrap.sampling( predictions[nr_of_estimated_pos:], strategy, max(1000, len(positive_bag))) new_names = positive_bag + negative_bag new_labels = [1] * len(positive_bag) + [-1] * len(negative_bag) name2label = dict(zip(new_names, new_labels)) renamed, vectors = feat_file.read(new_names) Ys = [name2label[x] for x in renamed] np = len([1 for y in Ys if y > 0]) nn = len([1 for y in Ys if y < 0]) assert (len(positive_bag) == np) assert (len(negative_bag) == nn) wp = float(beta) * (np + nn) / np wn = (1.0 - beta) * (np + nn) / nn C = 1 svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C) if 'fik' == params['model']: svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2' g_t = train_model(Ys, vectors, svm_params + ' -q') if t == 1: assemble_model = compress_model([g_t], [1.0], feat_dim, params) else: new_model = compress_model([g_t], [1.0], feat_dim, params) assemble_model.add_fastsvm(new_model, 1 - 1.0 / t, 1.0 / t) return assemble_model
def process(options, collection, annotationName, runfile): rootpath = options.rootpath overwrite = options.overwrite resultdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection) apscorer = getScorer('AP') datafiles = [ x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#') ] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) printStatus(INFO, 'read annotations from files') name2label = [{} for i in range(nr_of_concepts)] hit_imgset = [[] for i in range(nr_of_concepts)] for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) names = map(int, names) name2label[i] = dict(zip(names, labels)) label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt' % concepts[i]) try: hit_imgset[i] = set(map(int, open(label_file).readlines())) except: hit_imgset[i] = set() printStatus( INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i]))) ap_table = np.zeros((nr_of_runs, nr_of_concepts)) ap2_table = np.zeros((nr_of_runs, nr_of_concepts)) for run_idx in range(nr_of_runs): runName = os.path.splitext(os.path.basename(datafiles[run_idx]))[0] data = pickle.load(open(datafiles[run_idx], 'rb')) scores = data['scores'] assert (scores.shape[1] == nr_of_concepts) imset = data['id_images'] nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] for c_idx in range(nr_of_concepts): ground_truth = name2label[c_idx] ranklist = zip(imset, scores[:, c_idx]) ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True) ranklist = [x for x in ranklist if x[0] in ground_truth] resfile = os.path.join(resultdir, runName, '%s.txt' % concepts[c_idx]) if not checkToSkip(resfile, overwrite): writeRankingResults(ranklist, resfile) sorted_labels = [ground_truth[x[0]] for x in ranklist] assert (len(sorted_labels) > 0) #print concepts[c_idx], ranklist[:5], sorted_labels[:5] ap_table[run_idx, c_idx] = apscorer.score(sorted_labels) resfile = os.path.join(resultdir, 'tagged,lemm', runName, '%s.txt' % concepts[c_idx]) if not checkToSkip(resfile, overwrite): writeRankingResults( [x for x in ranklist if x[0] in hit_imgset[c_idx]], resfile) sorted_labels = [ ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx] ] ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels) print '#' * 100 print '# untagged-concept', ' '.join( [os.path.basename(x) for x in datafiles]) print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ap_table[:, c_idx]]) print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)]) print '#' * 100 print '# tagged-concept' print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ap2_table[:, c_idx]]) print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])
trainAnnotationName, feature, modelName)) results = [] for concept in concepts: model_file_name = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName, '%s.model' % concept) model = load_model(model_file_name) ranklist = [(test_renamed[i], model.predict(test_vectors[i])) for i in range(len(test_renamed))] ranklist.sort(key=lambda v: v[1], reverse=True) names, labels = readAnnotationsFrom(testCollection, testAnnotationName, concept, skip_0=True, rootpath=rootpath) test_name2label = dict(zip(names, labels)) sorted_labels = [ test_name2label[x[0]] for x in ranklist if x[0] in test_name2label ] perf = scorer.score(sorted_labels) print('%s %g' % (concept, perf)) results.append((concept, perf)) mean_perf = sum([x[1] for x in results]) / len(concepts) print('mean%s %g' % (metric, mean_perf))
def process(options, trainCollection, trainAnnotationName, valCollection, valAnnotationName, feature, modelName): assert(modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 metric = options.metric scorer = getScorer(metric) overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {} if 'fik' == modelName: from fiksvm.svmutil import svm_train as train_model from fiksvm.fiksvm import svm_to_fiksvm as compress_model from fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from fastlinear.liblinear193.python.liblinearutil import train as train_model from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model modelName = 'fastlinear' concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) valConcepts = readConcepts(valCollection,valAnnotationName, rootpath=rootpath) concept_num = len(concepts) for i in range(concept_num): assert(concepts[i] == valConcepts[i]) resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, '%s,best_params'%modelName, '%s,%s,%s' % (valCollection,valAnnotationName,feature)) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.txt') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature)) val_feat_file = BigFile(os.path.join(rootpath,valCollection,'FeatureData',feature)) feat_dim = train_feat_file.ndims assert(feat_dim == val_feat_file.ndims) for concept in todo: names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn names,labels = readAnnotationsFrom(valCollection, valAnnotationName, concept, skip_0=True, rootpath=rootpath) val_name2label = dict(zip(names,labels)) val_renamed, val_vectors = val_feat_file.read(names) min_perf = 2.0 worst_C = 1.0 max_perf = 0.0 best_C = 1.0 best_scores = None best_labels = None for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]: if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' #print modelName, '>'*20, svm_params model = train_model(Ys, vectors, svm_params + ' -q') new_model = compress_model([model], [1.0], feat_dim, params) ranklist = [(val_renamed[i], new_model.predict(val_vectors[i])) for i in range(len(val_renamed))] ranklist.sort(key=lambda v:v[1], reverse=True) sorted_labels = [val_name2label[x[0]] for x in ranklist] perf = scorer.score(sorted_labels) if max_perf < perf: max_perf = perf best_C = C best_scores = [x[1] for x in ranklist] best_labels = list(sorted_labels) if min_perf > perf: min_perf = perf worst_C = C [A,B] = sigmoid_train(best_scores, best_labels) resultfile = os.path.join(resultdir, '%s.txt' % concept) printStatus(INFO, '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' % (concept, min_perf, worst_C, max_perf, best_C, A, B)) makedirsforfile(resultfile) fw = open(resultfile, 'w') fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B)) fw.close()
overwrite = 0 concepts = readConcepts(srcCollection, annotationName, rootpath) todo = [] for concept in concepts: resfile = os.path.join(rootpath, dstCollection, 'Annotations', 'Image', annotationName, '%s.txt' % concept) if checkToSkip(resfile, overwrite): continue todo.append(concept) if not todo: print('nothing to do') sys.exit(0) imset = set(readImageSet(dstCollection, dstCollection, rootpath)) for concept in todo: names, labels = readAnnotationsFrom(srcCollection, annotationName, concept, rootpath=rootpath) selected = [x for x in zip(names, labels) if x[0] in imset] print concept, len(selected) writeAnnotationsTo([x[0] for x in selected], [x[1] for x in selected], dstCollection, annotationName, concept, rootpath=rootpath) writeConceptsTo(concepts, dstCollection, annotationName, rootpath)
newAnnotationTemplate = annotationName[:-4] + '.' + posName + str(nr_pos) + ('.random%d'%nr_neg) + '.%d.txt' concepts = readConcepts(collection, annotationName, rootpath) simdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection, rankMethod) scriptfile = os.path.join(rootpath,collection,'annotationfiles', annotationName[:-4] + '.' + posName + str(nr_pos) + ('.random%d'%nr_neg) + '.0-%d.txt'%(nr_neg_bags-1)) makedirsforfile(scriptfile) fout = open(scriptfile,'w') fout.write('\n'.join([newAnnotationTemplate%t for t in range(nr_neg_bags)]) + '\n') fout.close() for concept in concepts: simfile = os.path.join(simdir, '%s.txt' % concept) ranklist = readRankingResults(simfile) pos_bag = [x[0] for x in ranklist[:nr_pos]] names, labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath) negativePool = [x[0] for x in zip(names,labels) if x[1] < 0] for t in range(nr_neg_bags): newAnnotationName = newAnnotationTemplate % t resultfile = os.path.join(rootpath, collection, 'Annotations', 'Image', newAnnotationName, '%s.txt'%concept) if checkToSkip(resultfile, overwrite): continue true_nr_neg = max(500, len(pos_bag)*neg_pos_ratio) neg_bag = random.sample(negativePool, true_nr_neg) #len(pos_bag)*neg_pos_ratio) assert(len(set(pos_bag).intersection(set(neg_bag))) == 0) printStatus(INFO, "anno(%s,%d) %d pos %d neg -> %s" % (concept,t,len(pos_bag),len(neg_bag),resultfile)) writeAnnotations(pos_bag + neg_bag, [1]*len(pos_bag) + [-1]*len(neg_bag), resultfile) for t in range(nr_neg_bags): newAnnotationName = newAnnotationTemplate % t
BigFile( os.path.join(rootpath, trainCollection, 'FeatureData', feature), FEATURE_TO_DIM[feature])) featurefile = os.path.join(rootpath, testCollection, "FeatureData", feature, "id.feature.txt") feat_dim = 1024 num_bins = 50 #fikmodel.set_probAB(-1, 0) #print "fik model0", fikmodel0.get_nr_svs(), fikmodel0.get_feat_dim(), fikmodel0.get_probAB() #print "fik model", fikmodel.get_nr_svs(), fikmodel.get_feat_dim(), fikmodel.get_probAB() mAP = [0] * 4 for concept in concepts: names, labels = readAnnotationsFrom(testCollection, 'conceptsvoc2008val.txt', concept) name2label = dict(zip(names, labels)) ranklist = [] modelfile = os.path.join(rootpath, trainCollection, "Models", annotationName, feature, 'hiksvm', "%s.model" % concept) #print modelfile model = svm_load_model(modelfile) #print model.get_svm_type() #print model.get_nr_class() svm_models = [model, model] num_models = len(svm_models) fikmodel0 = svm_to_fiksvm0(svm_models, [1.0 / num_models] * num_models, num_models, feat_dim, num_bins) fikmodel1 = svm_to_fiksvm(svm_models, num_models,
def process(options, trainCollection, trainAnnotationName, feature): import re p = re.compile(r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)') rootpath = options.rootpath best_param_dir = options.best_param_dir overwrite = options.overwrite #autoweight = options.autoweight numjobs = options.numjobs job = options.job beta = 0.5 modelName = 'fastlinear' if best_param_dir: modelName += '-tuned' concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] if not todo: return 0 printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature)) for concept in todo: if best_param_dir: param_file = os.path.join(best_param_dir, '%s.txt' % concept) m = p.search(open(param_file).readline().strip()) C = float(m.group('C')) A = float(m.group('a')) B = float(m.group('b')) else: C = 1 A = 0 B = 0 printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B)) model_file_name = os.path.join(resultdir, concept + '.model') names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = feat_file.read(names) y = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn # no bias term added by setting "-B -1" svm_params = '-w1 %g -w-1 %g -s 2 -B -1 -q' % (wp*C, wn*C) model = liblinear_train(y, vectors, svm_params) newmodel = liblinear_to_fastlinear([model], [1.0], feat_file.ndims) newmodel.set_probAB(A, B) makedirsforfile(model_file_name) printStatus(INFO, '-> %s'%model_file_name) fastlinear_save_model(model_file_name, newmodel) # reload the model file to do a simple check fastlinear_load_model(model_file_name) assert(abs(newmodel.get_probAB()[0]-A)<1e-6) assert(abs(newmodel.get_probAB()[1]-B)<1e-6) return len(todo)
if __name__ == '__main__': args = sys.argv[1:] rootpath = '/var/scratch2/xirong/VisualSearch' srcCollection = args[0] annotationName = args[1] dstCollection = args[2] overwrite = 0 concepts = readConcepts(srcCollection, annotationName, rootpath) todo = [] for concept in concepts: resfile = os.path.join(rootpath, dstCollection, 'Annotations', 'Image', annotationName, '%s.txt'%concept) if checkToSkip(resfile, overwrite): continue todo.append(concept) if not todo: print ('nothing to do') sys.exit(0) imset = set(readImageSet(dstCollection, dstCollection, rootpath)) for concept in todo: names,labels = readAnnotationsFrom(srcCollection, annotationName, concept, rootpath=rootpath) selected = [x for x in zip(names,labels) if x[0] in imset] print concept, len(selected) writeAnnotationsTo([x[0] for x in selected], [x[1] for x in selected], dstCollection, annotationName, concept, rootpath=rootpath) writeConceptsTo(concepts, dstCollection, annotationName, rootpath)
def process(options, collection, annotationName, runfile): rootpath = options.rootpath p1_scorer = getScorer('P@3') p3_scorer = getScorer('P@5') r1_scorer = getScorer('R@3') r3_scorer = getScorer('R@5') ndcg1_scorer = getScorer('NDCG2@3') ndcg3_scorer = getScorer('NDCG2@5') ap_scorer = getScorer('AP') rr_scorer = getScorer('RR') datafiles = [ x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#') ] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) name2label = [{} for i in range(nr_of_concepts)] rel_conset = {} for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) #names = map(int, names) name2label[i] = dict(zip(names, labels)) for im, lab in zip(names, labels): if lab > 0: rel_conset.setdefault(im, set()).add(i) # ('7975436322', set([33])) # for im, im_labels in rel_conset.items(): # print(im, im_labels) for run_idx in range(nr_of_runs): data = pickle.load(open(datafiles[run_idx], 'rb')) scores = data['scores'] assert (scores.shape[1] == nr_of_concepts) imset = data['id_images'] # for im in imset: # print(im) # raw_input() nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] res = np.zeros((nr_of_images, 8)) for j in range(nr_of_images): ranklist = zip(range(nr_of_concepts), scores[j, :]) ranklist.sort(key=lambda v: v[1], reverse=True) # print(ranklist) # raw_input() rel_set = rel_conset.get(imset[j], set()) sorted_labels = [int(x[0] in rel_set) for x in ranklist] # print(sorted_labels) # raw_input() assert len(sorted_labels) == nr_of_concepts p1 = p1_scorer.score(sorted_labels) p3 = p3_scorer.score(sorted_labels) r1 = r1_scorer.score(sorted_labels) r3 = r3_scorer.score(sorted_labels) ndcg1 = ndcg1_scorer.score(sorted_labels) ndcg3 = ndcg3_scorer.score(sorted_labels) ap = ap_scorer.score(sorted_labels) rr = rr_scorer.score(sorted_labels) f1, f3 = 0.0, 0.0 if (p1 + r1) != 0.0: f1 = 2 * p1 * r1 / (p1 + r1) if (p3 + r3) != 0.0: f3 = 2 * p3 * r3 / (p3 + r3) # h1, h3 = max(p1, r1), max(p3, r3) res[j, :] = [p1, p3, r1, r3, ndcg1, ndcg3, ap, rr] res[j, :] = [p1, p3, f1, f3, ndcg1, ndcg3, ap, rr] # res[j,:] = [p1, p3, h1, h3, ndcg1, ndcg3, ap, rr] avg_perf = res.mean(axis=0) name = path.basename(datafiles[run_idx]).split('.')[0] name = name.split(',')[1] stdout.write('%s\t' % name) # for x in avg_perf: for i in range(len(avg_perf)): if i == 4 or i == 5: continue # x = avg_perf[i] * 100.0 x = avg_perf[i] if x >= 100.0: stdout.write('& %.1f ' % x) else: # stdout.write('& %.2f ' % x) stdout.write('& %s' % (('%.4f ' % x).lstrip('0'))) stdout.write('\n')
def process(options, trainCollection, devCollection): rootpath = options.rootpath overwrite = options.overwrite method = options.method metric = options.metric qrysim = options.qrysim qrythres = options.qrythres ntopimg = options.ntopimg ntopqry = options.ntopqry mincc = options.mincc feature = options.feature # semantic embedding k = options.k corpus = options.corpus word2vec_model = options.word2vec label_source = options.label_source # result path ranking_result_path = os.path.join(rootpath, devCollection, 'SimilarityIndex', devCollection, 'MetaData', method, feature) DCG_result_path = os.path.join(rootpath, devCollection, metric, method, feature) if checkToSkip(ranking_result_path, overwrite): sys.exit(0) if checkToSkip(DCG_result_path, overwrite): sys.exit(0) # inpute of query qp = SimpleQueryParser() qid_query_file = os.path.join(rootpath, devCollection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) #(qid query) qid2query = dict(zip(qid_list, [qp.process(query) for query in query_list])) # path of image feature train_feat_path = os.path.join(rootpath, trainCollection, 'FeatureData', feature) dev_feat_path = os.path.join(rootpath, devCollection, 'FeatureData', feature) # method selection if method =='conse': se_searcher = ConSE(label_source, corpus, word2vec_model, dev_feat_path, rootpath) elif method == 't2i' or method == 'ta': nnquery_file = os.path.join(rootpath, devCollection, 'TextData','querynn', options.nnqueryfile) qryClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.queryclickfile) t2i_searcher = Text2Image(nnquery_file, qryClick_file, dev_feat_path, train_feat_path, ntopqry) elif method == 'i2t' or method == 'ia': nnimage_file = os.path.join(rootpath, devCollection, 'TextData','imagenn', feature, options.nnimagefile) imgClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.imageclickfile) i2t_searcher = Image2Text(nnimage_file, imgClick_file, qrysim, ntopimg, ntopqry) else: print "this model is not supported with %s" % method sys.exit(0) # calculate DCG@25 scorer = getScorer(metric) done = 0 failed_count = 0 qid2dcg = collections.OrderedDict() qid2iid_label_score = {} for query_id in qid_list: iid_list, label_list = readAnnotationsFrom(devCollection, 'concepts%s.txt' % devCollection, query_id, False, rootpath) if method == 'conse': scorelist = se_searcher.do_search(qid2query[query_id], iid_list, k) elif method == 't2i': scorelist = t2i_searcher.text2image(query_id, iid_list, qrythres, mincc ) elif method == 'ta': scorelist = t2i_searcher.textAnnotation( query_id, iid_list, ntopimg, qrythres, mincc) elif method == 'i2t': scorelist = i2t_searcher.image2text(qid2query[query_id], iid_list, mincc ) elif method == 'ia': scorelist = i2t_searcher.imageAnnotation( qid2query[query_id], iid_list, mincc ) if len(scorelist) == 0: failed_count += 1 scorelist = [0]*len(iid_list) qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist) random.shuffle(qid2iid_label_score[query_id]) else: qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist) qid2iid_label_score[query_id] = sorted(qid2iid_label_score[query_id], key=lambda v:v[2], reverse=True) # calculate the result ranking of DCG@25 from our model qid2dcg[query_id] = scorer.score([x[1] for x in qid2iid_label_score[query_id]]) printMessage("Done", query_id, qid2query[query_id]) done += 1 if(done % 20 == 0): writeRankingResult(ranking_result_path, qid2iid_label_score) qid2iid_label_score = {} writeRankingResult(ranking_result_path, qid2iid_label_score) writeDCGResult(DCG_result_path, qid2dcg) print "number of failed query: %d" % failed_count print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values())) result_path_file = "result/individual_result_pathes.txt" if os.path.exists(result_path_file): fout = open(result_path_file,'a') else: makedirsforfile(result_path_file) fout = open(result_path_file, 'w') fout.write(ranking_result_path + '\n') fout.close()
def process(options, collection): rootpath = options.rootpath overwrite = options.overwrite feature = options.feature method = options.method sigma =options.sigma # result path ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'MetaData', method, feature) DCG_result_path = os.path.join(rootpath, collection, 'DCG', method, feature) if checkToSkip(ranking_result_path, overwrite): sys.exit(0) if checkToSkip(DCG_result_path, overwrite): sys.exit(0) # inpute of query qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) qid2query = dict(zip(qid_list, query_list)) # inpute of image img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature) img_feats = BigFile(img_feat_path) # the model to calculate DCG@25 scorer = getScorer("DCG@25") done = 0 qid2dcg = collections.OrderedDict() qid2iid_label_score = {} for qid in qid_list: iid_list, label_list = readAnnotationsFrom(collection, 'concepts%s.txt' % collection, qid, False, rootpath) renamed, test_X = img_feats.read(iid_list) parzen_list = [] for imidx in iid_list: parzen_list.append(calParzen(img_feats.read_one(imidx), test_X , sigma)) sorted_tuple = sorted(zip(iid_list, label_list, parzen_list), key=lambda v:v[2], reverse=True) qid2iid_label_score[qid] = sorted_tuple # calculate DCG@25 sorted_label = [x[1] for x in sorted_tuple] qid2dcg[qid] = scorer.score(sorted_label) printMessage("Done", qid, qid2query[qid]) done += 1 if done % 20 == 0: writeRankingResult(ranking_result_path, qid2iid_label_score) qid2iid_label_score = {} writeDCGResult(DCG_result_path, qid2dcg) writeRankingResult(ranking_result_path, qid2iid_label_score) print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values())) result_path_file = "result/individual_result_pathes.txt" if os.path.exists(result_path_file): fout = open(result_path_file,'a') else: makedirsforfile(result_path_file) fout = open(result_path_file, 'w') fout.write(ranking_result_path + '\n') fout.close()
def process(options, collection, annotationName, runfile): rootpath = options.rootpath overwrite = options.overwrite resultdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection) apscorer = getScorer('AP') datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#')] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) printStatus(INFO, 'read annotations from files') name2label = [{} for i in range(nr_of_concepts)] hit_imgset = [[] for i in range(nr_of_concepts)] for i in range(nr_of_concepts): names,labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) names = map(int, names) name2label[i] = dict(zip(names,labels)) label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt'% concepts[i]) try: hit_imgset[i] = set(map(int, open(label_file).readlines())) except: hit_imgset[i] = set() printStatus(INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i]))) ap_table = np.zeros((nr_of_runs, nr_of_concepts)) ap2_table = np.zeros((nr_of_runs, nr_of_concepts)) for run_idx in range(nr_of_runs): runName = os.path.splitext(os.path.basename(datafiles[run_idx]))[0] data = pickle.load(open(datafiles[run_idx],'rb')) scores = data['scores'] assert(scores.shape[1] == nr_of_concepts) imset = data['id_images'] nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] for c_idx in range(nr_of_concepts): ground_truth = name2label[c_idx] ranklist = zip(imset, scores[:,c_idx]) ranklist.sort(key=lambda v:(v[1], str(v[0])), reverse=True) ranklist = [x for x in ranklist if x[0] in ground_truth] resfile = os.path.join(resultdir, runName, '%s.txt' % concepts[c_idx]) if not checkToSkip(resfile, overwrite): writeRankingResults(ranklist, resfile) sorted_labels = [ground_truth[x[0]] for x in ranklist] assert(len(sorted_labels)>0) #print concepts[c_idx], ranklist[:5], sorted_labels[:5] ap_table[run_idx, c_idx] = apscorer.score(sorted_labels) resfile = os.path.join(resultdir, 'tagged,lemm', runName, '%s.txt' % concepts[c_idx]) if not checkToSkip(resfile, overwrite): writeRankingResults([x for x in ranklist if x[0] in hit_imgset[c_idx]], resfile) sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]] ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels) print '#'*100 print '# untagged-concept', ' '.join([os.path.basename(x) for x in datafiles]) print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ap_table[:,c_idx]]) print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)]) print '#'*100 print '# tagged-concept' print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ap2_table[:,c_idx]]) print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])
def process(options, trainCollection, annotationfile, feature, modelName): assert(modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 C = 1 overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {'rootpath': rootpath, 'model': modelName} if 'fik' == modelName: from svms.fiksvm.svmutil import svm_train as train_model from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model from svms.fiksvm.fiksvm import fiksvm_save_model as save_model from svms.fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model from svms.fastlinear.fastlinear import fastlinear_save_model as save_model newAnnotationName = os.path.split(annotationfile)[-1] trainAnnotationNames = [x.strip() for x in open(annotationfile).readlines() if x.strip() and not x.strip().startswith('#')] for annotationName in trainAnnotationNames: conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', annotationName) if not os.path.exists(conceptfile): print '%s does not exist' % conceptfile return 0 concepts = readConcepts(trainCollection, trainAnnotationNames[0], rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature)) feat_dim = train_feat_file.ndims s_time = time.time() for concept in todo: assemble_model = None for t in range(1, len(trainAnnotationNames)+1): names,labels = readAnnotationsFrom(trainCollection, trainAnnotationNames[t-1], concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' g_t = train_model(Ys, vectors, svm_params + ' -q') if t == 1: assemble_model = compress_model([g_t], [1.0], feat_dim, params) else: new_model = compress_model([g_t], [1.0], feat_dim, params) assemble_model.add_fastsvm(new_model, 1-1.0/t, 1.0/t) new_model_file = os.path.join(resultdir, '%s.model' % concept) makedirsforfile(new_model_file) printStatus(INFO, 'save model to %s' % new_model_file) save_model(new_model_file, assemble_model) printStatus(INFO, '%s done' % concept) timecost = time.time() - s_time writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath) printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo))) printStatus(INFO, 'models stored at %s' % resultdir) printStatus(INFO, '%g seconds in total' % timecost)
def process(options, collection, annotationName, runfile, outDirectory): rootpath = options.rootpath apscorer = getScorer('AP') ndcg = getScorer('NDCG@20') ndcg2 = getScorer('NDCG2@20') p1scorer = getScorer('P@1') p5scorer = getScorer('P@5') datafiles = [ x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#') ] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) printStatus(INFO, 'read annotations from files') name2label = [{} for i in range(nr_of_concepts)] hit_imgset = [[] for i in range(nr_of_concepts)] rel_conset = {} for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) names = map(int, names) name2label[i] = dict(zip(names, labels)) for im, lab in zip(names, labels): if lab > 0: rel_conset.setdefault(im, set()).add(i) label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt' % concepts[i]) try: hit_imgset[i] = set(map(int, open(label_file).readlines())) except: hit_imgset[i] = set() printStatus( INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i]))) ap_table = np.zeros((nr_of_runs, nr_of_concepts)) ap2_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts)) print '#' * 100 print '# method miap hit1 hit5' print '#' * 100 for run_idx in range(nr_of_runs): data = pickle.load(open(datafiles[run_idx], 'rb')) scores = data['scores'] assert (scores.shape[1] == nr_of_concepts) imset = data['id_images'] imset = np.array([int(x) for x in imset]) idx = np.argsort(imset) imset = imset[idx] scores = scores[idx] nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] for c_idx in range(nr_of_concepts): ground_truth = name2label[c_idx] ranklist = zip(imset, scores[:, c_idx]) ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True) ranklist = [x for x in ranklist if x[0] in ground_truth] sorted_labels = [ground_truth[x[0]] for x in ranklist] assert (len(sorted_labels) > 0) #print concepts[c_idx], ranklist[:5], sorted_labels[:5] ap_table[run_idx, c_idx] = apscorer.score(sorted_labels) sorted_labels = [ ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx] ] ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels) ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels) ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels) res = np.zeros((nr_of_images, 4)) gt = np.zeros((nr_of_images, nr_of_concepts)) for j in range(nr_of_images): ranklist = zip(range(nr_of_concepts), scores[j, :]) ranklist.sort(key=lambda v: v[1], reverse=True) rel_set = rel_conset.get(imset[j], set()) sorted_labels = [int(x[0] in rel_set) for x in ranklist] #print rel_set #print sorted_labels ap = apscorer.score(sorted_labels) hit1 = p1scorer.score(sorted_labels) hit5 = p5scorer.score(sorted_labels) > 0.1 res[j, :] = [ap, hit1, hit5, len(rel_set)] gt[j, :] = sorted_labels avg_perf = res.mean(axis=0) print os.path.split(datafiles[run_idx])[-1], ' '.join( ['%.3f' % x for x in avg_perf]) outMiap = h5py.File( os.path.join(outDirectory, os.path.split(datafiles[run_idx])[-1] + ".h5"), 'w') outMiap['iap'] = res[:, 0] outMiap['ngt'] = res[:, 3] outMiap['hit1'] = res[:, 1] outMiap['hit5'] = res[:, 2] outMiap['gt'] = gt outMiap['concepts'] = concepts outMiap['ap'] = ap_table[run_idx, :] outMiap['ap2'] = ap2_table[run_idx, :] outMiap[ndcg.name()] = ndcg_table[run_idx, :] outMiap[ndcg2.name()] = ndcg2_table[run_idx, :] outMiap.close() print '#' * 100 print '# untagged-concept', ' '.join( [os.path.split(x)[-1] for x in datafiles]) print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ap_table[:, c_idx]]) print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)]) print '#' * 100 print '# tagged-concept' print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ap2_table[:, c_idx]]) print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)]) print '#' * 100 print '# tagged-concept' print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ndcg_table[:, c_idx]]) print 'mean%s' % ndcg.name(), ' '.join( ['%.3f' % x for x in ndcg_table.mean(axis=1)]) print '#' * 100 print '# tagged-concept' print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ndcg2_table[:, c_idx]]) print 'mean%s' % ndcg2.name(), ' '.join( ['%.3f' % x for x in ndcg2_table.mean(axis=1)])
nr_of_images = len(_renamed) nr_of_concepts = len(concepts) mAP = 0.0 models = [None] * len(concepts) stream = StreamFile(feat_dir) for i,concept in enumerate(concepts): model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concept) model = load_model(model_file_name) #print model.get_probAB() models[i] = model names,labels = readAnnotationsFrom(testCollection, testAnnotationName, concept, rootpath=rootpath) name2label = dict(zip(names,labels)) ranklist1 = [(_id, model.predict(_vec)) for _id,_vec in zip(_renamed, _vectors)] stream.open() ranklist2 = [(_id, model.predict(_vec)) for _id,_vec in stream] stream.close() ranklist3 = [(_id, model.predict_probability(_vec)) for _id,_vec in zip(_renamed, _vectors)] print concept, for ranklist in [ranklist1, ranklist2, ranklist3]: ranklist.sort(key=lambda v:v[1], reverse=True) sorted_labels = [name2label[x[0]] for x in ranklist if x[0] in name2label]
def learn(concept, params): rootpath = params['rootpath'] trainCollection = params['trainCollection'] baseAnnotationName = params['baseAnnotationName'] startAnnotationName = params['startAnnotationName'] strategy = params['strategy'] feature = params['feature'] feat_file = params['feat_file'] feat_dim = feat_file.ndims npr = params['npr'] iterations = params['iterations'] beta = 0.5 names,labels = readAnnotationsFrom(trainCollection, startAnnotationName, concept, skip_0=True, rootpath=rootpath) positive_bag = [x[0] for x in zip(names,labels) if x[1] > 0] negative_bag = [x[0] for x in zip(names,labels) if x[1] < 0] names,labels = readAnnotationsFrom(trainCollection, baseAnnotationName, concept, skip_0=True, rootpath=rootpath) negative_pool = [x[0] for x in zip(names,labels) if x[1] < 0] Usize = max(5000, len(positive_bag) * npr) Usize = min(10000, Usize) Usize = min(Usize, len(negative_pool)) new_model = None for t in range(1, iterations+1): printStatus(INFO, 'iter %d (%s)' % (t, concept)) if t > 1: # select relevant negative examples # check how good at classifying positive training examples results = classify_large_data(assemble_model, positive_bag, feat_file) pos_error_rate = len([1 for x in results if x[1]<0])/float(len(results)) U = random.sample(negative_pool, Usize) predictions = classify_large_data(assemble_model, U, feat_file) neg_error_rate = len([1 for x in predictions if x[1]>0])/float(len(predictions)) error_rate = (pos_error_rate + neg_error_rate)/2.0 printStatus(INFO, 'iter %d: %s %.3f -> %s %.3f, pe=%.3f, ne=%.3f, error=%.3f' % (t, predictions[-1][0], predictions[-1][1], predictions[0][0], predictions[0][1], pos_error_rate, neg_error_rate, error_rate)) if error_rate < MIN_ERROR_RATE: printStatus(INFO, 'hit stop criteria: error (%.3f) < MIN_ERROR_RATE (%.3f)' % (error_rate, MIN_ERROR_RATE)) break # assume that 1% of the randomly sampled set is truely positive, and the classifier will rank them at the top # so ignore them nr_of_estimated_pos = int(len(predictions)*0.01) negative_bag = NegativeBootstrap.sampling(predictions[nr_of_estimated_pos:], strategy, max(1000, len(positive_bag))) new_names = positive_bag + negative_bag new_labels = [1] * len(positive_bag) + [-1] * len(negative_bag) name2label = dict(zip(new_names,new_labels)) renamed, vectors = feat_file.read(new_names) Ys = [name2label[x] for x in renamed] np = len([1 for y in Ys if y>0]) nn = len([1 for y in Ys if y<0]) assert(len(positive_bag) == np) assert(len(negative_bag) == nn) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn C = 1 svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) if 'fik' == params['model']: svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2' g_t = train_model(Ys, vectors, svm_params + ' -q') if t == 1: assemble_model = compress_model([g_t], [1.0], feat_dim, params) else: new_model = compress_model([g_t], [1.0], feat_dim, params) assemble_model.add_fastsvm(new_model, 1-1.0/t, 1.0/t) return assemble_model
def process(options, collection, annotationName, runfile): rootpath = options.rootpath apscorer = getScorer('AP') ndcg = getScorer('NDCG@20') ndcg2 = getScorer('NDCG2@20') p1scorer = getScorer('P@1') p5scorer = getScorer('P@5') datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#')] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) printStatus(INFO, 'read annotations from files') name2label = [{} for i in range(nr_of_concepts)] hit_imgset = [[] for i in range(nr_of_concepts)] rel_conset = {} for i in range(nr_of_concepts): names,labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) names = map(int, names) name2label[i] = dict(zip(names,labels)) for im,lab in zip(names,labels): if lab > 0: rel_conset.setdefault(im,set()).add(i) label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt'% concepts[i]) try: hit_imgset[i] = set(map(int, open(label_file).readlines())) except: hit_imgset[i] = set() printStatus(INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i]))) ap_table = np.zeros((nr_of_runs, nr_of_concepts)) ap2_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts)) print '#'*100 print '# method miap hit1 hit5' print '#'*100 for run_idx in range(nr_of_runs): data = pickle.load(open(datafiles[run_idx],'rb')) scores = data['scores'] assert(scores.shape[1] == nr_of_concepts) imset = data['id_images'] nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] for c_idx in range(nr_of_concepts): ground_truth = name2label[c_idx] ranklist = zip(imset, scores[:,c_idx]) ranklist.sort(key=lambda v:(v[1], str(v[0])), reverse=True) ranklist = [x for x in ranklist if x[0] in ground_truth] sorted_labels = [ground_truth[x[0]] for x in ranklist] assert(len(sorted_labels)>0) #print concepts[c_idx], ranklist[:5], sorted_labels[:5] ap_table[run_idx, c_idx] = apscorer.score(sorted_labels) sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]] ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels) ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels) ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels) res = np.zeros((nr_of_images, 3)) for j in range(nr_of_images): ranklist = zip(range(nr_of_concepts), scores[j,:]) ranklist.sort(key=lambda v:v[1], reverse=True) rel_set = rel_conset.get(imset[j], set()) sorted_labels = [int(x[0] in rel_set) for x in ranklist] ap = apscorer.score(sorted_labels) hit1 = p1scorer.score(sorted_labels) hit5 = p5scorer.score(sorted_labels) > 0.1 res[j,:] = [ap, hit1, hit5] avg_perf = res.mean(axis=0) print os.path.split(datafiles[run_idx])[-1], ' '.join(['%.3f' % x for x in avg_perf]) print '#'*100 print '# untagged-concept', ' '.join([os.path.split(x)[-1] for x in datafiles]) print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ap_table[:,c_idx]]) print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)]) print '#'*100 print '# tagged-concept' print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ap2_table[:,c_idx]]) print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)]) print '#'*100 print '# tagged-concept' print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ndcg_table[:,c_idx]]) print 'mean%s' % ndcg.name(), ' '.join(['%.3f' % x for x in ndcg_table.mean(axis=1)]) print '#'*100 print '# tagged-concept' print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ndcg2_table[:,c_idx]]) print 'mean%s'%ndcg2.name(), ' '.join(['%.3f' % x for x in ndcg2_table.mean(axis=1)])
def process(options, collection, annotationName, pos_num): assert (annotationName.endswith('.txt')) rootpath = options.rootpath pos_bag_num = options.pos_bag_num neg_bag_num = options.neg_bag_num neg_pos_ratio = options.neg_pos_ratio annotationNameStr = annotationName[:-4] + ( '.random%d' % pos_num) + '.%d' + ('.npr%d' % neg_pos_ratio) + '.%d.txt' concepts = readConcepts(collection, annotationName, rootpath=rootpath) skip = 0 newAnnotationNames = [None] * (pos_bag_num * neg_bag_num) for idxp in range(pos_bag_num): for idxn in range(neg_bag_num): anno_idx = idxp * neg_bag_num + idxn newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn) resultfile = os.path.join(rootpath, collection, 'Annotations', newAnnotationNames[anno_idx]) if checkToSkip(resultfile, options.overwrite): skip += 1 continue writeConcepts(concepts, resultfile) first, second, last = annotationNameStr.split('%d') scriptfile = os.path.join( rootpath, collection, 'annotationfiles', first + '0-%d' % (pos_bag_num - 1) + second + '0-%d' % (neg_bag_num - 1) + last) makedirsforfile(scriptfile) fout = open(scriptfile, 'w') fout.write('\n'.join(newAnnotationNames) + '\n') fout.close() if len(newAnnotationNames) == skip: return 0 for concept in concepts: names, labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath) positivePool = [x[0] for x in zip(names, labels) if x[1] > 0] negativePool = [x[0] for x in zip(names, labels) if x[1] < 0] for idxp in range(pos_bag_num): if len(positivePool) > pos_num: positiveBag = random.sample(positivePool, pos_num) else: positiveBag = positivePool for idxn in range(neg_bag_num): anno_idx = idxp * neg_bag_num + idxn newAnnotationName = newAnnotationNames[anno_idx] resultfile = os.path.join(rootpath, collection, 'Annotations', 'Image', newAnnotationName, '%s.txt' % concept) if checkToSkip(resultfile, options.overwrite): continue real_neg_num = max(len(positiveBag) * neg_pos_ratio, 1000) real_neg_num = min(len(negativePool), real_neg_num) negativeBag = random.sample(negativePool, real_neg_num) assert (len(set(positiveBag).intersection( set(negativeBag))) == 0) printStatus( INFO, "anno(%s,%d) %d pos %d neg -> %s" % (concept, anno_idx, len(positiveBag), len(negativeBag), resultfile)) writeAnnotations(positiveBag + negativeBag, [1] * len(positiveBag) + [-1] * len(negativeBag), resultfile)