def evaluateSearchEngines(searchers, collection, annotationName, metric, rootpath=ROOT_PATH): scorer = getScorer(metric) concepts = readConcepts(collection, annotationName, rootpath) nr_of_runs = len(searchers) nr_of_concepts = len(concepts) results = np.zeros((nr_of_concepts, nr_of_runs)) for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], rootpath) name2label = dict(zip(names, labels)) for j in range(nr_of_runs): searchresults = searchers[j].scoreCollection(concepts[i]) sorted_labels = [ name2label[name] for (name, score) in searchresults if name in name2label ] results[i, j] = scorer.score(sorted_labels) for i in range(nr_of_concepts): print concepts[i], ' '.join([niceNumber(x, 3) for x in results[i, :]]) mean_perf = results.mean(0) print 'mean%s' % metric, ' '.join([niceNumber(x, 3) for x in mean_perf]) return concepts, results
def eval(data, weights): scorer = getScorer('DCG@25') feat_dim = data[0][-1].shape[1] nr_of_qry = len(data) mean_perf = [0] * feat_dim avg_perf = 0 for i in range(nr_of_qry): qid, labels, scores = data[i] fusion = scores.dot(weights) sorted_idx = (0 - fusion).argsort() sorted_labels = [labels[x] for x in sorted_idx] avg_perf += scorer.score(sorted_labels) return avg_perf / nr_of_qry
def eval(data, weights): scorer = getScorer('DCG@25') feat_dim = data[0][-1].shape[1] nr_of_qry = len(data) mean_perf = [0] * feat_dim avg_perf = 0 for i in range(nr_of_qry): qid, labels, scores = data[i] fusion = scores.dot(weights) sorted_idx = (0-fusion).argsort() sorted_labels = [labels[x] for x in sorted_idx] avg_perf += scorer.score(sorted_labels) return avg_perf / nr_of_qry
def i2t_map(c2i, n_caption=5): """ Images->Text (Text Search) c2i: (5N, N) matrix of caption to image errors """ scorer = getScorer('AP') perf_list = [] for i in range(c2i.shape[1]): d_i = c2i[:, i] labels = [0] * len(d_i) labels[i * n_caption:(i + 1) * n_caption] = [1] * n_caption sorted_labels = [labels[x] for x in np.argsort(d_i)] current_score = scorer.score(sorted_labels) perf_list.append(current_score) return np.mean(perf_list)
def t2v_map(c2i, t2v_gts): """ Text->Videos (Text-to-Video Retrieval) c2i: (5N, N) matrix of caption to video errors """ scorer = getScorer('AP') perf_list = [] for i in range(c2i.shape[0]): d_i = c2i[i, :] labels = [0]*len(d_i) x = t2v_gts[i][0] labels[x] = 1 sorted_labels = [labels[x] for x in np.argsort(d_i)] current_score = scorer.score(sorted_labels) perf_list.append(current_score) return np.mean(perf_list)
def t2i_map(c2i, n_caption=5): """ Text->Videos (Text-to-Video Retrieval) c2i: (5N, N) matrix of caption to video errors """ # print("errors matrix shape: ", c2i.shape) assert c2i.shape[0] / c2i.shape[1] == n_caption, c2i.shape scorer = getScorer('AP') perf_list = [] for i in range(c2i.shape[0]): d_i = c2i[i, :] labels = [0] * len(d_i) labels[i / n_caption] = 1 sorted_labels = [labels[x] for x in np.argsort(d_i)] current_score = scorer.score(sorted_labels) perf_list.append(current_score) return np.mean(perf_list)
def i2t_map(c2i, n_caption=5): """ Videos->Text (Video-to-Text Retrieval) c2i: (5N, N) matrix of caption to video errors """ # print("errors matrix shape: ", c2i.shape) assert c2i.shape[0] / c2i.shape[1] == n_caption, c2i.shape scorer = getScorer('AP') perf_list = [] for i in range(c2i.shape[1]): d_i = c2i[:, i] labels = [0] * len(d_i) labels[i * n_caption:(i + 1) * n_caption] = [1] * n_caption sorted_labels = [labels[x] for x in np.argsort(d_i)] current_score = scorer.score(sorted_labels) perf_list.append(current_score) return np.mean(perf_list)
def v2t_map(c2i, v2t_gts): """ Videos->Text (Video-to-Text Retrieval) c2i: (5N, N) matrix of caption to video errors """ scorer = getScorer('AP') perf_list = [] for i in range(c2i.shape[1]): d_i = c2i[:, i] labels = [0]*len(d_i) # labels[i*n_caption:(i+1)*n_caption] = [1]*n_caption for x in v2t_gts[i]: labels[x] = 1 sorted_labels = [labels[x] for x in np.argsort(d_i)] current_score = scorer.score(sorted_labels) perf_list.append(current_score) return np.mean(perf_list)
def eval_file(data, weights, file_name): scorer = getScorer('DCG') feat_dim = data[0][-1].shape[1] nr_of_qry = len(data) mean_perf = [0] * feat_dim avg_perf = 0 fout = open(file_name, 'w') for i in range(nr_of_qry): qid, labels, scores = data[i] fusion = scores.dot(weights) print '.' * 20 print fusion print '-' * 20 sorted_idx = (0 - fusion).argsort() sorted_labels = [labels[x] for x in sorted_idx] fout.write(qid + " " + str(scorer.score(sorted_labels)) + "\n") fout.close()
def eval_file(data, weights, file_name): scorer = getScorer('DCG') feat_dim = data[0][-1].shape[1] nr_of_qry = len(data) mean_perf = [0] * feat_dim avg_perf = 0 fout = open(file_name, 'w') for i in range(nr_of_qry): qid, labels, scores = data[i] fusion = scores.dot(weights) print '.' * 20 print fusion print '-' * 20 sorted_idx = (0-fusion).argsort() sorted_labels = [labels[x] for x in sorted_idx] fout.write(qid +" "+ str(scorer.score(sorted_labels)) + "\n") fout.close()
def test(): scorer = getScorer('DCG@25') data = load_data() feat_dim = data[0][-1].shape[1] nr_of_qry = len(data) mean_perf = [0] * feat_dim rand_perf = 0 avg_perf = 0 weights = [1.0/feat_dim] * feat_dim for i in range(nr_of_qry): qid, labels, scores = data[i] nr_of_img = len(labels) A = (0-scores).argsort(axis=0) for j in range(feat_dim): sorted_idx = A[:,j] sorted_labels = [labels[x] for x in sorted_idx] random_guess = scorer.score(random.sample(labels, len(labels))) run = scorer.score(sorted_labels) mean_perf[j] += run rand_perf += random_guess #print int(run > random_guess), run avg = scores.dot(weights) sorted_idx = (0-avg).argsort() sorted_labels = [labels[x] for x in sorted_idx] avg_perf += scorer.score(sorted_labels) print 'random guess', rand_perf / (feat_dim * nr_of_qry) for j in range(feat_dim): print 'DCG@25 of feature_%d: %s' % (j, mean_perf[j] / nr_of_qry) # print 'mean DCG@25 of different feature:' , avg_perf / nr_of_qry print 'DCG@25 of average_fusion:', eval(data, weights)
def test(): scorer = getScorer('DCG@25') data = load_data() feat_dim = data[0][-1].shape[1] nr_of_qry = len(data) mean_perf = [0] * feat_dim rand_perf = 0 avg_perf = 0 weights = [1.0 / feat_dim] * feat_dim for i in range(nr_of_qry): qid, labels, scores = data[i] nr_of_img = len(labels) A = (0 - scores).argsort(axis=0) for j in range(feat_dim): sorted_idx = A[:, j] sorted_labels = [labels[x] for x in sorted_idx] random_guess = scorer.score(random.sample(labels, len(labels))) run = scorer.score(sorted_labels) mean_perf[j] += run rand_perf += random_guess #print int(run > random_guess), run avg = scores.dot(weights) sorted_idx = (0 - avg).argsort() sorted_labels = [labels[x] for x in sorted_idx] avg_perf += scorer.score(sorted_labels) print 'random guess', rand_perf / (feat_dim * nr_of_qry) for j in range(feat_dim): print 'DCG@25 of feature_%d: %s' % (j, mean_perf[j] / nr_of_qry) # print 'mean DCG@25 of different feature:' , avg_perf / nr_of_qry print 'DCG@25 of average_fusion:', eval(data, weights)
def evaluateSearchEngines(searchers, collection, annotationName, metric, rootpath=ROOT_PATH): scorer = getScorer(metric) concepts = readConcepts(collection, annotationName, rootpath) nr_of_runs = len(searchers) nr_of_concepts = len(concepts) results = np.zeros((nr_of_concepts,nr_of_runs)) for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], rootpath) name2label = dict(zip(names,labels)) for j in range(nr_of_runs): searchresults = searchers[j].scoreCollection(concepts[i]) sorted_labels = [name2label[name] for (name,score) in searchresults if name in name2label] results[i,j] = scorer.score(sorted_labels) for i in range(nr_of_concepts): print concepts[i], ' '.join([niceNumber(x,3) for x in results[i,:]]) mean_perf = results.mean(0) print 'mean%s'%metric, ' '.join([niceNumber(x,3) for x in mean_perf]) return concepts,results
def process(options, trainCollection, devCollection): rootpath = options.rootpath overwrite = options.overwrite method = options.method metric = options.metric qrysim = options.qrysim qrythres = options.qrythres ntopimg = options.ntopimg ntopqry = options.ntopqry mincc = options.mincc feature = options.feature # semantic embedding k = options.k corpus = options.corpus word2vec_model = options.word2vec label_source = options.label_source # result path ranking_result_path = os.path.join(rootpath, devCollection, 'SimilarityIndex', devCollection, 'MetaData', method, feature) DCG_result_path = os.path.join(rootpath, devCollection, metric, method, feature) if checkToSkip(ranking_result_path, overwrite): sys.exit(0) if checkToSkip(DCG_result_path, overwrite): sys.exit(0) # inpute of query qp = SimpleQueryParser() qid_query_file = os.path.join(rootpath, devCollection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) #(qid query) qid2query = dict(zip(qid_list, [qp.process(query) for query in query_list])) # path of image feature train_feat_path = os.path.join(rootpath, trainCollection, 'FeatureData', feature) dev_feat_path = os.path.join(rootpath, devCollection, 'FeatureData', feature) # method selection if method =='conse': se_searcher = ConSE(label_source, corpus, word2vec_model, dev_feat_path, rootpath) elif method == 't2i' or method == 'ta': nnquery_file = os.path.join(rootpath, devCollection, 'TextData','querynn', options.nnqueryfile) qryClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.queryclickfile) t2i_searcher = Text2Image(nnquery_file, qryClick_file, dev_feat_path, train_feat_path, ntopqry) elif method == 'i2t' or method == 'ia': nnimage_file = os.path.join(rootpath, devCollection, 'TextData','imagenn', feature, options.nnimagefile) imgClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.imageclickfile) i2t_searcher = Image2Text(nnimage_file, imgClick_file, qrysim, ntopimg, ntopqry) else: print "this model is not supported with %s" % method sys.exit(0) # calculate DCG@25 scorer = getScorer(metric) done = 0 failed_count = 0 qid2dcg = collections.OrderedDict() qid2iid_label_score = {} for query_id in qid_list: iid_list, label_list = readAnnotationsFrom(devCollection, 'concepts%s.txt' % devCollection, query_id, False, rootpath) if method == 'conse': scorelist = se_searcher.do_search(qid2query[query_id], iid_list, k) elif method == 't2i': scorelist = t2i_searcher.text2image(query_id, iid_list, qrythres, mincc ) elif method == 'ta': scorelist = t2i_searcher.textAnnotation( query_id, iid_list, ntopimg, qrythres, mincc) elif method == 'i2t': scorelist = i2t_searcher.image2text(qid2query[query_id], iid_list, mincc ) elif method == 'ia': scorelist = i2t_searcher.imageAnnotation( qid2query[query_id], iid_list, mincc ) if len(scorelist) == 0: failed_count += 1 scorelist = [0]*len(iid_list) qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist) random.shuffle(qid2iid_label_score[query_id]) else: qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist) qid2iid_label_score[query_id] = sorted(qid2iid_label_score[query_id], key=lambda v:v[2], reverse=True) # calculate the result ranking of DCG@25 from our model qid2dcg[query_id] = scorer.score([x[1] for x in qid2iid_label_score[query_id]]) printMessage("Done", query_id, qid2query[query_id]) done += 1 if(done % 20 == 0): writeRankingResult(ranking_result_path, qid2iid_label_score) qid2iid_label_score = {} writeRankingResult(ranking_result_path, qid2iid_label_score) writeDCGResult(DCG_result_path, qid2dcg) print "number of failed query: %d" % failed_count print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values())) result_path_file = "result/individual_result_pathes.txt" if os.path.exists(result_path_file): fout = open(result_path_file,'a') else: makedirsforfile(result_path_file) fout = open(result_path_file, 'w') fout.write(ranking_result_path + '\n') fout.close()
urls = ('/', 'index', '/search', 'ImageSearch', '/images/(.*)', 'images', '/img/(.*)', 'images', '/images2/(.*)', 'bigimages') render = web.template.render('templates/') pwd = os.path.dirname(os.path.realpath(__file__)) config = json.load(open(os.path.join(pwd, 'config.json'))) max_hits = config['max_hits'] rootpath = config['rootpath'] collection = config['collection'] rankMethod = config['rankMethod'] annotationName = config['annotationName'] metric = config['metric'] scorer = getScorer(metric) simdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection, rankMethod) imset = readImageSet(collection, collection, rootpath) class index: def GET(self): input = web.input(query=None) resp = { 'status': 0, 'hits': 0, 'random': [], 'tagrel': [], 'metric': metric,
def process(options, collection): rootpath = options.rootpath overwrite = options.overwrite feature = options.feature method = options.method sigma =options.sigma # result path ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'MetaData', method, feature) DCG_result_path = os.path.join(rootpath, collection, 'DCG', method, feature) if checkToSkip(ranking_result_path, overwrite): sys.exit(0) if checkToSkip(DCG_result_path, overwrite): sys.exit(0) # inpute of query qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) qid2query = dict(zip(qid_list, query_list)) # inpute of image img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature) img_feats = BigFile(img_feat_path) # the model to calculate DCG@25 scorer = getScorer("DCG@25") done = 0 qid2dcg = collections.OrderedDict() qid2iid_label_score = {} for qid in qid_list: iid_list, label_list = readAnnotationsFrom(collection, 'concepts%s.txt' % collection, qid, False, rootpath) renamed, test_X = img_feats.read(iid_list) parzen_list = [] for imidx in iid_list: parzen_list.append(calParzen(img_feats.read_one(imidx), test_X , sigma)) sorted_tuple = sorted(zip(iid_list, label_list, parzen_list), key=lambda v:v[2], reverse=True) qid2iid_label_score[qid] = sorted_tuple # calculate DCG@25 sorted_label = [x[1] for x in sorted_tuple] qid2dcg[qid] = scorer.score(sorted_label) printMessage("Done", qid, qid2query[qid]) done += 1 if done % 20 == 0: writeRankingResult(ranking_result_path, qid2iid_label_score) qid2iid_label_score = {} writeDCGResult(DCG_result_path, qid2dcg) writeRankingResult(ranking_result_path, qid2iid_label_score) print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values())) result_path_file = "result/individual_result_pathes.txt" if os.path.exists(result_path_file): fout = open(result_path_file,'a') else: makedirsforfile(result_path_file) fout = open(result_path_file, 'w') fout.write(ranking_result_path + '\n') fout.close()
def process(options, trainCollection, trainAnnotationName, valCollection, valAnnotationName, feature, modelName): assert (modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 metric = options.metric scorer = getScorer(metric) overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {} if 'fik' == modelName: from fiksvm.svmutil import svm_train as train_model from fiksvm.fiksvm import svm_to_fiksvm as compress_model from fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from fastlinear.liblinear193.python.liblinearutil import train as train_model from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model modelName = 'fastlinear' concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) valConcepts = readConcepts(valCollection, valAnnotationName, rootpath=rootpath) concept_num = len(concepts) for i in range(concept_num): assert (concepts[i] == valConcepts[i]) resultdir = os.path.join( rootpath, trainCollection, 'Models', trainAnnotationName, '%s,best_params' % modelName, '%s,%s,%s' % (valCollection, valAnnotationName, feature)) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.txt') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile( os.path.join(rootpath, trainCollection, 'FeatureData', feature)) val_feat_file = BigFile( os.path.join(rootpath, valCollection, 'FeatureData', feature)) feat_dim = train_feat_file.ndims assert (feat_dim == val_feat_file.ndims) for concept in todo: names, labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) renamed, vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1 == lab]) wp = float(beta) * (np + nn) / np wn = (1.0 - beta) * (np + nn) / nn names, labels = readAnnotationsFrom(valCollection, valAnnotationName, concept, skip_0=True, rootpath=rootpath) val_name2label = dict(zip(names, labels)) val_renamed, val_vectors = val_feat_file.read(names) min_perf = 2.0 worst_C = 1.0 max_perf = 0.0 best_C = 1.0 best_scores = None best_labels = None for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]: if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' #print modelName, '>'*20, svm_params model = train_model(Ys, vectors, svm_params + ' -q') new_model = compress_model([model], [1.0], feat_dim, params) ranklist = [(val_renamed[i], new_model.predict(val_vectors[i])) for i in range(len(val_renamed))] ranklist.sort(key=lambda v: v[1], reverse=True) sorted_labels = [val_name2label[x[0]] for x in ranklist] perf = scorer.score(sorted_labels) if max_perf < perf: max_perf = perf best_C = C best_scores = [x[1] for x in ranklist] best_labels = list(sorted_labels) if min_perf > perf: min_perf = perf worst_C = C [A, B] = sigmoid_train(best_scores, best_labels) resultfile = os.path.join(resultdir, '%s.txt' % concept) printStatus( INFO, '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' % (concept, min_perf, worst_C, max_perf, best_C, A, B)) makedirsforfile(resultfile) fw = open(resultfile, 'w') fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B)) fw.close()
def process(options, collection, annotationName, runfile): rootpath = options.rootpath p1_scorer = getScorer('P@3') p3_scorer = getScorer('P@5') r1_scorer = getScorer('R@3') r3_scorer = getScorer('R@5') ndcg1_scorer = getScorer('NDCG2@3') ndcg3_scorer = getScorer('NDCG2@5') ap_scorer = getScorer('AP') rr_scorer = getScorer('RR') datafiles = [ x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#') ] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) name2label = [{} for i in range(nr_of_concepts)] rel_conset = {} for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) #names = map(int, names) name2label[i] = dict(zip(names, labels)) for im, lab in zip(names, labels): if lab > 0: rel_conset.setdefault(im, set()).add(i) # ('7975436322', set([33])) # for im, im_labels in rel_conset.items(): # print(im, im_labels) for run_idx in range(nr_of_runs): data = pickle.load(open(datafiles[run_idx], 'rb')) scores = data['scores'] assert (scores.shape[1] == nr_of_concepts) imset = data['id_images'] # for im in imset: # print(im) # raw_input() nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] res = np.zeros((nr_of_images, 8)) for j in range(nr_of_images): ranklist = zip(range(nr_of_concepts), scores[j, :]) ranklist.sort(key=lambda v: v[1], reverse=True) # print(ranklist) # raw_input() rel_set = rel_conset.get(imset[j], set()) sorted_labels = [int(x[0] in rel_set) for x in ranklist] # print(sorted_labels) # raw_input() assert len(sorted_labels) == nr_of_concepts p1 = p1_scorer.score(sorted_labels) p3 = p3_scorer.score(sorted_labels) r1 = r1_scorer.score(sorted_labels) r3 = r3_scorer.score(sorted_labels) ndcg1 = ndcg1_scorer.score(sorted_labels) ndcg3 = ndcg3_scorer.score(sorted_labels) ap = ap_scorer.score(sorted_labels) rr = rr_scorer.score(sorted_labels) f1, f3 = 0.0, 0.0 if (p1 + r1) != 0.0: f1 = 2 * p1 * r1 / (p1 + r1) if (p3 + r3) != 0.0: f3 = 2 * p3 * r3 / (p3 + r3) # h1, h3 = max(p1, r1), max(p3, r3) res[j, :] = [p1, p3, r1, r3, ndcg1, ndcg3, ap, rr] res[j, :] = [p1, p3, f1, f3, ndcg1, ndcg3, ap, rr] # res[j,:] = [p1, p3, h1, h3, ndcg1, ndcg3, ap, rr] avg_perf = res.mean(axis=0) name = path.basename(datafiles[run_idx]).split('.')[0] name = name.split(',')[1] stdout.write('%s\t' % name) # for x in avg_perf: for i in range(len(avg_perf)): if i == 4 or i == 5: continue # x = avg_perf[i] * 100.0 x = avg_perf[i] if x >= 100.0: stdout.write('& %.1f ' % x) else: # stdout.write('& %.2f ' % x) stdout.write('& %s' % (('%.4f ' % x).lstrip('0'))) stdout.write('\n')
from fastsvm.svm import * from fiksvm import * from fiksvmutil import * from fastsvm.fiksvm import svm_to_fiksvm as svm_to_fiksvm0 if __name__ == "__main__": rootpath = ROOT_PATH trainCollection = "voc2008train" testCollection = "voc2008val" annotationName = "conceptsvoc2008train.txt" #concept = "aeroplane" feature = "dsift" concepts = readConcepts(testCollection, 'conceptsvoc2008val.txt') scorer = getScorer('AP') min_vals, max_vals = find_min_max_vals(BigFile(os.path.join(rootpath, trainCollection, 'FeatureData', feature), FEATURE_TO_DIM[feature])) featurefile = os.path.join(rootpath, testCollection, "FeatureData", feature, "id.feature.txt") feat_dim = 1024 num_bins = 50 #fikmodel.set_probAB(-1, 0) #print "fik model0", fikmodel0.get_nr_svs(), fikmodel0.get_feat_dim(), fikmodel0.get_probAB() #print "fik model", fikmodel.get_nr_svs(), fikmodel.get_feat_dim(), fikmodel.get_probAB() mAP = [0]*4 for concept in concepts: names,labels = readAnnotationsFrom(testCollection, 'conceptsvoc2008val.txt', concept) name2label = dict(zip(names,labels))
def process(options, collection, annotationName, runfile): rootpath = options.rootpath overwrite = options.overwrite resultdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection) apscorer = getScorer('AP') datafiles = [ x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#') ] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) printStatus(INFO, 'read annotations from files') name2label = [{} for i in range(nr_of_concepts)] hit_imgset = [[] for i in range(nr_of_concepts)] for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) names = map(int, names) name2label[i] = dict(zip(names, labels)) label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt' % concepts[i]) try: hit_imgset[i] = set(map(int, open(label_file).readlines())) except: hit_imgset[i] = set() printStatus( INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i]))) ap_table = np.zeros((nr_of_runs, nr_of_concepts)) ap2_table = np.zeros((nr_of_runs, nr_of_concepts)) for run_idx in range(nr_of_runs): runName = os.path.splitext(os.path.basename(datafiles[run_idx]))[0] data = pickle.load(open(datafiles[run_idx], 'rb')) scores = data['scores'] assert (scores.shape[1] == nr_of_concepts) imset = data['id_images'] nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] for c_idx in range(nr_of_concepts): ground_truth = name2label[c_idx] ranklist = zip(imset, scores[:, c_idx]) ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True) ranklist = [x for x in ranklist if x[0] in ground_truth] resfile = os.path.join(resultdir, runName, '%s.txt' % concepts[c_idx]) if not checkToSkip(resfile, overwrite): writeRankingResults(ranklist, resfile) sorted_labels = [ground_truth[x[0]] for x in ranklist] assert (len(sorted_labels) > 0) #print concepts[c_idx], ranklist[:5], sorted_labels[:5] ap_table[run_idx, c_idx] = apscorer.score(sorted_labels) resfile = os.path.join(resultdir, 'tagged,lemm', runName, '%s.txt' % concepts[c_idx]) if not checkToSkip(resfile, overwrite): writeRankingResults( [x for x in ranklist if x[0] in hit_imgset[c_idx]], resfile) sorted_labels = [ ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx] ] ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels) print '#' * 100 print '# untagged-concept', ' '.join( [os.path.basename(x) for x in datafiles]) print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ap_table[:, c_idx]]) print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)]) print '#' * 100 print '# tagged-concept' print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ap2_table[:, c_idx]]) print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])
def process(options, collection, annotationName, runfile): rootpath = options.rootpath overwrite = options.overwrite resultdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection) apscorer = getScorer('AP') datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#')] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) printStatus(INFO, 'read annotations from files') name2label = [{} for i in range(nr_of_concepts)] hit_imgset = [[] for i in range(nr_of_concepts)] for i in range(nr_of_concepts): names,labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) names = map(int, names) name2label[i] = dict(zip(names,labels)) label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt'% concepts[i]) try: hit_imgset[i] = set(map(int, open(label_file).readlines())) except: hit_imgset[i] = set() printStatus(INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i]))) ap_table = np.zeros((nr_of_runs, nr_of_concepts)) ap2_table = np.zeros((nr_of_runs, nr_of_concepts)) for run_idx in range(nr_of_runs): runName = os.path.splitext(os.path.basename(datafiles[run_idx]))[0] data = pickle.load(open(datafiles[run_idx],'rb')) scores = data['scores'] assert(scores.shape[1] == nr_of_concepts) imset = data['id_images'] nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] for c_idx in range(nr_of_concepts): ground_truth = name2label[c_idx] ranklist = zip(imset, scores[:,c_idx]) ranklist.sort(key=lambda v:(v[1], str(v[0])), reverse=True) ranklist = [x for x in ranklist if x[0] in ground_truth] resfile = os.path.join(resultdir, runName, '%s.txt' % concepts[c_idx]) if not checkToSkip(resfile, overwrite): writeRankingResults(ranklist, resfile) sorted_labels = [ground_truth[x[0]] for x in ranklist] assert(len(sorted_labels)>0) #print concepts[c_idx], ranklist[:5], sorted_labels[:5] ap_table[run_idx, c_idx] = apscorer.score(sorted_labels) resfile = os.path.join(resultdir, 'tagged,lemm', runName, '%s.txt' % concepts[c_idx]) if not checkToSkip(resfile, overwrite): writeRankingResults([x for x in ranklist if x[0] in hit_imgset[c_idx]], resfile) sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]] ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels) print '#'*100 print '# untagged-concept', ' '.join([os.path.basename(x) for x in datafiles]) print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ap_table[:,c_idx]]) print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)]) print '#'*100 print '# tagged-concept' print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ap2_table[:,c_idx]]) print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])
def process(options, collection): rootpath = options.rootpath overwrite = options.overwrite feature = options.feature method = options.method sigma = options.sigma # result path ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'MetaData', method, feature) DCG_result_path = os.path.join(rootpath, collection, 'DCG', method, feature) if checkToSkip(ranking_result_path, overwrite): sys.exit(0) if checkToSkip(DCG_result_path, overwrite): sys.exit(0) # inpute of query qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) qid2query = dict(zip(qid_list, query_list)) # inpute of image img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature) img_feats = BigFile(img_feat_path) # the model to calculate DCG@25 scorer = getScorer("DCG@25") done = 0 qid2dcg = collections.OrderedDict() qid2iid_label_score = {} for qid in qid_list: iid_list, label_list = readAnnotationsFrom( collection, 'concepts%s.txt' % collection, qid, False, rootpath) renamed, test_X = img_feats.read(iid_list) parzen_list = [] for imidx in iid_list: parzen_list.append( calParzen(img_feats.read_one(imidx), test_X, sigma)) # parzen_list_suffle = calParzen_fast(test_X, len(renamed), sigma) # parzen_list = [] # for imidx in iid_list: # parzen_list.append(parzen_list_suffle[renamed.index(imidx)]) sorted_tuple = sorted(zip(iid_list, label_list, parzen_list), key=lambda v: v[2], reverse=True) qid2iid_label_score[qid] = sorted_tuple # calculate DCG@25 sorted_label = [x[1] for x in sorted_tuple] qid2dcg[qid] = scorer.score(sorted_label) printMessage("Done", qid, qid2query[qid]) done += 1 if done % 20 == 0: writeRankingResult(ranking_result_path, qid2iid_label_score) qid2iid_label_score = {} writeDCGResult(DCG_result_path, qid2dcg) writeRankingResult(ranking_result_path, qid2iid_label_score) print "average DCG@25: %f" % (1.0 * sum(qid2dcg.values()) / len(qid2dcg.values())) result_path_file = "result/individual_result_pathes.txt" if os.path.exists(result_path_file): fout = open(result_path_file, 'a') else: makedirsforfile(result_path_file) fout = open(result_path_file, 'w') fout.write(ranking_result_path + '\n') fout.close()
def process(options, trainCollection, devCollection): rootpath = options.rootpath overwrite = options.overwrite method = options.method metric = options.metric qrysim = options.qrysim qrythres = options.qrythres ntopimg = options.ntopimg ntopqry = options.ntopqry mincc = options.mincc feature = options.feature # semantic embedding k = options.k corpus = options.corpus word2vec_model = options.word2vec label_source = options.label_source # result path ranking_result_path = os.path.join(rootpath, devCollection, 'SimilarityIndex', devCollection, 'MetaData', method, feature) DCG_result_path = os.path.join(rootpath, devCollection, 'DCG', method, feature) if checkToSkip(ranking_result_path, overwrite): sys.exit(0) if checkToSkip(DCG_result_path, overwrite): sys.exit(0) # inpute of query qp = SimpleQueryParser() qid_query_file = os.path.join(rootpath, devCollection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) #(qid query) qid2query = dict(zip(qid_list, [qp.process(query) for query in query_list])) # path of image feature train_feat_path = os.path.join(rootpath, trainCollection, 'FeatureData', feature) dev_feat_path = os.path.join(rootpath, devCollection, 'FeatureData', feature) # method selection if method == 'se': se_searcher = SemanticEmbedding(label_source, corpus, word2vec_model, dev_feat_path, rootpath) elif method == 't2i': nnquery_file = os.path.join(rootpath, devCollection, 'TextData', 'querynn', options.nnqueryfile) qryClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.queryclickfile) t2i_searcher = Text2Image(nnquery_file, qryClick_file, dev_feat_path, train_feat_path, ntopqry) elif method == 'i2t': nnimage_file = os.path.join(rootpath, devCollection, 'TextData', 'imagenn', feature, options.nnimagefile) imgClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.imageclickfile) i2t_searcher = Image2Text(nnimage_file, imgClick_file, qrysim, ntopimg, ntopqry) else: print "this model is not supported with %s" % method sys.exit(0) # calculate DCG@25 scorer = getScorer(metric) done = 0 failed_count = 0 qid2dcg = collections.OrderedDict() qid2iid_label_score = {} for query_id in qid_list: iid_list, label_list = readAnnotationsFrom( devCollection, 'concepts%s.txt' % devCollection, query_id, False, rootpath) if method == 'se': scorelist = se_searcher.do_search(qid2query[query_id], iid_list, k) elif method == 't2i': scorelist = t2i_searcher.text2image(query_id, iid_list, qrythres, mincc) elif method == 'i2t': scorelist = i2t_searcher.image2text(qid2query[query_id], iid_list, mincc) if len(scorelist) == 0: failed_count += 1 scorelist = [0] * len(iid_list) qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist) random.shuffle(qid2iid_label_score[query_id]) else: qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist) qid2iid_label_score[query_id] = sorted( qid2iid_label_score[query_id], key=lambda v: v[2], reverse=True) # calculate the result ranking of DCG@25 from our model qid2dcg[query_id] = scorer.score( [x[1] for x in qid2iid_label_score[query_id]]) printMessage("Done", query_id, qid2query[query_id]) done += 1 if (done % 20 == 0): writeRankingResult(ranking_result_path, qid2iid_label_score) qid2iid_label_score = {} writeRankingResult(ranking_result_path, qid2iid_label_score) writeDCGResult(DCG_result_path, qid2dcg) print "number of failed query: %d" % failed_count print "average DCG@25: %f" % (1.0 * sum(qid2dcg.values()) / len(qid2dcg.values())) result_path_file = "result/individual_result_pathes.txt" if os.path.exists(result_path_file): fout = open(result_path_file, 'a') else: makedirsforfile(result_path_file) fout = open(result_path_file, 'w') fout.write(ranking_result_path + '\n') fout.close()
def process(options, collection, annotationName, runfile, outDirectory): rootpath = options.rootpath apscorer = getScorer("AP") ndcg = getScorer("NDCG@20") ndcg2 = getScorer("NDCG2@20") p1scorer = getScorer("P@1") p5scorer = getScorer("P@5") datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith("#")] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) printStatus(INFO, "read annotations from files") name2label = [{} for i in range(nr_of_concepts)] hit_imgset = [[] for i in range(nr_of_concepts)] rel_conset = {} for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) names = map(int, names) name2label[i] = dict(zip(names, labels)) for im, lab in zip(names, labels): if lab > 0: rel_conset.setdefault(im, set()).add(i) label_file = os.path.join(rootpath, collection, "tagged,lemm", "%s.txt" % concepts[i]) try: hit_imgset[i] = set(map(int, open(label_file).readlines())) except: hit_imgset[i] = set() printStatus(INFO, "readLabeledImageSet for %s-%s -> %d hits" % (collection, concepts[i], len(hit_imgset[i]))) ap_table = np.zeros((nr_of_runs, nr_of_concepts)) ap2_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts)) print "#" * 100 print "# method miap hit1 hit5" print "#" * 100 for run_idx in range(nr_of_runs): data = pickle.load(open(datafiles[run_idx], "rb")) scores = data["scores"] assert scores.shape[1] == nr_of_concepts imset = data["id_images"] imset = np.array([int(x) for x in imset]) idx = np.argsort(imset) imset = imset[idx] scores = scores[idx] nr_of_images = len(imset) # print datafiles[run_idx], imset[:5], imset[-5:] for c_idx in range(nr_of_concepts): ground_truth = name2label[c_idx] ranklist = zip(imset, scores[:, c_idx]) ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True) ranklist = [x for x in ranklist if x[0] in ground_truth] sorted_labels = [ground_truth[x[0]] for x in ranklist] assert len(sorted_labels) > 0 # print concepts[c_idx], ranklist[:5], sorted_labels[:5] ap_table[run_idx, c_idx] = apscorer.score(sorted_labels) sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]] ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels) ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels) ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels) res = np.zeros((nr_of_images, 4)) gt = np.zeros((nr_of_images, nr_of_concepts)) for j in range(nr_of_images): ranklist = zip(range(nr_of_concepts), scores[j, :]) ranklist.sort(key=lambda v: v[1], reverse=True) rel_set = rel_conset.get(imset[j], set()) sorted_labels = [int(x[0] in rel_set) for x in ranklist] # print rel_set # print sorted_labels ap = apscorer.score(sorted_labels) hit1 = p1scorer.score(sorted_labels) hit5 = p5scorer.score(sorted_labels) > 0.1 res[j, :] = [ap, hit1, hit5, len(rel_set)] gt[j, :] = sorted_labels avg_perf = res.mean(axis=0) print os.path.split(datafiles[run_idx])[-1], " ".join(["%.3f" % x for x in avg_perf]) outMiap = h5py.File(os.path.join(outDirectory, os.path.split(datafiles[run_idx])[-1] + ".h5"), "w") outMiap["iap"] = res[:, 0] outMiap["ngt"] = res[:, 3] outMiap["hit1"] = res[:, 1] outMiap["hit5"] = res[:, 2] outMiap["gt"] = gt outMiap["concepts"] = concepts outMiap["ap"] = ap_table[run_idx, :] outMiap["ap2"] = ap2_table[run_idx, :] outMiap[ndcg.name()] = ndcg_table[run_idx, :] outMiap[ndcg2.name()] = ndcg2_table[run_idx, :] outMiap.close() print "#" * 100 print "# untagged-concept", " ".join([os.path.split(x)[-1] for x in datafiles]) print "#" * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], " ".join(["%.3f" % x for x in ap_table[:, c_idx]]) print "meanAP", " ".join(["%.3f" % x for x in ap_table.mean(axis=1)]) print "#" * 100 print "# tagged-concept" print "#" * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], " ".join(["%.3f" % x for x in ap2_table[:, c_idx]]) print "meanAP2", " ".join(["%.3f" % x for x in ap2_table.mean(axis=1)]) print "#" * 100 print "# tagged-concept" print "#" * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], " ".join(["%.3f" % x for x in ndcg_table[:, c_idx]]) print "mean%s" % ndcg.name(), " ".join(["%.3f" % x for x in ndcg_table.mean(axis=1)]) print "#" * 100 print "# tagged-concept" print "#" * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], " ".join(["%.3f" % x for x in ndcg2_table[:, c_idx]]) print "mean%s" % ndcg2.name(), " ".join(["%.3f" % x for x in ndcg2_table.mean(axis=1)])
def process(options, trainCollection, trainAnnotationName, valCollection, valAnnotationName, feature, modelName): assert(modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 metric = options.metric scorer = getScorer(metric) overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {} if 'fik' == modelName: from fiksvm.svmutil import svm_train as train_model from fiksvm.fiksvm import svm_to_fiksvm as compress_model from fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from fastlinear.liblinear193.python.liblinearutil import train as train_model from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model modelName = 'fastlinear' concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) valConcepts = readConcepts(valCollection,valAnnotationName, rootpath=rootpath) concept_num = len(concepts) for i in range(concept_num): assert(concepts[i] == valConcepts[i]) resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, '%s,best_params'%modelName, '%s,%s,%s' % (valCollection,valAnnotationName,feature)) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.txt') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature)) val_feat_file = BigFile(os.path.join(rootpath,valCollection,'FeatureData',feature)) feat_dim = train_feat_file.ndims assert(feat_dim == val_feat_file.ndims) for concept in todo: names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn names,labels = readAnnotationsFrom(valCollection, valAnnotationName, concept, skip_0=True, rootpath=rootpath) val_name2label = dict(zip(names,labels)) val_renamed, val_vectors = val_feat_file.read(names) min_perf = 2.0 worst_C = 1.0 max_perf = 0.0 best_C = 1.0 best_scores = None best_labels = None for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]: if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' #print modelName, '>'*20, svm_params model = train_model(Ys, vectors, svm_params + ' -q') new_model = compress_model([model], [1.0], feat_dim, params) ranklist = [(val_renamed[i], new_model.predict(val_vectors[i])) for i in range(len(val_renamed))] ranklist.sort(key=lambda v:v[1], reverse=True) sorted_labels = [val_name2label[x[0]] for x in ranklist] perf = scorer.score(sorted_labels) if max_perf < perf: max_perf = perf best_C = C best_scores = [x[1] for x in ranklist] best_labels = list(sorted_labels) if min_perf > perf: min_perf = perf worst_C = C [A,B] = sigmoid_train(best_scores, best_labels) resultfile = os.path.join(resultdir, '%s.txt' % concept) printStatus(INFO, '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' % (concept, min_perf, worst_C, max_perf, best_C, A, B)) makedirsforfile(resultfile) fw = open(resultfile, 'w') fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B)) fw.close()
from basic.constant import ROOT_PATH from basic.metric import getScorer from basic.common import writeRankingResults from basic.annotationtable import readAnnotationsFrom from simpleknn.bigfile import BigFile ROOT_PATH = '/home/root123/xirong/VisualSearch' rootpath = ROOT_PATH trainCollection = 'flickr81train' trainAnnotationName = 'concepts81train.random50.0.random50.0.txt' testCollection = "flickr81test" testAnnotationName = 'conceptsflickr81test.txt' feature = "dascaffeprob" feat_dim = 1000 scorer = getScorer("AP") targetConcept = sys.argv[1] #"aeroplane" train_feat_file = BigFile( os.path.join(ROOT_PATH, trainCollection, "FeatureData", feature), feat_dim) test_feat_file = BigFile( os.path.join(ROOT_PATH, testCollection, "FeatureData", feature), feat_dim) testImageSet = test_feat_file.names #random.sample(test_feat_file.names, 10000) minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: min_vals = map(float, str.split(f.readline()))
def process(options, trainCollection, devCollection): rootpath = options.rootpath overwrite = options.overwrite metric = options.metric qrythres = options.qrythres ntopimg = options.ntopimg ntopqry = options.ntopqry mincc = options.mincc feature = options.feature # result path ranking_result_path = os.path.join(rootpath, devCollection, 'SimilarityIndex', devCollection, 'MetaData', 'text2image', feature) DCG_result_path = os.path.join(rootpath, devCollection, metric, 'text2image', feature) if checkToSkip(ranking_result_path, overwrite): sys.exit(0) if checkToSkip(DCG_result_path, overwrite): sys.exit(0) # inpute of query qp = SimpleQueryParser() qid_query_file = os.path.join(rootpath, devCollection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) #(qid query) qid2query = dict(zip(qid_list, [qp.process(query) for query in query_list])) # random performance for specific queries qid_randomperf_file = os.path.join(rootpath, devCollection, 'Annotations', '*****@*****.**') qid2randomperf = {} for line in open(qid_randomperf_file): qid, random_perf = line.strip().split() qid2randomperf[qid] = float(random_perf) # path of image feature train_feat_path = os.path.join(rootpath, trainCollection, 'FeatureData', feature) dev_feat_path = os.path.join(rootpath, devCollection, 'FeatureData', feature) nnquery_file = os.path.join(rootpath, devCollection, 'TextData','querynn', options.nnqueryfile) qryClick_file = os.path.join(rootpath, trainCollection, 'TextData', options.queryclickfile) t2i_searcher = Text2Image(nnquery_file, qryClick_file, dev_feat_path, train_feat_path, ntopqry) # calculate DCG@25 scorer = getScorer(metric) done = 0 failed_count = 0 qid2dcg = collections.OrderedDict() qid2iid_label_score = {} for query_id in qid_list: iid_list, label_list = readAnnotationsFrom(devCollection, 'concepts%s.txt' % devCollection, query_id, False, rootpath) scorelist = t2i_searcher.doSearch( query_id, iid_list, ntopimg, qrythres, mincc) if len(scorelist) == 0: failed_count += 1 qid2dcg[query_id] = qid2randomperf[query_id] else: qid2iid_label_score[query_id] = zip(iid_list, label_list, scorelist) qid2iid_label_score[query_id] = sorted(qid2iid_label_score[query_id], key=lambda v:v[2], reverse=True) # calculate the result ranking of DCG@25 from our model qid2dcg[query_id] = scorer.score([x[1] for x in qid2iid_label_score[query_id]]) printMessage("Done", query_id, qid2query[query_id]) done += 1 if(done % 20 == 0): writeRankingResult(ranking_result_path, qid2iid_label_score) qid2iid_label_score = {} writeRankingResult(ranking_result_path, qid2iid_label_score) writeDCGResult(DCG_result_path, qid2dcg) print "number of failed query: %d" % failed_count print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values()))
from basic.constant import ROOT_PATH from basic.metric import getScorer from basic.common import writeRankingResults from basic.annotationtable import readAnnotationsFrom from simpleknn.bigfile import BigFile ROOT_PATH = '/home/root123/xirong/VisualSearch' rootpath = ROOT_PATH trainCollection = 'flickr81train' trainAnnotationName = 'concepts81train.random50.0.random50.0.txt' testCollection = "flickr81test" testAnnotationName = 'conceptsflickr81test.txt' feature = "dascaffeprob" feat_dim = 1000 scorer = getScorer("AP") targetConcept = sys.argv[1] #"aeroplane" train_feat_file = BigFile(os.path.join(ROOT_PATH, trainCollection, "FeatureData", feature), feat_dim) test_feat_file = BigFile(os.path.join(ROOT_PATH, testCollection, "FeatureData", feature), feat_dim) testImageSet = test_feat_file.names #random.sample(test_feat_file.names, 10000) minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: min_vals = map(float, str.split(f.readline())) max_vals = map(float, str.split(f.readline())) [names,labels] = readAnnotationsFrom(collection=trainCollection, annotationName=trainAnnotationName, concept=targetConcept, rootpath=rootpath) name2label = dict(zip(names,labels))
def process(options, collection, annotationName, runfile, outDirectory): rootpath = options.rootpath apscorer = getScorer('AP') ndcg = getScorer('NDCG@20') ndcg2 = getScorer('NDCG2@20') p1scorer = getScorer('P@1') p5scorer = getScorer('P@5') datafiles = [ x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#') ] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) printStatus(INFO, 'read annotations from files') name2label = [{} for i in range(nr_of_concepts)] hit_imgset = [[] for i in range(nr_of_concepts)] rel_conset = {} for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) names = map(int, names) name2label[i] = dict(zip(names, labels)) for im, lab in zip(names, labels): if lab > 0: rel_conset.setdefault(im, set()).add(i) label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt' % concepts[i]) try: hit_imgset[i] = set(map(int, open(label_file).readlines())) except: hit_imgset[i] = set() printStatus( INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i]))) ap_table = np.zeros((nr_of_runs, nr_of_concepts)) ap2_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts)) print '#' * 100 print '# method miap hit1 hit5' print '#' * 100 for run_idx in range(nr_of_runs): data = pickle.load(open(datafiles[run_idx], 'rb')) scores = data['scores'] assert (scores.shape[1] == nr_of_concepts) imset = data['id_images'] imset = np.array([int(x) for x in imset]) idx = np.argsort(imset) imset = imset[idx] scores = scores[idx] nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] for c_idx in range(nr_of_concepts): ground_truth = name2label[c_idx] ranklist = zip(imset, scores[:, c_idx]) ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True) ranklist = [x for x in ranklist if x[0] in ground_truth] sorted_labels = [ground_truth[x[0]] for x in ranklist] assert (len(sorted_labels) > 0) #print concepts[c_idx], ranklist[:5], sorted_labels[:5] ap_table[run_idx, c_idx] = apscorer.score(sorted_labels) sorted_labels = [ ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx] ] ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels) ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels) ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels) res = np.zeros((nr_of_images, 4)) gt = np.zeros((nr_of_images, nr_of_concepts)) for j in range(nr_of_images): ranklist = zip(range(nr_of_concepts), scores[j, :]) ranklist.sort(key=lambda v: v[1], reverse=True) rel_set = rel_conset.get(imset[j], set()) sorted_labels = [int(x[0] in rel_set) for x in ranklist] #print rel_set #print sorted_labels ap = apscorer.score(sorted_labels) hit1 = p1scorer.score(sorted_labels) hit5 = p5scorer.score(sorted_labels) > 0.1 res[j, :] = [ap, hit1, hit5, len(rel_set)] gt[j, :] = sorted_labels avg_perf = res.mean(axis=0) print os.path.split(datafiles[run_idx])[-1], ' '.join( ['%.3f' % x for x in avg_perf]) outMiap = h5py.File( os.path.join(outDirectory, os.path.split(datafiles[run_idx])[-1] + ".h5"), 'w') outMiap['iap'] = res[:, 0] outMiap['ngt'] = res[:, 3] outMiap['hit1'] = res[:, 1] outMiap['hit5'] = res[:, 2] outMiap['gt'] = gt outMiap['concepts'] = concepts outMiap['ap'] = ap_table[run_idx, :] outMiap['ap2'] = ap2_table[run_idx, :] outMiap[ndcg.name()] = ndcg_table[run_idx, :] outMiap[ndcg2.name()] = ndcg2_table[run_idx, :] outMiap.close() print '#' * 100 print '# untagged-concept', ' '.join( [os.path.split(x)[-1] for x in datafiles]) print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ap_table[:, c_idx]]) print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)]) print '#' * 100 print '# tagged-concept' print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ap2_table[:, c_idx]]) print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)]) print '#' * 100 print '# tagged-concept' print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ndcg_table[:, c_idx]]) print 'mean%s' % ndcg.name(), ' '.join( ['%.3f' % x for x in ndcg_table.mean(axis=1)]) print '#' * 100 print '# tagged-concept' print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ndcg2_table[:, c_idx]]) print 'mean%s' % ndcg2.name(), ' '.join( ['%.3f' % x for x in ndcg2_table.mean(axis=1)])
feature = "dsift" trainCollection = 'voc2008train' trainAnnotationName = 'conceptsvoc2008train.txt' testCollection = 'voc2008val' testset = testCollection testAnnotationName = 'conceptsvoc2008val.txt' modelName = 'fik50' #modelName = 'fastlinear' if 'fastlinear' == modelName: from fastlinear.fastlinear import fastlinear_load_model as load_model else: from fiksvm.fiksvm import fiksvm_load_model as load_model scorer = getScorer(metric) imset = readImageSet(testCollection,testset,rootpath=rootpath) concepts = readConcepts(testCollection,testAnnotationName,rootpath=rootpath) feat_dir = os.path.join(rootpath, testCollection, "FeatureData", feature) feat_file = BigFile(feat_dir) _renamed, _vectors = feat_file.read(imset) nr_of_images = len(_renamed) nr_of_concepts = len(concepts) mAP = 0.0 models = [None] * len(concepts) stream = StreamFile(feat_dir)
from fastsvm.svmutil import * from fastsvm.svm import * from fiksvm import * from fiksvmutil import * from fastsvm.fiksvm import svm_to_fiksvm as svm_to_fiksvm0 if __name__ == "__main__": rootpath = ROOT_PATH trainCollection = "voc2008train" testCollection = "voc2008val" annotationName = "conceptsvoc2008train.txt" #concept = "aeroplane" feature = "dsift" concepts = readConcepts(testCollection, 'conceptsvoc2008val.txt') scorer = getScorer('AP') min_vals, max_vals = find_min_max_vals( BigFile( os.path.join(rootpath, trainCollection, 'FeatureData', feature), FEATURE_TO_DIM[feature])) featurefile = os.path.join(rootpath, testCollection, "FeatureData", feature, "id.feature.txt") feat_dim = 1024 num_bins = 50 #fikmodel.set_probAB(-1, 0) #print "fik model0", fikmodel0.get_nr_svs(), fikmodel0.get_feat_dim(), fikmodel0.get_probAB() #print "fik model", fikmodel.get_nr_svs(), fikmodel.get_feat_dim(), fikmodel.get_probAB()
def process(options, collection, annotationName, runfile): rootpath = options.rootpath apscorer = getScorer('AP') ndcg = getScorer('NDCG@20') ndcg2 = getScorer('NDCG2@20') p1scorer = getScorer('P@1') p5scorer = getScorer('P@5') datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#')] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) printStatus(INFO, 'read annotations from files') name2label = [{} for i in range(nr_of_concepts)] hit_imgset = [[] for i in range(nr_of_concepts)] rel_conset = {} for i in range(nr_of_concepts): names,labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) names = map(int, names) name2label[i] = dict(zip(names,labels)) for im,lab in zip(names,labels): if lab > 0: rel_conset.setdefault(im,set()).add(i) label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt'% concepts[i]) try: hit_imgset[i] = set(map(int, open(label_file).readlines())) except: hit_imgset[i] = set() printStatus(INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i]))) ap_table = np.zeros((nr_of_runs, nr_of_concepts)) ap2_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts)) print '#'*100 print '# method miap hit1 hit5' print '#'*100 for run_idx in range(nr_of_runs): data = pickle.load(open(datafiles[run_idx],'rb')) scores = data['scores'] assert(scores.shape[1] == nr_of_concepts) imset = data['id_images'] nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] for c_idx in range(nr_of_concepts): ground_truth = name2label[c_idx] ranklist = zip(imset, scores[:,c_idx]) ranklist.sort(key=lambda v:(v[1], str(v[0])), reverse=True) ranklist = [x for x in ranklist if x[0] in ground_truth] sorted_labels = [ground_truth[x[0]] for x in ranklist] assert(len(sorted_labels)>0) #print concepts[c_idx], ranklist[:5], sorted_labels[:5] ap_table[run_idx, c_idx] = apscorer.score(sorted_labels) sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]] ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels) ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels) ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels) res = np.zeros((nr_of_images, 3)) for j in range(nr_of_images): ranklist = zip(range(nr_of_concepts), scores[j,:]) ranklist.sort(key=lambda v:v[1], reverse=True) rel_set = rel_conset.get(imset[j], set()) sorted_labels = [int(x[0] in rel_set) for x in ranklist] ap = apscorer.score(sorted_labels) hit1 = p1scorer.score(sorted_labels) hit5 = p5scorer.score(sorted_labels) > 0.1 res[j,:] = [ap, hit1, hit5] avg_perf = res.mean(axis=0) print os.path.split(datafiles[run_idx])[-1], ' '.join(['%.3f' % x for x in avg_perf]) print '#'*100 print '# untagged-concept', ' '.join([os.path.split(x)[-1] for x in datafiles]) print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ap_table[:,c_idx]]) print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)]) print '#'*100 print '# tagged-concept' print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ap2_table[:,c_idx]]) print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)]) print '#'*100 print '# tagged-concept' print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ndcg_table[:,c_idx]]) print 'mean%s' % ndcg.name(), ' '.join(['%.3f' % x for x in ndcg_table.mean(axis=1)]) print '#'*100 print '# tagged-concept' print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ndcg2_table[:,c_idx]]) print 'mean%s'%ndcg2.name(), ' '.join(['%.3f' % x for x in ndcg2_table.mean(axis=1)])
def process(options, trainCollection, valCollection, testCollection): lang = which_language(trainCollection) assert(which_language(trainCollection) == which_language(valCollection)) assert(which_language(trainCollection) == which_language(testCollection)) rootpath = options.rootpath overwrite = options.overwrite checkpoint = options.checkpoint init_model_from = options.init_model_from unroll = options.unroll corpus = options.corpus word2vec = options.word2vec batch_size = options.batch_size w2vv_config = options.model_config config = load_config('w2vv_configs/%s.py' % w2vv_config) img_feature = config.img_feature set_style = config.set_style # text embedding style (word2vec, bag-of-words, word hashing) text_style = config.text_style L1_normalize = config.L1_normalize L2_normalize = config.L2_normalize bow_vocab = config.bow_vocab+'.txt' l2_p = config.l2_p dropout = config.dropout max_epochs= config.max_epochs optimizer = config.optimizer loss_fun = config.loss_fun lr = config.lr clipnorm = config.clipnorm activation = config.activation sequences = config.sequences # lstm sent_maxlen = config.sent_maxlen embed_size = config.embed_size we_trainable = config.we_trainable lstm_size = config.lstm_size n_layers = map(int, config.n_layers.strip().split('-')) if init_model_from != '': init_model_name = init_model_from.strip().split("/")[-1] train_style = INFO + "_" + init_model_name else: train_style = INFO rnn_style, bow_style, w2v_style = text_style.strip().split('@') # text embedding style model_info = w2vv_config if 'lstm' in text_style or 'gru' in text_style: if lang == 'zh': w2v_data_path = os.path.join(rootpath, 'zh_w2v', 'model', 'zh_jieba.model') else: w2v_data_path = os.path.join(rootpath, "word2vec", corpus, word2vec) # bag-of-words vocabulary file path text_data_path = os.path.join(rootpath, trainCollection, "TextData", "vocabulary", "bow", bow_vocab) bow_data_path = os.path.join(rootpath, trainCollection, "TextData", "vocabulary", bow_style, bow_vocab) # text embedding (text representation) text2vec = get_text_encoder(rnn_style)(text_data_path, ndims=0, language=lang, L1_normalize=L1_normalize, L2_normalize=L2_normalize, maxlen=sent_maxlen) bow2vec = get_text_encoder(bow_style)(bow_data_path, ndims=0, language=lang, L1_normalize=L1_normalize, L2_normalize=L2_normalize) w2v2vec = get_text_encoder(w2v_style)(w2v_data_path, ndims=0, language=lang, L1_normalize=L1_normalize, L2_normalize=L2_normalize) if n_layers[0] == 0: n_layers[0] = bow2vec.ndims + w2v2vec.ndims else: assert n_layers[0] == bow2vec.ndims + w2v2vec.ndims # log file checkpoint_dir = os.path.join(rootpath, trainCollection, checkpoint, valCollection, train_style, model_info) else: logger.info("%s is not supported, please check the 'text_style' parameter", text_style) sys.exit(0) train_loss_hist_file = os.path.join(checkpoint_dir, 'train_loss_hist.txt') val_per_hist_file = os.path.join(checkpoint_dir, 'val_per_hist.txt') model_file_name = os.path.join(checkpoint_dir, 'model.json') model_img_name = os.path.join(checkpoint_dir, 'model.png') logger.info(model_file_name) if checkToSkip(model_file_name, overwrite): sys.exit(0) makedirsforfile(val_per_hist_file) # img2vec img_feat_path = os.path.join(rootpath, FULL_COLLECTION, 'FeatureData', img_feature) img_feats = BigFile(img_feat_path) val_img_feat_path = os.path.join(rootpath, FULL_COLLECTION, 'FeatureData', img_feature) val_img_feats = BigFile(val_img_feat_path) # dataset train_file = os.path.join(rootpath, trainCollection, 'TextData', '%s.caption.txt' % trainCollection) # training set # print "loss function: ", loss_fun dataset_style = 'sent_' + loss_fun DataSet = get_dataset(dataset_style) # represent text on the fly trainData = DataSet(train_file, batch_size, text2vec, bow2vec, w2v2vec, img_feats, flag_maxlen=True, maxlen=sent_maxlen) # get pre-trained word embedding we_weights = get_we_parameter(text2vec.vocab, w2v_data_path, lang) # define word2visualvec model w2vv = W2VV_MS( text2vec.nvocab, sent_maxlen, embed_size, we_weights, we_trainable, lstm_size, n_layers, dropout, l2_p, activation=activation, lstm_style=rnn_style, sequences=sequences, unroll=unroll) w2vv.save_json_model(model_file_name) w2vv.plot(model_img_name) w2vv.compile_model(optimizer, loss_fun, learning_rate = lr, clipnorm=clipnorm) if options.init_model_from != '': logger.info('initialize the model from %s', options.init_model_from) w2vv.init_model(options.init_model_from) # preparation for validation val_sent_file = os.path.join(rootpath, valCollection, 'TextData', '%s.caption.txt' % valCollection) val_sents_id, val_sents, val_id2sents = readSentsInfo(val_sent_file) val_img_list = map(str.strip, open(os.path.join(rootpath, valCollection, set_style, '%s.txt' % valCollection)).readlines()) sent_feats_1 = [] sent_feats_2 = [] new_val_sents_id = [] for index, sent in enumerate(val_sents): sent_vec = text2vec.mapping(sent) bow_vec = bow2vec.mapping(sent) w2v_vec = w2v2vec.mapping(sent) if sent_vec is not None and bow_vec is not None and w2v_vec is not None: sent_feats_1.append(sent_vec) sent_feats_2.append(list(bow_vec) + list(w2v_vec)) new_val_sents_id.append(val_sents_id[index]) sent_feats_1 = pad_sequences(sent_feats_1, maxlen=sent_maxlen, truncating='post') simer = get_simer('cosine_batch')() scorer = getScorer(options.val_metric) count = 0 lr_count = 0 best_validation_perf = 0 best_epoch = -1 train_loss_hist = [] val_per_hist = [] n_train_batches = int(np.ceil( 1.0 * trainData.datasize / batch_size )) if loss_fun == 'ctl': datasize = 2*trainData.datasize else: datasize = trainData.datasize for epoch in range(max_epochs): logger.info('Epoch %d', epoch) logger.info("Training..., learning rate: %g", w2vv.get_lr()) train_loss_epoch = [] train_progbar = generic_utils.Progbar(datasize) trainBatchIter = trainData.getBatchData() for minibatch_index in xrange(n_train_batches): train_X_batch, train_Y_batch = trainBatchIter.next() loss = w2vv.model.train_on_batch(train_X_batch, train_Y_batch) train_progbar.add(train_X_batch[0].shape[0], values=[("train loss", loss)]) train_loss_epoch.append(loss) train_loss_hist.append(np.mean(train_loss_epoch)) this_validation_perf = do_validation(val_img_list, val_img_feats, new_val_sents_id, sent_feats_1, sent_feats_2, simer, scorer, w2vv) val_per_hist.append(this_validation_perf) logger.info('previous_best_performance: %g', best_validation_perf) logger.info('current_performance: %g', this_validation_perf) fout_file = os.path.join(checkpoint_dir, 'epoch_%d.h5' % ( epoch)) lr_count += 1 if this_validation_perf > best_validation_perf: best_validation_perf = this_validation_perf count = 0 # save best model w2vv.model.save_weights(fout_file) if best_epoch != -1: os.system('rm '+ os.path.join(checkpoint_dir, 'epoch_%d.h5' % (best_epoch))) best_epoch = epoch else: # when the validation performance has decreased after an epoch, # we divide the learning rate by 2 and continue training; # but we use each learning rate for at least 3 epochs. if lr_count > 2: w2vv.decay_lr(0.5) lr_count = 0 count += 1 if count > 10: print ("Early stopping happend") break sorted_epoch_loss = zip(range(len(train_loss_hist)), train_loss_hist) with open(train_loss_hist_file, 'w') as fout: for i, loss in sorted_epoch_loss: fout.write("epoch_" + str(i) + " " + str(loss) + "\n") sorted_epoch_perf = sorted(zip(range(len(val_per_hist)), val_per_hist), key = lambda x: x[1], reverse=True) with open(val_per_hist_file, 'w') as fout: for i, perf in sorted_epoch_perf: fout.write("epoch_" + str(i) + " " + str(perf) + "\n") # generate the shell script for test templete = ''.join(open( 'TEMPLATE_do_test.sh').readlines()) striptStr = templete.replace('@@@rootpath@@@', rootpath) striptStr = striptStr.replace('@@@overwrite@@@', str(overwrite)) striptStr = striptStr.replace('@@@trainCollection@@@', trainCollection) striptStr = striptStr.replace('@@@testCollection@@@', '%s %s'%(valCollection, testCollection)) striptStr = striptStr.replace('@@@model_config@@@', w2vv_config) striptStr = striptStr.replace('@@@set_style@@@', set_style) striptStr = striptStr.replace('@@@model_path@@@', checkpoint_dir) striptStr = striptStr.replace('@@@model_name@@@', 'model.json') striptStr = striptStr.replace('@@@weight_name@@@', 'epoch_%d.h5' % sorted_epoch_perf[0][0]) runfile = 'do_test_%s_%s.sh' % (w2vv_config, testCollection) open( runfile, 'w' ).write(striptStr+'\n') os.system('chmod +x %s' % runfile) os.system('./%s' % runfile)