def process(options, collection, annotationName): rootpath = options.rootpath overwrite = options.overwrite neg_filter = options.neg_filter concepts = readConcepts(collection, annotationName, rootpath) newAnnotationName = annotationName[:-4] + 'social.txt' ne = STRING_TO_NEGATIVE_ENGINE[neg_filter](collection, rootpath) newConcepts = [] for concept in concepts: resultfile = os.path.join(rootpath, collection, 'Annotations', 'Image', newAnnotationName, '%s.txt'%concept) if checkToSkip(resultfile, overwrite): newConcepts.append(concept) continue try: pos_set = readLabeledImageSet(collection, concept, tpp='lemm', rootpath=rootpath) except: pos_set = None if not pos_set: printStatus(INFO, '*** %s has not labeled examples, will be ignored ***' % concept) continue neg_set = ne.sample(concept, int(1e8)) assert(len(set(pos_set).intersection(set(neg_set))) == 0) newlabels = [1] * len(pos_set) + [-1] * len(neg_set) newnames = pos_set + neg_set printStatus(INFO, "anno(%s) %d pos %d neg -> %s" % (concept,len(pos_set),len(neg_set),resultfile)) writeAnnotations(newnames, newlabels, resultfile) newConcepts.append(concept) writeConceptsTo(newConcepts, collection, newAnnotationName, rootpath)
def process(options, collection, annotationName): rootpath = options.rootpath overwrite = options.overwrite concepts = readConcepts(collection,annotationName,rootpath) resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd") todo = [x for x in concepts if not os.path.exists(os.path.join(resultdir,x+'.txt')) or overwrite] if not todo: printStatus(INFO, 'nothing to do') return fcs = FlickrContextSim(collection, rootpath=rootpath) vob = fcs.vob resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd") printStatus(INFO, 'expanding tags for %s-%s -> %s' % (collection, annotationName, resultdir)) for concept in todo: resultfile = os.path.join(resultdir, concept + '.txt') vals = [] for tag in vob: dist = fcs.computeNGD(concept, tag, img=1) if dist < 10: vals.append((tag,dist)) vals.sort(key=lambda v:v[1]) printStatus(INFO, '%s -> %s' % (concept, ' '.join([x[0] for x in vals[:3]]))) writeRankingResults(vals, resultfile)
def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH, k=DEFAULT_K): self.rootpath = rootpath self.concepts = readConcepts(collection, annotationName, rootpath) self.nr_of_concepts = len(self.concepts) self.concept2index = dict( zip(self.concepts, range(self.nr_of_concepts))) self.imset = readImageSet(collection, collection, rootpath) self.nr_of_images = len(self.imset) self.knndir = os.path.join(collection, '%s,%sknn,1500' % (feature, distance)) self.k = k self.noise = 0 self._load_tag_data(collection, tpp, rootpath) printStatus( INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len( self.tag2freq), distance, self.k))
def process(options, collection, annotationName, simdir, resultfile): rootpath = options.rootpath if checkToSkip(resultfile, options.overwrite): return 0 concepts = readConcepts(collection, annotationName, rootpath=rootpath) concept_num = len(concepts) id_images = readImageSet(collection, collection, rootpath) image_num = len(id_images) im2index = dict(zip(id_images, range(image_num))) print ('%d instances, %d concepts to dump -> %s' % (image_num, concept_num, resultfile)) scores = np.zeros((image_num, concept_num)) - 1e4 for c_id,concept in enumerate(concepts): simfile = os.path.join(simdir, '%s.txt' % concept) ranklist = readRankingResults(simfile) for im,score in ranklist: idx = im2index[im] scores[idx,c_id] = score makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump({'concepts':concepts, 'id_images':map(int,id_images), 'scores':scores}, output, -1) output.close()
def evaluateSearchEngines(searchers, collection, annotationName, metric, rootpath=ROOT_PATH): scorer = getScorer(metric) concepts = readConcepts(collection, annotationName, rootpath) nr_of_runs = len(searchers) nr_of_concepts = len(concepts) results = np.zeros((nr_of_concepts, nr_of_runs)) for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], rootpath) name2label = dict(zip(names, labels)) for j in range(nr_of_runs): searchresults = searchers[j].scoreCollection(concepts[i]) sorted_labels = [ name2label[name] for (name, score) in searchresults if name in name2label ] results[i, j] = scorer.score(sorted_labels) for i in range(nr_of_concepts): print concepts[i], ' '.join([niceNumber(x, 3) for x in results[i, :]]) mean_perf = results.mean(0) print 'mean%s' % metric, ' '.join([niceNumber(x, 3) for x in mean_perf]) return concepts, results
def process(options, trainCollection, modelAnnotationName, trainAnnotationName, feature): rootpath = options.rootpath modelName = options.model if 'fastlinear' == modelName: from fastlinear.fastlinear import fastlinear_load_model as load_model from fastlinear.fastlinear import fastlinear_save_model as save_model else: from fiksvm.fiksvm import fiksvm_load_model as load_model from fiksvm.fiksvm import fiksvm_save_model as save_model concepts = readConcepts(trainCollection, trainAnnotationName, rootpath) concepts = [concepts[i] for i in range(len(concepts)) if (i%options.numjobs + 1) == options.job] feat_file = BigFile(os.path.join(rootpath, trainCollection, "FeatureData", feature)) for concept in concepts: modelfile = os.path.join(rootpath, trainCollection, 'Models', modelAnnotationName, feature, modelName, '%s.model' % concept) model = load_model(modelfile) (A0, B0) = model.get_probAB() if abs(A0) > 1e-8 and not options.overwrite: printStatus(INFO, "old parameters exist as A=%g, B=%g, skip" % (A0, B0)) continue names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) results = classify_large_data(model, names, feat_file, prob_output=False) labels = [name2label[x[0]] for x in results] dec_values = [x[1] for x in results] printStatus(INFO, "%s +%d -%d" % (concept, len([x for x in labels if x==1]), len([x for x in labels if x==-1]))) [A,B] = sigmoid_train(dec_values, labels) model.set_probAB(A, B) save_model(modelfile, model) (A1, B1) = model.get_probAB() printStatus(INFO, "A: %g -> %g, B: %g -> %g" % (A0, A1, B0, B1))
def process(options, collection, annotationName, simdir, resultfile): rootpath = options.rootpath if checkToSkip(resultfile, options.overwrite): return 0 concepts = readConcepts(collection, annotationName, rootpath=rootpath) concept_num = len(concepts) id_images = readImageSet(collection, collection, rootpath) image_num = len(id_images) im2index = dict(zip(id_images, range(image_num))) print('%d instances, %d concepts to dump -> %s' % (image_num, concept_num, resultfile)) scores = np.zeros((image_num, concept_num)) - 1e4 for c_id, concept in enumerate(concepts): simfile = os.path.join(simdir, '%s.txt' % concept) ranklist = readRankingResults(simfile) for im, score in ranklist: idx = im2index[im] scores[idx, c_id] = score makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump( { 'concepts': concepts, 'id_images': map(int, id_images), 'scores': scores }, output, -1) output.close()
def process(options, collection, annotationName): rootpath = options.rootpath overwrite = options.overwrite concepts = readConcepts(collection, annotationName, rootpath) resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd") todo = [ x for x in concepts if not os.path.exists(os.path.join(resultdir, x + '.txt')) or overwrite ] if not todo: printStatus(INFO, 'nothing to do') return fcs = FlickrContextSim(collection, rootpath=rootpath) vob = fcs.vob resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd") printStatus( INFO, 'expanding tags for %s-%s -> %s' % (collection, annotationName, resultdir)) for concept in todo: resultfile = os.path.join(resultdir, concept + '.txt') vals = [] for tag in vob: dist = fcs.computeNGD(concept, tag, img=1) if dist < 10: vals.append((tag, dist)) vals.sort(key=lambda v: v[1]) printStatus(INFO, '%s -> %s' % (concept, ' '.join([x[0] for x in vals[:3]]))) writeRankingResults(vals, resultfile)
def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH): self.rootpath = rootpath self.concepts = readConcepts(collection, annotationName, rootpath) self.nr_of_concepts = len(self.concepts) self.concept2index = dict( zip(self.concepts, range(self.nr_of_concepts))) featuredir = os.path.join(rootpath, collection, 'FeatureData', feature) id_file = os.path.join(featuredir, "id.txt") shape_file = os.path.join(feat_dir, 'shape.txt') self.nr_of_images, feat_dim = map(int, open(shape_file).readline().split()) self.searcher = load_model(featuredir, self.nr_of_images, feat_dim, nr_of_segments=512, segmentk=256, coarsek=4096) self.k = DEFAULT_K self._load_tag_data(collection, tpp, rootpath) printStatus( INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len( self.tag2freq), distance, self.k))
def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH): self.concepts = readConcepts(collection, annotationName, rootpath) self.nr_of_concepts = len(self.concepts) self.concept2index = dict( zip(self.concepts, range(self.nr_of_concepts))) feat_dir = os.path.join(rootpath, collection, "FeatureData", feature) id_file = os.path.join(feat_dir, 'id.txt') shape_file = os.path.join(feat_dir, 'shape.txt') self.nr_of_images, feat_dim = map(int, open(shape_file).readline().split()) self.searcher = simpleknn.load_model( os.path.join(feat_dir, 'feature.bin'), feat_dim, self.nr_of_images, id_file) self.searcher.set_distance(distance) self.k = DEFAULT_K self._load_tag_data(collection, tpp, rootpath) printStatus( INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len( self.tag2freq), distance, self.k))
def process(options, collection, annotationName, pos_num): assert(annotationName.endswith('.txt')) rootpath = options.rootpath pos_bag_num = options.pos_bag_num neg_bag_num = options.neg_bag_num neg_pos_ratio = options.neg_pos_ratio annotationNameStr = annotationName[:-4] + ('.random%d' % pos_num) + '.%d' + ('.npr%d' % neg_pos_ratio) + '.%d.txt' concepts = readConcepts(collection, annotationName, rootpath=rootpath) skip = 0 newAnnotationNames = [None] * (pos_bag_num * neg_bag_num) for idxp in range(pos_bag_num): for idxn in range(neg_bag_num): anno_idx = idxp * neg_bag_num + idxn newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn) resultfile = os.path.join(rootpath,collection,'Annotations',newAnnotationNames[anno_idx]) if checkToSkip(resultfile, options.overwrite): skip += 1 continue writeConcepts(concepts,resultfile) first,second,last = annotationNameStr.split('%d') scriptfile = os.path.join(rootpath,collection,'annotationfiles',first + '0-%d'%(pos_bag_num-1) + second + '0-%d'%(neg_bag_num-1) + last) makedirsforfile(scriptfile) fout = open(scriptfile,'w') fout.write('\n'.join(newAnnotationNames) + '\n') fout.close() if len(newAnnotationNames) == skip: return 0 for concept in concepts: names,labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath) positivePool = [x[0] for x in zip(names,labels) if x[1]>0] negativePool = [x[0] for x in zip(names,labels) if x[1]<0] for idxp in range(pos_bag_num): if len(positivePool) > pos_num: positiveBag = random.sample(positivePool, pos_num) else: positiveBag = positivePool for idxn in range(neg_bag_num): anno_idx = idxp * neg_bag_num + idxn newAnnotationName = newAnnotationNames[anno_idx] resultfile = os.path.join(rootpath,collection,'Annotations','Image',newAnnotationName,'%s.txt'%concept) if checkToSkip(resultfile, options.overwrite): continue real_neg_num = max(len(positiveBag) * neg_pos_ratio, 1000) real_neg_num = min(len(negativePool), real_neg_num) negativeBag = random.sample(negativePool, real_neg_num) assert(len(set(positiveBag).intersection(set(negativeBag))) == 0) printStatus(INFO, "anno(%s,%d) %d pos %d neg -> %s" % (concept,anno_idx,len(positiveBag),len(negativeBag),resultfile)) writeAnnotations(positiveBag + negativeBag, [1]*len(positiveBag) + [-1]*len(negativeBag), resultfile)
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName): assert(modelName.startswith('fastlinear')) rootpath = options.rootpath overwrite = options.overwrite numjobs = options.numjobs job = options.job topk = options.topk outputName = '%s,%s' % (feature,modelName) resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt') if numjobs>1: resultfile += '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) nr_of_concepts = len(concepts) test_imset = readImageSet(testCollection, testCollection, rootpath) test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job] test_imset = set(test_imset) nr_of_test_images = len(test_imset) printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile)) ma = ModelArray(trainCollection, trainAnnotationName, feature, modelName, rootpath=rootpath) feat_file = StreamFile(os.path.join(rootpath, testCollection, "FeatureData", feature)) makedirsforfile(resultfile) fw = open(resultfile, "w") done = 0 feat_file.open() for _id, _vec in feat_file: if _id not in test_imset: continue res = ma.predict([_vec],prob=0) tagvotes = res[0] if topk>0: tagvotes = tagvotes[:topk] newline = '%s %s\n' % (_id, " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes])) fw.write(newline) done += 1 if done % 1e4 == 0: printStatus(INFO, "%d done" % done) feat_file.close() fw.close() printStatus(INFO, "%d done" % (done)) return done
def process(options, trainCollection, baseAnnotationName, startAnnotationName, feature, modelName): global train_model, compress_model, save_model assert(modelName in ['fik', 'fastlinear']) if 'fik' == modelName: from model_based.svms.fiksvm.svmutil import svm_train as train_model from model_based.svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model from model_based.svms.fiksvm.fiksvm import fiksvm_save_model as save_model else: from model_based.svms.fastlinear.liblinear193.python.liblinearutil import train as train_model from model_based.svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model from model_based.svms.fastlinear.fastlinear import fastlinear_save_model as save_model rootpath = options.rootpath overwrite = options.overwrite params = {'rootpath': rootpath, 'trainCollection': trainCollection, 'baseAnnotationName': baseAnnotationName, 'startAnnotationName': startAnnotationName, 'feature': feature, 'model': modelName, 'strategy': options.strategy, 'iterations': options.iterations, 'npr': options.npr, 'nr_bins': options.nr_bins} concepts = readConcepts(trainCollection, startAnnotationName, rootpath) newAnnotationName = get_new_annotation_name(params) newModelName = get_model_name(params) modeldir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, newModelName) todo = [concept for concept in concepts if overwrite or os.path.exists(os.path.join(modeldir,'%s.txt'%concept)) is False] activeConcepts = [todo[i] for i in range(len(todo)) if (i%options.numjobs+1) == options.job] params['feat_file'] = BigFile(os.path.join(rootpath, trainCollection, 'FeatureData', feature)) if 'fik' == modelName: minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) s_time = time.time() for concept in activeConcepts: printStatus(INFO, 'processing %s' % concept) modelfile = os.path.join(modeldir, '%s.model'%concept) if checkToSkip(modelfile, overwrite): continue new_model = NegativeBootstrap.learn(concept, params) makedirsforfile(modelfile) printStatus(INFO, 'save model to %s' % modelfile) save_model(modelfile, new_model) printStatus(INFO, '%s done' % concept) timecost = time.time() - s_time writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath) printStatus(INFO, 'done for %g concepts: %s' % (len(activeConcepts), ' '.join(activeConcepts))) printStatus(INFO, 'models stored at %s' % modeldir) printStatus(INFO, '%g seconds in total' % timecost)
def process(options, testCollection, annotationName, tagvotefile): rootpath = options.rootpath tpp = options.tpp tagged = options.tagged overwrite = options.overwrite resultdir = generate_result_dir(options, testCollection, tagvotefile) concepts = readConcepts(testCollection, annotationName, rootpath) todo = [] for concept in concepts: resfile = os.path.join(resultdir, '%s.txt'%concept) if checkToSkip(resfile, overwrite): continue todo.append(concept) if not todo: print ('nothing to do') return 0 nr_of_concepts = len(todo) labeled_set = [None] * nr_of_concepts if tagged: for i in range(nr_of_concepts): labeled_set[i] = set(readLabeledImageSet(testCollection, todo[i], tpp, rootpath)) concept2index = dict(zip(todo, range(nr_of_concepts))) ranklists = [[] for i in range(nr_of_concepts)] for line in open(tagvotefile): elems = line.strip().split() imageid = elems[0] del elems[0] assert(len(elems)%2==0) for i in range(0, len(elems), 2): tag = elems[i] c = concept2index.get(tag, -1) if c >= 0: if tagged and imageid not in labeled_set[c]: continue score = float(elems[i+1]) ranklists[c].append((imageid,score)) for i in range(nr_of_concepts): concept = todo[i] resfile = os.path.join(resultdir, '%s.txt'%concept) ranklist = sorted(ranklists[i], key=lambda v:v[1], reverse=True) print ('%s %d -> %s' % (concept, len(ranklist), resfile)) writeRankingResults(ranklist, resfile)
def submit(searchers, collection,annotationName, rootpath=ROOT_PATH, overwrite=0): concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_runs = len(searchers) for concept in concepts: for j in range(nr_of_runs): resultfile = os.path.join(searchers[j].getOutputdir(), concept + ".txt") if checkToSkip(resultfile, overwrite): continue searchresults = searchers[j].scoreCollection(concept) print ("%s: %s %d -> %s" % (searchers[j].name, concept, len(searchresults), resultfile)) writeRankingResults(searchresults, resultfile) printStatus('%s.submit'%os.path.basename(__file__), "done")
def __init__(self, testCollection, trainCollection, annotationName, rootpath=ROOT_PATH): self.name = '%s-%s-%s' % (self.__class__.__name__, trainCollection, annotationName) self.concepts = readConcepts(trainCollection, annotationName, rootpath) self.concept_num = len(self.concepts) self.concept2index = dict(zip(self.concepts, range(self.concept_num))) self.tbase = TagBase(trainCollection, tpp='lemm', rootpath=rootpath) self.rbase = ConceptRankBase(os.path.join(rootpath,trainCollection,'TextData', 'tag.concept-rank.%s.pkl' % annotationName)) self.DEFAULT_RANK = self.tbase.tag_num() self.m = DEFAULT_M self.k_r = DEFAULT_KR self.k_s = DEFAULT_KS self.k_d = DEFAULT_KD self.normalize = True self.add_bonus = False
def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH): self.rootpath = rootpath self.concepts = readConcepts(collection, annotationName, rootpath) self.nr_of_concepts = len(self.concepts) self.concept2index = dict(zip(self.concepts, range(self.nr_of_concepts))) featuredir = os.path.join(rootpath,collection,'FeatureData',feature) id_file = os.path.join(featuredir, "id.txt") shape_file = os.path.join(feat_dir, 'shape.txt') self.nr_of_images, feat_dim = map(int, open(shape_file).readline().split()) self.searcher = load_model(featuredir, self.nr_of_images, feat_dim,nr_of_segments=512,segmentk=256,coarsek=4096) self.k = DEFAULT_K self._load_tag_data(collection, tpp, rootpath) printStatus(INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len(self.tag2freq), distance, self.k))
def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH): self.rootpath = rootpath self.concepts = readConcepts(collection, annotationName, rootpath) self.nr_of_concepts = len(self.concepts) self.concept2index = dict(zip(self.concepts, range(self.nr_of_concepts))) self.imset = readImageSet(collection, collection, rootpath) self.nr_of_images = len(self.imset) self.knndir = os.path.join(collection, '%s,%sknn,uu,1500' % (feature, distance)) self.k = DEFAULT_K self.noise = 0 self._load_tag_data(collection, tpp, rootpath) printStatus(INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len(self.tag2freq), distance, self.k))
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite random = options.random resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.zeros((len(id_images), len(concepts))) id_images = [] tag2idx = dict(zip(concepts, xrange(len(concepts)))) with open( os.path.join(rootpath, workingCollection, 'TextData', 'id.userid.lemmtags.txt')) as f: cnt = 0 for line in f: id_img, _, tags = line.split('\t') tags = tags.split() if len(tags) > 0: tags = [(tag2idx.get(x, -1), y) for x, y in zip(tags, xrange(len(tags)))] idx = np.array([x[0] for x in tags]) vals = 1. / (1. + np.array([x[1] for x in tags])) tagmatrix[cnt, idx] = vals id_images.append(id_img) cnt += 1 # random rank for untagged images if random: tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand( tagmatrix.shape[0], tagmatrix.shape[1]) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump( { 'concepts': concepts, 'id_images': map(int, id_images), 'scores': tagmatrix }, f, pickle.HIGHEST_PROTOCOL)
def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH): self.concepts = readConcepts(collection, annotationName, rootpath) self.nr_of_concepts = len(self.concepts) self.concept2index = dict(zip(self.concepts, range(self.nr_of_concepts))) feat_dir = os.path.join(rootpath, collection, "FeatureData", feature) id_file = os.path.join(feat_dir, 'id.txt') shape_file = os.path.join(feat_dir, 'shape.txt') self.nr_of_images, feat_dim = map(int, open(shape_file).readline().split()) self.searcher = simpleknn.load_model(os.path.join(feat_dir, 'feature.bin'), feat_dim, self.nr_of_images, id_file) self.searcher.set_distance(distance) self.k = DEFAULT_K self._load_tag_data(collection, tpp, rootpath) printStatus(INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len(self.tag2freq), distance, self.k))
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.random.rand(len(id_images), len(concepts)) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
def process(options, collection, annotationName, runfile, newRunName): rootpath = options.rootpath overwrite = options.overwrite dataset = options.testset if options.testset else collection concepts = readConcepts(collection, annotationName, rootpath) simdir = os.path.join(rootpath, collection, "SimilarityIndex", dataset) data = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith("#")] models = [] for line in data: weight,run = str.split(line) models.append((run, float(weight), 1)) for concept in concepts: resultfile = os.path.join(simdir, newRunName, concept + ".txt") if checkToSkip(resultfile, overwrite): continue scorefile = os.path.join(simdir, models[0][0], concept + ".txt") if not os.path.exists(scorefile): print ("%s does not exist. skip" % scorefile) continue ranklist = readRankingResults(scorefile) names = sorted([x[0] for x in ranklist]) nr_of_images = len(names) name2index = dict(zip(names, range(nr_of_images))) print ('%s %d' % (concept, nr_of_images)) scoreTable = readImageScoreTable(concept, name2index, simdir, models, torank=options.torank) assert(scoreTable.shape[1] == nr_of_images) weights = [model[1] for model in models] scores = np.matrix(weights) * scoreTable scores = [float(scores[0,k]) for k in range(nr_of_images)] newranklist = [(names[i], scores[i]) for i in range(nr_of_images)] newranklist.sort(key=lambda v:(v[1],v[0]), reverse=True) writeRankingResults(newranklist, resultfile)
def process(options, collection, annotationName, runfile, newRunName): rootpath = options.rootpath overwrite = options.overwrite dataset = options.testset if options.testset else collection concepts = readConcepts(collection, annotationName, rootpath) simdir = os.path.join(rootpath, collection, "SimilarityIndex", dataset) data = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith("#")] models = [] for line in data: weight, run = str.split(line) models.append((run, float(weight), 1)) for concept in concepts: resultfile = os.path.join(simdir, newRunName, concept + ".txt") if checkToSkip(resultfile, overwrite): continue scorefile = os.path.join(simdir, models[0][0], concept + ".txt") if not os.path.exists(scorefile): print("%s does not exist. skip" % scorefile) continue ranklist = readRankingResults(scorefile) names = sorted([x[0] for x in ranklist]) nr_of_images = len(names) name2index = dict(zip(names, range(nr_of_images))) print("%s %d" % (concept, nr_of_images)) scoreTable = readImageScoreTable(concept, name2index, simdir, models, torank=options.torank) assert scoreTable.shape[1] == nr_of_images weights = [model[1] for model in models] scores = np.matrix(weights) * scoreTable scores = [float(scores[0, k]) for k in range(nr_of_images)] newranklist = [(names[i], scores[i]) for i in range(nr_of_images)] newranklist.sort(key=lambda v: (v[1], v[0]), reverse=True) writeRankingResults(newranklist, resultfile)
def __init__(self, trainCollection, trainAnnotationName, feature, modelName, rootpath=ROOT_PATH): assert(modelName.startswith('fastlinear')), modelName self.concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) self.nr_of_concepts = len(self.concepts) modeldir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName) model = load_model(os.path.join(modeldir, self.concepts[0]+'.model')) self.feat_dim = model.get_feat_dim() self.W = np.zeros((self.feat_dim, self.nr_of_concepts)) self.AB = np.zeros((2, self.nr_of_concepts)) for i in range(self.nr_of_concepts): model_file_name = os.path.join(modeldir, "%s.model" % self.concepts[i]) model = load_model(model_file_name) self.W[:,i] = model.get_w() [A,B] = model.get_probAB() self.AB[:,i] = [A,B] if abs(A)>1e-8 else [-1,0] printStatus(INFO, '%s, A=%g, B=%g' % (self.concepts[i], A, B)) printStatus(INFO, '%s-%s-%s -> %dx%d ModelArray' % (trainCollection,trainCollection,feature,self.feat_dim,self.nr_of_concepts))
def process(options, trainCollection, annotationName): rootpath = options.rootpath overwrite = options.overwrite resultfile = os.path.join(rootpath, trainCollection, 'TextData', 'tag.concept-rank.%s.pkl' % annotationName) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection, annotationName, rootpath) concept_num = len(concepts) concept2index = dict(zip(concepts, range(concept_num))) tcb = TagCooccurBase(trainCollection, rootpath=rootpath) tag_num = tcb.tag_num() DEFAULT_RANK = tag_num rank_matrix = np.zeros((tag_num, concept_num), dtype=np.int) + DEFAULT_RANK tag_list = [] for i,u in enumerate(tcb.vob): ranklist = tcb.top_cooccur(u,-1) concept2rank = {} rank = [DEFAULT_RANK] * concept_num hit = 0 for j,x in enumerate(ranklist): idx = concept2index.get(x[0], -1) if idx>=0: rank_matrix[i,idx] = j+1 hit += 1 if hit == concept_num: break tag_list.append(u) if (i+1) % 1e4 == 0: printStatus(INFO, '%d done' % (i+1) ) assert(len(tag_list) == tag_num) import cPickle as pickle makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump({'tags':tag_list, 'concepts':concepts, 'rank_matrix':rank_matrix}, output, -1) output.close() printStatus(INFO, '%dx%d dumped to %s' % (tag_num, concept_num, resultfile))
def submit(searchers, collection, annotationName, rootpath=ROOT_PATH, overwrite=0): concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_runs = len(searchers) for concept in concepts: for j in range(nr_of_runs): resultfile = os.path.join(searchers[j].getOutputdir(), concept + ".txt") if checkToSkip(resultfile, overwrite): continue searchresults = searchers[j].scoreCollection(concept) print("%s: %s %d -> %s" % (searchers[j].name, concept, len(searchresults), resultfile)) writeRankingResults(searchresults, resultfile) printStatus('%s.submit' % os.path.basename(__file__), "done")
def __init__(self, testCollection, trainCollection, annotationName, rootpath=ROOT_PATH): self.name = '%s-%s-%s' % (self.__class__.__name__, trainCollection, annotationName) self.concepts = readConcepts(trainCollection, annotationName, rootpath) self.concept_num = len(self.concepts) self.concept2index = dict(zip(self.concepts, range(self.concept_num))) self.tbase = TagBase(trainCollection, tpp='lemm', rootpath=rootpath) self.rbase = ConceptRankBase( os.path.join(rootpath, trainCollection, 'TextData', 'tag.concept-rank.%s.pkl' % annotationName)) self.DEFAULT_RANK = self.tbase.tag_num() self.m = DEFAULT_M self.k_r = DEFAULT_KR self.k_s = DEFAULT_KS self.k_d = DEFAULT_KD self.normalize = True self.add_bonus = False
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.random.rand(len(id_images), len(concepts)) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump( { 'concepts': concepts, 'id_images': map(int, id_images), 'scores': tagmatrix }, f, pickle.HIGHEST_PROTOCOL)
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite random = options.random resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.zeros((len(id_images), len(concepts))) id_images = [] tag2idx = dict(zip(concepts, xrange(len(concepts)))) with open(os.path.join(rootpath, workingCollection, 'TextData', 'id.userid.lemmtags.txt')) as f: cnt = 0 for line in f: id_img, _, tags = line.split('\t') tags = tags.split() if len(tags) > 0: tags = [(tag2idx.get(x,-1), y) for x,y in zip(tags, xrange(len(tags)))] idx = np.array([x[0] for x in tags]) vals = 1. / (1. + np.array([x[1] for x in tags])) tagmatrix[cnt, idx] = vals id_images.append(id_img) cnt += 1 # random rank for untagged images if random: tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand(tagmatrix.shape[0], tagmatrix.shape[1]) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
def evaluateSearchEngines(searchers, collection, annotationName, metric, rootpath=ROOT_PATH): scorer = getScorer(metric) concepts = readConcepts(collection, annotationName, rootpath) nr_of_runs = len(searchers) nr_of_concepts = len(concepts) results = np.zeros((nr_of_concepts,nr_of_runs)) for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], rootpath) name2label = dict(zip(names,labels)) for j in range(nr_of_runs): searchresults = searchers[j].scoreCollection(concepts[i]) sorted_labels = [name2label[name] for (name,score) in searchresults if name in name2label] results[i,j] = scorer.score(sorted_labels) for i in range(nr_of_concepts): print concepts[i], ' '.join([niceNumber(x,3) for x in results[i,:]]) mean_perf = results.mean(0) print 'mean%s'%metric, ' '.join([niceNumber(x,3) for x in mean_perf]) return concepts,results
def process(options, testCollection, trainCollection, annotationName, feature, outputpkl): rootpath = options.rootpath k = options.k distance = options.distance variant = options.variant overwrite = options.overwrite testset = testCollection forcetrainmodel = options.trainmodel modelName = "tagprop" nnName = distance + "knn" printStatus(INFO, "Starting TagProp %s,%s,%s,%s,%s" % (variant, trainCollection, testCollection, annotationName, feature)) resultfile = os.path.join(outputpkl) resultfile_tagprop = os.path.join(rootpath, testCollection, 'TagProp-Prediction', testset, trainCollection, annotationName, modelName, '%s,%s,%s,%d'%(feature,nnName,variant,k), 'prediction.mat') # if checkToSkip(resultfile, overwrite) or checkToSkip(resultfile_tagprop, overwrite): # return 0 tagmatrix_file = os.path.join(rootpath, trainCollection, 'TextData', 'lemm_wordnet_freq_tags.h5') if not os.path.exists(tagmatrix_file): printStatus(INFO, "Tag matrix file not found at %s Did you run wordnet_frequency_tags.py?" % (tagmatrix_file)) sys.exit(1) train_neighs_file = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d'%(feature,nnName,k), 'nn_train.h5') if not os.path.exists(train_neighs_file): printStatus(INFO, "Matlab train neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (train_neighs_file)) sys.exit(1) # do we need to perform learning? train_model_file = os.path.join(rootpath, trainCollection, 'TagProp-models', '%s,%s,%s,%d'%(feature,nnName,variant,k), 'model.mat') # if os.path.exists(train_model_file) and not forcetrainmodel: if False: printStatus(INFO, "model for %s available at %s" % (trainCollection, train_model_file)) else: printStatus(INFO, "starting learning model for %s" % (trainCollection)) makedirsforfile(train_model_file) # print(tagmatrix_file, train_neighs_file) # exit() script = """ tagprop_path = '%s/model_based/tagprop/TagProp/'; addpath(tagprop_path); tagmatrix = h5read('%s', '/tagmatrix') > 0.5; tagmatrix = sparse(tagmatrix); NN = h5read('%s', '/NN'); NN = NN(2:end, :); NN = double(NN); """ % (survey_code, tagmatrix_file, train_neighs_file) if variant == 'dist' or variant == 'distsigmoids': script += """ NND = h5read('%s', '/NND'); NND = NND(2:end, :); NND = reshape(NND, 1, size(NND,1), size(NND,2)); NND = double(NND); """ % train_neighs_file if variant == 'rank': script += """ m = tagprop_learn(NN,[],tagmatrix); """ elif variant == 'ranksigmoids': script += """ m = tagprop_learn(NN,[],tagmatrix,'sigmoids',true); """ elif variant == 'dist': script += """ m = tagprop_learn(NN,NND,tagmatrix,'type','dist'); """ elif variant == 'distsigmoids': script += """ m = tagprop_learn(NN,NND,tagmatrix,'type','dist','sigmoids',true); """ script += """ save('%s', 'm', '-v7.3'); """ % train_model_file # call_matlab(script) # print(script) # exit() # we perform prediction printStatus(INFO, "starting prediction") test_neighs_file = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d'%(feature,nnName,k), 'nn_test.h5') if not os.path.exists(test_neighs_file): printStatus(INFO, "Matlab test neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (test_neighs_file)) sys.exit(1) script += """ tagprop_path = '%s/model_based/tagprop/TagProp/'; addpath(tagprop_path); load('%s'); tagmatrix = h5read('%s', '/tagmatrix') > 0.5; tagmatrix = sparse(tagmatrix); NNT = h5read('%s', '/NNT'); NNT = double(NNT); """ % (survey_code, train_model_file, tagmatrix_file, test_neighs_file) if variant == 'dist' or variant == 'distsigmoids': script += """ NNDT = h5read('%s', '/NNDT'); NNDT = reshape(NNDT, 1, size(NNDT,1), size(NNDT,2)); NNDT = double(NNDT); """ % test_neighs_file script += """ P = tagprop_predict(NNT,[],m)'; save('%s', '-v7.3'); exit; """ % resultfile_tagprop # print(script) makedirsforfile(resultfile_tagprop) call_matlab(script) # exit() # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) concepts = readConcepts(testCollection, annotationName, rootpath) id_images = readImageSet(testCollection, testset, rootpath) id_images.sort() # id_images = map(int, id_images) # concepts mapping tagprop_output = h5py.File(resultfile_tagprop, 'r') tagprop_input = h5py.File(tagmatrix_file, 'r') mapping = getVocabMap(list(tagprop_input['vocab'][:]),concepts) final_tagmatrix = tagprop_output['P'][:][:,mapping] with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images':id_images, 'scores':final_tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
def process(options, collection, annotationName, runfile, outDirectory): rootpath = options.rootpath apscorer = getScorer('AP') ndcg = getScorer('NDCG@20') ndcg2 = getScorer('NDCG2@20') p1scorer = getScorer('P@1') p5scorer = getScorer('P@5') datafiles = [ x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#') ] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) printStatus(INFO, 'read annotations from files') name2label = [{} for i in range(nr_of_concepts)] hit_imgset = [[] for i in range(nr_of_concepts)] rel_conset = {} for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) names = map(int, names) name2label[i] = dict(zip(names, labels)) for im, lab in zip(names, labels): if lab > 0: rel_conset.setdefault(im, set()).add(i) label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt' % concepts[i]) try: hit_imgset[i] = set(map(int, open(label_file).readlines())) except: hit_imgset[i] = set() printStatus( INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i]))) ap_table = np.zeros((nr_of_runs, nr_of_concepts)) ap2_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts)) print '#' * 100 print '# method miap hit1 hit5' print '#' * 100 for run_idx in range(nr_of_runs): data = pickle.load(open(datafiles[run_idx], 'rb')) scores = data['scores'] assert (scores.shape[1] == nr_of_concepts) imset = data['id_images'] imset = np.array([int(x) for x in imset]) idx = np.argsort(imset) imset = imset[idx] scores = scores[idx] nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] for c_idx in range(nr_of_concepts): ground_truth = name2label[c_idx] ranklist = zip(imset, scores[:, c_idx]) ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True) ranklist = [x for x in ranklist if x[0] in ground_truth] sorted_labels = [ground_truth[x[0]] for x in ranklist] assert (len(sorted_labels) > 0) #print concepts[c_idx], ranklist[:5], sorted_labels[:5] ap_table[run_idx, c_idx] = apscorer.score(sorted_labels) sorted_labels = [ ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx] ] ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels) ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels) ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels) res = np.zeros((nr_of_images, 4)) gt = np.zeros((nr_of_images, nr_of_concepts)) for j in range(nr_of_images): ranklist = zip(range(nr_of_concepts), scores[j, :]) ranklist.sort(key=lambda v: v[1], reverse=True) rel_set = rel_conset.get(imset[j], set()) sorted_labels = [int(x[0] in rel_set) for x in ranklist] #print rel_set #print sorted_labels ap = apscorer.score(sorted_labels) hit1 = p1scorer.score(sorted_labels) hit5 = p5scorer.score(sorted_labels) > 0.1 res[j, :] = [ap, hit1, hit5, len(rel_set)] gt[j, :] = sorted_labels avg_perf = res.mean(axis=0) print os.path.split(datafiles[run_idx])[-1], ' '.join( ['%.3f' % x for x in avg_perf]) outMiap = h5py.File( os.path.join(outDirectory, os.path.split(datafiles[run_idx])[-1] + ".h5"), 'w') outMiap['iap'] = res[:, 0] outMiap['ngt'] = res[:, 3] outMiap['hit1'] = res[:, 1] outMiap['hit5'] = res[:, 2] outMiap['gt'] = gt outMiap['concepts'] = concepts outMiap['ap'] = ap_table[run_idx, :] outMiap['ap2'] = ap2_table[run_idx, :] outMiap[ndcg.name()] = ndcg_table[run_idx, :] outMiap[ndcg2.name()] = ndcg2_table[run_idx, :] outMiap.close() print '#' * 100 print '# untagged-concept', ' '.join( [os.path.split(x)[-1] for x in datafiles]) print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ap_table[:, c_idx]]) print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)]) print '#' * 100 print '# tagged-concept' print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ap2_table[:, c_idx]]) print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)]) print '#' * 100 print '# tagged-concept' print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ndcg_table[:, c_idx]]) print 'mean%s' % ndcg.name(), ' '.join( ['%.3f' % x for x in ndcg_table.mean(axis=1)]) print '#' * 100 print '# tagged-concept' print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ndcg2_table[:, c_idx]]) print 'mean%s' % ndcg2.name(), ' '.join( ['%.3f' % x for x in ndcg2_table.mean(axis=1)])
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName): if modelName.startswith('fik'): from fiksvm.fiksvm import fiksvm_load_model as load_model else: from fastlinear.fastlinear import fastlinear_load_model as load_model rootpath = options.rootpath overwrite = options.overwrite prob_output = options.prob_output numjobs = options.numjobs job = options.job blocksize = options.blocksize outputName = '%s,%s' % (feature, modelName) if prob_output: outputName += ',prob' resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt') if numjobs > 1: resultfile += '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) nr_of_concepts = len(concepts) test_imset = readImageSet(testCollection, testCollection, rootpath) test_imset = [ test_imset[i] for i in range(len(test_imset)) if i % numjobs + 1 == job ] nr_of_test_images = len(test_imset) printStatus( INFO, "working on %d-%d, %d test images -> %s" % (numjobs, job, nr_of_test_images, resultfile)) models = [None] * nr_of_concepts for c in range(nr_of_concepts): model_file_name = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName, '%s.model' % concepts[c]) models[c] = load_model(model_file_name) if models[c] is None: return 0 #(pA,pB) = model.get_probAB() feat_file = BigFile( os.path.join(rootpath, testCollection, "FeatureData", feature)) makedirsforfile(resultfile) fw = open(resultfile, "w") read_time = 0 test_time = 0 start = 0 done = 0 while start < nr_of_test_images: end = min(nr_of_test_images, start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end - 1)) s_time = time.time() renamed, test_X = feat_file.read(test_imset[start:end]) read_time += time.time() - s_time s_time = time.time() output = [None] * len(renamed) for i in xrange(len(renamed)): if prob_output: scores = [ models[c].predict_probability(test_X[i]) for c in range(nr_of_concepts) ] else: scores = [ models[c].predict(test_X[i]) for c in range(nr_of_concepts) ] #dec_value = sigmoid_predict(dec_value, A=pA, B=pB) tagvotes = sorted(zip(concepts, scores), key=lambda v: v[1], reverse=True) output[i] = '%s %s\n' % (renamed[i], " ".join([ "%s %s" % (tag, niceNumber(vote, 6)) for (tag, vote) in tagvotes ])) test_time += time.time() - s_time start = end fw.write(''.join(output)) fw.flush() done += len(output) # done printStatus( INFO, "%d done. read time %g seconds, test_time %g seconds" % (done, read_time, test_time)) fw.close() return done
#from svm import * from fastsvm.svmutil import * from fastsvm.svm import * from fiksvm import * from fiksvmutil import * from fastsvm.fiksvm import svm_to_fiksvm as svm_to_fiksvm0 if __name__ == "__main__": rootpath = ROOT_PATH trainCollection = "voc2008train" testCollection = "voc2008val" annotationName = "conceptsvoc2008train.txt" #concept = "aeroplane" feature = "dsift" concepts = readConcepts(testCollection, 'conceptsvoc2008val.txt') scorer = getScorer('AP') min_vals, max_vals = find_min_max_vals( BigFile( os.path.join(rootpath, trainCollection, 'FeatureData', feature), FEATURE_TO_DIM[feature])) featurefile = os.path.join(rootpath, testCollection, "FeatureData", feature, "id.feature.txt") feat_dim = 1024 num_bins = 50 #fikmodel.set_probAB(-1, 0) #print "fik model0", fikmodel0.get_nr_svs(), fikmodel0.get_feat_dim(), fikmodel0.get_probAB()
def process(options, collection, annotationName, runfile): rootpath = options.rootpath p1_scorer = getScorer('P@3') p3_scorer = getScorer('P@5') r1_scorer = getScorer('R@3') r3_scorer = getScorer('R@5') ndcg1_scorer = getScorer('NDCG2@3') ndcg3_scorer = getScorer('NDCG2@5') ap_scorer = getScorer('AP') rr_scorer = getScorer('RR') datafiles = [ x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#') ] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) name2label = [{} for i in range(nr_of_concepts)] rel_conset = {} for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) #names = map(int, names) name2label[i] = dict(zip(names, labels)) for im, lab in zip(names, labels): if lab > 0: rel_conset.setdefault(im, set()).add(i) # ('7975436322', set([33])) # for im, im_labels in rel_conset.items(): # print(im, im_labels) for run_idx in range(nr_of_runs): data = pickle.load(open(datafiles[run_idx], 'rb')) scores = data['scores'] assert (scores.shape[1] == nr_of_concepts) imset = data['id_images'] # for im in imset: # print(im) # raw_input() nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] res = np.zeros((nr_of_images, 8)) for j in range(nr_of_images): ranklist = zip(range(nr_of_concepts), scores[j, :]) ranklist.sort(key=lambda v: v[1], reverse=True) # print(ranklist) # raw_input() rel_set = rel_conset.get(imset[j], set()) sorted_labels = [int(x[0] in rel_set) for x in ranklist] # print(sorted_labels) # raw_input() assert len(sorted_labels) == nr_of_concepts p1 = p1_scorer.score(sorted_labels) p3 = p3_scorer.score(sorted_labels) r1 = r1_scorer.score(sorted_labels) r3 = r3_scorer.score(sorted_labels) ndcg1 = ndcg1_scorer.score(sorted_labels) ndcg3 = ndcg3_scorer.score(sorted_labels) ap = ap_scorer.score(sorted_labels) rr = rr_scorer.score(sorted_labels) f1, f3 = 0.0, 0.0 if (p1 + r1) != 0.0: f1 = 2 * p1 * r1 / (p1 + r1) if (p3 + r3) != 0.0: f3 = 2 * p3 * r3 / (p3 + r3) # h1, h3 = max(p1, r1), max(p3, r3) res[j, :] = [p1, p3, r1, r3, ndcg1, ndcg3, ap, rr] res[j, :] = [p1, p3, f1, f3, ndcg1, ndcg3, ap, rr] # res[j,:] = [p1, p3, h1, h3, ndcg1, ndcg3, ap, rr] avg_perf = res.mean(axis=0) name = path.basename(datafiles[run_idx]).split('.')[0] name = name.split(',')[1] stdout.write('%s\t' % name) # for x in avg_perf: for i in range(len(avg_perf)): if i == 4 or i == 5: continue # x = avg_perf[i] * 100.0 x = avg_perf[i] if x >= 100.0: stdout.write('& %.1f ' % x) else: # stdout.write('& %.2f ' % x) stdout.write('& %s' % (('%.4f ' % x).lstrip('0'))) stdout.write('\n')
def process(options, trainCollection, trainAnnotationName, feature): import re p = re.compile(r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)') rootpath = options.rootpath best_param_dir = options.best_param_dir overwrite = options.overwrite #autoweight = options.autoweight numjobs = options.numjobs job = options.job beta = 0.5 modelName = 'fastlinear' if best_param_dir: modelName += '-tuned' concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] if not todo: return 0 printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature)) for concept in todo: if best_param_dir: param_file = os.path.join(best_param_dir, '%s.txt' % concept) m = p.search(open(param_file).readline().strip()) C = float(m.group('C')) A = float(m.group('a')) B = float(m.group('b')) else: C = 1 A = 0 B = 0 printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B)) model_file_name = os.path.join(resultdir, concept + '.model') names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = feat_file.read(names) y = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn # no bias term added by setting "-B -1" svm_params = '-w1 %g -w-1 %g -s 2 -B -1 -q' % (wp*C, wn*C) model = liblinear_train(y, vectors, svm_params) newmodel = liblinear_to_fastlinear([model], [1.0], feat_file.ndims) newmodel.set_probAB(A, B) makedirsforfile(model_file_name) printStatus(INFO, '-> %s'%model_file_name) fastlinear_save_model(model_file_name, newmodel) # reload the model file to do a simple check fastlinear_load_model(model_file_name) assert(abs(newmodel.get_probAB()[0]-A)<1e-6) assert(abs(newmodel.get_probAB()[1]-B)<1e-6) return len(todo)
def process(options, collection, annotationName, runfile): rootpath = options.rootpath overwrite = options.overwrite resultdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection) apscorer = getScorer('AP') datafiles = [ x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#') ] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) printStatus(INFO, 'read annotations from files') name2label = [{} for i in range(nr_of_concepts)] hit_imgset = [[] for i in range(nr_of_concepts)] for i in range(nr_of_concepts): names, labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) names = map(int, names) name2label[i] = dict(zip(names, labels)) label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt' % concepts[i]) try: hit_imgset[i] = set(map(int, open(label_file).readlines())) except: hit_imgset[i] = set() printStatus( INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i]))) ap_table = np.zeros((nr_of_runs, nr_of_concepts)) ap2_table = np.zeros((nr_of_runs, nr_of_concepts)) for run_idx in range(nr_of_runs): runName = os.path.splitext(os.path.basename(datafiles[run_idx]))[0] data = pickle.load(open(datafiles[run_idx], 'rb')) scores = data['scores'] assert (scores.shape[1] == nr_of_concepts) imset = data['id_images'] nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] for c_idx in range(nr_of_concepts): ground_truth = name2label[c_idx] ranklist = zip(imset, scores[:, c_idx]) ranklist.sort(key=lambda v: (v[1], str(v[0])), reverse=True) ranklist = [x for x in ranklist if x[0] in ground_truth] resfile = os.path.join(resultdir, runName, '%s.txt' % concepts[c_idx]) if not checkToSkip(resfile, overwrite): writeRankingResults(ranklist, resfile) sorted_labels = [ground_truth[x[0]] for x in ranklist] assert (len(sorted_labels) > 0) #print concepts[c_idx], ranklist[:5], sorted_labels[:5] ap_table[run_idx, c_idx] = apscorer.score(sorted_labels) resfile = os.path.join(resultdir, 'tagged,lemm', runName, '%s.txt' % concepts[c_idx]) if not checkToSkip(resfile, overwrite): writeRankingResults( [x for x in ranklist if x[0] in hit_imgset[c_idx]], resfile) sorted_labels = [ ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx] ] ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels) print '#' * 100 print '# untagged-concept', ' '.join( [os.path.basename(x) for x in datafiles]) print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ap_table[:, c_idx]]) print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)]) print '#' * 100 print '# tagged-concept' print '#' * 100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join( ['%.3f' % x for x in ap2_table[:, c_idx]]) print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])
def process(options, testCollection, trainCollection, annotationName, tagrelMethod, tagfeature): rootpath = options.rootpath overwrite = options.overwrite concepts = readConcepts(trainCollection, annotationName, rootpath) nr_of_concepts = len(concepts) mapping = dict(zip(concepts, range(nr_of_concepts))) feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', tagfeature) binary_file = os.path.join(feat_dir, 'feature.bin') id_file = os.path.join(feat_dir, 'id.txt') shape_file = os.path.join(feat_dir, 'shape.txt') if checkToSkip(binary_file, overwrite): sys.exit(0) inputfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, tagrelMethod, 'id.tagvotes.txt') if not os.path.exists(inputfile): printError(INFO, '%s does not exist' % inputfile) sys.exit(0) makedirsforfile(binary_file) fw = open(binary_file, 'wb') processed = set() imset = [] count_line = 0 for line in open(inputfile): count_line += 1 elems = str.split(line.strip()) name = elems[0] if name in processed: continue processed.add(name) del elems[0] assert (len(elems) == 2 * nr_of_concepts) vec = [0] * nr_of_concepts for i in range(0, len(elems), 2): tag = elems[i] idx = mapping[tag] score = float(elems[i + 1]) vec[idx] = score s = float(sum(vec)) # l_1 normalized vec = np.array([x / s for x in vec], dtype=np.float32) vec.tofile(fw) imset.append(name) fw.close() fw = open(id_file, 'w') fw.write(' '.join(imset)) fw.close() fw = open(shape_file, 'w') fw.write('%d %d' % (len(imset), nr_of_concepts)) fw.close() print('%d lines parsed, %d ids -> %d unique ids' % (count_line, len(processed), len(imset)))
def process(options, trainCollection, annotationfile, feature, modelName): assert (modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 C = 1 overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {'rootpath': rootpath, 'model': modelName} if 'fik' == modelName: from svms.fiksvm.svmutil import svm_train as train_model from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model from svms.fiksvm.fiksvm import fiksvm_save_model as save_model from svms.fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model from svms.fastlinear.fastlinear import fastlinear_save_model as save_model newAnnotationName = os.path.split(annotationfile)[-1] trainAnnotationNames = [ x.strip() for x in open(annotationfile).readlines() if x.strip() and not x.strip().startswith('#') ] for annotationName in trainAnnotationNames: conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', annotationName) if not os.path.exists(conceptfile): print '%s does not exist' % conceptfile return 0 concepts = readConcepts(trainCollection, trainAnnotationNames[0], rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile( os.path.join(rootpath, trainCollection, 'FeatureData', feature)) feat_dim = train_feat_file.ndims s_time = time.time() for concept in todo: assemble_model = None for t in range(1, len(trainAnnotationNames) + 1): names, labels = readAnnotationsFrom(trainCollection, trainAnnotationNames[t - 1], concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) renamed, vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1 == lab]) wp = float(beta) * (np + nn) / np wn = (1.0 - beta) * (np + nn) / nn if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' g_t = train_model(Ys, vectors, svm_params + ' -q') if t == 1: assemble_model = compress_model([g_t], [1.0], feat_dim, params) else: new_model = compress_model([g_t], [1.0], feat_dim, params) assemble_model.add_fastsvm(new_model, 1 - 1.0 / t, 1.0 / t) new_model_file = os.path.join(resultdir, '%s.model' % concept) makedirsforfile(new_model_file) printStatus(INFO, 'save model to %s' % new_model_file) save_model(new_model_file, assemble_model) printStatus(INFO, '%s done' % concept) timecost = time.time() - s_time writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath) printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo))) printStatus(INFO, 'models stored at %s' % resultdir) printStatus(INFO, '%g seconds in total' % timecost)
def process(options, workingCollection, annotationName, feature, outputpkl): rootpath = options.rootpath distance = options.distance overwrite = options.overwrite k_ratio = options.kratio ratio_cs = options.ratiocs lambda1 = options.lambda1 lambda2 = options.lambda2 outputonlytest = options.outputonlytest rawtagmatrix = options.rawtagmatrix modelName = "robustpca" nnName = distance + "knn" printStatus(INFO, "Starting RobustPCA %s,%s,%s,%s,%f,%f,%f" % (workingCollection, annotationName, feature, nnName, k_ratio, lambda1, lambda2)) if rawtagmatrix: printStatus(INFO, "Using raw tag matrix.") else: printStatus(INFO, "Using preprocessed tag matrix.") resultfile = os.path.join(outputpkl) resultfile_robustpca = os.path.join(rootpath, workingCollection, 'RobustPCA-Prediction', '%s,%s,%f,%f,%f,%d'%(feature,nnName,lambda1,lambda2,k_ratio,rawtagmatrix), 'prediction.mat') if checkToSkip(resultfile_robustpca, overwrite): only_dump = True else: only_dump = False if not rawtagmatrix: tagmatrix_file = os.path.join(rootpath, workingCollection, 'RobustPCA', '%s,%s,%f'%(feature,nnName,DEFAULT_K_PROP), 'tagmatrix.h5') if not os.path.exists(tagmatrix_file): printStatus(INFO, "Tag matrix file not found at %s Did you run robustpca_preprocessing.py?" % (tagmatrix_file)) sys.exit(1) else: tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5") if not os.path.exists(tagmatrix_file): printStatus(INFO, 'Tag matrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file)) sys.exit(1) laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,k_ratio), 'laplacianI.mat') if not os.path.exists(laplacianI_file): printStatus(INFO, "LaplacianI file not found at %s Did you run laplacian_images.py?" % (laplacianI_file)) sys.exit(1) laplacianT_file = os.path.join(rootpath, workingCollection, 'LaplacianT', '%f'%(ratio_cs), 'laplacianT.mat') if not os.path.exists(laplacianT_file): printStatus(INFO, "LaplacianT file not found at %s Did you run laplacian_tags.py?" % (laplacianT_file)) sys.exit(1) # being learning script = """ rpca_path = 'transduction_based/robustpca/'; addpath(rpca_path); addpath([rpca_path, 'fast_svd/']); tagmatrix = sparse(double(h5read('%s', '/tagmatrix'))); load('%s'); load('%s'); lambda1 = %f; lambda2 = %f; maxIters = 50; precision = 1e-4; mu_start = 1.; parpool('local', 4); [P,E]=robustpca(tagmatrix, lambda1, lambda2, tag_similarity, im_similarity, maxIters, precision, mu_start); """ % (tagmatrix_file, laplacianI_file, laplacianT_file, lambda1, lambda2) script += """ delete(gcp); save('%s', 'P', 'E', 'lambda1', 'lambda2', '-v7.3'); exit; """ % resultfile_robustpca if not only_dump: printStatus(INFO, "starting learning") makedirsforfile(resultfile_robustpca) call_matlab(script) if checkToSkip(resultfile, overwrite): return 0 # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) concepts = readConcepts(workingCollection, annotationName, rootpath) if outputonlytest: testset_id_images = readImageSet(workingCollection.split('+')[1], workingCollection.split('+')[1], rootpath) testset_id_images.sort() id_images = readImageSet(workingCollection, workingCollection, rootpath) id_images.sort() # concepts mapping robustpca_output = h5py.File(resultfile_robustpca, 'r') tagprop_input = h5py.File(tagmatrix_file, 'r') mapping = getVocabMap(list(tagprop_input['vocab'][:]),concepts) predicted_tagmatrix = robustpca_output['P'][:,mapping] if outputonlytest: idx = np.array([bisect_index(id_images, x) for x in testset_id_images]) final_tagmatrix = predicted_tagmatrix[idx, :] assert(final_tagmatrix.shape[0] == idx.shape[0]) id_images = testset_id_images else: final_tagmatrix = predicted_tagmatrix makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images': id_images, 'scores':final_tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
from basic.common import ROOT_PATH,checkToSkip,makedirsforfile from basic.util import readImageSet from simpleknn.bigfile import BigFile, StreamFile from basic.annotationtable import readConcepts,readAnnotationsFrom rootpath = ROOT_PATH trainCollection = 'voc2008train' trainAnnotationName = 'conceptsvoc2008train.txt' modelName = 'fik50' modelName = 'fastlinear' modelName = sys.argv[1] feature = 'dsift' weight_dir = os.path.join(rootpath, trainCollection, 'l2r', modelName) concepts = readConcepts(trainCollection,trainAnnotationName,rootpath=rootpath) nr_of_models = 5 for concept in concepts: weight_file = os.path.join(weight_dir, '%s.txt' % concept) makedirsforfile(weight_file) weights = [1.0/nr_of_models] * nr_of_models model = os.path.join(trainCollection, 'Models', 'conceptsvoc2008train.txt', feature, modelName) models = [model] * nr_of_models fw = open(weight_file, 'w') fw.write('\n'.join(['%g %s' % (w,m) for w,m in zip(weights, models)])) fw.close()
rootpath = options.rootpath nr_pos = options.pos_nr collection = argv[0] #'train1m' annotationName = argv[1] # 'conceptsmir14social.txt' rankMethod = argv[2] #'train1m/fcs-wn_color64+dsift_borda' posName = argv[3] #'fcstagrelbc' neg_pos_ratio = options.neg_pos_ratio nr_neg = neg_pos_ratio * nr_pos nr_neg_bags = options.nr_neg_bags # 10 overwrite = options.overwrite assert( annotationName.endswith('social.txt') ) assert( rankMethod.startswith('tagged,lemm/%s'%collection) ) newAnnotationTemplate = annotationName[:-4] + '.' + posName + str(nr_pos) + ('.random%d'%nr_neg) + '.%d.txt' concepts = readConcepts(collection, annotationName, rootpath) simdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection, rankMethod) scriptfile = os.path.join(rootpath,collection,'annotationfiles', annotationName[:-4] + '.' + posName + str(nr_pos) + ('.random%d'%nr_neg) + '.0-%d.txt'%(nr_neg_bags-1)) makedirsforfile(scriptfile) fout = open(scriptfile,'w') fout.write('\n'.join([newAnnotationTemplate%t for t in range(nr_neg_bags)]) + '\n') fout.close() for concept in concepts: simfile = os.path.join(simdir, '%s.txt' % concept) ranklist = readRankingResults(simfile) pos_bag = [x[0] for x in ranklist[:nr_pos]] names, labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath) negativePool = [x[0] for x in zip(names,labels) if x[1] < 0]
overwrite = cmdOpts.getInt('overwrite') rootpath = cmdOpts.getString('rootpath') collection = cmdOpts.getString('collection') annotationName = cmdOpts.getString('annotationName') tpp = cmdOpts.getString('tpp') nr_pos = cmdOpts.getInt('nr_pos') pos_source = cmdOpts.getString('pos_source') select_pos = cmdOpts.getString('select_pos') neg_filter = cmdOpts.getString('neg_filter') neg_pos_ratio = cmdOpts.getInt('neg_pos_ratio') nr_pos_bags = cmdOpts.getInt('nr_pos_bags') nr_neg_bags = cmdOpts.getInt('nr_neg_bags') nr_neg = nr_pos * neg_pos_ratio concepts = readConcepts(collection, annotationName) annotationNameStr = generate_new_annotation_template(cmdOpts) nr_skipped = 0 newAnnotationNames = [None] * (nr_pos_bags * nr_neg_bags) for idxp in range(nr_pos_bags): for idxn in range(nr_neg_bags): anno_idx = idxp * nr_neg_bags + idxn newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn) resultfile = os.path.join(rootpath, collection, 'Annotations', newAnnotationNames[anno_idx]) if checkToSkip(resultfile, overwrite): nr_skipped += 1 continue writeConcepts(concepts, resultfile)
def process(options, testCollection, trainCollection, annotationName, feature, outputpkl): rootpath = options.rootpath k = options.k distance = options.distance variant = options.variant overwrite = options.overwrite testset = testCollection forcetrainmodel = options.trainmodel modelName = "tagprop" nnName = distance + "knn" printStatus(INFO, "Starting TagProp %s,%s,%s,%s,%s" % (variant, trainCollection, testCollection, annotationName, feature)) resultfile = os.path.join(outputpkl) resultfile_tagprop = os.path.join(rootpath, testCollection, 'TagProp-Prediction', testset, trainCollection, annotationName, modelName, '%s,%s,%s,%d'%(feature,nnName,variant,k), 'prediction.mat') if checkToSkip(resultfile, overwrite) or checkToSkip(resultfile_tagprop, overwrite): return 0 tagmatrix_file = os.path.join(rootpath, trainCollection, 'TextData', 'lemm_wordnet_freq_tags.h5') if not os.path.exists(tagmatrix_file): printStatus(INFO, "Tag matrix file not found at %s Did you run wordnet_frequency_tags.py?" % (tagmatrix_file)) sys.exit(1) train_neighs_file = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d'%(feature,nnName,k), 'nn_train.h5') if not os.path.exists(train_neighs_file): printStatus(INFO, "Matlab train neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (train_neighs_file)) sys.exit(1) # do we need to perform learning? train_model_file = os.path.join(rootpath, trainCollection, 'TagProp-models', '%s,%s,%s,%d'%(feature,nnName,variant,k), 'model.mat') if os.path.exists(train_model_file) and not forcetrainmodel: printStatus(INFO, "model for %s available at %s" % (trainCollection, train_model_file)) else: printStatus(INFO, "starting learning model for %s" % (trainCollection)) makedirsforfile(train_model_file) script = """ tagprop_path = 'model_based/tagprop/TagProp/'; addpath(tagprop_path); tagmatrix = h5read('%s', '/tagmatrix') > 0.5; tagmatrix = sparse(tagmatrix); NN = h5read('%s', '/NN'); NN = NN(2:end, :); NN = double(NN); """ % (tagmatrix_file, train_neighs_file) if variant == 'dist' or variant == 'distsigmoids': script += """ NND = h5read('%s', '/NND'); NND = NND(2:end, :); NND = reshape(NND, 1, size(NND,1), size(NND,2)); NND = double(NND); """ % train_neighs_file if variant == 'rank': script += """ m = tagprop_learn(NN,[],tagmatrix); """ elif variant == 'ranksigmoids': script += """ m = tagprop_learn(NN,[],tagmatrix,'sigmoids',true); """ elif variant == 'dist': script += """ m = tagprop_learn(NN,NND,tagmatrix,'type','dist'); """ elif variant == 'distsigmoids': script += """ m = tagprop_learn(NN,NND,tagmatrix,'type','dist','sigmoids',true); """ script += """ save('%s', 'm', '-v7.3'); exit; """ % train_model_file call_matlab(script) # we perform prediction printStatus(INFO, "starting prediction") test_neighs_file = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d'%(feature,nnName,k), 'nn_test.h5') if not os.path.exists(test_neighs_file): printStatus(INFO, "Matlab test neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (test_neighs_file)) sys.exit(1) script = """ tagprop_path = 'model_based/tagprop/TagProp/'; addpath(tagprop_path); load('%s'); tagmatrix = h5read('%s', '/tagmatrix') > 0.5; tagmatrix = sparse(tagmatrix); NNT = h5read('%s', '/NNT'); NNT = double(NNT); """ % (train_model_file, tagmatrix_file, test_neighs_file) if variant == 'dist' or variant == 'distsigmoids': script += """ NNDT = h5read('%s', '/NNDT'); NNDT = reshape(NNDT, 1, size(NNDT,1), size(NNDT,2)); NNDT = double(NNDT); """ % test_neighs_file script += """ P = tagprop_predict(NNT,[],m)'; save('%s', '-v7.3'); exit; """ % resultfile_tagprop makedirsforfile(resultfile_tagprop) call_matlab(script) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) concepts = readConcepts(testCollection, annotationName, rootpath) id_images = readImageSet(testCollection, testset, rootpath) id_images.sort() # id_images = map(int, id_images) # concepts mapping tagprop_output = h5py.File(resultfile_tagprop, 'r') tagprop_input = h5py.File(tagmatrix_file, 'r') mapping = getVocabMap(list(tagprop_input['vocab'][:]),concepts) final_tagmatrix = tagprop_output['P'][:][:,mapping] with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images':id_images, 'scores':final_tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
trainAnnotationName = 'conceptsvoc2008train.txt' testCollection = 'voc2008val' testset = testCollection testAnnotationName = 'conceptsvoc2008val.txt' modelName = 'fik50' #modelName = 'fastlinear' if 'fastlinear' == modelName: from fastlinear.fastlinear import fastlinear_load_model as load_model else: from fiksvm.fiksvm import fiksvm_load_model as load_model scorer = getScorer(metric) imset = readImageSet(testCollection,testset,rootpath=rootpath) concepts = readConcepts(testCollection,testAnnotationName,rootpath=rootpath) feat_dir = os.path.join(rootpath, testCollection, "FeatureData", feature) feat_file = BigFile(feat_dir) _renamed, _vectors = feat_file.read(imset) nr_of_images = len(_renamed) nr_of_concepts = len(concepts) mAP = 0.0 models = [None] * len(concepts) stream = StreamFile(feat_dir) for i,concept in enumerate(concepts): model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concept)
def process(options, trainCollection, trainAnnotationName, feature): import re p = re.compile( r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)') rootpath = options.rootpath overwrite = options.overwrite #autoweight = options.autoweight numjobs = options.numjobs job = options.job nr_bins = options.nr_bins best_param_dir = options.best_param_dir beta = 0.5 modelName = 'fik%d' % nr_bins if best_param_dir: modelName += '-tuned' concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) feat_file = BigFile(feat_dir) params = {'nr_bins': nr_bins} with open(os.path.join(feat_dir, 'minmax.txt'), 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) for concept in todo: if best_param_dir: param_file = os.path.join(best_param_dir, '%s.txt' % concept) m = p.search(open(param_file).readline().strip()) C = float(m.group('C')) A = float(m.group('a')) B = float(m.group('b')) else: C = 1 A = 0 B = 0 printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B)) model_file_name = os.path.join(resultdir, concept + '.model') names, labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) renamed, vectors = feat_file.read(names) y = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1 == lab]) wp = float(beta) * (np + nn) / np wn = (1.0 - beta) * (np + nn) / nn svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C) model = svm_train( y, vectors, svm_params + ' -s 0 -t %d -q' % KERNEL_TYPE.index("HI")) newmodel = svm_to_fiksvm([model], [1.0], feat_file.ndims, params) newmodel.set_probAB(A, B) makedirsforfile(model_file_name) printStatus(INFO, '-> %s' % model_file_name) fiksvm_save_model(model_file_name, newmodel) # reload the model file to do a simple check fiksvm_load_model(model_file_name) assert (abs(newmodel.get_probAB()[0] - A) < 1e-6) assert (abs(newmodel.get_probAB()[1] - B) < 1e-6) return len(todo)
def process(options, testCollection, trainCollection, annotationName, tagrelMethod, tagfeature): rootpath = options.rootpath overwrite = options.overwrite concepts = readConcepts(trainCollection, annotationName, rootpath) nr_of_concepts = len(concepts) mapping = dict(zip(concepts,range(nr_of_concepts))) feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', tagfeature) binary_file = os.path.join(feat_dir, 'feature.bin') id_file = os.path.join(feat_dir, 'id.txt') shape_file = os.path.join(feat_dir,'shape.txt') if checkToSkip(binary_file, overwrite): sys.exit(0) inputfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, tagrelMethod, 'id.tagvotes.txt') if not os.path.exists(inputfile): printError(INFO, '%s does not exist' % inputfile) sys.exit(0) makedirsforfile(binary_file) fw = open(binary_file, 'wb') processed = set() imset = [] count_line = 0 for line in open(inputfile): count_line += 1 elems = str.split(line.strip()) name = elems[0] if name in processed: continue processed.add(name) del elems[0] assert(len(elems) == 2 * nr_of_concepts) vec = [0] * nr_of_concepts for i in range(0, len(elems), 2): tag = elems[i] idx = mapping[tag] score = float(elems[i+1]) vec[idx] = score s = float(sum(vec)) # l_1 normalized vec = np.array([x/s for x in vec], dtype=np.float32) vec.tofile(fw) imset.append(name) fw.close() fw = open(id_file, 'w') fw.write(' '.join(imset)) fw.close() fw = open(shape_file, 'w') fw.write('%d %d' % (len(imset), nr_of_concepts)) fw.close() print ('%d lines parsed, %d ids -> %d unique ids' % (count_line, len(processed), len(imset)))
def process(options, trainCollection, trainAnnotationName, valCollection, valAnnotationName, feature, modelName): assert (modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 metric = options.metric scorer = getScorer(metric) overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {} if 'fik' == modelName: from fiksvm.svmutil import svm_train as train_model from fiksvm.fiksvm import svm_to_fiksvm as compress_model from fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from fastlinear.liblinear193.python.liblinearutil import train as train_model from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model modelName = 'fastlinear' concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) valConcepts = readConcepts(valCollection, valAnnotationName, rootpath=rootpath) concept_num = len(concepts) for i in range(concept_num): assert (concepts[i] == valConcepts[i]) resultdir = os.path.join( rootpath, trainCollection, 'Models', trainAnnotationName, '%s,best_params' % modelName, '%s,%s,%s' % (valCollection, valAnnotationName, feature)) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.txt') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile( os.path.join(rootpath, trainCollection, 'FeatureData', feature)) val_feat_file = BigFile( os.path.join(rootpath, valCollection, 'FeatureData', feature)) feat_dim = train_feat_file.ndims assert (feat_dim == val_feat_file.ndims) for concept in todo: names, labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) renamed, vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1 == lab]) wp = float(beta) * (np + nn) / np wn = (1.0 - beta) * (np + nn) / nn names, labels = readAnnotationsFrom(valCollection, valAnnotationName, concept, skip_0=True, rootpath=rootpath) val_name2label = dict(zip(names, labels)) val_renamed, val_vectors = val_feat_file.read(names) min_perf = 2.0 worst_C = 1.0 max_perf = 0.0 best_C = 1.0 best_scores = None best_labels = None for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]: if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' #print modelName, '>'*20, svm_params model = train_model(Ys, vectors, svm_params + ' -q') new_model = compress_model([model], [1.0], feat_dim, params) ranklist = [(val_renamed[i], new_model.predict(val_vectors[i])) for i in range(len(val_renamed))] ranklist.sort(key=lambda v: v[1], reverse=True) sorted_labels = [val_name2label[x[0]] for x in ranklist] perf = scorer.score(sorted_labels) if max_perf < perf: max_perf = perf best_C = C best_scores = [x[1] for x in ranklist] best_labels = list(sorted_labels) if min_perf > perf: min_perf = perf worst_C = C [A, B] = sigmoid_train(best_scores, best_labels) resultfile = os.path.join(resultdir, '%s.txt' % concept) printStatus( INFO, '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' % (concept, min_perf, worst_C, max_perf, best_C, A, B)) makedirsforfile(resultfile) fw = open(resultfile, 'w') fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B)) fw.close()
def process(options, trainCollection, trainAnnotationName, valCollection, valAnnotationName, feature, modelName): assert(modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 metric = options.metric scorer = getScorer(metric) overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {} if 'fik' == modelName: from fiksvm.svmutil import svm_train as train_model from fiksvm.fiksvm import svm_to_fiksvm as compress_model from fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from fastlinear.liblinear193.python.liblinearutil import train as train_model from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model modelName = 'fastlinear' concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) valConcepts = readConcepts(valCollection,valAnnotationName, rootpath=rootpath) concept_num = len(concepts) for i in range(concept_num): assert(concepts[i] == valConcepts[i]) resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, '%s,best_params'%modelName, '%s,%s,%s' % (valCollection,valAnnotationName,feature)) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.txt') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature)) val_feat_file = BigFile(os.path.join(rootpath,valCollection,'FeatureData',feature)) feat_dim = train_feat_file.ndims assert(feat_dim == val_feat_file.ndims) for concept in todo: names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn names,labels = readAnnotationsFrom(valCollection, valAnnotationName, concept, skip_0=True, rootpath=rootpath) val_name2label = dict(zip(names,labels)) val_renamed, val_vectors = val_feat_file.read(names) min_perf = 2.0 worst_C = 1.0 max_perf = 0.0 best_C = 1.0 best_scores = None best_labels = None for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]: if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' #print modelName, '>'*20, svm_params model = train_model(Ys, vectors, svm_params + ' -q') new_model = compress_model([model], [1.0], feat_dim, params) ranklist = [(val_renamed[i], new_model.predict(val_vectors[i])) for i in range(len(val_renamed))] ranklist.sort(key=lambda v:v[1], reverse=True) sorted_labels = [val_name2label[x[0]] for x in ranklist] perf = scorer.score(sorted_labels) if max_perf < perf: max_perf = perf best_C = C best_scores = [x[1] for x in ranklist] best_labels = list(sorted_labels) if min_perf > perf: min_perf = perf worst_C = C [A,B] = sigmoid_train(best_scores, best_labels) resultfile = os.path.join(resultdir, '%s.txt' % concept) printStatus(INFO, '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' % (concept, min_perf, worst_C, max_perf, best_C, A, B)) makedirsforfile(resultfile) fw = open(resultfile, 'w') fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B)) fw.close()
def process(options, trainCollection, annotationfile, feature, modelName): assert(modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 C = 1 overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {'rootpath': rootpath, 'model': modelName} if 'fik' == modelName: from svms.fiksvm.svmutil import svm_train as train_model from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model from svms.fiksvm.fiksvm import fiksvm_save_model as save_model from svms.fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model from svms.fastlinear.fastlinear import fastlinear_save_model as save_model newAnnotationName = os.path.split(annotationfile)[-1] trainAnnotationNames = [x.strip() for x in open(annotationfile).readlines() if x.strip() and not x.strip().startswith('#')] for annotationName in trainAnnotationNames: conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', annotationName) if not os.path.exists(conceptfile): print '%s does not exist' % conceptfile return 0 concepts = readConcepts(trainCollection, trainAnnotationNames[0], rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature)) feat_dim = train_feat_file.ndims s_time = time.time() for concept in todo: assemble_model = None for t in range(1, len(trainAnnotationNames)+1): names,labels = readAnnotationsFrom(trainCollection, trainAnnotationNames[t-1], concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' g_t = train_model(Ys, vectors, svm_params + ' -q') if t == 1: assemble_model = compress_model([g_t], [1.0], feat_dim, params) else: new_model = compress_model([g_t], [1.0], feat_dim, params) assemble_model.add_fastsvm(new_model, 1-1.0/t, 1.0/t) new_model_file = os.path.join(resultdir, '%s.model' % concept) makedirsforfile(new_model_file) printStatus(INFO, 'save model to %s' % new_model_file) save_model(new_model_file, assemble_model) printStatus(INFO, '%s done' % concept) timecost = time.time() - s_time writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath) printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo))) printStatus(INFO, 'models stored at %s' % resultdir) printStatus(INFO, '%g seconds in total' % timecost)
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName): if modelName.startswith('fik'): from fiksvm.fiksvm import fiksvm_load_model as load_model else: from fastlinear.fastlinear import fastlinear_load_model as load_model rootpath = options.rootpath overwrite = options.overwrite prob_output = options.prob_output numjobs = options.numjobs job = options.job #blocksize = options.blocksize topk = options.topk outputName = '%s,%s' % (feature,modelName) if prob_output: outputName += ',prob' resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt') if numjobs>1: resultfile += '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) nr_of_concepts = len(concepts) test_imset = readImageSet(testCollection, testCollection, rootpath) test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job] test_imset = set(test_imset) nr_of_test_images = len(test_imset) printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile)) models = [None] * nr_of_concepts for c in range(nr_of_concepts): model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concepts[c]) models[c] = load_model(model_file_name) if models[c] is None: return 0 #(pA,pB) = model.get_probAB() feat_file = StreamFile(os.path.join(rootpath, testCollection, "FeatureData", feature)) makedirsforfile(resultfile) fw = open(resultfile, "w") done = 0 feat_file.open() for _id, _vec in feat_file: if _id not in test_imset: continue if prob_output: scores = [models[c].predict_probability(_vec) for c in range(nr_of_concepts)] else: scores = [models[c].predict(_vec) for c in range(nr_of_concepts)] tagvotes = sorted(zip(concepts, scores), key=lambda v:v[1], reverse=True) if topk>0: tagvotes = tagvotes[:topk] newline = '%s %s\n' % (_id, " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes])) fw.write(newline) done += 1 if done % 1e4 == 0: printStatus(INFO, "%d done" % done) feat_file.close() fw.close() printStatus(INFO, "%d done" % (done)) return done
import sys, os from basic.common import checkToSkip, ROOT_PATH, makedirsforfile from basic.annotationtable import readConcepts, readAnnotationsFrom, writeAnnotationsTo, writeConceptsTo from basic.data import readImageSet if __name__ == '__main__': args = sys.argv[1:] rootpath = '/var/scratch2/xirong/VisualSearch' srcCollection = args[0] annotationName = args[1] dstCollection = args[2] overwrite = 0 concepts = readConcepts(srcCollection, annotationName, rootpath) todo = [] for concept in concepts: resfile = os.path.join(rootpath, dstCollection, 'Annotations', 'Image', annotationName, '%s.txt' % concept) if checkToSkip(resfile, overwrite): continue todo.append(concept) if not todo: print('nothing to do') sys.exit(0) imset = set(readImageSet(dstCollection, dstCollection, rootpath)) for concept in todo: names, labels = readAnnotationsFrom(srcCollection, annotationName, concept,
def process(options, workingCollection, annotationName, feature, outputpkl): rootpath = options.rootpath distance = options.distance overwrite = options.overwrite k_ratio = options.kratio ratio_cs = options.ratiocs lambda1 = options.lambda1 lambda2 = options.lambda2 outputonlytest = options.outputonlytest rawtagmatrix = options.rawtagmatrix modelName = "robustpca" nnName = distance + "knn" printStatus( INFO, "Starting RobustPCA %s,%s,%s,%s,%f,%f,%f" % (workingCollection, annotationName, feature, nnName, k_ratio, lambda1, lambda2)) if rawtagmatrix: printStatus(INFO, "Using raw tag matrix.") else: printStatus(INFO, "Using preprocessed tag matrix.") resultfile = os.path.join(outputpkl) resultfile_robustpca = os.path.join( rootpath, workingCollection, 'RobustPCA-Prediction', '%s,%s,%f,%f,%f,%d' % (feature, nnName, lambda1, lambda2, k_ratio, rawtagmatrix), 'prediction.mat') if checkToSkip(resultfile_robustpca, overwrite): only_dump = True else: only_dump = False if not rawtagmatrix: tagmatrix_file = os.path.join( rootpath, workingCollection, 'RobustPCA', '%s,%s,%f' % (feature, nnName, DEFAULT_K_PROP), 'tagmatrix.h5') if not os.path.exists(tagmatrix_file): printStatus( INFO, "Tag matrix file not found at %s Did you run robustpca_preprocessing.py?" % (tagmatrix_file)) sys.exit(1) else: tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5") if not os.path.exists(tagmatrix_file): printStatus( INFO, 'Tag matrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file)) sys.exit(1) laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f' % (feature, nnName, k_ratio), 'laplacianI.mat') if not os.path.exists(laplacianI_file): printStatus( INFO, "LaplacianI file not found at %s Did you run laplacian_images.py?" % (laplacianI_file)) sys.exit(1) laplacianT_file = os.path.join(rootpath, workingCollection, 'LaplacianT', '%f' % (ratio_cs), 'laplacianT.mat') if not os.path.exists(laplacianT_file): printStatus( INFO, "LaplacianT file not found at %s Did you run laplacian_tags.py?" % (laplacianT_file)) sys.exit(1) # being learning script = """ rpca_path = 'transduction_based/robustpca/'; addpath(rpca_path); addpath([rpca_path, 'fast_svd/']); tagmatrix = sparse(double(h5read('%s', '/tagmatrix'))); load('%s'); load('%s'); lambda1 = %f; lambda2 = %f; maxIters = 50; precision = 1e-4; mu_start = 1.; parpool('local', 4); [P,E]=robustpca(tagmatrix, lambda1, lambda2, tag_similarity, im_similarity, maxIters, precision, mu_start); """ % (tagmatrix_file, laplacianI_file, laplacianT_file, lambda1, lambda2) script += """ delete(gcp); save('%s', 'P', 'E', 'lambda1', 'lambda2', '-v7.3'); exit; """ % resultfile_robustpca if not only_dump: printStatus(INFO, "starting learning") makedirsforfile(resultfile_robustpca) call_matlab(script) if checkToSkip(resultfile, overwrite): return 0 # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) concepts = readConcepts(workingCollection, annotationName, rootpath) if outputonlytest: testset_id_images = readImageSet( workingCollection.split('+')[1], workingCollection.split('+')[1], rootpath) testset_id_images.sort() id_images = readImageSet(workingCollection, workingCollection, rootpath) id_images.sort() # concepts mapping robustpca_output = h5py.File(resultfile_robustpca, 'r') tagprop_input = h5py.File(tagmatrix_file, 'r') mapping = getVocabMap(list(tagprop_input['vocab'][:]), concepts) predicted_tagmatrix = robustpca_output['P'][:, mapping] if outputonlytest: idx = np.array([bisect_index(id_images, x) for x in testset_id_images]) final_tagmatrix = predicted_tagmatrix[idx, :] assert (final_tagmatrix.shape[0] == idx.shape[0]) id_images = testset_id_images else: final_tagmatrix = predicted_tagmatrix makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump( { 'concepts': concepts, 'id_images': id_images, 'scores': final_tagmatrix }, f, pickle.HIGHEST_PROTOCOL)
def process(options, collection, annotationName, runfile): rootpath = options.rootpath apscorer = getScorer('AP') ndcg = getScorer('NDCG@20') ndcg2 = getScorer('NDCG2@20') p1scorer = getScorer('P@1') p5scorer = getScorer('P@5') datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#')] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) printStatus(INFO, 'read annotations from files') name2label = [{} for i in range(nr_of_concepts)] hit_imgset = [[] for i in range(nr_of_concepts)] rel_conset = {} for i in range(nr_of_concepts): names,labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) names = map(int, names) name2label[i] = dict(zip(names,labels)) for im,lab in zip(names,labels): if lab > 0: rel_conset.setdefault(im,set()).add(i) label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt'% concepts[i]) try: hit_imgset[i] = set(map(int, open(label_file).readlines())) except: hit_imgset[i] = set() printStatus(INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i]))) ap_table = np.zeros((nr_of_runs, nr_of_concepts)) ap2_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg_table = np.zeros((nr_of_runs, nr_of_concepts)) ndcg2_table = np.zeros((nr_of_runs, nr_of_concepts)) print '#'*100 print '# method miap hit1 hit5' print '#'*100 for run_idx in range(nr_of_runs): data = pickle.load(open(datafiles[run_idx],'rb')) scores = data['scores'] assert(scores.shape[1] == nr_of_concepts) imset = data['id_images'] nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] for c_idx in range(nr_of_concepts): ground_truth = name2label[c_idx] ranklist = zip(imset, scores[:,c_idx]) ranklist.sort(key=lambda v:(v[1], str(v[0])), reverse=True) ranklist = [x for x in ranklist if x[0] in ground_truth] sorted_labels = [ground_truth[x[0]] for x in ranklist] assert(len(sorted_labels)>0) #print concepts[c_idx], ranklist[:5], sorted_labels[:5] ap_table[run_idx, c_idx] = apscorer.score(sorted_labels) sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]] ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels) ndcg_table[run_idx, c_idx] = ndcg.score(sorted_labels) ndcg2_table[run_idx, c_idx] = ndcg2.score(sorted_labels) res = np.zeros((nr_of_images, 3)) for j in range(nr_of_images): ranklist = zip(range(nr_of_concepts), scores[j,:]) ranklist.sort(key=lambda v:v[1], reverse=True) rel_set = rel_conset.get(imset[j], set()) sorted_labels = [int(x[0] in rel_set) for x in ranklist] ap = apscorer.score(sorted_labels) hit1 = p1scorer.score(sorted_labels) hit5 = p5scorer.score(sorted_labels) > 0.1 res[j,:] = [ap, hit1, hit5] avg_perf = res.mean(axis=0) print os.path.split(datafiles[run_idx])[-1], ' '.join(['%.3f' % x for x in avg_perf]) print '#'*100 print '# untagged-concept', ' '.join([os.path.split(x)[-1] for x in datafiles]) print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ap_table[:,c_idx]]) print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)]) print '#'*100 print '# tagged-concept' print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ap2_table[:,c_idx]]) print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)]) print '#'*100 print '# tagged-concept' print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ndcg_table[:,c_idx]]) print 'mean%s' % ndcg.name(), ' '.join(['%.3f' % x for x in ndcg_table.mean(axis=1)]) print '#'*100 print '# tagged-concept' print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ndcg2_table[:,c_idx]]) print 'mean%s'%ndcg2.name(), ' '.join(['%.3f' % x for x in ndcg2_table.mean(axis=1)])
def process(options, collection, annotationName, runfile): rootpath = options.rootpath overwrite = options.overwrite resultdir = os.path.join(rootpath, collection, 'SimilarityIndex', collection) apscorer = getScorer('AP') datafiles = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith('#')] nr_of_runs = len(datafiles) concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_concepts = len(concepts) printStatus(INFO, 'read annotations from files') name2label = [{} for i in range(nr_of_concepts)] hit_imgset = [[] for i in range(nr_of_concepts)] for i in range(nr_of_concepts): names,labels = readAnnotationsFrom(collection, annotationName, concepts[i], skip_0=False, rootpath=rootpath) names = map(int, names) name2label[i] = dict(zip(names,labels)) label_file = os.path.join(rootpath, collection, 'tagged,lemm', '%s.txt'% concepts[i]) try: hit_imgset[i] = set(map(int, open(label_file).readlines())) except: hit_imgset[i] = set() printStatus(INFO, 'readLabeledImageSet for %s-%s -> %d hits' % (collection, concepts[i], len(hit_imgset[i]))) ap_table = np.zeros((nr_of_runs, nr_of_concepts)) ap2_table = np.zeros((nr_of_runs, nr_of_concepts)) for run_idx in range(nr_of_runs): runName = os.path.splitext(os.path.basename(datafiles[run_idx]))[0] data = pickle.load(open(datafiles[run_idx],'rb')) scores = data['scores'] assert(scores.shape[1] == nr_of_concepts) imset = data['id_images'] nr_of_images = len(imset) #print datafiles[run_idx], imset[:5], imset[-5:] for c_idx in range(nr_of_concepts): ground_truth = name2label[c_idx] ranklist = zip(imset, scores[:,c_idx]) ranklist.sort(key=lambda v:(v[1], str(v[0])), reverse=True) ranklist = [x for x in ranklist if x[0] in ground_truth] resfile = os.path.join(resultdir, runName, '%s.txt' % concepts[c_idx]) if not checkToSkip(resfile, overwrite): writeRankingResults(ranklist, resfile) sorted_labels = [ground_truth[x[0]] for x in ranklist] assert(len(sorted_labels)>0) #print concepts[c_idx], ranklist[:5], sorted_labels[:5] ap_table[run_idx, c_idx] = apscorer.score(sorted_labels) resfile = os.path.join(resultdir, 'tagged,lemm', runName, '%s.txt' % concepts[c_idx]) if not checkToSkip(resfile, overwrite): writeRankingResults([x for x in ranklist if x[0] in hit_imgset[c_idx]], resfile) sorted_labels = [ground_truth[x[0]] for x in ranklist if x[0] in hit_imgset[c_idx]] ap2_table[run_idx, c_idx] = apscorer.score(sorted_labels) print '#'*100 print '# untagged-concept', ' '.join([os.path.basename(x) for x in datafiles]) print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ap_table[:,c_idx]]) print 'meanAP', ' '.join(['%.3f' % x for x in ap_table.mean(axis=1)]) print '#'*100 print '# tagged-concept' print '#'*100 for c_idx in range(nr_of_concepts): print concepts[c_idx], ' '.join(['%.3f' % x for x in ap2_table[:,c_idx]]) print 'meanAP2', ' '.join(['%.3f' % x for x in ap2_table.mean(axis=1)])