def precompute_annotator(self, concept): NegativeEngine.precompute_annotator(self, concept) for subconcept in concept.split('-'): expandedTagSet = set([subconcept] + wn_expand(subconcept)) self.annotator = self.annotator.union(expandedTagSet) INFO = 'dataengine.%s' % self.__class__.__name__ printStatus(INFO, 'precomputing the virtual annotator for %s: %d tags' % (concept, len(self.annotator)))
def process(options, source_dir, feat_dim, imsetfile, result_dir): resultfile = os.path.join(result_dir, 'feature.bin') if checkToSkip(resultfile, options.overwrite): sys.exit(0) imset = map(str.strip, open(imsetfile).readlines()) print "requested", len(imset) featurefile = BigFile(source_dir, feat_dim) makedirsforfile(resultfile) fw = open(resultfile, 'wb') done = [] start = 0 while start < len(imset): end = min(len(imset), start + options.blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) renamed, vectors = featurefile.read(imset[start:end]) for vec in vectors: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) done += renamed start = end fw.close() assert(len(done) == len(set(done))) resultfile = os.path.join(result_dir, 'id.txt') fw = open(resultfile, 'w') fw.write(' '.join(done)) fw.close() print '%d requested, %d obtained' % (len(imset), len(done))
def process(options, collection, annotationName): rootpath = options.rootpath overwrite = options.overwrite neg_filter = options.neg_filter concepts = readConcepts(collection, annotationName, rootpath) newAnnotationName = annotationName[:-4] + 'social.txt' ne = STRING_TO_NEGATIVE_ENGINE[neg_filter](collection, rootpath) newConcepts = [] for concept in concepts: resultfile = os.path.join(rootpath, collection, 'Annotations', 'Image', newAnnotationName, '%s.txt'%concept) if checkToSkip(resultfile, overwrite): newConcepts.append(concept) continue try: pos_set = readLabeledImageSet(collection, concept, tpp='lemm', rootpath=rootpath) except: pos_set = None if not pos_set: printStatus(INFO, '*** %s has not labeled examples, will be ignored ***' % concept) continue neg_set = ne.sample(concept, int(1e8)) assert(len(set(pos_set).intersection(set(neg_set))) == 0) newlabels = [1] * len(pos_set) + [-1] * len(neg_set) newnames = pos_set + neg_set printStatus(INFO, "anno(%s) %d pos %d neg -> %s" % (concept,len(pos_set),len(neg_set),resultfile)) writeAnnotations(newnames, newlabels, resultfile) newConcepts.append(concept) writeConceptsTo(newConcepts, collection, newAnnotationName, rootpath)
def __init__(self, datafile, k=5, ndims=0, language='en', L1_normalize=0, L2_normalize=0): Text2Vec.__init__(self, datafile, k, ndims, language, L1_normalize, L2_normalize) self.words_simi = {} word_vob = map(str.strip, open(datafile).readlines()) self.word2index = dict(zip(word_vob, range(len(word_vob)))) self.soft_file = os.path.join( datafile.rsplit('/', 1)[0], 'word_vocab_soft_5.txt') for line in open(self.soft_file).readlines(): word, s_sim = line.strip().split(' ', 1) w_s = s_sim.split(' ') assert len(w_s) % 2 == 0 self.words_simi[word] = [(w_s[i], float(w_s[i + 1])) for i in range(0, len(w_s), 2)] if ndims != 0: assert len( word_vob ) == self.ndims, "feat dimension is not match %d != %d" % ( len(word_vob), self.ndims) else: self.ndims = len(word_vob) printStatus(INFO + '.' + self.__class__.__name__, "%d words" % self.ndims)
def precompute(self, concept): self.precompute_annotator(concept) self.candidateset = [] for i, line in enumerate(self.data): elems = str.split(line) imageid = elems[0] if imageid in self.tabooImset: continue negative = 1 for tag in elems[1:]: if tag in self.annotator: negative = 0 break if negative: self.candidateset.append(imageid) self.candidateset = [x for x in self.candidateset if x in self.imset] self.target = concept INFO = 'dataengine.%s' % self.__class__.__name__ printStatus(INFO, "%d candidates for %s" % (self.getCount(concept), concept))
def __init__(self, model_path, weight_path): self.model = model_from_json(open(model_path).read()) self.model.load_weights(weight_path) # any loss ang optimizers are ok self.model.compile(loss='mse', optimizer='sgd') printStatus(INFO + '.' + self.__class__.__name__, 'loaded a trained Word2VisualVec model successfully')
def __init__(self, trainCollection, tpp="lemm", feature="color64+dsift", k=1000, rootpath=ROOT_PATH): self.trainCollection = trainCollection self.k = k self.name = "%s(%s,%s,%s,%d)" % (self.__class__.__name__, self.trainCollection, tpp, feature, k) vobfile = os.path.join(rootpath, trainCollection, "TextData", "wn.%s.txt" % trainCollection) self.vob = set(map(str.strip, open(vobfile).readlines())) printStatus( INFO, 'the vocabulary of %s contains %d tags' % (trainCollection, len(self.vob))) self.gamma = (1.0 / MEDIAN_DISTANCE[feature])**2 self.feat_dir = os.path.join(rootpath, trainCollection, 'FeatureIndex', feature) self.dim = FEATURE_TO_DIM[feature] self.fcs = FlickrContextSim(trainCollection, rootpath) printStatus(INFO, self.name + ' okay')
def classify_large_data(model, imset, feat_file, prob_output=False, blocksize=DEFAULT_BLOCK_SIZE): start = 0 results = [] read_time = 0.0 test_time = 0.0 while start < len(imset): end = min(len(imset), start + blocksize) printStatus(INFO, 'classifying images from %d to %d' % (start, end-1)) s_time = time.time() renamed,vectors = feat_file.read(imset[start:end]) read_time += time.time() - s_time s_time = time.time() if prob_output: scores = [model.predict_probability(vectors[i]) for i in range(len(renamed))] else: scores = [model.predict(vectors[i]) for i in range(len(renamed))] test_time += time.time() - s_time results += zip(renamed, scores) start = end #printStatus('%.sclassifyLargeData'%INFO, 'read time %g seconds, test time %g seconds' % (read_time, test_time)) results.sort(key=lambda v: (v[1], v[0]), reverse=True) return results
def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH, k=DEFAULT_K): self.rootpath = rootpath self.concepts = readConcepts(collection, annotationName, rootpath) self.nr_of_concepts = len(self.concepts) self.concept2index = dict( zip(self.concepts, range(self.nr_of_concepts))) self.imset = readImageSet(collection, collection, rootpath) self.nr_of_images = len(self.imset) self.knndir = os.path.join(collection, '%s,%sknn,1500' % (feature, distance)) self.k = k self.noise = 0 self._load_tag_data(collection, tpp, rootpath) printStatus( INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len( self.tag2freq), distance, self.k))
def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH): self.concepts = readConcepts(collection, annotationName, rootpath) self.nr_of_concepts = len(self.concepts) self.concept2index = dict( zip(self.concepts, range(self.nr_of_concepts))) feat_dir = os.path.join(rootpath, collection, "FeatureData", feature) id_file = os.path.join(feat_dir, 'id.txt') shape_file = os.path.join(feat_dir, 'shape.txt') self.nr_of_images, feat_dim = map(int, open(shape_file).readline().split()) self.searcher = simpleknn.load_model( os.path.join(feat_dir, 'feature.bin'), feat_dim, self.nr_of_images, id_file) self.searcher.set_distance(distance) self.k = DEFAULT_K self._load_tag_data(collection, tpp, rootpath) printStatus( INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len( self.tag2freq), distance, self.k))
def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH): self.rootpath = rootpath self.concepts = readConcepts(collection, annotationName, rootpath) self.nr_of_concepts = len(self.concepts) self.concept2index = dict( zip(self.concepts, range(self.nr_of_concepts))) featuredir = os.path.join(rootpath, collection, 'FeatureData', feature) id_file = os.path.join(featuredir, "id.txt") shape_file = os.path.join(feat_dir, 'shape.txt') self.nr_of_images, feat_dim = map(int, open(shape_file).readline().split()) self.searcher = load_model(featuredir, self.nr_of_images, feat_dim, nr_of_segments=512, segmentk=256, coarsek=4096) self.k = DEFAULT_K self._load_tag_data(collection, tpp, rootpath) printStatus( INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len( self.tag2freq), distance, self.k))
def cal_perf_t2i(prediction_file, verbose=1): scorers = [RecallScorer(k) for k in [1, 5, 10]] res = [0] * len(scorers) nr_of_sents = 0 nr_of_images = 0 first_matched_idexs = [] for line in open(prediction_file): elems = line.strip().split() sentid = elems[0] del elems[0] assert (len(elems) % 2 == 0) imageids = elems[::2] nr_of_images = len(imageids) hit_list = [] flag = 1 for i in range(len(imageids)): if sentid.find(imageids[i]) == 0: hit_list.append(1) if flag == 1: first_matched_idexs.append(i + 1) flag = 0 else: hit_list.append(0) if len(hit_list) > 20 and flag == 0: break hit_list = hit_list[: 20] # consider at most the first 20 predicted tags perf = [scorer.score(hit_list) for scorer in scorers] res = [res[i] + perf[i] for i in range(len(scorers))] printStatus(INFO, 'nr of sentences: %d' % nr_of_sents) printStatus(INFO, 'nr of images: %d' % nr_of_images) res = [x / nr_of_images for x in res] recall_name = ' '.join([x.name() for x in scorers]) recall_score = ' '.join(['%.3f' % x for x in res]) assert len(first_matched_idexs) == nr_of_images med_r = sorted(first_matched_idexs)[nr_of_images / 2 - 1] mean_r = np.mean(first_matched_idexs) mean_invert_r = [] for i in first_matched_idexs: mean_invert_r.append(1.0 / i) mean_invert_r = np.mean(mean_invert_r) if verbose == 1: print recall_name print recall_score print 'Med r: ', med_r print 'Mean r: ', mean_r print 'mean inverted r: ', round(mean_invert_r, 3) return (recall_name, recall_score, med_r, mean_r, mean_invert_r)
def sampling(predictions, strategy, n): printStatus(INFO, '%s sampling: %d out of %d instances' % (strategy, n, len(predictions))) if 'toprand' == strategy: temp = [(x[0], x[1]*random.uniform(0.9,1)) for x in predictions] temp.sort(key=lambda v:v[1], reverse=True) return [x[0] for x in temp[:n]] else: return [x[0] for x in predictions[:n]]
def process(options, collection, annotationName, pos_num): assert(annotationName.endswith('.txt')) rootpath = options.rootpath pos_bag_num = options.pos_bag_num neg_bag_num = options.neg_bag_num neg_pos_ratio = options.neg_pos_ratio annotationNameStr = annotationName[:-4] + ('.random%d' % pos_num) + '.%d' + ('.npr%d' % neg_pos_ratio) + '.%d.txt' concepts = readConcepts(collection, annotationName, rootpath=rootpath) skip = 0 newAnnotationNames = [None] * (pos_bag_num * neg_bag_num) for idxp in range(pos_bag_num): for idxn in range(neg_bag_num): anno_idx = idxp * neg_bag_num + idxn newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn) resultfile = os.path.join(rootpath,collection,'Annotations',newAnnotationNames[anno_idx]) if checkToSkip(resultfile, options.overwrite): skip += 1 continue writeConcepts(concepts,resultfile) first,second,last = annotationNameStr.split('%d') scriptfile = os.path.join(rootpath,collection,'annotationfiles',first + '0-%d'%(pos_bag_num-1) + second + '0-%d'%(neg_bag_num-1) + last) makedirsforfile(scriptfile) fout = open(scriptfile,'w') fout.write('\n'.join(newAnnotationNames) + '\n') fout.close() if len(newAnnotationNames) == skip: return 0 for concept in concepts: names,labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath) positivePool = [x[0] for x in zip(names,labels) if x[1]>0] negativePool = [x[0] for x in zip(names,labels) if x[1]<0] for idxp in range(pos_bag_num): if len(positivePool) > pos_num: positiveBag = random.sample(positivePool, pos_num) else: positiveBag = positivePool for idxn in range(neg_bag_num): anno_idx = idxp * neg_bag_num + idxn newAnnotationName = newAnnotationNames[anno_idx] resultfile = os.path.join(rootpath,collection,'Annotations','Image',newAnnotationName,'%s.txt'%concept) if checkToSkip(resultfile, options.overwrite): continue real_neg_num = max(len(positiveBag) * neg_pos_ratio, 1000) real_neg_num = min(len(negativePool), real_neg_num) negativeBag = random.sample(negativePool, real_neg_num) assert(len(set(positiveBag).intersection(set(negativeBag))) == 0) printStatus(INFO, "anno(%s,%d) %d pos %d neg -> %s" % (concept,anno_idx,len(positiveBag),len(negativeBag),resultfile)) writeAnnotations(positiveBag + negativeBag, [1]*len(positiveBag) + [-1]*len(negativeBag), resultfile)
def __init__(self, corpus, modelName, wnid2words_file='data/wnid2words.pkl', rootpath=ROOT_PATH): printStatus(INFO + '.' + self.__class__.__name__, 'initializing ...') word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', modelName) self.wnid2words = pickle.load(open(wnid2words_file, 'rb')) self.word2vec = BigFile(word2vec_dir)
def precompute_annotator(self, concept): NegativeEngine.precompute_annotator(self, concept) for subconcept in concept.split('-'): expandedTagSet = set([subconcept] + wn_expand(subconcept)) self.annotator = self.annotator.union(expandedTagSet) INFO = 'dataengine.%s' % self.__class__.__name__ printStatus( INFO, 'precomputing the virtual annotator for %s: %d tags' % (concept, len(self.annotator)))
def __init__(self, datafile, ndims=0, L1_normalize=0, L2_normalize=0): printStatus(INFO + '.' + self.__class__.__name__, 'initializing ...') self.datafile = datafile self.ndims = ndims self.L1_normalize = L1_normalize self.L2_normalize = L2_normalize assert type(L1_normalize) == int assert type(L2_normalize) == int assert (L1_normalize + L2_normalize) <= 1
def sampling(predictions, strategy, n): printStatus( INFO, '%s sampling: %d out of %d instances' % (strategy, n, len(predictions))) if 'toprand' == strategy: temp = [(x[0], x[1] * random.uniform(0.9, 1)) for x in predictions] temp.sort(key=lambda v: v[1], reverse=True) return [x[0] for x in temp[:n]] else: return [x[0] for x in predictions[:n]]
def process(options, testCollection): overwrite = options.overwrite rootpath = options.rootpath corpus = options.corpus word2vec_model = options.word2vec embedding_model = options.embedding Y0 = options.Y0 Y1 = options.Y1 pY0 = options.pY0 r = options.r blocksize = 2000 embedding_name = '%s,%s,%s' % (corpus, word2vec_model, embedding_model) for synset_name in [Y0, Y1]: assert(os.path.exists(os.path.join(rootpath, 'synset2vec', synset_name, embedding_name))) resfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, embedding_name, pY0, 'id.tagvotes.txt') if checkToSkip(resfile, overwrite): return 0 label_file = 'data/ilsvrc12/synsets.txt' label2vec_dir = os.path.join(rootpath, 'synset2vec', Y0, embedding_name) i2v = Image2Vec(label_file, label2vec_dir) tagger = ZeroshotTagger(synset_name=Y1, embedding_name=embedding_name, rootpath=rootpath) imset = readImageSet(testCollection, testCollection, rootpath) feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', pY0) feat_file = BigFile(feat_dir) printStatus(INFO, 'tagging %d images' % len(imset)) makedirsforfile(resfile) fw = open(resfile, 'w') start = 0 while start < len(imset): end = min(len(imset), start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end)) todo = imset[start:end] if not todo: break renamed, vectors = feat_file.read(todo) output = [] for _id,_vec in zip(renamed, vectors): im_vec = i2v.embedding(_vec) pred = tagger.predict(im_vec, topk=options.r) output.append('%s %s\n' % (_id, ' '.join(['%s %s'%(x[0],x[1]) for x in pred]))) start = end fw.write(''.join(output)) fw.close()
def __init__(self, datafile, ndims=0, L1_normalize=0, L2_normalize=0): Text2Vec.__init__(self, datafile, ndims, L1_normalize, L2_normalize) word_vob = map(str.strip, open(datafile).readlines()) self.word2index = dict(zip(word_vob, range(len(word_vob)))) if ndims != 0: assert len( word_vob ) == self.ndims, "feat dimension is not match %d != %d" % ( len(word_vob), self.ndims) else: self.ndims = len(word_vob) printStatus(INFO + '.' + self.__class__.__name__, "%d words" % self.ndims)
def __init__(self, collection, tpp="lemm", rootpath=ROOT_PATH): self.name = "%s(%s,%s)" % (self.__class__.__name__, collection, tpp) self.photoid2tags = {} datafile = os.path.join(rootpath, collection, "TextData", "id.userid.%stags.txt" % tpp) self.vob = [] with open(datafile) as fin: for line in fin: [photoid, userid, tags] = line.split("\t") self.photoid2tags[photoid] = tags self.vob += str.split(tags) self.vob = set(self.vob) printStatus(self.name, "%d images, %d unique tags" % (len(self.photoid2tags), len(self.vob)))
def submit(searchers, collection,annotationName, rootpath=ROOT_PATH, overwrite=0): concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_runs = len(searchers) for concept in concepts: for j in range(nr_of_runs): resultfile = os.path.join(searchers[j].getOutputdir(), concept + ".txt") if checkToSkip(resultfile, overwrite): continue searchresults = searchers[j].scoreCollection(concept) print ("%s: %s %d -> %s" % (searchers[j].name, concept, len(searchresults), resultfile)) writeRankingResults(searchresults, resultfile) printStatus('%s.submit'%os.path.basename(__file__), "done")
def __init__(self, collection, rootpath=ROOT_PATH): self.name = '%s.%s' % (self.__class__.__name__, collection) imsetfile = os.path.join(rootpath, collection, "ImageSets", "%s.txt" % collection) self.imset = set(map(str.strip, open(imsetfile).readlines())) holdoutfile = os.path.join(rootpath, collection, "ImageSets", "holdout.txt") holdoutSet = set(map(str.strip, open(holdoutfile).readlines())) printStatus(self.name, '%d examples, %d holdout' % (len(self.imset), len(holdoutSet))) self.collection = collection self.target = None self.imset = set([x for x in self.imset if x not in holdoutSet]) self.candidateset = sorted(list(self.imset)) self.datadir = os.path.join(rootpath, collection)
def __init__(self, trainCollection, tpp="lemm", feature="color64+dsift", k=1000, rootpath=ROOT_PATH): self.trainCollection = trainCollection self.k = k self.name = "%s(%s,%s,%s,%d)" % (self.__class__.__name__, self.trainCollection, tpp, feature, k) vobfile = os.path.join(rootpath, trainCollection, "TextData", "wn.%s.txt"%trainCollection) self.vob = set(map(str.strip, open(vobfile).readlines())) printStatus(INFO, 'the vocabulary of %s contains %d tags' % (trainCollection, len(self.vob))) self.gamma = (1.0/MEDIAN_DISTANCE[feature])**2 self.feat_dir = os.path.join(rootpath, trainCollection, 'FeatureIndex', feature) self.dim = FEATURE_TO_DIM[feature] self.fcs = FlickrContextSim(trainCollection,rootpath) printStatus(INFO, self.name + ' okay')
def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH): self.rootpath = rootpath self.concepts = readConcepts(collection, annotationName, rootpath) self.nr_of_concepts = len(self.concepts) self.concept2index = dict(zip(self.concepts, range(self.nr_of_concepts))) featuredir = os.path.join(rootpath,collection,'FeatureData',feature) id_file = os.path.join(featuredir, "id.txt") shape_file = os.path.join(feat_dir, 'shape.txt') self.nr_of_images, feat_dim = map(int, open(shape_file).readline().split()) self.searcher = load_model(featuredir, self.nr_of_images, feat_dim,nr_of_segments=512,segmentk=256,coarsek=4096) self.k = DEFAULT_K self._load_tag_data(collection, tpp, rootpath) printStatus(INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len(self.tag2freq), distance, self.k))
def __init__(self, label_file, label2vec_dir): self.labels = map(str.strip, open(label_file).readlines()) self.nr_of_labels = len(self.labels) feat_file = BigFile(label2vec_dir) renamed, vectors = feat_file.read(self.labels) name2index = dict(zip(renamed, range(len(renamed)))) self.label_vectors = [None] * self.nr_of_labels self.feat_dim = feat_file.ndims for i in xrange(self.nr_of_labels): idx = name2index.get(self.labels[i], -1) self.label_vectors[i] = np.array(vectors[idx]) if idx >= 0 else None nr_of_inactive_labels = len([x for x in self.label_vectors if x is None]) printStatus(INFO, '#active_labels=%d, embedding_size=%d' % (self.nr_of_labels - nr_of_inactive_labels, self.feat_dim))
def precompute_annotator(self, concept): INFO = 'dataengine.%s.precompute_annotator'%self.__class__.__name__ topn = 100 NegativeEngine.precompute_annotator(self, concept) for subconcept in concept.split('-'): expandedTagSet = set([subconcept] + wn_expand(subconcept)) try: datafile = os.path.join(ROOT_PATH, self.collection, 'SimilarityIndex', 'ngd', '%s.txt' % subconcept) rankedtags = readRankingResults(datafile) expandedTagSet = expandedTagSet.union(set([x[0] for x in rankedtags[:topn]])) except: printError(INFO, 'failed to load ranktag file for %s' % subconcept) self.annotator = self.annotator.union(expandedTagSet) printStatus(INFO, 'precomputing the virtual annotator for %s: %d tags' % (concept, len(self.annotator)))
def process(options, testCollection, trainCollection, tagsimMethod): rootpath = options.rootpath overwrite = options.overwrite testsetName = options.testset if options.testset else testCollection tpp = options.tpp numjobs = options.numjobs job = options.job useWnVob = 1 outputName = tagsimMethod + '-wn' if useWnVob else tagsimMethod if tagsimMethod == 'wns': resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, outputName,'id.tagvotes.txt') else: resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, trainCollection, outputName,'id.tagvotes.txt') if numjobs>1: resultfile = resultfile.replace("id.tagvotes.txt", "id.tagvotes.%d.%d.txt" % (numjobs,job)) if checkToSkip(resultfile, overwrite): sys.exit(0) makedirsforfile(resultfile) try: doneset = set([str.split(x)[0] for x in open(options.donefile).readlines()[:-1]]) except: doneset = set() printStatus(INFO, "done set: %d" % len(doneset)) testImageSet = readImageSet(testCollection, testCollection, rootpath) testImageSet = [x for x in testImageSet if x not in doneset] testImageSet = [testImageSet[i] for i in range(len(testImageSet)) if (i%numjobs+1) == job] printStatus(INFO, 'working on %d-%d, %d test images -> %s' % (numjobs,job,len(testImageSet),resultfile) ) testreader = TagReader(testCollection, rootpath=rootpath) if tagsimMethod == "wns": tagrel = SIM_TO_TAGREL["wns"](trainCollection, useWnVob, "wup", rootpath) else: tagrel = SIM_TO_TAGREL[tagsimMethod](trainCollection, useWnVob, rootpath) done = 0 fw = open(resultfile, "w") for qry_id in testImageSet: qry_tags = testreader.get(qry_id) tagvotes = tagrel.estimate(qry_tags) newline = qry_id + " " + " ".join(["%s %s" % (tag, niceNumber(vote,8)) for (tag,vote) in tagvotes]) fw.write(newline+"\n") done += 1 if done%1000 == 0: printStatus(INFO, "%d done" % done) # done fw.close() printStatus(INFO, "%d done" % done)
def buildHitLists(collection, tpp='lemm', rootpath=ROOT_PATH): vobfile = os.path.join(rootpath, collection, 'TextData', 'wn.%s.txt' % collection) vob = set(map(str.strip, open(vobfile).readlines())) printStatus(INFO, '%s, %d unique tags' % (collection, len(vob))) tagfile = os.path.join(rootpath, collection, 'TextData', 'id.userid.%stags.txt'%tpp) hitlists = {} for line in open(tagfile).readlines(): elems = line.strip().split() name = elems[0] tagset = set(elems[2:]).intersection(vob) for tag in tagset: hitlists.setdefault(tag,[]).append(name) assert(len(hitlists)<=len(vob)) return hitlists
def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH): self.rootpath = rootpath self.concepts = readConcepts(collection, annotationName, rootpath) self.nr_of_concepts = len(self.concepts) self.concept2index = dict(zip(self.concepts, range(self.nr_of_concepts))) self.imset = readImageSet(collection, collection, rootpath) self.nr_of_images = len(self.imset) self.knndir = os.path.join(collection, '%s,%sknn,uu,1500' % (feature, distance)) self.k = DEFAULT_K self.noise = 0 self._load_tag_data(collection, tpp, rootpath) printStatus(INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len(self.tag2freq), distance, self.k))
def __init__(self, collection, tpp='lemm', rootpath=ROOT_PATH): self.name = '%s(%s,%s)' % (self.__class__.__name__, collection, tpp) self.photoid2tags = {} datafile = os.path.join(rootpath, collection, "TextData", "id.userid.%stags.txt" % tpp) self.vob = [] with open(datafile) as fin: for line in fin: [photoid, userid, tags] = line.split("\t") self.photoid2tags[photoid] = tags self.vob += str.split(tags) self.vob = set(self.vob) printStatus( self.name, "%d images, %d unique tags" % (len(self.photoid2tags), len(self.vob)))
def buildHitLists(collection, tpp="lemm", rootpath=ROOT_PATH): vobfile = os.path.join(rootpath, collection, "TextData", "wn.%s.txt" % collection) vob = set(map(str.strip, open(vobfile).readlines())) printStatus(INFO, "%s, %d unique tags" % (collection, len(vob))) tagfile = os.path.join(rootpath, collection, "TextData", "id.userid.%stags.txt" % tpp) hitlists = {} for line in open(tagfile).readlines(): elems = line.strip().split() name = elems[0] tagset = set(elems[2:]).intersection(vob) for tag in tagset: hitlists.setdefault(tag, []).append(name) assert len(hitlists) <= len(vob) return hitlists
def process(options, pklfile, hdf5file): if checkToSkip(hdf5file, options.overwrite): return 0 printStatus(INFO, 'Loading pkl file %s' % pklfile) with open(pklfile, 'r') as f: data = pkl.load(f) printStatus(INFO, 'Found %d elements.' % len(data)) printStatus(INFO, 'Saving hdf5 file %s' % hdf5file) with h5py.File(hdf5file, 'w') as f: for k, v in data.items(): printStatus(INFO, 'Dumping %s' % k) f[k] = v printStatus(INFO, 'Done.')
def process(options, pklfile, hdf5file): if checkToSkip(hdf5file, options.overwrite): return 0 printStatus(INFO, 'Loading pkl file %s' % pklfile) with open(pklfile, 'r') as f: data = pkl.load(f) printStatus(INFO, 'Found %d elements.' % len(data)) printStatus(INFO, 'Saving hdf5 file %s' % hdf5file) with h5py.File(hdf5file,'w') as f: for k,v in data.items(): printStatus(INFO, 'Dumping %s' % k) f[k] = v printStatus(INFO, 'Done.')
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite random = options.random resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.zeros((len(id_images), len(concepts))) id_images = [] tag2idx = dict(zip(concepts, xrange(len(concepts)))) with open( os.path.join(rootpath, workingCollection, 'TextData', 'id.userid.lemmtags.txt')) as f: cnt = 0 for line in f: id_img, _, tags = line.split('\t') tags = tags.split() if len(tags) > 0: tags = [(tag2idx.get(x, -1), y) for x, y in zip(tags, xrange(len(tags)))] idx = np.array([x[0] for x in tags]) vals = 1. / (1. + np.array([x[1] for x in tags])) tagmatrix[cnt, idx] = vals id_images.append(id_img) cnt += 1 # random rank for untagged images if random: tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand( tagmatrix.shape[0], tagmatrix.shape[1]) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump( { 'concepts': concepts, 'id_images': map(int, id_images), 'scores': tagmatrix }, f, pickle.HIGHEST_PROTOCOL)
def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH): self.concepts = readConcepts(collection, annotationName, rootpath) self.nr_of_concepts = len(self.concepts) self.concept2index = dict(zip(self.concepts, range(self.nr_of_concepts))) feat_dir = os.path.join(rootpath, collection, "FeatureData", feature) id_file = os.path.join(feat_dir, 'id.txt') shape_file = os.path.join(feat_dir, 'shape.txt') self.nr_of_images, feat_dim = map(int, open(shape_file).readline().split()) self.searcher = simpleknn.load_model(os.path.join(feat_dir, 'feature.bin'), feat_dim, self.nr_of_images, id_file) self.searcher.set_distance(distance) self.k = DEFAULT_K self._load_tag_data(collection, tpp, rootpath) printStatus(INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len(self.tag2freq), distance, self.k))
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.random.rand(len(id_images), len(concepts)) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
def process(options, model_name, concept_file, weight_dir, result_dir): rootpath = options.rootpath overwrite = options.overwrite if 'fastlinear' == model_name: from fastlinear.fastlinear import fastlinear_load_model as load_model from fastlinear.fastlinear import fastlinear_save_model as save_model else: from fiksvm.fiksvm import fiksvm_load_model as load_model from fiksvm.fiksvm import fiksvm_save_model as save_model concepts = [ x.strip() for x in open(concept_file).readlines() if x.strip() and not x.strip().startswith('#') ] todo = [ x for x in concepts if overwrite or not os.path.exists(os.path.join(result_dir, '%s.model' % x)) ] printStatus(INFO, '%d concepts to do' % len(todo)) for concept in todo: weight_file = os.path.join(weight_dir, '%s.txt' % concept) weight_data = map(str.strip, open(weight_file).readlines()) nr_of_models = len(weight_data) assert (nr_of_models >= 2) weights = [0] * nr_of_models models = [None] * nr_of_models for i, line in enumerate(weight_data): w, model_dir = line.split() weights[i] = float(w) model_dir = model_dir if model_dir.startswith( rootpath) else os.path.join(rootpath, model_dir) assert (model_dir.find(model_name) > 0) model_file_name = os.path.join(model_dir, '%s.model' % concept) models[i] = load_model(model_file_name) new_model = models[0] new_model.add_fastsvm(models[1], weights[0], weights[1]) for i in range(2, len(models)): new_model.add_fastsvm(models[i], 1, weights[i]) new_model_file = os.path.join(result_dir, '%s.model' % concept) makedirsforfile(new_model_file) save_model(new_model_file, new_model)
def __init__(self, synset_name='imagenet1k2hop', embedding_name='flickr4m,tagvec500,hierse2', rootpath=ROOT_PATH): feat_dir = os.path.join(rootpath, 'synset2vec', synset_name, embedding_name) feat_file = BigFile(feat_dir) self.labels = feat_file.names self.nr_of_labels = len(self.labels) self.feat_dim = feat_file.ndims renamed, vectors = feat_file.read(self.labels) name2index = dict(zip(renamed, range(len(renamed)))) self.label_vectors = [None] * self.nr_of_labels for i in xrange(self.nr_of_labels): idx = name2index.get(self.labels[i], -1) self.label_vectors[i] = np.array(vectors[idx]) if idx >= 0 else None nr_of_inactive_labels = len([x for x in self.label_vectors if x is None]) printStatus(INFO + '.' + self.__class__.__name__, '#active_labels=%d, embedding_size=%d' % (self.nr_of_labels - nr_of_inactive_labels, self.feat_dim))
def __init__(self, collection, rootpath=ROOT_PATH): self.name = '%s.%s' % (self.__class__.__name__, collection) imsetfile = os.path.join(rootpath, collection, "ImageSets", "%s.txt" % collection) self.imset = set(map(str.strip, open(imsetfile).readlines())) holdoutfile = os.path.join(rootpath, collection, "ImageSets", "holdout.txt") holdoutSet = set(map(str.strip, open(holdoutfile).readlines())) printStatus( self.name, '%d examples, %d holdout' % (len(self.imset), len(holdoutSet))) self.collection = collection self.target = None self.imset = set([x for x in self.imset if x not in holdoutSet]) self.candidateset = sorted(list(self.imset)) self.datadir = os.path.join(rootpath, collection)
def __init__(self, trainCollection, trainAnnotationName, feature, modelName, rootpath=ROOT_PATH): assert(modelName.startswith('fastlinear')), modelName self.concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) self.nr_of_concepts = len(self.concepts) modeldir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName) model = load_model(os.path.join(modeldir, self.concepts[0]+'.model')) self.feat_dim = model.get_feat_dim() self.W = np.zeros((self.feat_dim, self.nr_of_concepts)) self.AB = np.zeros((2, self.nr_of_concepts)) for i in range(self.nr_of_concepts): model_file_name = os.path.join(modeldir, "%s.model" % self.concepts[i]) model = load_model(model_file_name) self.W[:,i] = model.get_w() [A,B] = model.get_probAB() self.AB[:,i] = [A,B] if abs(A)>1e-8 else [-1,0] printStatus(INFO, '%s, A=%g, B=%g' % (self.concepts[i], A, B)) printStatus(INFO, '%s-%s-%s -> %dx%d ModelArray' % (trainCollection,trainCollection,feature,self.feat_dim,self.nr_of_concepts))
def process(options, trainCollection, annotationName): rootpath = options.rootpath overwrite = options.overwrite resultfile = os.path.join(rootpath, trainCollection, 'TextData', 'tag.concept-rank.%s.pkl' % annotationName) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection, annotationName, rootpath) concept_num = len(concepts) concept2index = dict(zip(concepts, range(concept_num))) tcb = TagCooccurBase(trainCollection, rootpath=rootpath) tag_num = tcb.tag_num() DEFAULT_RANK = tag_num rank_matrix = np.zeros((tag_num, concept_num), dtype=np.int) + DEFAULT_RANK tag_list = [] for i,u in enumerate(tcb.vob): ranklist = tcb.top_cooccur(u,-1) concept2rank = {} rank = [DEFAULT_RANK] * concept_num hit = 0 for j,x in enumerate(ranklist): idx = concept2index.get(x[0], -1) if idx>=0: rank_matrix[i,idx] = j+1 hit += 1 if hit == concept_num: break tag_list.append(u) if (i+1) % 1e4 == 0: printStatus(INFO, '%d done' % (i+1) ) assert(len(tag_list) == tag_num) import cPickle as pickle makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump({'tags':tag_list, 'concepts':concepts, 'rank_matrix':rank_matrix}, output, -1) output.close() printStatus(INFO, '%dx%d dumped to %s' % (tag_num, concept_num, resultfile))
def process(options, collection, conceptfile): rootpath = options.rootpath tpp = options.tpp overwrite = options.overwrite concepts = [x.strip() for x in open(conceptfile).readlines() if x.strip() and not x.strip().startswith('#')] resultdir = os.path.join(rootpath, collection, 'tagged,%s'%tpp) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, '%s.txt'%concept) if checkToSkip(resultfile, overwrite): continue todo.append(concept) if not todo: printStatus(INFO, 'nothing to do') return 0 try: holdoutfile = os.path.join(rootpath,collection,'ImageSets','holdout.txt') holdoutSet = set(map(str.strip,open(holdoutfile).readlines())) except: holdoutSet = set() hitlists = buildHitlists(collection, todo, tpp, rootpath) min_hit = 1e6 max_hit = 0 for concept in todo: resultfile = os.path.join(resultdir, '%s.txt' % concept) if checkToSkip(resultfile,overwrite): continue subconcepts = concept.split('-') labeledSet = set(hitlists[subconcepts[0]]) for i in range(1,len(subconcepts)): labeledSet = labeledSet.intersection(hitlists[subconcepts[i]]) labeledSet = labeledSet.difference(holdoutSet) if len(labeledSet) == 0: printStatus(INFO, '%s has ZERO hit' % concept) else: printStatus(INFO, '%s, %d hits -> %s' %(concept, len(labeledSet), resultfile)) makedirsforfile(resultfile) fw = open(resultfile, 'w') fw.write('\n'.join(labeledSet) + '\n') fw.close() if len(labeledSet) > max_hit: max_hit = len(labeledSet) if len(labeledSet) < min_hit: min_hit = len(labeledSet) printStatus(INFO, 'max hits: %d, min hits: %d' % (max_hit, min_hit))
def process(options, feat_dir): resultfile = os.path.join(feat_dir, "minmax.txt") if checkToSkip(resultfile, options.overwrite): sys.exit(0) nr_of_images, feat_dim = map(int, open(os.path.join(feat_dir, "shape.txt")).readline().split()) min_vals = [1e6] * feat_dim max_vals = [-1e6] * feat_dim offset = np.float32(1).nbytes * feat_dim res = array.array("f") feat_file = os.path.join(feat_dir, "feature.bin") id_file = os.path.join(feat_dir, "id.txt") nr_of_images = len(open(id_file).readline().strip().split()) printStatus(INFO, "parsing %s" % feat_file) fr = open(feat_file, "rb") s_time = time.time() for i in xrange(nr_of_images): res.fromfile(fr, feat_dim) vec = res for d in xrange(feat_dim): if vec[d] > max_vals[d]: max_vals[d] = vec[d] if vec[d] < min_vals[d]: min_vals[d] = vec[d] del res[:] fr.close() timecost = time.time() - s_time printStatus( INFO, "%g seconds to find min [%g,%g] and max [%g,%g]" % (timecost, min(min_vals), max(min_vals), min(max_vals), max(max_vals)), ) with open(resultfile, "w") as f: f.write("%s\n" % " ".join(map(str, min_vals))) f.write("%s\n" % " ".join(map(str, max_vals))) f.close()
def submit(searchers, collection, annotationName, rootpath=ROOT_PATH, overwrite=0): concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_runs = len(searchers) for concept in concepts: for j in range(nr_of_runs): resultfile = os.path.join(searchers[j].getOutputdir(), concept + ".txt") if checkToSkip(resultfile, overwrite): continue searchresults = searchers[j].scoreCollection(concept) print("%s: %s %d -> %s" % (searchers[j].name, concept, len(searchresults), resultfile)) writeRankingResults(searchresults, resultfile) printStatus('%s.submit' % os.path.basename(__file__), "done")
def process(options, synset_file, synset_name): overwrite = options.overwrite rootpath = options.rootpath corpus = options.corpus word2vec_model = options.word2vec embedding = options.embedding resdir = os.path.join(rootpath, 'synset2vec', synset_name, '%s,%s,%s' % (corpus, word2vec_model, embedding)) resfile = os.path.join(resdir, 'feature.bin') if checkToSkip(resfile, overwrite): return 0 synsets = map(str.strip, open(synset_file).readlines()) s2v = get_synset_encoder(embedding)(corpus, word2vec_model, rootpath=rootpath) makedirsforfile(resfile) good = [] with open(resfile, 'wb') as fw: for i, wnid in enumerate(synsets): #if i % 1e3 == 0: # printStatus(INFO, '%d done' % i) vec = s2v.embedding(wnid) if vec is not None: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) good.append(wnid) fw.close() printStatus(INFO, '%d done, %d okay' % ((i + 1), len(good))) with open(os.path.join(resdir, 'id.txt'), 'w') as fw: fw.write(' '.join(good)) fw.close() with open(os.path.join(resdir, 'shape.txt'), 'w') as fw: fw.write('%d %d' % (len(good), s2v.get_feat_dim())) fw.close()
def precompute_annotator(self, concept): INFO = 'dataengine.%s.precompute_annotator' % self.__class__.__name__ topn = 100 NegativeEngine.precompute_annotator(self, concept) for subconcept in concept.split('-'): expandedTagSet = set([subconcept] + wn_expand(subconcept)) try: datafile = os.path.join(ROOT_PATH, self.collection, 'SimilarityIndex', 'ngd', '%s.txt' % subconcept) rankedtags = readRankingResults(datafile) expandedTagSet = expandedTagSet.union( set([x[0] for x in rankedtags[:topn]])) except: printError(INFO, 'failed to load ranktag file for %s' % subconcept) self.annotator = self.annotator.union(expandedTagSet) printStatus( INFO, 'precomputing the virtual annotator for %s: %d tags' % (concept, len(self.annotator)))
def process(options, collection, annotationName): rootpath = options.rootpath overwrite = options.overwrite concepts = readConcepts(collection, annotationName, rootpath) resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd") todo = [ x for x in concepts if not os.path.exists(os.path.join(resultdir, x + '.txt')) or overwrite ] if not todo: printStatus(INFO, 'nothing to do') return fcs = FlickrContextSim(collection, rootpath=rootpath) vob = fcs.vob resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd") printStatus( INFO, 'expanding tags for %s-%s -> %s' % (collection, annotationName, resultdir)) for concept in todo: resultfile = os.path.join(resultdir, concept + '.txt') vals = [] for tag in vob: dist = fcs.computeNGD(concept, tag, img=1) if dist < 10: vals.append((tag, dist)) vals.sort(key=lambda v: v[1]) printStatus(INFO, '%s -> %s' % (concept, ' '.join([x[0] for x in vals[:3]]))) writeRankingResults(vals, resultfile)
def process(options, model_name, concept_file, weight_dir, result_dir): rootpath = options.rootpath overwrite = options.overwrite if 'fastlinear' == model_name: from fastlinear.fastlinear import fastlinear_load_model as load_model from fastlinear.fastlinear import fastlinear_save_model as save_model else: from fiksvm.fiksvm import fiksvm_load_model as load_model from fiksvm.fiksvm import fiksvm_save_model as save_model concepts = [x.strip() for x in open(concept_file).readlines() if x.strip() and not x.strip().startswith('#')] todo = [x for x in concepts if overwrite or not os.path.exists(os.path.join(result_dir, '%s.model'%x))] printStatus(INFO, '%d concepts to do' % len(todo)) for concept in todo: weight_file = os.path.join(weight_dir, '%s.txt' % concept) weight_data = map(str.strip, open(weight_file).readlines()) nr_of_models = len(weight_data) assert(nr_of_models >= 2) weights = [0] * nr_of_models models = [None] * nr_of_models for i,line in enumerate(weight_data): w, model_dir = line.split() weights[i] = float(w) model_dir = model_dir if model_dir.startswith(rootpath) else os.path.join(rootpath, model_dir) assert (model_dir.find(model_name)>0) model_file_name = os.path.join(model_dir, '%s.model' % concept) models[i] = load_model(model_file_name) new_model = models[0] new_model.add_fastsvm(models[1], weights[0], weights[1]) for i in range(2, len(models)): new_model.add_fastsvm(models[i], 1, weights[i]) new_model_file = os.path.join(result_dir, '%s.model'%concept) makedirsforfile(new_model_file) save_model(new_model_file, new_model)
def process(options, trainCollection, modelAnnotationName, trainAnnotationName, feature): rootpath = options.rootpath modelName = options.model if 'fastlinear' == modelName: from fastlinear.fastlinear import fastlinear_load_model as load_model from fastlinear.fastlinear import fastlinear_save_model as save_model else: from fiksvm.fiksvm import fiksvm_load_model as load_model from fiksvm.fiksvm import fiksvm_save_model as save_model concepts = readConcepts(trainCollection, trainAnnotationName, rootpath) concepts = [concepts[i] for i in range(len(concepts)) if (i%options.numjobs + 1) == options.job] feat_file = BigFile(os.path.join(rootpath, trainCollection, "FeatureData", feature)) for concept in concepts: modelfile = os.path.join(rootpath, trainCollection, 'Models', modelAnnotationName, feature, modelName, '%s.model' % concept) model = load_model(modelfile) (A0, B0) = model.get_probAB() if abs(A0) > 1e-8 and not options.overwrite: printStatus(INFO, "old parameters exist as A=%g, B=%g, skip" % (A0, B0)) continue names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) results = classify_large_data(model, names, feat_file, prob_output=False) labels = [name2label[x[0]] for x in results] dec_values = [x[1] for x in results] printStatus(INFO, "%s +%d -%d" % (concept, len([x for x in labels if x==1]), len([x for x in labels if x==-1]))) [A,B] = sigmoid_train(dec_values, labels) model.set_probAB(A, B) save_model(modelfile, model) (A1, B1) = model.get_probAB() printStatus(INFO, "A: %g -> %g, B: %g -> %g" % (A0, A1, B0, B1))
def process(options, collection, annotationName): rootpath = options.rootpath overwrite = options.overwrite concepts = readConcepts(collection,annotationName,rootpath) resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd") todo = [x for x in concepts if not os.path.exists(os.path.join(resultdir,x+'.txt')) or overwrite] if not todo: printStatus(INFO, 'nothing to do') return fcs = FlickrContextSim(collection, rootpath=rootpath) vob = fcs.vob resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd") printStatus(INFO, 'expanding tags for %s-%s -> %s' % (collection, annotationName, resultdir)) for concept in todo: resultfile = os.path.join(resultdir, concept + '.txt') vals = [] for tag in vob: dist = fcs.computeNGD(concept, tag, img=1) if dist < 10: vals.append((tag,dist)) vals.sort(key=lambda v:v[1]) printStatus(INFO, '%s -> %s' % (concept, ' '.join([x[0] for x in vals[:3]]))) writeRankingResults(vals, resultfile)
def process(options, testCollection, method): rootpath = options.rootpath scorers = [HitScorer(k) for k in [1, 2, 5, 10]] im2truth = load_ground_truth(testCollection, imset=None, rootpath=rootpath) printStatus(INFO, 'nr of ground-truthed images: %d' % len(im2truth)) tag_prediction_file = os.path.join(rootpath, testCollection,'autotagging', testCollection, method, 'id.tagvotes.txt') printStatus(INFO, 'evaluating %s' % tag_prediction_file) res = [0] * len(scorers) nr_of_images = 0 for line in open(tag_prediction_file): elems = line.strip().split() imageid = elems[0] del elems[0] assert(len(elems)%2 == 0) pred_labels = [elems[i] for i in range(0, len(elems), 2)] pred_labels = pred_labels[:10] # consider at most the first 20 predicted tags truth = im2truth.get(imageid, None) if not truth: continue sorted_labels = [int(x in truth) for x in pred_labels] perf = [scorer.score(sorted_labels) for scorer in scorers] res = [res[i] + perf[i] for i in range(len(scorers))] nr_of_images += 1 printStatus(INFO, 'nr of images: %d' % nr_of_images) res = [x/nr_of_images for x in res] print ' '.join([x.name() for x in scorers]) print ' '.join(['%.3f' % x for x in res])
def process(options, feat_dir, imsetfile, result_dir): resultfile = os.path.join(result_dir, "feature.bin") if checkToSkip(resultfile, options.overwrite): sys.exit(0) imset = map(str.strip, open(imsetfile).readlines()) print "requested", len(imset) feat_file = BigFile(feat_dir) makedirsforfile(resultfile) fw = open(resultfile, "wb") done = [] start = 0 while start < len(imset): end = min(len(imset), start + options.blocksize) printStatus(INFO, "processing images from %d to %d" % (start, end - 1)) toread = imset[start:end] if len(toread) == 0: break renamed, vectors = feat_file.read(toread) for vec in vectors: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) done += renamed start = end fw.close() assert len(done) == len(set(done)) with open(os.path.join(result_dir, "id.txt"), "w") as fw: fw.write(" ".join(done)) fw.close() with open(os.path.join(result_dir, "shape.txt"), "w") as fw: fw.write("%d %d" % (len(done), feat_file.ndims)) fw.close() print "%d requested, %d obtained" % (len(imset), len(done))