def process(options): overwrite = options.overwrite inputeFile = options.inputeFile weightFile = options.weightFile weightFile = os.path.join('result', weightFile) if checkToSkip(weightFile, overwrite): sys.exit(0) makedirsforfile(weightFile) test() print '-'*70 best_perf = -10 best_alpha = None sigma = 0.001 data = load_data(os.path.join('result', inputeFile)) for i in range(1): perf, alpha = coordinate_ascent(data, sigma) if perf > best_perf: best_perf = perf best_alpha = alpha print '*'*70 print 'optimized wights:', ' '.join(['%g'%x for x in best_alpha]) print 'best tuned performance:', best_perf open(weightFile, 'w').write(' '.join(map(str,best_alpha))) print 'optimized wight parameters have written into %s' % weightFile
def process(options, source_dir, feat_dim, imsetfile, result_dir): resultfile = os.path.join(result_dir, 'feature.bin') if checkToSkip(resultfile, options.overwrite): sys.exit(0) imset = map(str.strip, open(imsetfile).readlines()) print "requested", len(imset) featurefile = BigFile(source_dir, feat_dim) makedirsforfile(resultfile) fw = open(resultfile, 'wb') done = [] start = 0 while start < len(imset): end = min(len(imset), start + options.blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) renamed, vectors = featurefile.read(imset[start:end]) for vec in vectors: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) done += renamed start = end fw.close() assert(len(done) == len(set(done))) resultfile = os.path.join(result_dir, 'id.txt') fw = open(resultfile, 'w') fw.write(' '.join(done)) fw.close() print '%d requested, %d obtained' % (len(imset), len(done))
def process(options, tagfile, tpp): if "stem" == tpp: worker = nltk.PorterStemmer() func = stemming else: worker = nltk.WordNetLemmatizer() func = lemmatize resultfile = os.path.join(os.path.split(tagfile)[0], 'id.userid.%stags.txt' % tpp) if checkToSkip(resultfile, options.overwrite): return 0 makedirsforfile(resultfile) fw = codecs.open(resultfile, "w", encoding='utf8') parsed = 0 obtained = 0 for line in open(tagfile): elems = line.strip().split() parsed += 1 if len(elems) > 2: newtags = [] for tag in elems[2:]: try: newtag = func(worker,tag.lower()) except: newtag = tag newtags.append(newtag.decode('utf-8')) newline = "\t".join([elems[0], elems[1], " ".join(newtags)]) fw.write('%s\n' % newline) obtained += 1 fw.close() print ('%d lines parsed, %d records obtained' % (parsed, obtained) )
def process(options, conceptfile, tagvotesfile, resultfile): if checkToSkip(resultfile, options.overwrite): return 0 concepts = map(str.strip, open(conceptfile).readlines()) concept2index = dict(zip(concepts,range(len(concepts)))) data = open(tagvotesfile).readlines() print ('%d instances to dump' % len(data)) concept_num = len(concepts) image_num = len(data) scores = np.zeros((image_num, concept_num)) - 1e4 id_images = [None] * image_num for i in xrange(image_num): elems = str.split(data[i]) id_images[i] = int(elems[0]) del elems[0] for k in range(0, len(elems), 2): tag = elems[k] score = float(elems[k+1]) j = concept2index.get(tag, -1) if j >= 0: scores[i,j] = score makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump({'concepts':concepts, 'id_images':id_images, 'scores':scores}, output, -1) output.close()
def process(options, collection, annotationName, simdir, resultfile): rootpath = options.rootpath if checkToSkip(resultfile, options.overwrite): return 0 concepts = readConcepts(collection, annotationName, rootpath=rootpath) concept_num = len(concepts) id_images = readImageSet(collection, collection, rootpath) image_num = len(id_images) im2index = dict(zip(id_images, range(image_num))) print ('%d instances, %d concepts to dump -> %s' % (image_num, concept_num, resultfile)) scores = np.zeros((image_num, concept_num)) - 1e4 for c_id,concept in enumerate(concepts): simfile = os.path.join(simdir, '%s.txt' % concept) ranklist = readRankingResults(simfile) for im,score in ranklist: idx = im2index[im] scores[idx,c_id] = score makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump({'concepts':concepts, 'id_images':map(int,id_images), 'scores':scores}, output, -1) output.close()
def process(options, collection, annotationName, pos_num): assert(annotationName.endswith('.txt')) rootpath = options.rootpath pos_bag_num = options.pos_bag_num neg_bag_num = options.neg_bag_num neg_pos_ratio = options.neg_pos_ratio annotationNameStr = annotationName[:-4] + ('.random%d' % pos_num) + '.%d' + ('.npr%d' % neg_pos_ratio) + '.%d.txt' concepts = readConcepts(collection, annotationName, rootpath=rootpath) skip = 0 newAnnotationNames = [None] * (pos_bag_num * neg_bag_num) for idxp in range(pos_bag_num): for idxn in range(neg_bag_num): anno_idx = idxp * neg_bag_num + idxn newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn) resultfile = os.path.join(rootpath,collection,'Annotations',newAnnotationNames[anno_idx]) if checkToSkip(resultfile, options.overwrite): skip += 1 continue writeConcepts(concepts,resultfile) first,second,last = annotationNameStr.split('%d') scriptfile = os.path.join(rootpath,collection,'annotationfiles',first + '0-%d'%(pos_bag_num-1) + second + '0-%d'%(neg_bag_num-1) + last) makedirsforfile(scriptfile) fout = open(scriptfile,'w') fout.write('\n'.join(newAnnotationNames) + '\n') fout.close() if len(newAnnotationNames) == skip: return 0 for concept in concepts: names,labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath) positivePool = [x[0] for x in zip(names,labels) if x[1]>0] negativePool = [x[0] for x in zip(names,labels) if x[1]<0] for idxp in range(pos_bag_num): if len(positivePool) > pos_num: positiveBag = random.sample(positivePool, pos_num) else: positiveBag = positivePool for idxn in range(neg_bag_num): anno_idx = idxp * neg_bag_num + idxn newAnnotationName = newAnnotationNames[anno_idx] resultfile = os.path.join(rootpath,collection,'Annotations','Image',newAnnotationName,'%s.txt'%concept) if checkToSkip(resultfile, options.overwrite): continue real_neg_num = max(len(positiveBag) * neg_pos_ratio, 1000) real_neg_num = min(len(negativePool), real_neg_num) negativeBag = random.sample(negativePool, real_neg_num) assert(len(set(positiveBag).intersection(set(negativeBag))) == 0) printStatus(INFO, "anno(%s,%d) %d pos %d neg -> %s" % (concept,anno_idx,len(positiveBag),len(negativeBag),resultfile)) writeAnnotations(positiveBag + negativeBag, [1]*len(positiveBag) + [-1]*len(negativeBag), resultfile)
def process(options, testCollection, trainCollection, tagsimMethod): rootpath = options.rootpath overwrite = options.overwrite testsetName = options.testset if options.testset else testCollection tpp = options.tpp numjobs = options.numjobs job = options.job useWnVob = 1 outputName = tagsimMethod + '-wn' if useWnVob else tagsimMethod if tagsimMethod == 'wns': resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, outputName,'id.tagvotes.txt') else: resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, trainCollection, outputName,'id.tagvotes.txt') if numjobs>1: resultfile = resultfile.replace("id.tagvotes.txt", "id.tagvotes.%d.%d.txt" % (numjobs,job)) if checkToSkip(resultfile, overwrite): sys.exit(0) makedirsforfile(resultfile) try: doneset = set([str.split(x)[0] for x in open(options.donefile).readlines()[:-1]]) except: doneset = set() printStatus(INFO, "done set: %d" % len(doneset)) testImageSet = readImageSet(testCollection, testCollection, rootpath) testImageSet = [x for x in testImageSet if x not in doneset] testImageSet = [testImageSet[i] for i in range(len(testImageSet)) if (i%numjobs+1) == job] printStatus(INFO, 'working on %d-%d, %d test images -> %s' % (numjobs,job,len(testImageSet),resultfile) ) testreader = TagReader(testCollection, rootpath=rootpath) if tagsimMethod == "wns": tagrel = SIM_TO_TAGREL["wns"](trainCollection, useWnVob, "wup", rootpath) else: tagrel = SIM_TO_TAGREL[tagsimMethod](trainCollection, useWnVob, rootpath) done = 0 fw = open(resultfile, "w") for qry_id in testImageSet: qry_tags = testreader.get(qry_id) tagvotes = tagrel.estimate(qry_tags) newline = qry_id + " " + " ".join(["%s %s" % (tag, niceNumber(vote,8)) for (tag,vote) in tagvotes]) fw.write(newline+"\n") done += 1 if done%1000 == 0: printStatus(INFO, "%d done" % done) # done fw.close() printStatus(INFO, "%d done" % done)
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName): assert(modelName.startswith('fastlinear')) rootpath = options.rootpath overwrite = options.overwrite numjobs = options.numjobs job = options.job topk = options.topk outputName = '%s,%s' % (feature,modelName) resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt') if numjobs>1: resultfile += '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) nr_of_concepts = len(concepts) test_imset = readImageSet(testCollection, testCollection, rootpath) test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job] test_imset = set(test_imset) nr_of_test_images = len(test_imset) printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile)) ma = ModelArray(trainCollection, trainAnnotationName, feature, modelName, rootpath=rootpath) feat_file = StreamFile(os.path.join(rootpath, testCollection, "FeatureData", feature)) makedirsforfile(resultfile) fw = open(resultfile, "w") done = 0 feat_file.open() for _id, _vec in feat_file: if _id not in test_imset: continue res = ma.predict([_vec],prob=0) tagvotes = res[0] if topk>0: tagvotes = tagvotes[:topk] newline = '%s %s\n' % (_id, " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes])) fw.write(newline) done += 1 if done % 1e4 == 0: printStatus(INFO, "%d done" % done) feat_file.close() fw.close() printStatus(INFO, "%d done" % (done)) return done
def process(options, testCollection): overwrite = options.overwrite rootpath = options.rootpath corpus = options.corpus word2vec_model = options.word2vec embedding_model = options.embedding Y0 = options.Y0 Y1 = options.Y1 pY0 = options.pY0 r = options.r blocksize = 2000 embedding_name = '%s,%s,%s' % (corpus, word2vec_model, embedding_model) for synset_name in [Y0, Y1]: assert(os.path.exists(os.path.join(rootpath, 'synset2vec', synset_name, embedding_name))) resfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, embedding_name, pY0, 'id.tagvotes.txt') if checkToSkip(resfile, overwrite): return 0 label_file = 'data/ilsvrc12/synsets.txt' label2vec_dir = os.path.join(rootpath, 'synset2vec', Y0, embedding_name) i2v = Image2Vec(label_file, label2vec_dir) tagger = ZeroshotTagger(synset_name=Y1, embedding_name=embedding_name, rootpath=rootpath) imset = readImageSet(testCollection, testCollection, rootpath) feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', pY0) feat_file = BigFile(feat_dir) printStatus(INFO, 'tagging %d images' % len(imset)) makedirsforfile(resfile) fw = open(resfile, 'w') start = 0 while start < len(imset): end = min(len(imset), start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end)) todo = imset[start:end] if not todo: break renamed, vectors = feat_file.read(todo) output = [] for _id,_vec in zip(renamed, vectors): im_vec = i2v.embedding(_vec) pred = tagger.predict(im_vec, topk=options.r) output.append('%s %s\n' % (_id, ' '.join(['%s %s'%(x[0],x[1]) for x in pred]))) start = end fw.write(''.join(output)) fw.close()
def process(options, trainCollection, baseAnnotationName, startAnnotationName, feature, modelName): global train_model, compress_model, save_model assert(modelName in ['fik', 'fastlinear']) if 'fik' == modelName: from model_based.svms.fiksvm.svmutil import svm_train as train_model from model_based.svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model from model_based.svms.fiksvm.fiksvm import fiksvm_save_model as save_model else: from model_based.svms.fastlinear.liblinear193.python.liblinearutil import train as train_model from model_based.svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model from model_based.svms.fastlinear.fastlinear import fastlinear_save_model as save_model rootpath = options.rootpath overwrite = options.overwrite params = {'rootpath': rootpath, 'trainCollection': trainCollection, 'baseAnnotationName': baseAnnotationName, 'startAnnotationName': startAnnotationName, 'feature': feature, 'model': modelName, 'strategy': options.strategy, 'iterations': options.iterations, 'npr': options.npr, 'nr_bins': options.nr_bins} concepts = readConcepts(trainCollection, startAnnotationName, rootpath) newAnnotationName = get_new_annotation_name(params) newModelName = get_model_name(params) modeldir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, newModelName) todo = [concept for concept in concepts if overwrite or os.path.exists(os.path.join(modeldir,'%s.txt'%concept)) is False] activeConcepts = [todo[i] for i in range(len(todo)) if (i%options.numjobs+1) == options.job] params['feat_file'] = BigFile(os.path.join(rootpath, trainCollection, 'FeatureData', feature)) if 'fik' == modelName: minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) s_time = time.time() for concept in activeConcepts: printStatus(INFO, 'processing %s' % concept) modelfile = os.path.join(modeldir, '%s.model'%concept) if checkToSkip(modelfile, overwrite): continue new_model = NegativeBootstrap.learn(concept, params) makedirsforfile(modelfile) printStatus(INFO, 'save model to %s' % modelfile) save_model(modelfile, new_model) printStatus(INFO, '%s done' % concept) timecost = time.time() - s_time writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath) printStatus(INFO, 'done for %g concepts: %s' % (len(activeConcepts), ' '.join(activeConcepts))) printStatus(INFO, 'models stored at %s' % modeldir) printStatus(INFO, '%g seconds in total' % timecost)
def process(options, trainCollection, annotationName, testCollection): rootpath = options.rootpath m = options.m k_r = options.kr k_d = options.kd k_s = options.ks k_c = options.kc feature = options.feature add_bonus = options.bonus overwrite = options.overwrite #outputName = 'cotag,m%d,kr%d,kd%d,ks%d,kc%d,bonus%d'%(m,k_r,k_d,k_s,k_c,add_bonus) outputName = 'cotag' # simplify the outputName to reduce the length of the result filename outputName = os.path.join(outputName, feature) if (k_c>1e-6) else outputName resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, annotationName, outputName, 'id.tagvotes.txt') if checkToSkip(resultfile, overwrite): sys.exit(0) testImageSet = readImageSet(testCollection, testCollection, rootpath=rootpath) test_tag_reader = TagReader(testCollection, rootpath=rootpath) if k_c < 1e-6: tagger = TagCooccurTagger(testCollection, trainCollection, annotationName, rootpath=rootpath) else: tagger = TagCooccurPlusTagger(testCollection, trainCollection, annotationName, feature=feature, rootpath=rootpath) tagger.m = m tagger.k_r = k_r tagger.k_d = k_d tagger.k_s = k_s tagger.k_c = k_c tagger.add_bonus = add_bonus makedirsforfile(resultfile) fw = open(resultfile, 'w') output = [] done = 0 for im in testImageSet: user_tags = test_tag_reader.get(im) tagvotes = tagger.predict(content=im, context=user_tags) newline = '%s %s' % (im, ' '.join(['%s %s'%(x[0], niceNumber(x[1],6)) for x in tagvotes])) output.append(newline) done += 1 if len(output) % 1e4 == 0: fw.write('\n'.join(output) + '\n') output=[] printStatus(INFO, '%d done' % done) if output: fw.write('\n'.join(output) + '\n') fw.close() printStatus(INFO, '%d done' % done)
def process(options, collection, conceptfile): rootpath = options.rootpath tpp = options.tpp overwrite = options.overwrite concepts = [x.strip() for x in open(conceptfile).readlines() if x.strip() and not x.strip().startswith('#')] resultdir = os.path.join(rootpath, collection, 'tagged,%s'%tpp) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, '%s.txt'%concept) if checkToSkip(resultfile, overwrite): continue todo.append(concept) if not todo: printStatus(INFO, 'nothing to do') return 0 try: holdoutfile = os.path.join(rootpath,collection,'ImageSets','holdout.txt') holdoutSet = set(map(str.strip,open(holdoutfile).readlines())) except: holdoutSet = set() hitlists = buildHitlists(collection, todo, tpp, rootpath) min_hit = 1e6 max_hit = 0 for concept in todo: resultfile = os.path.join(resultdir, '%s.txt' % concept) if checkToSkip(resultfile,overwrite): continue subconcepts = concept.split('-') labeledSet = set(hitlists[subconcepts[0]]) for i in range(1,len(subconcepts)): labeledSet = labeledSet.intersection(hitlists[subconcepts[i]]) labeledSet = labeledSet.difference(holdoutSet) if len(labeledSet) == 0: printStatus(INFO, '%s has ZERO hit' % concept) else: printStatus(INFO, '%s, %d hits -> %s' %(concept, len(labeledSet), resultfile)) makedirsforfile(resultfile) fw = open(resultfile, 'w') fw.write('\n'.join(labeledSet) + '\n') fw.close() if len(labeledSet) > max_hit: max_hit = len(labeledSet) if len(labeledSet) < min_hit: min_hit = len(labeledSet) printStatus(INFO, 'max hits: %d, min hits: %d' % (max_hit, min_hit))
def process(options, label_file, label2vec_dir, testCollection, feature, new_feature): rootpath = options.rootpath overwrite = options.overwrite k = options.k blocksize = options.blocksize subset = options.subset if options.subset else testCollection resfile = os.path.join(rootpath, testCollection, 'FeatureData', new_feature, 'id.feature.txt') if checkToSkip(resfile, overwrite): return 0 imsetfile = os.path.join(rootpath, testCollection, 'ImageSets', '%s.txt' % subset) imset = map(str.strip, open(imsetfile).readlines()) printStatus(INFO, '%d images to do' % len(imset)) feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature)) im2vec = Image2Vec(label_file, label2vec_dir) makedirsforfile(resfile) fw = open(resfile, 'w') read_time = 0 run_time = 0 start = 0 done = 0 while start < len(imset): end = min(len(imset), start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) s_time = time.time() renamed, test_X = feat_file.read(imset[start:end]) read_time += time.time() - s_time s_time = time.time() output = [None] * len(renamed) for i in xrange(len(renamed)): vec = im2vec.embedding(test_X[i], k) output[i] = '%s %s\n' % (renamed[i], " ".join([niceNumber(x,6) for x in vec])) run_time += time.time() - s_time start = end fw.write(''.join(output)) done += len(output) # done printStatus(INFO, "%d done. read time %g seconds, run_time %g seconds" % (done, read_time, run_time)) fw.close() return done
def process(options, collection): rootpath = options.rootpath tpp = options.tpp overwrite = options.overwrite resultfile = os.path.join(rootpath, collection, "tagrel", collection, 'tagpos,%s'%tpp, 'id.tagvotes.txt') if checkToSkip(resultfile, overwrite): sys.exit(0) imset = readImageSet(collection, collection, rootpath) printStatus(INFO, 'working on %d test images -> %s' % (len(imset),resultfile)) reader = TagReader(collection,tpp=tpp,rootpath=rootpath) makedirsforfile(resultfile) fw = open(resultfile, "w") output = [] done = 0 for im in imset: tags = reader.get(im) tagSet = set() tagSeq = [] for tag in str.split(tags): if tag not in tagSet: tagSeq.append(tag) tagSet.add(tag) assert(len(tagSeq) == len(tagSet)) nr_tags = len(tagSeq) tagvotes = [(tagSeq[i], 1.0-float(i)/nr_tags) for i in range(nr_tags)] newline = "%s %s" % (im, " ".join(["%s %g" % (x[0],x[1]) for x in tagvotes])) output.append(newline + "\n") done += 1 if len(output)%1e4 == 0: printStatus(INFO, '%d %s %s' % (done,im,' '.join(['%s:%g' % (x[0],x[1]) for x in tagvotes[:3]] ))) fw.write("".join(output)) fw.flush() output = [] if output: fw.write("".join(output)) fw.close() printStatus(INFO, 'done')
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.random.rand(len(id_images), len(concepts)) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
def process(options, feat_dir): newname = '' if options.ssr: newname = 'ssr' newname += 'l%d' % options.p resfile = os.path.join(feat_dir.rstrip('/\\') + newname, 'feature.bin') if checkToSkip(resfile, options.overwrite): return 0 with open(os.path.join(feat_dir, 'shape.txt')) as fr: nr_of_images, feat_dim = map(int, fr.readline().strip().split()) fr.close() offset = np.float32(1).nbytes * feat_dim res = array.array('f') fr = open(os.path.join(feat_dir,'feature.bin'), 'rb') makedirsforfile(resfile) fw = open(resfile, 'wb') print ('>>> writing results to %s' % resfile) for i in xrange(nr_of_images): res.fromfile(fr, feat_dim) vec = res if options.ssr: vec = [np.sign(x) * np.sqrt(abs(x)) for x in vec] if options.p == 1: Z = sum(abs(x) for x in vec) + 1e-9 else: Z = np.sqrt(sum([x**2 for x in vec])) + 1e-9 if i % 1e4 == 0: print ('image_%d, norm_%d=%g' % (i, options.p, Z)) vec = [x/Z for x in vec] del res[:] vec = np.array(vec, dtype=np.float32) vec.tofile(fw) fr.close() fw.close() print ('>>> %d lines parsed' % nr_of_images) shutil.copyfile(os.path.join(feat_dir,'id.txt'), os.path.join(os.path.split(resfile)[0], 'id.txt')) shapefile = os.path.join(os.path.split(resfile)[0], 'shape.txt') with open(shapefile, 'w') as fw: fw.write('%d %d' % (nr_of_images, feat_dim)) fw.close()
def process(options, collection): rootpath = options.rootpath overwrite = options.overwrite inputfile = options.inputfile resultname = options.resultname result_file = os.path.join('result', resultname) if checkToSkip(result_file, overwrite): sys.exit(0) makedirsforfile(result_file) # inpute of query qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) num2file = {} num2file[0] = os.path.join(rootpath, collection, 'Annotations', 'Image', 'concepts%s.txt' % collection) method_count = 1 for line in open(os.path.join('result',inputfile)).readlines(): num2file[method_count] = line.strip() method_count +=1 fout = open(result_file, "w") for qid in qid_list: name2feature = {} for fnum in xrange(method_count): data_file = os.path.join( num2file[fnum], '%s.txt' % qid) data = readAnnotations(data_file) data.sort(key=lambda v:v[0], reverse=True) names = [x[0] for x in data] labels = [x[1] for x in data] # print 'fnum %d' % fnum if fnum == 0: key_names = names for i in xrange(len(names)): name2feature[names[i]] = [labels[i]] else: assert(checkSameList(key_names, names)) for i in xrange(len(names)): name2feature[names[i]].append(labels[i]) for img in key_names: fout.write('%s ' % qid + img + ' ' + ' '.join(name2feature[img]) + '\n') fout.close() print 'Combined result of different written into %s' % result_file
def process(options, trainCollection, annotationName): rootpath = options.rootpath overwrite = options.overwrite resultfile = os.path.join(rootpath, trainCollection, 'TextData', 'tag.concept-rank.%s.pkl' % annotationName) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection, annotationName, rootpath) concept_num = len(concepts) concept2index = dict(zip(concepts, range(concept_num))) tcb = TagCooccurBase(trainCollection, rootpath=rootpath) tag_num = tcb.tag_num() DEFAULT_RANK = tag_num rank_matrix = np.zeros((tag_num, concept_num), dtype=np.int) + DEFAULT_RANK tag_list = [] for i,u in enumerate(tcb.vob): ranklist = tcb.top_cooccur(u,-1) concept2rank = {} rank = [DEFAULT_RANK] * concept_num hit = 0 for j,x in enumerate(ranklist): idx = concept2index.get(x[0], -1) if idx>=0: rank_matrix[i,idx] = j+1 hit += 1 if hit == concept_num: break tag_list.append(u) if (i+1) % 1e4 == 0: printStatus(INFO, '%d done' % (i+1) ) assert(len(tag_list) == tag_num) import cPickle as pickle makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump({'tags':tag_list, 'concepts':concepts, 'rank_matrix':rank_matrix}, output, -1) output.close() printStatus(INFO, '%dx%d dumped to %s' % (tag_num, concept_num, resultfile))
def process(options, collection, feature): rootpath = options.rootpath tpp = options.tpp k = 1000 # options.k numjobs = options.numjobs job = options.job overwrite = options.overwrite feat_dir = os.path.join(rootpath, collection, "FeatureData", feature) feat_file = BigFile(feat_dir) hitlists = buildHitLists(collection, tpp, rootpath) printStatus(INFO, "nr of tags: %d" % len(hitlists)) vob = sorted(hitlists.keys()) vob = [vob[i] for i in range(len(vob)) if i % numjobs == job - 1] printStatus(INFO, "working on %d-%d: %d tags" % (numjobs, job, len(vob))) for tag_idx, tag in enumerate(vob): resultdir = os.path.join(rootpath, collection, "FeatureIndex", feature, tag[:2], tag) binfile = os.path.join(resultdir, "feature.bin") if checkToSkip(binfile, overwrite): continue hitlist = hitlists[tag] hitlist = hitlist[:k] # keep at most 1000 images per tag renamed, vecs = feat_file.read(hitlist) makedirsforfile(binfile) np.array(vecs).astype(np.float32).tofile(binfile) idfile = os.path.join(resultdir, "id.txt") fw = open(idfile, "w") fw.write(" ".join(renamed)) fw.close() shapefile = os.path.join(resultdir, "shape.txt") fw = open(shapefile, "w") fw.write("%d %d" % (len(renamed), len(vecs[0]))) fw.close() if tag_idx % 1e3 == 0: printStatus(INFO, "%d - %s, %d images" % (tag_idx, tag, len(hitlist)))
def process(options, model_name, concept_file, weight_dir, result_dir): rootpath = options.rootpath overwrite = options.overwrite if 'fastlinear' == model_name: from fastlinear.fastlinear import fastlinear_load_model as load_model from fastlinear.fastlinear import fastlinear_save_model as save_model else: from fiksvm.fiksvm import fiksvm_load_model as load_model from fiksvm.fiksvm import fiksvm_save_model as save_model concepts = [x.strip() for x in open(concept_file).readlines() if x.strip() and not x.strip().startswith('#')] todo = [x for x in concepts if overwrite or not os.path.exists(os.path.join(result_dir, '%s.model'%x))] printStatus(INFO, '%d concepts to do' % len(todo)) for concept in todo: weight_file = os.path.join(weight_dir, '%s.txt' % concept) weight_data = map(str.strip, open(weight_file).readlines()) nr_of_models = len(weight_data) assert(nr_of_models >= 2) weights = [0] * nr_of_models models = [None] * nr_of_models for i,line in enumerate(weight_data): w, model_dir = line.split() weights[i] = float(w) model_dir = model_dir if model_dir.startswith(rootpath) else os.path.join(rootpath, model_dir) assert (model_dir.find(model_name)>0) model_file_name = os.path.join(model_dir, '%s.model' % concept) models[i] = load_model(model_file_name) new_model = models[0] new_model.add_fastsvm(models[1], weights[0], weights[1]) for i in range(2, len(models)): new_model.add_fastsvm(models[i], 1, weights[i]) new_model_file = os.path.join(result_dir, '%s.model'%concept) makedirsforfile(new_model_file) save_model(new_model_file, new_model)
def process(options, feat_dir, imsetfile, result_dir): resultfile = os.path.join(result_dir, "feature.bin") if checkToSkip(resultfile, options.overwrite): sys.exit(0) imset = map(str.strip, open(imsetfile).readlines()) print "requested", len(imset) feat_file = BigFile(feat_dir) makedirsforfile(resultfile) fw = open(resultfile, "wb") done = [] start = 0 while start < len(imset): end = min(len(imset), start + options.blocksize) printStatus(INFO, "processing images from %d to %d" % (start, end - 1)) toread = imset[start:end] if len(toread) == 0: break renamed, vectors = feat_file.read(toread) for vec in vectors: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) done += renamed start = end fw.close() assert len(done) == len(set(done)) with open(os.path.join(result_dir, "id.txt"), "w") as fw: fw.write(" ".join(done)) fw.close() with open(os.path.join(result_dir, "shape.txt"), "w") as fw: fw.write("%d %d" % (len(done), feat_file.ndims)) fw.close() print "%d requested, %d obtained" % (len(imset), len(done))
def process(options, synset_file, synset_name): overwrite = options.overwrite rootpath = options.rootpath corpus = options.corpus word2vec_model = options.word2vec embedding = options.embedding resdir = os.path.join(rootpath, 'synset2vec', synset_name, '%s,%s,%s' % (corpus, word2vec_model, embedding)) resfile = os.path.join(resdir, 'feature.bin') if checkToSkip(resfile, overwrite): return 0 synsets = map(str.strip, open(synset_file).readlines()) s2v = get_synset_encoder(embedding)(corpus, word2vec_model, rootpath=rootpath) makedirsforfile(resfile) good = [] with open(resfile, 'wb') as fw: for i,wnid in enumerate(synsets): #if i % 1e3 == 0: # printStatus(INFO, '%d done' % i) vec = s2v.embedding(wnid) if vec is not None: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) good.append(wnid) fw.close() printStatus(INFO, '%d done, %d okay' % ((i+1), len(good))) with open(os.path.join(resdir, 'id.txt'), 'w') as fw: fw.write(' '.join(good)) fw.close() with open(os.path.join(resdir, 'shape.txt'), 'w') as fw: fw.write('%d %d' % (len(good), s2v.get_feat_dim())) fw.close()
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite random = options.random resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.zeros((len(id_images), len(concepts))) id_images = [] tag2idx = dict(zip(concepts, xrange(len(concepts)))) with open(os.path.join(rootpath, workingCollection, 'TextData', 'id.userid.lemmtags.txt')) as f: cnt = 0 for line in f: id_img, _, tags = line.split('\t') tags = tags.split() if len(tags) > 0: tags = [(tag2idx.get(x,-1), y) for x,y in zip(tags, xrange(len(tags)))] idx = np.array([x[0] for x in tags]) vals = 1. / (1. + np.array([x[1] for x in tags])) tagmatrix[cnt, idx] = vals id_images.append(id_img) cnt += 1 # random rank for untagged images if random: tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand(tagmatrix.shape[0], tagmatrix.shape[1]) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
def process(options, collection): rootpath = options.rootpath overwrite = options.overwrite feature = options.feature method = options.method sigma =options.sigma # result path ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'MetaData', method, feature) DCG_result_path = os.path.join(rootpath, collection, 'DCG', method, feature) if checkToSkip(ranking_result_path, overwrite): sys.exit(0) if checkToSkip(DCG_result_path, overwrite): sys.exit(0) # inpute of query qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) qid2query = dict(zip(qid_list, query_list)) # inpute of image img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature) img_feats = BigFile(img_feat_path) # the model to calculate DCG@25 scorer = getScorer("DCG@25") done = 0 qid2dcg = collections.OrderedDict() qid2iid_label_score = {} for qid in qid_list: iid_list, label_list = readAnnotationsFrom(collection, 'concepts%s.txt' % collection, qid, False, rootpath) renamed, test_X = img_feats.read(iid_list) parzen_list = [] for imidx in iid_list: parzen_list.append(calParzen(img_feats.read_one(imidx), test_X , sigma)) sorted_tuple = sorted(zip(iid_list, label_list, parzen_list), key=lambda v:v[2], reverse=True) qid2iid_label_score[qid] = sorted_tuple # calculate DCG@25 sorted_label = [x[1] for x in sorted_tuple] qid2dcg[qid] = scorer.score(sorted_label) printMessage("Done", qid, qid2query[qid]) done += 1 if done % 20 == 0: writeRankingResult(ranking_result_path, qid2iid_label_score) qid2iid_label_score = {} writeDCGResult(DCG_result_path, qid2dcg) writeRankingResult(ranking_result_path, qid2iid_label_score) print "average DCG@25: %f" % (1.0*sum(qid2dcg.values())/ len(qid2dcg.values())) result_path_file = "result/individual_result_pathes.txt" if os.path.exists(result_path_file): fout = open(result_path_file,'a') else: makedirsforfile(result_path_file) fout = open(result_path_file, 'w') fout.write(ranking_result_path + '\n') fout.close()
def process(options, workingCollection, feature): rootpath = options.rootpath k_ratio = options.kratio distance = options.distance overwrite = options.overwrite nnName = distance + "knn" resultfile = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,k_ratio), 'laplacianI.mat') if checkToSkip(resultfile, overwrite): return 0 workingSet = readImageSet(workingCollection, workingCollection, rootpath) workingSet.sort() tot_images = len(workingSet) printStatus(INFO, '%d images' % (tot_images)) K_neighs = int(math.floor(len(workingSet) * k_ratio)) printStatus(INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio)) printStatus(INFO, 'Allocating I,J,V arrays') I = np.zeros((K_neighs * tot_images * 2)) J = np.zeros((K_neighs * tot_images * 2)) V = np.zeros((K_neighs * tot_images * 2)) n_entries = 0 # distances printStatus(INFO, 'Starting to fill I,J,V arrays') for i in xrange(tot_images): try: neighbors = _get_neighbors('%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs*2, feature, distance) # remove images with features but not in the working set NNrow = [] NNDrow = [] new_neighs = [] for x in neighbors: try: NNrow.append(bisect_index(workingSet, x[0])) NNDrow.append(x[1]) new_neighs.append(x) except ValueError: pass #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors]) #NNDrow = np.array([x[1] for x in neighbors]) NNrow = np.array(NNrow) NNDrow = np.array(NNDrow) neighbors = new_neighs[0:K_neighs] except ValueError: printStatus(INFO, 'ERROR: id_img %s has non-standard format!' % (workingSet[i])) sys.exit(1) if len(neighbors) < K_neighs: printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (workingSet[i], len(neighbors), K_neighs)) sys.exit(1) if (i+1) % 1000 == 0: printStatus(INFO, '%d / %d done' % (i+1, tot_images)) for k in xrange(K_neighs): if i != int(NNrow[k]): # -1 zero on the diagonal for a later step I[n_entries] = i J[n_entries] = int(NNrow[k]) # -1 V[n_entries] = NNDrow[k] n_entries += 1 I[n_entries] = int(NNrow[k]) # -1 J[n_entries] = i V[n_entries] = NNDrow[k] n_entries += 1 I = I[0:n_entries] J = J[0:n_entries] V = V[0:n_entries] printStatus(INFO, 'Removing duplicates') ind = np.lexsort((V,J,I)) I = I[ind] J = J[ind] V = V[ind] a = np.concatenate([I.reshape(1, len(I)), J.reshape(1, len(J))], axis=0).T b = np.ascontiguousarray(a).view(np.dtype((np.void, a.dtype.itemsize * a.shape[1]))) del a _, idx = np.unique(b, return_index=True) del b I = I[idx] J = J[idx] V = V[idx] printStatus(INFO, 'Computing the final laplacian matrix') sigma = np.median(V) ** 2.; printStatus(INFO, 'Estimated sigma^2 = %f' % sigma) V = np.exp(-V / sigma) matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tocsr() new_diag = matrix.sum(axis=0).T V = -V I_add = np.zeros((tot_images)) J_add = np.zeros((tot_images)) V_add = np.zeros((tot_images)) for i,v in enumerate(new_diag): I_add[i] = i J_add[i] = i V_add[i] = v I = np.append(I, I_add) J = np.append(J, J_add) V = np.append(V, V_add) matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tolil() printStatus(INFO, 'Saving laplacian matrix to %s' % resultfile) makedirsforfile(resultfile) scipy.io.savemat(resultfile, {'im_similarity' : matrix, 'sigma' : sigma})
def process(options, trainCollection, trainAnnotationName, feature): import re p = re.compile(r'best_C=(?P<C>[\.\d]+),\sa=(?P<a>[\.\-\d]+),\sb=(?P<b>[\.\-\d]+)') rootpath = options.rootpath best_param_dir = options.best_param_dir overwrite = options.overwrite #autoweight = options.autoweight numjobs = options.numjobs job = options.job beta = 0.5 modelName = 'fastlinear' if best_param_dir: modelName += '-tuned' concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] if not todo: return 0 printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature)) for concept in todo: if best_param_dir: param_file = os.path.join(best_param_dir, '%s.txt' % concept) m = p.search(open(param_file).readline().strip()) C = float(m.group('C')) A = float(m.group('a')) B = float(m.group('b')) else: C = 1 A = 0 B = 0 printStatus(INFO, '%s, C=%g, A=%g, B=%g' % (concept, C, A, B)) model_file_name = os.path.join(resultdir, concept + '.model') names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = feat_file.read(names) y = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn # no bias term added by setting "-B -1" svm_params = '-w1 %g -w-1 %g -s 2 -B -1 -q' % (wp*C, wn*C) model = liblinear_train(y, vectors, svm_params) newmodel = liblinear_to_fastlinear([model], [1.0], feat_file.ndims) newmodel.set_probAB(A, B) makedirsforfile(model_file_name) printStatus(INFO, '-> %s'%model_file_name) fastlinear_save_model(model_file_name, newmodel) # reload the model file to do a simple check fastlinear_load_model(model_file_name) assert(abs(newmodel.get_probAB()[0]-A)<1e-6) assert(abs(newmodel.get_probAB()[1]-B)<1e-6) return len(todo)
def process(options, workingCollection): rootpath = options.rootpath overwrite = options.overwrite chunk = options.chunk - 1 n_chunks = options.nchunks ratio_cs = options.ratiocs assert chunk < n_chunks and chunk >= 0 and n_chunks > 0 printStatus(INFO, 'RatioCS = %f' % ratio_cs) printStatus(INFO, 'Using Brown Corpus for the ic') brown_ic = wordnet_ic.ic('ic-brown.dat') tags_file = os.path.join(rootpath, workingCollection, 'TextData', 'lemm_wordnet_freq_tags.h5') if not os.path.exists(tags_file): printStatus(INFO, 'Tags file not found at %s Did you run wordnet_frequency_tags.py ?' % tags_file) sys.exit(1) if n_chunks > 1: resultfile = os.path.join(rootpath, workingCollection, 'LaplacianT', '%f'%(ratio_cs), 'laplacianT_%d.mat' % chunk) else: resultfile = os.path.join(rootpath, workingCollection, 'LaplacianT', '%f'%(ratio_cs), 'laplacianT.mat') if checkToSkip(resultfile, overwrite): return 0 tags_data = h5py.File(tags_file, 'r') vocab = list(tags_data['vocab'][:]) tagmatrix = tags_data['tagmatrix'][:] N_tags = len(vocab) # single tag frequency frequency = tagmatrix.sum(axis=0) assert len(frequency) == len(vocab), "%s " % len(frequency) == len(vocab) final_matrix = np.zeros((N_tags, N_tags)) # similarity matrix printStatus(INFO, 'Building the similarity matrix') start_chunk = chunk * int(math.floor(N_tags / n_chunks)) if chunk == (n_chunks - 1): end_chunk = N_tags else: end_chunk = (chunk + 1) * int(math.floor(N_tags / n_chunks)) for i in xrange(start_chunk, end_chunk): if i % 100 == 0: printStatus(INFO, '%d / %d done' % (i+1, end_chunk)) for k in xrange(i+1, N_tags): context = ratio_cs * np.sum(tagmatrix[:, [i, k]].sum(axis=1) > 1.5) / (frequency[i] + frequency[k]) semantic = max(0, (1. - ratio_cs) * tag_semantic_similarity(vocab[i], vocab[k], brown_ic)) final_matrix[i, k] = context + semantic final_matrix[k, i] = final_matrix[i, k] # laplacian if n_chunks < 2: printStatus(INFO, 'Computing the laplacian matrix') new_diag = final_matrix.sum(axis=0).T final_matrix = - final_matrix for i in xrange(N_tags): final_matrix[i, i] = new_diag[i] if n_chunks < 2: printStatus(INFO, 'Saving laplacian matrix to %s' % resultfile) else: printStatus(INFO, 'Saving partial matrix to %s' % resultfile) makedirsforfile(resultfile) scipy.io.savemat(resultfile, {'tag_similarity' : final_matrix})
def process(options, testCollection, trainCollection, annotationName, feature): rootpath = options.rootpath k = options.k distance = options.distance overwrite = options.overwrite testset = testCollection onlytest = options.onlytest nnName = distance + "knn" resultfile_train = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d'%(feature,nnName,k), 'nn_train.h5') resultfile_test = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d'%(feature,nnName,k), 'nn_test.h5') if (not onlytest and checkToSkip(resultfile_train, overwrite)) or checkToSkip(resultfile_test, overwrite): return 0 testSet = readImageSet(testCollection, testset, rootpath) trainSet = readImageSet(trainCollection, trainCollection, rootpath) testSet.sort() trainSet.sort() #train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) #train_feat_file = BigFile(train_feat_dir) tagger = NAME_TO_TAGGER["preknn"](trainCollection, annotationName, feature, distance, rootpath=rootpath, k=1001) printStatus(INFO, '%d test images, %d train images' % (len(testSet), len(trainSet))) # allocate train -> train nearest neighbors if not onlytest: printStatus(INFO, 'Allocating NN, NND matrices') NN = np.zeros((len(trainSet), k+1), dtype=np.int32) NND = np.zeros((len(trainSet), k+1)) printStatus(INFO, 'Filling NN, NND matrices') for i,id_img in enumerate(trainSet): neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (trainCollection, id_img)) if len(neighbors) < k+1: printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k+1)) sys.exit(1) NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors]) NNDrow = np.array([x[1] for x in neighbors]) NN[i,:] = NNrow[0:k+1] NND[i,:] = NNDrow[0:k+1] if i % 1000 == 0: printStatus(INFO, '%d / %d images' % (i, len(trainSet))) printStatus(INFO, 'Saving train matrices to file %s' % (resultfile_train)) makedirsforfile(resultfile_train) fout = h5py.File(resultfile_train, 'w') fout['NN'] = NN fout['NND'] = NND fout['trainSet'] = trainSet fout['concepts'] = tagger.concepts fout.close() del NN del NND # allocate test -> train nearest neighbors printStatus(INFO, 'Allocating NNT, NNDT matrices') NNT = np.zeros((len(testSet), k), dtype=np.int32) NNDT = np.zeros((len(testSet), k)) printStatus(INFO, 'Filling NNT, NNDT matrices') for i,id_img in enumerate(testSet): neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (testCollection, id_img)) if len(neighbors) < k: printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k)) sys.exit(1) NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors]) NNDrow = np.array([x[1] for x in neighbors]) NNT[i,:] = NNrow[0:k] NNDT[i,:] = NNDrow[0:k] if i % 1000 == 0: printStatus(INFO, '%d / %d images' % (i, len(testSet))) printStatus(INFO, 'Saving test matrices to file %s' % (resultfile_test)) makedirsforfile(resultfile_test) fout = h5py.File(resultfile_test, 'w') fout['NNT'] = NNT fout['NNDT'] = NNDT fout['trainSet'] = trainSet fout['testSet'] = testSet fout['concepts'] = tagger.concepts fout.close()
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName): if modelName.startswith('fik'): from fiksvm.fiksvm import fiksvm_load_model as load_model else: from fastlinear.fastlinear import fastlinear_load_model as load_model rootpath = options.rootpath overwrite = options.overwrite prob_output = options.prob_output numjobs = options.numjobs job = options.job blocksize = options.blocksize outputName = '%s,%s' % (feature,modelName) if prob_output: outputName += ',prob' resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt') if numjobs>1: resultfile += '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) nr_of_concepts = len(concepts) test_imset = readImageSet(testCollection, testCollection, rootpath) test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job] nr_of_test_images = len(test_imset) printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile)) models = [None] * nr_of_concepts for c in range(nr_of_concepts): model_file_name = os.path.join(rootpath,trainCollection,'Models',trainAnnotationName,feature, modelName, '%s.model'%concepts[c]) models[c] = load_model(model_file_name) if models[c] is None: return 0 #(pA,pB) = model.get_probAB() feat_file = BigFile(os.path.join(rootpath, testCollection, "FeatureData", feature)) makedirsforfile(resultfile) fw = open(resultfile, "w") read_time = 0 test_time = 0 start = 0 done = 0 while start < nr_of_test_images: end = min(nr_of_test_images, start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) s_time = time.time() renamed, test_X = feat_file.read(test_imset[start:end]) read_time += time.time() - s_time s_time = time.time() output = [None] * len(renamed) for i in xrange(len(renamed)): if prob_output: scores = [models[c].predict_probability(test_X[i]) for c in range(nr_of_concepts)] else: scores = [models[c].predict(test_X[i]) for c in range(nr_of_concepts)] #dec_value = sigmoid_predict(dec_value, A=pA, B=pB) tagvotes = sorted(zip(concepts, scores), key=lambda v:v[1], reverse=True) output[i] = '%s %s\n' % (renamed[i], " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes])) test_time += time.time() - s_time start = end fw.write(''.join(output)) fw.flush() done += len(output) # done printStatus(INFO, "%d done. read time %g seconds, test_time %g seconds" % (done, read_time, test_time)) fw.close() return done
def process(options, trainCollection, valCollection, testCollection): lang = which_language(trainCollection) assert(which_language(trainCollection) == which_language(valCollection)) assert(which_language(trainCollection) == which_language(testCollection)) rootpath = options.rootpath overwrite = options.overwrite checkpoint = options.checkpoint init_model_from = options.init_model_from unroll = options.unroll corpus = options.corpus word2vec = options.word2vec batch_size = options.batch_size w2vv_config = options.model_config config = load_config('w2vv_configs/%s.py' % w2vv_config) img_feature = config.img_feature set_style = config.set_style # text embedding style (word2vec, bag-of-words, word hashing) text_style = config.text_style L1_normalize = config.L1_normalize L2_normalize = config.L2_normalize bow_vocab = config.bow_vocab+'.txt' l2_p = config.l2_p dropout = config.dropout max_epochs= config.max_epochs optimizer = config.optimizer loss_fun = config.loss_fun lr = config.lr clipnorm = config.clipnorm activation = config.activation sequences = config.sequences # lstm sent_maxlen = config.sent_maxlen embed_size = config.embed_size we_trainable = config.we_trainable lstm_size = config.lstm_size n_layers = map(int, config.n_layers.strip().split('-')) if init_model_from != '': init_model_name = init_model_from.strip().split("/")[-1] train_style = INFO + "_" + init_model_name else: train_style = INFO rnn_style, bow_style, w2v_style = text_style.strip().split('@') # text embedding style model_info = w2vv_config if 'lstm' in text_style or 'gru' in text_style: if lang == 'zh': w2v_data_path = os.path.join(rootpath, 'zh_w2v', 'model', 'zh_jieba.model') else: w2v_data_path = os.path.join(rootpath, "word2vec", corpus, word2vec) # bag-of-words vocabulary file path text_data_path = os.path.join(rootpath, trainCollection, "TextData", "vocabulary", "bow", bow_vocab) bow_data_path = os.path.join(rootpath, trainCollection, "TextData", "vocabulary", bow_style, bow_vocab) # text embedding (text representation) text2vec = get_text_encoder(rnn_style)(text_data_path, ndims=0, language=lang, L1_normalize=L1_normalize, L2_normalize=L2_normalize, maxlen=sent_maxlen) bow2vec = get_text_encoder(bow_style)(bow_data_path, ndims=0, language=lang, L1_normalize=L1_normalize, L2_normalize=L2_normalize) w2v2vec = get_text_encoder(w2v_style)(w2v_data_path, ndims=0, language=lang, L1_normalize=L1_normalize, L2_normalize=L2_normalize) if n_layers[0] == 0: n_layers[0] = bow2vec.ndims + w2v2vec.ndims else: assert n_layers[0] == bow2vec.ndims + w2v2vec.ndims # log file checkpoint_dir = os.path.join(rootpath, trainCollection, checkpoint, valCollection, train_style, model_info) else: logger.info("%s is not supported, please check the 'text_style' parameter", text_style) sys.exit(0) train_loss_hist_file = os.path.join(checkpoint_dir, 'train_loss_hist.txt') val_per_hist_file = os.path.join(checkpoint_dir, 'val_per_hist.txt') model_file_name = os.path.join(checkpoint_dir, 'model.json') model_img_name = os.path.join(checkpoint_dir, 'model.png') logger.info(model_file_name) if checkToSkip(model_file_name, overwrite): sys.exit(0) makedirsforfile(val_per_hist_file) # img2vec img_feat_path = os.path.join(rootpath, FULL_COLLECTION, 'FeatureData', img_feature) img_feats = BigFile(img_feat_path) val_img_feat_path = os.path.join(rootpath, FULL_COLLECTION, 'FeatureData', img_feature) val_img_feats = BigFile(val_img_feat_path) # dataset train_file = os.path.join(rootpath, trainCollection, 'TextData', '%s.caption.txt' % trainCollection) # training set # print "loss function: ", loss_fun dataset_style = 'sent_' + loss_fun DataSet = get_dataset(dataset_style) # represent text on the fly trainData = DataSet(train_file, batch_size, text2vec, bow2vec, w2v2vec, img_feats, flag_maxlen=True, maxlen=sent_maxlen) # get pre-trained word embedding we_weights = get_we_parameter(text2vec.vocab, w2v_data_path, lang) # define word2visualvec model w2vv = W2VV_MS( text2vec.nvocab, sent_maxlen, embed_size, we_weights, we_trainable, lstm_size, n_layers, dropout, l2_p, activation=activation, lstm_style=rnn_style, sequences=sequences, unroll=unroll) w2vv.save_json_model(model_file_name) w2vv.plot(model_img_name) w2vv.compile_model(optimizer, loss_fun, learning_rate = lr, clipnorm=clipnorm) if options.init_model_from != '': logger.info('initialize the model from %s', options.init_model_from) w2vv.init_model(options.init_model_from) # preparation for validation val_sent_file = os.path.join(rootpath, valCollection, 'TextData', '%s.caption.txt' % valCollection) val_sents_id, val_sents, val_id2sents = readSentsInfo(val_sent_file) val_img_list = map(str.strip, open(os.path.join(rootpath, valCollection, set_style, '%s.txt' % valCollection)).readlines()) sent_feats_1 = [] sent_feats_2 = [] new_val_sents_id = [] for index, sent in enumerate(val_sents): sent_vec = text2vec.mapping(sent) bow_vec = bow2vec.mapping(sent) w2v_vec = w2v2vec.mapping(sent) if sent_vec is not None and bow_vec is not None and w2v_vec is not None: sent_feats_1.append(sent_vec) sent_feats_2.append(list(bow_vec) + list(w2v_vec)) new_val_sents_id.append(val_sents_id[index]) sent_feats_1 = pad_sequences(sent_feats_1, maxlen=sent_maxlen, truncating='post') simer = get_simer('cosine_batch')() scorer = getScorer(options.val_metric) count = 0 lr_count = 0 best_validation_perf = 0 best_epoch = -1 train_loss_hist = [] val_per_hist = [] n_train_batches = int(np.ceil( 1.0 * trainData.datasize / batch_size )) if loss_fun == 'ctl': datasize = 2*trainData.datasize else: datasize = trainData.datasize for epoch in range(max_epochs): logger.info('Epoch %d', epoch) logger.info("Training..., learning rate: %g", w2vv.get_lr()) train_loss_epoch = [] train_progbar = generic_utils.Progbar(datasize) trainBatchIter = trainData.getBatchData() for minibatch_index in xrange(n_train_batches): train_X_batch, train_Y_batch = trainBatchIter.next() loss = w2vv.model.train_on_batch(train_X_batch, train_Y_batch) train_progbar.add(train_X_batch[0].shape[0], values=[("train loss", loss)]) train_loss_epoch.append(loss) train_loss_hist.append(np.mean(train_loss_epoch)) this_validation_perf = do_validation(val_img_list, val_img_feats, new_val_sents_id, sent_feats_1, sent_feats_2, simer, scorer, w2vv) val_per_hist.append(this_validation_perf) logger.info('previous_best_performance: %g', best_validation_perf) logger.info('current_performance: %g', this_validation_perf) fout_file = os.path.join(checkpoint_dir, 'epoch_%d.h5' % ( epoch)) lr_count += 1 if this_validation_perf > best_validation_perf: best_validation_perf = this_validation_perf count = 0 # save best model w2vv.model.save_weights(fout_file) if best_epoch != -1: os.system('rm '+ os.path.join(checkpoint_dir, 'epoch_%d.h5' % (best_epoch))) best_epoch = epoch else: # when the validation performance has decreased after an epoch, # we divide the learning rate by 2 and continue training; # but we use each learning rate for at least 3 epochs. if lr_count > 2: w2vv.decay_lr(0.5) lr_count = 0 count += 1 if count > 10: print ("Early stopping happend") break sorted_epoch_loss = zip(range(len(train_loss_hist)), train_loss_hist) with open(train_loss_hist_file, 'w') as fout: for i, loss in sorted_epoch_loss: fout.write("epoch_" + str(i) + " " + str(loss) + "\n") sorted_epoch_perf = sorted(zip(range(len(val_per_hist)), val_per_hist), key = lambda x: x[1], reverse=True) with open(val_per_hist_file, 'w') as fout: for i, perf in sorted_epoch_perf: fout.write("epoch_" + str(i) + " " + str(perf) + "\n") # generate the shell script for test templete = ''.join(open( 'TEMPLATE_do_test.sh').readlines()) striptStr = templete.replace('@@@rootpath@@@', rootpath) striptStr = striptStr.replace('@@@overwrite@@@', str(overwrite)) striptStr = striptStr.replace('@@@trainCollection@@@', trainCollection) striptStr = striptStr.replace('@@@testCollection@@@', '%s %s'%(valCollection, testCollection)) striptStr = striptStr.replace('@@@model_config@@@', w2vv_config) striptStr = striptStr.replace('@@@set_style@@@', set_style) striptStr = striptStr.replace('@@@model_path@@@', checkpoint_dir) striptStr = striptStr.replace('@@@model_name@@@', 'model.json') striptStr = striptStr.replace('@@@weight_name@@@', 'epoch_%d.h5' % sorted_epoch_perf[0][0]) runfile = 'do_test_%s_%s.sh' % (w2vv_config, testCollection) open( runfile, 'w' ).write(striptStr+'\n') os.system('chmod +x %s' % runfile) os.system('./%s' % runfile)
def process(opt, trainCollection, valCollection, testCollection): rootpath = opt.rootpath overwrite = opt.overwrite opt.n_text_layers = map(int, opt.n_text_layers.strip().split('-')) if opt.init_model_from != '': init_model_name = opt.init_model_from.strip().split("/")[-1] train_style = opt.model_name + "_" + INFO + "_ft_" + init_model_name else: train_style = opt.model_name + "_" + INFO # text embedding style if '@' in opt.text_style and opt.model_name.endswith('_ms'): rnn_style, bow_style, w2v_style = opt.text_style.strip().split('@') opt.rnn_style = rnn_style text_data_path = os.path.join(rootpath, trainCollection, "TextData", "vocabulary", "bow", opt.rnn_vocab) bow_data_path = os.path.join(rootpath, trainCollection, "TextData", "vocabulary", bow_style, opt.bow_vocab) w2v_data_path = os.path.join(rootpath, "word2vec", opt.corpus, opt.word2vec) text_name = opt.bow_vocab + "_rnn_%d_%s_sent_%d" % ( opt.rnn_size, opt.rnn_vocab, opt.sent_maxlen) else: print opt.text_style + " is not supported, please check the 'text_style' parameter" sys.exit(0) optm_style = opt.optimizer + '_clipnorm_%.1f_lr_%.5f_dp_%.2f_l2_%.5f_%s_bs_%d' % \ (opt.clipnorm, opt.lr, opt.dropout, opt.l2_p, opt.loss_fun, opt.batch_size) model_style = "-".join(map( str, opt.n_text_layers)) + '_' + opt.hidden_act + '_' + opt.simi_fun model_id = "".join([ opt.checkpoint, 'w2vv', valCollection, train_style, opt.text_style + '_' + text_name, opt.img_feature, optm_style, model_style, opt.postfix ]) checkpoint_dir = os.path.join(rootpath, trainCollection, "train_results", hashlib.sha1(model_id).hexdigest()) # output visualization script runfile_vis = 'do_visual.sh' open(runfile_vis, 'w').write( 'port=$1\ntensorboard --logdir %s --port $port' % checkpoint_dir) os.system('chmod +x %s' % runfile_vis) val_per_hist_file = os.path.join(checkpoint_dir, 'val_per_hist.txt') if checkToSkip(val_per_hist_file, overwrite): sys.exit(0) # else: # if os.path.exists(checkpoint_dir): # os.system("rm -r " + checkpoint_dir) makedirsforfile(val_per_hist_file) model_file_name = os.path.join(checkpoint_dir, 'model.json') model_img_name = os.path.join(checkpoint_dir, 'model.png') tb_logger.configure(checkpoint_dir, flush_secs=5) # text embedding (text representation) if '@' in opt.text_style and opt.model_name.endswith('_ms'): text2vec = get_text_encoder(rnn_style)(text_data_path) bow2vec = get_text_encoder(bow_style)(bow_data_path) w2v2vec = get_text_encoder(w2v_style)(w2v_data_path) if opt.n_text_layers[0] == 0: opt.n_text_layers[0] = bow2vec.ndims + w2v2vec.ndims else: assert opt.n_text_layers[0] == bow2vec.ndims + w2v2vec.ndims opt.vocab_size = text2vec.n_vocab opt.embed_size = w2v2vec.ndims else: text2vec = get_text_encoder(opt.text_style)(text_data_path, ndims=opt.n_text_layers[0]) if opt.n_text_layers[0] == 0: opt.n_text_layers[0] = text2vec.ndims # img2vec img_feat_path = os.path.join(rootpath, trainCollection, 'FeatureData', opt.img_feature) img_feats = BigFile(img_feat_path) val_img_feat_path = os.path.join(rootpath, valCollection, 'FeatureData', opt.img_feature) val_img_feats = BigFile(val_img_feat_path) # write out options for evaluation pkl_file = os.path.join(checkpoint_dir, 'option.pkl') writePkl(opt, pkl_file) # define word2visualvec model if opt.model_name.endswith('_ms'): we_weights = get_we_parameter(text2vec.vocab, w2v_data_path) print we_weights.shape model = get_model(opt.model_name)(opt, we_weights=we_weights) else: model = get_model(opt.model_name)(opt) model.save_json_model(model_file_name) model.plot(model_img_name) model.compile_model(opt.loss_fun, opt=opt) if opt.init_model_from != '': print '*' * 20 print 'initialize the model form ' + opt.init_model_from print '*' * 20 model.init_model(opt.init_model_from) # training set caption_file = os.path.join(rootpath, trainCollection, 'TextData', '%s.caption.txt' % trainCollection) trainData = PairDataSet_MS(caption_file, opt.batch_size, text2vec, bow2vec, w2v2vec, img_feats, flag_maxlen=True, maxlen=opt.sent_maxlen) val_sent_file = os.path.join(rootpath, valCollection, 'TextData', '%s.caption.txt' % valCollection) val_img_list, val_sents_id, val_sents = readImgSents(val_sent_file) losser = get_losser(opt.simi_fun)() best_validation_perf = 0 n_step = 0 count = 0 lr_count = 0 best_epoch = -1 val_per_hist = [] for epoch in range(opt.max_epochs): print '\nEpoch', epoch print "Training..." print "learning rate: ", model.get_lr() tb_logger.log_value('lr', model.get_lr(), step=n_step) train_progbar = generic_utils.Progbar(trainData.datasize) trainBatchIter = trainData.getBatchData() for minibatch_index in xrange(trainData.max_batch_size): n_step += 1 img_X_batch, text_X_batch = trainBatchIter.next() loss_batch = model.model.train_on_batch(text_X_batch, img_X_batch) train_progbar.add(img_X_batch.shape[0], values=[("loss", loss_batch)]) tb_logger.log_value('loss', loss_batch, step=n_step) tb_logger.log_value('n_step', n_step, step=n_step) print "\nValidating..." all_errors = pred_mutual_error_ms(val_img_list, val_sents, model, text2vec, bow2vec, w2v2vec, val_img_feats, losser, opt=opt) this_validation_perf = cal_val_perf(all_errors, opt=opt) tb_logger.log_value('val_accuracy', this_validation_perf, step=n_step) val_per_hist.append(this_validation_perf) print 'previous_best_performance: %.3f' % best_validation_perf print 'current_performance: %.3f' % this_validation_perf fout_file = os.path.join(checkpoint_dir, 'epoch_%d.h5' % (epoch)) lr_count += 1 if this_validation_perf > best_validation_perf: best_validation_perf = this_validation_perf count = 0 # save model model.model.save_weights(fout_file) if best_epoch != -1: os.system('rm ' + os.path.join(checkpoint_dir, 'epoch_%d.h5' % (best_epoch))) best_epoch = epoch else: # when the validation performance has decreased after an epoch, # we divide the learning rate by 2 and continue training; # but we use each learning rate for at least 3 epochs. if lr_count > 2: model.decay_lr(0.5) lr_count = 0 count += 1 if count > 10: print("Early stopping happened") break sorted_epoch_perf = sorted(zip(range(len(val_per_hist)), val_per_hist), key=lambda x: x[1], reverse=True) with open(val_per_hist_file, 'w') as fout: for i, perf in sorted_epoch_perf: fout.write("epoch_" + str(i) + " " + str(perf) + "\n") # generate the shell script for test templete = ''.join(open('TEMPLATE_do_test.sh').readlines()) striptStr = templete.replace('@@@rootpath@@@', rootpath) striptStr = striptStr.replace('@@@trainCollection@@@', trainCollection) striptStr = striptStr.replace('@@@valCollection@@@', valCollection) striptStr = striptStr.replace('@@@testCollection@@@', testCollection) striptStr = striptStr.replace('@@@model_path@@@', checkpoint_dir) striptStr = striptStr.replace('@@@weight_name@@@', 'epoch_%d.h5' % sorted_epoch_perf[0][0]) striptStr = striptStr.replace('@@@n_caption@@@', str(opt.n_caption)) print os.path.join(checkpoint_dir, 'epoch_%d.h5' % sorted_epoch_perf[0][0]) runfile = 'do_test_%s.sh' % (testCollection) open(runfile, 'w').write(striptStr + '\n') os.system('chmod +x %s' % runfile) # os.system('./'+runfile) os.system('cp %s/epoch_%d.h5 %s/best_model.h5' % (checkpoint_dir, sorted_epoch_perf[0][0], checkpoint_dir))
if len(sys.argv) < 4: print "Usage: merge_datasets.py trainCollection testCollection feature" sys.exit(1) coll1 = sys.argv[1] coll2 = sys.argv[2] feature = sys.argv[3] coll1_features_file = "%s/%s/FeatureData/%s/feature.bin" % (datapath, coll1, feature) coll2_features_file = "%s/%s/FeatureData/%s/feature.bin" % (datapath, coll2, feature) new_features_file = "%s/%s+%s/FeatureData/%s/feature.bin" % (datapath, coll1, coll2, feature) makedirsforfile(new_features_file) coll1_shape_file = "%s/%s/FeatureData/%s/shape.txt" % (datapath, coll1, feature) coll2_shape_file = "%s/%s/FeatureData/%s/shape.txt" % (datapath, coll2, feature) new_shape_file = "%s/%s+%s/FeatureData/%s/shape.txt" % (datapath, coll1, coll2, feature) makedirsforfile(new_shape_file) # shape file with open(new_shape_file, 'w') as fout: imA, featA = open(coll1_shape_file).read().strip().split(" ") imB, featB = open(coll2_shape_file).read().strip().split(" ") assert featA == featB
def process(options, collection, annotationName, pos_num): assert (annotationName.endswith('.txt')) rootpath = options.rootpath pos_bag_num = options.pos_bag_num neg_bag_num = options.neg_bag_num neg_pos_ratio = options.neg_pos_ratio annotationNameStr = annotationName[:-4] + ( '.random%d' % pos_num) + '.%d' + ('.npr%d' % neg_pos_ratio) + '.%d.txt' concepts = readConcepts(collection, annotationName, rootpath=rootpath) skip = 0 newAnnotationNames = [None] * (pos_bag_num * neg_bag_num) for idxp in range(pos_bag_num): for idxn in range(neg_bag_num): anno_idx = idxp * neg_bag_num + idxn newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn) resultfile = os.path.join(rootpath, collection, 'Annotations', newAnnotationNames[anno_idx]) if checkToSkip(resultfile, options.overwrite): skip += 1 continue writeConcepts(concepts, resultfile) first, second, last = annotationNameStr.split('%d') scriptfile = os.path.join( rootpath, collection, 'annotationfiles', first + '0-%d' % (pos_bag_num - 1) + second + '0-%d' % (neg_bag_num - 1) + last) makedirsforfile(scriptfile) fout = open(scriptfile, 'w') fout.write('\n'.join(newAnnotationNames) + '\n') fout.close() if len(newAnnotationNames) == skip: return 0 for concept in concepts: names, labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath) positivePool = [x[0] for x in zip(names, labels) if x[1] > 0] negativePool = [x[0] for x in zip(names, labels) if x[1] < 0] for idxp in range(pos_bag_num): if len(positivePool) > pos_num: positiveBag = random.sample(positivePool, pos_num) else: positiveBag = positivePool for idxn in range(neg_bag_num): anno_idx = idxp * neg_bag_num + idxn newAnnotationName = newAnnotationNames[anno_idx] resultfile = os.path.join(rootpath, collection, 'Annotations', 'Image', newAnnotationName, '%s.txt' % concept) if checkToSkip(resultfile, options.overwrite): continue real_neg_num = max(len(positiveBag) * neg_pos_ratio, 1000) real_neg_num = min(len(negativePool), real_neg_num) negativeBag = random.sample(negativePool, real_neg_num) assert (len(set(positiveBag).intersection( set(negativeBag))) == 0) printStatus( INFO, "anno(%s,%d) %d pos %d neg -> %s" % (concept, anno_idx, len(positiveBag), len(negativeBag), resultfile)) writeAnnotations(positiveBag + negativeBag, [1] * len(positiveBag) + [-1] * len(negativeBag), resultfile)
def process(options, trainCollection, annotationfile, feature, modelName): assert(modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 C = 1 overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {'rootpath': rootpath, 'model': modelName} if 'fik' == modelName: from svms.fiksvm.svmutil import svm_train as train_model from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model from svms.fiksvm.fiksvm import fiksvm_save_model as save_model from svms.fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model from svms.fastlinear.fastlinear import fastlinear_save_model as save_model newAnnotationName = os.path.split(annotationfile)[-1] trainAnnotationNames = [x.strip() for x in open(annotationfile).readlines() if x.strip() and not x.strip().startswith('#')] for annotationName in trainAnnotationNames: conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', annotationName) if not os.path.exists(conceptfile): print '%s does not exist' % conceptfile return 0 concepts = readConcepts(trainCollection, trainAnnotationNames[0], rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature)) feat_dim = train_feat_file.ndims s_time = time.time() for concept in todo: assemble_model = None for t in range(1, len(trainAnnotationNames)+1): names,labels = readAnnotationsFrom(trainCollection, trainAnnotationNames[t-1], concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' g_t = train_model(Ys, vectors, svm_params + ' -q') if t == 1: assemble_model = compress_model([g_t], [1.0], feat_dim, params) else: new_model = compress_model([g_t], [1.0], feat_dim, params) assemble_model.add_fastsvm(new_model, 1-1.0/t, 1.0/t) new_model_file = os.path.join(resultdir, '%s.model' % concept) makedirsforfile(new_model_file) printStatus(INFO, 'save model to %s' % new_model_file) save_model(new_model_file, assemble_model) printStatus(INFO, '%s done' % concept) timecost = time.time() - s_time writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath) printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo))) printStatus(INFO, 'models stored at %s' % resultdir) printStatus(INFO, '%g seconds in total' % timecost)
int, open(shapefile).readline().strip().split()) nr_of_images_list.append(nr_of_images) feat_dim_list.append(feat_dim) feat_files.append( BigFile(os.path.join(rootpath, collection, 'FeatureData', feature))) #assert(nr_of_images_list[0] == nr_of_images_list[1]) new_feat_dim = sum(feat_dim_list) imset = readImageSet(collection, collection, rootpath) nr_of_images = len(imset) blocksize = 1000 makedirsforfile(binary_file) fw = open(binary_file, 'wb') new_imset = [] start = 0 while start < nr_of_images: end = min(nr_of_images, start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end - 1)) renamed_0, vecs_0 = feat_files[0].read(imset[start:end]) renamed_1, vecs_1 = feat_files[1].read(imset[start:end]) sorted_idx_0 = np.argsort(renamed_0) sorted_idx_1 = np.argsort(renamed_1) for x, y in zip(sorted_idx_0, sorted_idx_1):
def process(options, feature, srcCollections, newCollection): assert (type(srcCollections) == list) temp = [] [x for x in srcCollections if x not in temp and temp.append(x)] # unique source collections srcCollections = temp rootpath = options.rootpath resfile = os.path.join(rootpath, newCollection, 'FeatureData', feature, 'feature.bin') if checkToSkip(resfile, options.overwrite): return 0 querysetfile = os.path.join(rootpath, newCollection, 'ImageSets', '%s.txt' % newCollection) try: query_set = set(map(str.strip, open(querysetfile).readlines())) printStatus(INFO, '%d images wanted' % len(query_set)) except IOError: printStatus( INFO, 'failed to load %s, will merge all features in %s' % (querysetfile, ';'.join(srcCollections))) query_set = None makedirsforfile(resfile) fw = open(resfile, 'wb') printStatus(INFO, 'writing results to %s' % resfile) seen = set() newimset = [] for collection in srcCollections: feat_dir = os.path.join(rootpath, collection, 'FeatureData', feature) with open(os.path.join(feat_dir, 'shape.txt')) as fr: nr_of_images, feat_dim = map(int, fr.readline().strip().split()) fr.close() srcimset = open(os.path.join(feat_dir, 'id.txt')).readline().strip().split() res = array.array('f') fr = open(os.path.join(feat_dir, 'feature.bin'), 'rb') for i, im in enumerate(srcimset): res.fromfile(fr, feat_dim) if im not in seen: seen.add(im) if not query_set or im in query_set: vec = res vec = np.array(vec, dtype=np.float32) vec.tofile(fw) newimset.append(im) del res[:] if i % 1e5 == 0: printStatus( INFO, '%d parsed, %d obtained' % (len(seen), len(newimset))) fr.close() printStatus(INFO, '%d parsed, %d obtained' % (len(seen), len(newimset))) fw.close() printStatus(INFO, '%d parsed, %d obtained' % (len(seen), len(newimset))) idfile = os.path.join(os.path.split(resfile)[0], 'id.txt') with open(idfile, 'w') as fw: fw.write(' '.join(newimset)) fw.close() shapefile = os.path.join(os.path.split(resfile)[0], 'shape.txt') with open(shapefile, 'w') as fw: fw.write('%d %d' % (len(newimset), feat_dim)) fw.close()
def process(options, workingCollection, feature): rootpath = options.rootpath k_ratio = options.kratio distance = options.distance overwrite = options.overwrite nnName = distance + "knn" resultfile = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f' % (feature, nnName, k_ratio), 'laplacianI.mat') if checkToSkip(resultfile, overwrite): return 0 workingSet = readImageSet(workingCollection, workingCollection, rootpath) workingSet.sort() tot_images = len(workingSet) printStatus(INFO, '%d images' % (tot_images)) K_neighs = int(math.floor(len(workingSet) * k_ratio)) printStatus( INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio)) printStatus(INFO, 'Allocating I,J,V arrays') I = np.zeros((K_neighs * tot_images * 2)) J = np.zeros((K_neighs * tot_images * 2)) V = np.zeros((K_neighs * tot_images * 2)) n_entries = 0 # distances printStatus(INFO, 'Starting to fill I,J,V arrays') for i in xrange(tot_images): try: neighbors = _get_neighbors( '%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs * 2, feature, distance) # remove images with features but not in the working set NNrow = [] NNDrow = [] new_neighs = [] for x in neighbors: try: NNrow.append(bisect_index(workingSet, x[0])) NNDrow.append(x[1]) new_neighs.append(x) except ValueError: pass #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors]) #NNDrow = np.array([x[1] for x in neighbors]) NNrow = np.array(NNrow) NNDrow = np.array(NNDrow) neighbors = new_neighs[0:K_neighs] except ValueError: printStatus( INFO, 'ERROR: id_img %s has non-standard format!' % (workingSet[i])) sys.exit(1) if len(neighbors) < K_neighs: printStatus( INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (workingSet[i], len(neighbors), K_neighs)) sys.exit(1) if (i + 1) % 1000 == 0: printStatus(INFO, '%d / %d done' % (i + 1, tot_images)) for k in xrange(K_neighs): if i != int(NNrow[k]): # -1 zero on the diagonal for a later step I[n_entries] = i J[n_entries] = int(NNrow[k]) # -1 V[n_entries] = NNDrow[k] n_entries += 1 I[n_entries] = int(NNrow[k]) # -1 J[n_entries] = i V[n_entries] = NNDrow[k] n_entries += 1 I = I[0:n_entries] J = J[0:n_entries] V = V[0:n_entries] printStatus(INFO, 'Removing duplicates') ind = np.lexsort((V, J, I)) I = I[ind] J = J[ind] V = V[ind] a = np.concatenate([I.reshape(1, len(I)), J.reshape(1, len(J))], axis=0).T b = np.ascontiguousarray(a).view( np.dtype((np.void, a.dtype.itemsize * a.shape[1]))) del a _, idx = np.unique(b, return_index=True) del b I = I[idx] J = J[idx] V = V[idx] printStatus(INFO, 'Computing the final laplacian matrix') sigma = np.median(V)**2. printStatus(INFO, 'Estimated sigma^2 = %f' % sigma) V = np.exp(-V / sigma) matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tocsr() new_diag = matrix.sum(axis=0).T V = -V I_add = np.zeros((tot_images)) J_add = np.zeros((tot_images)) V_add = np.zeros((tot_images)) for i, v in enumerate(new_diag): I_add[i] = i J_add[i] = i V_add[i] = v I = np.append(I, I_add) J = np.append(J, J_add) V = np.append(V, V_add) matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tolil() printStatus(INFO, 'Saving laplacian matrix to %s' % resultfile) makedirsforfile(resultfile) scipy.io.savemat(resultfile, {'im_similarity': matrix, 'sigma': sigma})
codepath = "/home/urix/shared/tagrelcodebase" datapath = ROOT_PATH if len(sys.argv) < 4: print "Usage: merge_datasets.py trainCollection testCollection feature" sys.exit(1) coll1 = sys.argv[1] coll2 = sys.argv[2] feature = sys.argv[3] coll1_features_file = "%s/%s/FeatureData/%s/feature.bin" % (datapath, coll1, feature) coll2_features_file = "%s/%s/FeatureData/%s/feature.bin" % (datapath, coll2, feature) new_features_file = "%s/%s+%s/FeatureData/%s/feature.bin" % (datapath, coll1, coll2, feature) makedirsforfile(new_features_file) coll1_shape_file = "%s/%s/FeatureData/%s/shape.txt" % (datapath, coll1, feature) coll2_shape_file = "%s/%s/FeatureData/%s/shape.txt" % (datapath, coll2, feature) new_shape_file = "%s/%s+%s/FeatureData/%s/shape.txt" % (datapath, coll1, coll2, feature) makedirsforfile(new_shape_file) # shape file with open(new_shape_file, 'w') as fout: imA, featA = open(coll1_shape_file).read().strip().split(" ") imB, featB = open(coll2_shape_file).read().strip().split(" ") assert featA == featB fout.write('%d %d' % (int(imA) + int(imB), int(featA))) # copy and concatenate features
def process(options, testCollection, trainCollection, annotationName, feature, outputpkl): rootpath = options.rootpath k = options.k distance = options.distance variant = options.variant overwrite = options.overwrite testset = testCollection forcetrainmodel = options.trainmodel modelName = "tagprop" nnName = distance + "knn" printStatus( INFO, "Starting TagProp %s,%s,%s,%s,%s" % (variant, trainCollection, testCollection, annotationName, feature)) resultfile = os.path.join(outputpkl) resultfile_tagprop = os.path.join( rootpath, testCollection, 'TagProp-Prediction', testset, trainCollection, annotationName, modelName, '%s,%s,%s,%d' % (feature, nnName, variant, k), 'prediction.mat') if checkToSkip(resultfile, overwrite) or checkToSkip( resultfile_tagprop, overwrite): return 0 tagmatrix_file = os.path.join(rootpath, trainCollection, 'TextData', 'lemm_wordnet_freq_tags.h5') if not os.path.exists(tagmatrix_file): printStatus( INFO, "Tag matrix file not found at %s Did you run wordnet_frequency_tags.py?" % (tagmatrix_file)) sys.exit(1) train_neighs_file = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d' % (feature, nnName, k), 'nn_train.h5') if not os.path.exists(train_neighs_file): printStatus( INFO, "Matlab train neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (train_neighs_file)) sys.exit(1) # do we need to perform learning? train_model_file = os.path.join( rootpath, trainCollection, 'TagProp-models', '%s,%s,%s,%d' % (feature, nnName, variant, k), 'model.mat') if os.path.exists(train_model_file) and not forcetrainmodel: printStatus( INFO, "model for %s available at %s" % (trainCollection, train_model_file)) else: printStatus(INFO, "starting learning model for %s" % (trainCollection)) makedirsforfile(train_model_file) script = """ tagprop_path = 'model_based/tagprop/TagProp/'; addpath(tagprop_path); tagmatrix = h5read('%s', '/tagmatrix') > 0.5; tagmatrix = sparse(tagmatrix); NN = h5read('%s', '/NN'); NN = NN(2:end, :); NN = double(NN); """ % (tagmatrix_file, train_neighs_file) if variant == 'dist' or variant == 'distsigmoids': script += """ NND = h5read('%s', '/NND'); NND = NND(2:end, :); NND = reshape(NND, 1, size(NND,1), size(NND,2)); NND = double(NND); """ % train_neighs_file if variant == 'rank': script += """ m = tagprop_learn(NN,[],tagmatrix); """ elif variant == 'ranksigmoids': script += """ m = tagprop_learn(NN,[],tagmatrix,'sigmoids',true); """ elif variant == 'dist': script += """ m = tagprop_learn(NN,NND,tagmatrix,'type','dist'); """ elif variant == 'distsigmoids': script += """ m = tagprop_learn(NN,NND,tagmatrix,'type','dist','sigmoids',true); """ script += """ save('%s', 'm', '-v7.3'); exit; """ % train_model_file call_matlab(script) # we perform prediction printStatus(INFO, "starting prediction") test_neighs_file = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d' % (feature, nnName, k), 'nn_test.h5') if not os.path.exists(test_neighs_file): printStatus( INFO, "Matlab test neighbors file not found at %s Did you run prepare_tagprop_data.py?" % (test_neighs_file)) sys.exit(1) script = """ tagprop_path = 'model_based/tagprop/TagProp/'; addpath(tagprop_path); load('%s'); tagmatrix = h5read('%s', '/tagmatrix') > 0.5; tagmatrix = sparse(tagmatrix); NNT = h5read('%s', '/NNT'); NNT = double(NNT); """ % (train_model_file, tagmatrix_file, test_neighs_file) if variant == 'dist' or variant == 'distsigmoids': script += """ NNDT = h5read('%s', '/NNDT'); NNDT = reshape(NNDT, 1, size(NNDT,1), size(NNDT,2)); NNDT = double(NNDT); """ % test_neighs_file script += """ P = tagprop_predict(NNT,[],m)'; save('%s', '-v7.3'); exit; """ % resultfile_tagprop makedirsforfile(resultfile_tagprop) call_matlab(script) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) concepts = readConcepts(testCollection, annotationName, rootpath) id_images = readImageSet(testCollection, testset, rootpath) id_images.sort() # id_images = map(int, id_images) # concepts mapping tagprop_output = h5py.File(resultfile_tagprop, 'r') tagprop_input = h5py.File(tagmatrix_file, 'r') mapping = getVocabMap(list(tagprop_input['vocab'][:]), concepts) final_tagmatrix = tagprop_output['P'][:][:, mapping] with open(resultfile, 'w') as f: pickle.dump( { 'concepts': concepts, 'id_images': id_images, 'scores': final_tagmatrix }, f, pickle.HIGHEST_PROTOCOL)
def process(options, feature, srcCollections, newCollection): assert(type(srcCollections) == list) temp = [] [x for x in srcCollections if x not in temp and temp.append(x)] # unique source collections srcCollections = temp rootpath = options.rootpath resfile = os.path.join(rootpath, newCollection, 'FeatureData', feature, 'feature.bin') if checkToSkip(resfile, options.overwrite): return 0 querysetfile = os.path.join(rootpath, newCollection, 'ImageSets', '%s.txt' % newCollection) try: query_set = set(map(str.strip, open(querysetfile).readlines())) printStatus(INFO, '%d images wanted' % len(query_set)) except IOError: printStatus(INFO, 'failed to load %s, will merge all features in %s' % (querysetfile, ';'.join(srcCollections))) query_set = None makedirsforfile(resfile) fw = open(resfile, 'wb') printStatus(INFO, 'writing results to %s' % resfile) seen = set() newimset = [] for collection in srcCollections: feat_dir = os.path.join(rootpath, collection, 'FeatureData', feature) with open(os.path.join(feat_dir, 'shape.txt')) as fr: nr_of_images, feat_dim = map(int, fr.readline().strip().split()) fr.close() srcimset = open(os.path.join(feat_dir,'id.txt')).readline().strip().split() res = array.array('f') fr = open(os.path.join(feat_dir,'feature.bin'), 'rb') for i,im in enumerate(srcimset): res.fromfile(fr, feat_dim) if im not in seen: seen.add(im) if not query_set or im in query_set: vec = res vec = np.array(vec, dtype=np.float32) vec.tofile(fw) newimset.append(im) del res[:] if i%1e5 == 0: printStatus(INFO, '%d parsed, %d obtained' % (len(seen), len(newimset))) fr.close() printStatus(INFO, '%d parsed, %d obtained' % (len(seen), len(newimset))) fw.close() printStatus(INFO, '%d parsed, %d obtained' % (len(seen), len(newimset))) idfile = os.path.join(os.path.split(resfile)[0], 'id.txt') with open(idfile, 'w') as fw: fw.write(' '.join(newimset)) fw.close() shapefile = os.path.join(os.path.split(resfile)[0], 'shape.txt') with open(shapefile, 'w') as fw: fw.write('%d %d' % (len(newimset), feat_dim)) fw.close()
def main(): opt = parse_args() print(json.dumps(vars(opt), indent=2)) rootpath = opt.rootpath evalpath = opt.evalpath testCollection = opt.testCollection batchsize = opt.batch_size # n_caption = opt.n_caption resume = os.path.join(opt.logger_name, opt.checkpoint_name) if not os.path.exists(resume): logging.info(resume + ' not exists.') sys.exit(0) saveFile_AVS16 = (opt.logger_name + '/AVS16_' + testCollection + '_Dense_Dual_model_bin.txt') saveFile_AVS17 = (opt.logger_name + '/AVS17_' + testCollection + '_Dense_Dual_model_bin.txt') saveFile_AVS18 = (opt.logger_name + '/AVS18_' + testCollection + '_Dense_Dual_model_bin.txt') if os.path.exists(saveFile_AVS17): sys.exit(0) queriesFile = 'AVS/tv16_17_18.avs.topics_parsed.txt' lineList = [line.rstrip('\n') for line in open(queriesFile)] checkpoint = torch.load(resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( resume, start_epoch, best_rsum)) options = checkpoint['opt'] if not hasattr(options, 'do_visual_feas_norm'): setattr(options, "do_visual_feas_norm", 0) if not hasattr(options, 'concate'): setattr(options, "concate", "full") trainCollection = options.trainCollection output_dir = resume.replace(trainCollection, testCollection) output_dir = output_dir.replace('/%s/' % options.cv_name, '/results/%s/' % trainCollection) result_pred_sents = os.path.join(output_dir, 'id.sent.score.txt') pred_error_matrix_file = os.path.join(output_dir, 'pred_errors_matrix.pth.tar') if checkToSkip(pred_error_matrix_file, opt.overwrite): sys.exit(0) makedirsforfile(pred_error_matrix_file) # data loader prepare caption_files = { 'test': os.path.join(evalpath, testCollection, 'TextData', '%s.caption.txt' % testCollection) } img_feat_path = os.path.join(evalpath, testCollection, 'FeatureData', options.visual_feature) visual_feats = {'test': BigFile(img_feat_path)} assert options.visual_feat_dim == visual_feats['test'].ndims video2frames = { 'test': read_dict( os.path.join(evalpath, testCollection, 'FeatureData', options.visual_feature, 'video2frames.txt')) } # video2frames = None # set bow vocabulary and encoding bow_vocab_file = os.path.join(rootpath, options.trainCollection, 'TextData', 'vocabulary', 'bow', options.vocab + '.pkl') bow_vocab = pickle.load(open(bow_vocab_file, 'rb')) bow2vec = get_text_encoder('bow')(bow_vocab) options.bow_vocab_size = len(bow_vocab) # set rnn vocabulary rnn_vocab_file = os.path.join(rootpath, options.trainCollection, 'TextData', 'vocabulary', 'rnn', options.vocab + '.pkl') rnn_vocab = pickle.load(open(rnn_vocab_file, 'rb')) options.vocab_size = len(rnn_vocab) # initialize word embedding options.we_parameter = None if options.word_dim == 500: w2v_data_path = os.path.join(rootpath, "word2vec", 'flickr', 'vec500flickr30m') options.we_parameter = get_we_parameter(rnn_vocab, w2v_data_path) # Construct the model model = get_model(options.model)(options) model.load_state_dict(checkpoint['model']) model.Eiters = checkpoint['Eiters'] # switch to evaluate mode model.val_start() video2frames = video2frames['test'] videoIDs = [key for key in video2frames.keys()] # Queries embeddings tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) queryEmbeddingsTMP = [] for quer in lineList: videBatch = videoIDs[0] # a dummy video data = dataLoadedVideoText_one(video2frames, videBatch, visual_feats['test'], quer, bow2vec, rnn_vocab, tokenizer, options) videos, captions = collate_frame_gru_fn(data) # compute the embeddings vid_emb, cap_emb = model.forward_emb(videos, captions, True) # preserve the embeddings by copying from gpu and converting to numpy cap_embs = cap_emb.data.cpu().numpy().copy() queryEmbeddingsTMP.append(cap_embs[0]) queryEmbeddings = np.stack(queryEmbeddingsTMP) # print(queryEmbeddings.shape) start = time.time() VideoIDS = [] errorlistList = [] for i in xrange(0, len(videoIDs), batchsize): videBatch = videoIDs[i:i + batchsize] VideoIDS.extend(videBatch) data = [] for bb in videBatch: data.extend( dataLoadedVideoText_one(video2frames, bb, visual_feats['test'], lineList[0], bow2vec, rnn_vocab, tokenizer, options)) videos, captions = collate_frame_gru_fn(data) # compute the embeddings vid_emb, cap_emb = model.forward_emb(videos, captions, True) # preserve the embeddings by copying from gpu and converting to numpy video_embs = vid_emb.data.cpu().numpy().copy() # calculate cosine distance errorlistList.extend(cosine_calculate(video_embs, queryEmbeddings)) if i % 100000 == 0: # print (i) end = time.time() print(str(i) + ' in: ' + str(end - start)) start = time.time() errorlist = np.asarray(errorlistList) f = open(saveFile_AVS16, "w") for num, name in enumerate(lineList[:30], start=1): queryError = errorlist[:, num - 1] scoresIndex = np.argsort(queryError) f = open(saveFile_AVS16, "a") c = 0 for ind in scoresIndex: imgID = VideoIDS[ind] c = c + 1 f.write('15%02d' % num) f.write(' 0 ' + imgID + ' ' + str(c) + ' ' + str(1000 - c) + ' ITI-CERTH' + '\n') if c == 1000: break f.close() # AVS17 f = open(saveFile_AVS17, "w") for num, name in enumerate(lineList[30:60], start=31): queryError = errorlist[:, num - 1] scoresIndex = np.argsort(queryError) f = open(saveFile_AVS17, "a") c = 0 for ind in scoresIndex: imgID = VideoIDS[ind] c = c + 1 f.write('15%02d' % num) f.write(' 0 ' + imgID + ' ' + str(c) + ' ' + str(1000 - c) + ' ITI-CERTH' + '\n') if c == 1000: break f.close() # AVS18 f = open(saveFile_AVS18, "w") for num, name in enumerate(lineList[60:90], start=61): queryError = errorlist[:, num - 1] scoresIndex = np.argsort(queryError) f = open(saveFile_AVS18, "a") c = 0 for ind in scoresIndex: imgID = VideoIDS[ind] c = c + 1 f.write('15%02d' % num) f.write(' 0 ' + imgID + ' ' + str(c) + ' ' + str(1000 - c) + ' ITI-CERTH' + '\n') if c == 1000: break f.close() resultAVSFile16 = saveFile_AVS16[:-4] + '_results.txt' command = "perl AVS/sample_eval.pl -q AVS/avs.qrels.tv16 {} > {}".format( saveFile_AVS16, resultAVSFile16) os.system(command) resultAVSFile17 = saveFile_AVS17[:-4] + '_results.txt' command = "perl AVS/sample_eval.pl -q AVS/avs.qrels.tv17 {} > {}".format( saveFile_AVS17, resultAVSFile17) os.system(command) resultAVSFile18 = saveFile_AVS18[:-4] + '_results.txt' command = "perl AVS/sample_eval.pl -q AVS/avs.qrels.tv18 {} > {}".format( saveFile_AVS18, resultAVSFile18) os.system(command)