def process(options, collection, annotationName, pos_num): assert(annotationName.endswith('.txt')) rootpath = options.rootpath pos_bag_num = options.pos_bag_num neg_bag_num = options.neg_bag_num neg_pos_ratio = options.neg_pos_ratio annotationNameStr = annotationName[:-4] + ('.random%d' % pos_num) + '.%d' + ('.npr%d' % neg_pos_ratio) + '.%d.txt' concepts = readConcepts(collection, annotationName, rootpath=rootpath) skip = 0 newAnnotationNames = [None] * (pos_bag_num * neg_bag_num) for idxp in range(pos_bag_num): for idxn in range(neg_bag_num): anno_idx = idxp * neg_bag_num + idxn newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn) resultfile = os.path.join(rootpath,collection,'Annotations',newAnnotationNames[anno_idx]) if checkToSkip(resultfile, options.overwrite): skip += 1 continue writeConcepts(concepts,resultfile) first,second,last = annotationNameStr.split('%d') scriptfile = os.path.join(rootpath,collection,'annotationfiles',first + '0-%d'%(pos_bag_num-1) + second + '0-%d'%(neg_bag_num-1) + last) makedirsforfile(scriptfile) fout = open(scriptfile,'w') fout.write('\n'.join(newAnnotationNames) + '\n') fout.close() if len(newAnnotationNames) == skip: return 0 for concept in concepts: names,labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath) positivePool = [x[0] for x in zip(names,labels) if x[1]>0] negativePool = [x[0] for x in zip(names,labels) if x[1]<0] for idxp in range(pos_bag_num): if len(positivePool) > pos_num: positiveBag = random.sample(positivePool, pos_num) else: positiveBag = positivePool for idxn in range(neg_bag_num): anno_idx = idxp * neg_bag_num + idxn newAnnotationName = newAnnotationNames[anno_idx] resultfile = os.path.join(rootpath,collection,'Annotations','Image',newAnnotationName,'%s.txt'%concept) if checkToSkip(resultfile, options.overwrite): continue real_neg_num = max(len(positiveBag) * neg_pos_ratio, 1000) real_neg_num = min(len(negativePool), real_neg_num) negativeBag = random.sample(negativePool, real_neg_num) assert(len(set(positiveBag).intersection(set(negativeBag))) == 0) printStatus(INFO, "anno(%s,%d) %d pos %d neg -> %s" % (concept,anno_idx,len(positiveBag),len(negativeBag),resultfile)) writeAnnotations(positiveBag + negativeBag, [1]*len(positiveBag) + [-1]*len(negativeBag), resultfile)
def process(options, collection, conceptfile): rootpath = options.rootpath tpp = options.tpp overwrite = options.overwrite concepts = [x.strip() for x in open(conceptfile).readlines() if x.strip() and not x.strip().startswith('#')] resultdir = os.path.join(rootpath, collection, 'tagged,%s'%tpp) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, '%s.txt'%concept) if checkToSkip(resultfile, overwrite): continue todo.append(concept) if not todo: printStatus(INFO, 'nothing to do') return 0 try: holdoutfile = os.path.join(rootpath,collection,'ImageSets','holdout.txt') holdoutSet = set(map(str.strip,open(holdoutfile).readlines())) except: holdoutSet = set() hitlists = buildHitlists(collection, todo, tpp, rootpath) min_hit = 1e6 max_hit = 0 for concept in todo: resultfile = os.path.join(resultdir, '%s.txt' % concept) if checkToSkip(resultfile,overwrite): continue subconcepts = concept.split('-') labeledSet = set(hitlists[subconcepts[0]]) for i in range(1,len(subconcepts)): labeledSet = labeledSet.intersection(hitlists[subconcepts[i]]) labeledSet = labeledSet.difference(holdoutSet) if len(labeledSet) == 0: printStatus(INFO, '%s has ZERO hit' % concept) else: printStatus(INFO, '%s, %d hits -> %s' %(concept, len(labeledSet), resultfile)) makedirsforfile(resultfile) fw = open(resultfile, 'w') fw.write('\n'.join(labeledSet) + '\n') fw.close() if len(labeledSet) > max_hit: max_hit = len(labeledSet) if len(labeledSet) < min_hit: min_hit = len(labeledSet) printStatus(INFO, 'max hits: %d, min hits: %d' % (max_hit, min_hit))
def process(options, collection, conceptfile): rootpath = options.rootpath tpp = options.tpp overwrite = options.overwrite concepts = [x.strip() for x in open(conceptfile).readlines() if x.strip() and not x.strip().startswith('#')] resultdir = os.path.join(rootpath, collection, 'tagged,%s'%tpp) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, '%s.txt'%concept) if checkToSkip(resultfile, overwrite): continue todo.append(concept) if not todo: print ('nothing to do') return 0 try: holdoutfile = os.path.join(rootpath,collection,'ImageSets','holdout.txt') holdoutSet = set(map(str.strip,open(holdoutfile).readlines())) except: holdoutSet = set() hitlists = buildHitlists(collection, todo, tpp, rootpath) min_hit = 1e6 max_hit = 0 for concept in todo: resultfile = os.path.join(resultdir, '%s.txt' % concept) if checkToSkip(resultfile,overwrite): continue subconcepts = concept.split('-') labeledSet = set(hitlists[subconcepts[0]]) for i in range(1,len(subconcepts)): labeledSet = labeledSet.intersection(hitlists[subconcepts[i]]) labeledSet = labeledSet.difference(holdoutSet) if len(labeledSet) == 0: printStatus(INFO, '%s has ZERO hit' % concept) else: printStatus(INFO, '%s, %d hits -> %s' %(concept, len(labeledSet), resultfile)) makedirsforfile(resultfile) fw = open(resultfile, 'w') fw.write('\n'.join(labeledSet) + '\n') fw.close() if len(labeledSet) > max_hit: max_hit = len(labeledSet) if len(labeledSet) < min_hit: min_hit = len(labeledSet) printStatus(INFO, 'max hits: %d, min hits: %d' % (max_hit, min_hit))
def process(options): overwrite = options.overwrite inputeFile = options.inputeFile weightFile = options.weightFile weightFile = os.path.join('result', weightFile) if checkToSkip(weightFile, overwrite): sys.exit(0) makedirsforfile(weightFile) test() print '-' * 70 best_perf = -10 best_alpha = None sigma = 0.001 data = load_data(os.path.join('result', inputeFile)) for i in range(1): perf, alpha = coordinate_ascent(data, sigma) if perf > best_perf: best_perf = perf best_alpha = alpha print '*' * 70 print 'optimized wights:', ' '.join(['%g' % x for x in best_alpha]) print 'best tuned performance:', best_perf open(weightFile, 'w').write(' '.join(map(str, best_alpha))) print 'optimized wight parameters have written into %s' % weightFile
def process(options): overwrite = options.overwrite inputeFile = options.inputeFile weightFile = options.weightFile weightFile = os.path.join('result', weightFile) if checkToSkip(weightFile, overwrite): sys.exit(0) makedirsforfile(weightFile) test() print '-'*70 best_perf = -10 best_alpha = None sigma = 0.001 data = load_data(os.path.join('result', inputeFile)) for i in range(1): perf, alpha = coordinate_ascent(data, sigma) if perf > best_perf: best_perf = perf best_alpha = alpha print '*'*70 print 'optimized wights:', ' '.join(['%g'%x for x in best_alpha]) print 'best tuned performance:', best_perf open(weightFile, 'w').write(' '.join(map(str,best_alpha))) print 'optimized wight parameters have written into %s' % weightFile
def process(options, tagfile, tpp): if "stem" == tpp: worker = nltk.PorterStemmer() func = stemming else: worker = nltk.WordNetLemmatizer() func = lemmatize resultfile = os.path.join(os.path.split(tagfile)[0], 'id.userid.%stags.txt' % tpp) if checkToSkip(resultfile, options.overwrite): return 0 makedirsforfile(resultfile) fw = codecs.open(resultfile, "w", encoding='utf8') parsed = 0 obtained = 0 for line in open(tagfile): elems = line.strip().split() parsed += 1 if len(elems) > 2: newtags = [] for tag in elems[2:]: try: newtag = func(worker,tag.lower()) except: newtag = tag newtags.append(newtag.decode('utf-8')) newline = "\t".join([elems[0], elems[1], " ".join(newtags)]) fw.write('%s\n' % newline) obtained += 1 fw.close() print ('%d lines parsed, %d records obtained' % (parsed, obtained) )
def process(options, conceptfile, tagvotesfile, resultfile): if checkToSkip(resultfile, options.overwrite): return 0 concepts = map(str.strip, open(conceptfile).readlines()) concept2index = dict(zip(concepts,range(len(concepts)))) data = open(tagvotesfile).readlines() print ('%d instances to dump' % len(data)) concept_num = len(concepts) image_num = len(data) scores = np.zeros((image_num, concept_num)) - 1e4 id_images = [None] * image_num for i in xrange(image_num): elems = str.split(data[i]) id_images[i] = int(elems[0]) del elems[0] for k in range(0, len(elems), 2): tag = elems[k] score = float(elems[k+1]) j = concept2index.get(tag, -1) if j >= 0: scores[i,j] = score makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump({'concepts':concepts, 'id_images':id_images, 'scores':scores}, output, -1) output.close()
def process(opt): rootpath = opt.rootpath collection = opt.collection feature = opt.feature overwrite = opt.overwrite feat_path = os.path.join(rootpath, collection, "FeatureData", feature) result_file = os.path.join(feat_path, "video2frames.txt") if checkToSkip(result_file, overwrite): sys.exit(0) makedirsforfile(result_file) feat_data = BigFile(feat_path) video2fmnos = {} int2str = {} for frame_id in feat_data.names: data = frame_id.strip().split("_") #print data video_id = '_'.join(data[:-1]) fm_no = data[-1] video2fmnos.setdefault(video_id, []).append(int(fm_no)) if int(fm_no) not in int2str: int2str[int(fm_no)] = fm_no video2frames = {} for video_id, fmnos in video2fmnos.iteritems(): for fm_no in sorted(fmnos): video2frames.setdefault(video_id, []).append(video_id + "_" + int2str[fm_no]) write_dict(result_file, video2frames) print "write out into: ", result_file
def process(options, collection, annotationName, simdir, resultfile): rootpath = options.rootpath if checkToSkip(resultfile, options.overwrite): return 0 concepts = readConcepts(collection, annotationName, rootpath=rootpath) concept_num = len(concepts) id_images = readImageSet(collection, collection, rootpath) image_num = len(id_images) im2index = dict(zip(id_images, range(image_num))) print('%d instances, %d concepts to dump -> %s' % (image_num, concept_num, resultfile)) scores = np.zeros((image_num, concept_num)) - 1e4 for c_id, concept in enumerate(concepts): simfile = os.path.join(simdir, '%s.txt' % concept) ranklist = readRankingResults(simfile) for im, score in ranklist: idx = im2index[im] scores[idx, c_id] = score makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump( { 'concepts': concepts, 'id_images': map(int, id_images), 'scores': scores }, output, -1) output.close()
def process(options, conceptfile, tagvotesfile, resultfile): if checkToSkip(resultfile, options.overwrite): return 0 concepts = map(str.strip, open(conceptfile).readlines()) concept2index = dict(zip(concepts, range(len(concepts)))) data = open(tagvotesfile).readlines() print('%d instances to dump' % len(data)) concept_num = len(concepts) image_num = len(data) scores = np.zeros((image_num, concept_num)) - 1e4 id_images = [None] * image_num for i in xrange(image_num): elems = str.split(data[i]) id_images[i] = elems[0] #int(elems[0]) del elems[0] for k in range(0, len(elems), 2): tag = elems[k] score = float(elems[k + 1]) j = concept2index.get(tag, -1) if j >= 0: scores[i, j] = score makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump( { 'concepts': concepts, 'id_images': id_images, 'scores': scores }, output, -1) output.close()
def process(options, source_dir, feat_dim, imsetfile, result_dir): resultfile = os.path.join(result_dir, 'feature.bin') if checkToSkip(resultfile, options.overwrite): sys.exit(0) imset = map(str.strip, open(imsetfile).readlines()) print "requested", len(imset) featurefile = BigFile(source_dir, feat_dim) makedirsforfile(resultfile) fw = open(resultfile, 'wb') done = [] start = 0 while start < len(imset): end = min(len(imset), start + options.blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) renamed, vectors = featurefile.read(imset[start:end]) for vec in vectors: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) done += renamed start = end fw.close() assert(len(done) == len(set(done))) resultfile = os.path.join(result_dir, 'id.txt') fw = open(resultfile, 'w') fw.write(' '.join(done)) fw.close() print '%d requested, %d obtained' % (len(imset), len(done))
def process(options): overwrite = options.overwrite inputeFile = options.inputeFile weightFile = options.weightFile resultFile = options.resultFile weightFile = os.path.join('result', weightFile) weight = open(weightFile).readline().strip().split() weight = np.array(weight, dtype=np.float) resultFile = os.path.join('result', resultFile) if checkToSkip(resultFile, overwrite): sys.exit(0) fout = open(resultFile, 'w') done = 0 for line in open(os.path.join('result', inputeFile)): elems = line.strip().split() vecs = map(float, elems[3:]) vecs = np.array(vecs, dtype=np.float) assert(len(weight) == len(vecs)) fout.write(" ".join(elems[:2]) + " " + str(np.dot(weight, vecs)) + '\n') done += 1 if done % 10000 == 0: print done, 'Done' fout.close() print "final score result after relevance fusion have written in %s" % resultFile
def process(options, collection, annotationName, simdir, resultfile): rootpath = options.rootpath if checkToSkip(resultfile, options.overwrite): return 0 concepts = readConcepts(collection, annotationName, rootpath=rootpath) concept_num = len(concepts) id_images = readImageSet(collection, collection, rootpath) image_num = len(id_images) im2index = dict(zip(id_images, range(image_num))) print ('%d instances, %d concepts to dump -> %s' % (image_num, concept_num, resultfile)) scores = np.zeros((image_num, concept_num)) - 1e4 for c_id,concept in enumerate(concepts): simfile = os.path.join(simdir, '%s.txt' % concept) ranklist = readRankingResults(simfile) for im,score in ranklist: idx = im2index[im] scores[idx,c_id] = score makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump({'concepts':concepts, 'id_images':map(int,id_images), 'scores':scores}, output, -1) output.close()
def main(option): rootpath = option.rootpath collection = option.collection threshold = option.threshold text_style = option.text_style vocab_file = os.path.join(rootpath, collection, 'TextData', 'vocabulary', text_style, 'word_vocab_%d.pkl' % threshold) counter_file = os.path.join(os.path.dirname(vocab_file), 'word_vocab_counter_%s.txt' % threshold) if checkToSkip(vocab_file, option.overwrite): sys.exit(0) makedirsforfile(vocab_file) vocab, word_counter = build_vocab(collection, text_style, threshold=threshold, rootpath=rootpath) with open(vocab_file, 'wb') as writer: pickle.dump(vocab, writer, pickle.HIGHEST_PROTOCOL) logger.info("Saved vocabulary file to %s", vocab_file) word_counter = [(word, cnt) for word, cnt in word_counter.items() if cnt >= threshold] word_counter.sort(key=lambda x: x[1], reverse=True) with open(counter_file, 'w') as writer: writer.write('\n'.join(map(lambda x: x[0] + ' %d' % x[1], word_counter))) logger.info("Saved vocabulary counter file to %s", counter_file)
def process(options, collection, annotationName): rootpath = options.rootpath overwrite = options.overwrite neg_filter = options.neg_filter concepts = readConcepts(collection, annotationName, rootpath) newAnnotationName = annotationName[:-4] + 'social.txt' ne = STRING_TO_NEGATIVE_ENGINE[neg_filter](collection, rootpath) newConcepts = [] for concept in concepts: resultfile = os.path.join(rootpath, collection, 'Annotations', 'Image', newAnnotationName, '%s.txt'%concept) if checkToSkip(resultfile, overwrite): newConcepts.append(concept) continue try: pos_set = readLabeledImageSet(collection, concept, tpp='lemm', rootpath=rootpath) except: pos_set = None if not pos_set: printStatus(INFO, '*** %s has not labeled examples, will be ignored ***' % concept) continue neg_set = ne.sample(concept, int(1e8)) assert(len(set(pos_set).intersection(set(neg_set))) == 0) newlabels = [1] * len(pos_set) + [-1] * len(neg_set) newnames = pos_set + neg_set printStatus(INFO, "anno(%s) %d pos %d neg -> %s" % (concept,len(pos_set),len(neg_set),resultfile)) writeAnnotations(newnames, newlabels, resultfile) newConcepts.append(concept) writeConceptsTo(newConcepts, collection, newAnnotationName, rootpath)
def process(options, testCollection, trainCollection, tagsimMethod): rootpath = options.rootpath overwrite = options.overwrite testsetName = options.testset if options.testset else testCollection tpp = options.tpp numjobs = options.numjobs job = options.job useWnVob = 1 outputName = tagsimMethod + '-wn' if useWnVob else tagsimMethod if tagsimMethod == 'wns': resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, outputName,'id.tagvotes.txt') else: resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, trainCollection, outputName,'id.tagvotes.txt') if numjobs>1: resultfile = resultfile.replace("id.tagvotes.txt", "id.tagvotes.%d.%d.txt" % (numjobs,job)) if checkToSkip(resultfile, overwrite): sys.exit(0) makedirsforfile(resultfile) try: doneset = set([str.split(x)[0] for x in open(options.donefile).readlines()[:-1]]) except: doneset = set() printStatus(INFO, "done set: %d" % len(doneset)) testImageSet = readImageSet(testCollection, testCollection, rootpath) testImageSet = [x for x in testImageSet if x not in doneset] testImageSet = [testImageSet[i] for i in range(len(testImageSet)) if (i%numjobs+1) == job] printStatus(INFO, 'working on %d-%d, %d test images -> %s' % (numjobs,job,len(testImageSet),resultfile) ) testreader = TagReader(testCollection, rootpath=rootpath) if tagsimMethod == "wns": tagrel = SIM_TO_TAGREL["wns"](trainCollection, useWnVob, "wup", rootpath) else: tagrel = SIM_TO_TAGREL[tagsimMethod](trainCollection, useWnVob, rootpath) done = 0 fw = open(resultfile, "w") for qry_id in testImageSet: qry_tags = testreader.get(qry_id) tagvotes = tagrel.estimate(qry_tags) newline = qry_id + " " + " ".join(["%s %s" % (tag, niceNumber(vote,8)) for (tag,vote) in tagvotes]) fw.write(newline+"\n") done += 1 if done%1000 == 0: printStatus(INFO, "%d done" % done) # done fw.close() printStatus(INFO, "%d done" % done)
def process(options, testCollection, trainCollection, trainAnnotationName, feature, modelName): assert(modelName.startswith('fastlinear')) rootpath = options.rootpath overwrite = options.overwrite numjobs = options.numjobs job = options.job topk = options.topk outputName = '%s,%s' % (feature,modelName) resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, trainAnnotationName, outputName, 'id.tagvotes.txt') if numjobs>1: resultfile += '.%d.%d' % (numjobs, job) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection,trainAnnotationName, rootpath=rootpath) nr_of_concepts = len(concepts) test_imset = readImageSet(testCollection, testCollection, rootpath) test_imset = [test_imset[i] for i in range(len(test_imset)) if i%numjobs+1 == job] test_imset = set(test_imset) nr_of_test_images = len(test_imset) printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,nr_of_test_images,resultfile)) ma = ModelArray(trainCollection, trainAnnotationName, feature, modelName, rootpath=rootpath) feat_file = StreamFile(os.path.join(rootpath, testCollection, "FeatureData", feature)) makedirsforfile(resultfile) fw = open(resultfile, "w") done = 0 feat_file.open() for _id, _vec in feat_file: if _id not in test_imset: continue res = ma.predict([_vec],prob=0) tagvotes = res[0] if topk>0: tagvotes = tagvotes[:topk] newline = '%s %s\n' % (_id, " ".join(["%s %s" % (tag, niceNumber(vote,6)) for (tag,vote) in tagvotes])) fw.write(newline) done += 1 if done % 1e4 == 0: printStatus(INFO, "%d done" % done) feat_file.close() fw.close() printStatus(INFO, "%d done" % (done)) return done
def process(options, testCollection): overwrite = options.overwrite rootpath = options.rootpath corpus = options.corpus word2vec_model = options.word2vec embedding_model = options.embedding Y0 = options.Y0 Y1 = options.Y1 pY0 = options.pY0 r = options.r blocksize = 2000 embedding_name = '%s,%s,%s' % (corpus, word2vec_model, embedding_model) for synset_name in [Y0, Y1]: assert(os.path.exists(os.path.join(rootpath, 'synset2vec', synset_name, embedding_name))) resfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, embedding_name, pY0, 'id.tagvotes.txt') if checkToSkip(resfile, overwrite): return 0 label_file = 'data/ilsvrc12/synsets.txt' label2vec_dir = os.path.join(rootpath, 'synset2vec', Y0, embedding_name) i2v = Image2Vec(label_file, label2vec_dir) tagger = ZeroshotTagger(synset_name=Y1, embedding_name=embedding_name, rootpath=rootpath) imset = readImageSet(testCollection, testCollection, rootpath) feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', pY0) feat_file = BigFile(feat_dir) printStatus(INFO, 'tagging %d images' % len(imset)) makedirsforfile(resfile) fw = open(resfile, 'w') start = 0 while start < len(imset): end = min(len(imset), start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end)) todo = imset[start:end] if not todo: break renamed, vectors = feat_file.read(todo) output = [] for _id,_vec in zip(renamed, vectors): im_vec = i2v.embedding(_vec) pred = tagger.predict(im_vec, topk=options.r) output.append('%s %s\n' % (_id, ' '.join(['%s %s'%(x[0],x[1]) for x in pred]))) start = end fw.write(''.join(output)) fw.close()
def process(options, trainCollection, baseAnnotationName, startAnnotationName, feature, modelName): global train_model, compress_model, save_model assert(modelName in ['fik', 'fastlinear']) if 'fik' == modelName: from model_based.svms.fiksvm.svmutil import svm_train as train_model from model_based.svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model from model_based.svms.fiksvm.fiksvm import fiksvm_save_model as save_model else: from model_based.svms.fastlinear.liblinear193.python.liblinearutil import train as train_model from model_based.svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model from model_based.svms.fastlinear.fastlinear import fastlinear_save_model as save_model rootpath = options.rootpath overwrite = options.overwrite params = {'rootpath': rootpath, 'trainCollection': trainCollection, 'baseAnnotationName': baseAnnotationName, 'startAnnotationName': startAnnotationName, 'feature': feature, 'model': modelName, 'strategy': options.strategy, 'iterations': options.iterations, 'npr': options.npr, 'nr_bins': options.nr_bins} concepts = readConcepts(trainCollection, startAnnotationName, rootpath) newAnnotationName = get_new_annotation_name(params) newModelName = get_model_name(params) modeldir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, newModelName) todo = [concept for concept in concepts if overwrite or os.path.exists(os.path.join(modeldir,'%s.txt'%concept)) is False] activeConcepts = [todo[i] for i in range(len(todo)) if (i%options.numjobs+1) == options.job] params['feat_file'] = BigFile(os.path.join(rootpath, trainCollection, 'FeatureData', feature)) if 'fik' == modelName: minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) s_time = time.time() for concept in activeConcepts: printStatus(INFO, 'processing %s' % concept) modelfile = os.path.join(modeldir, '%s.model'%concept) if checkToSkip(modelfile, overwrite): continue new_model = NegativeBootstrap.learn(concept, params) makedirsforfile(modelfile) printStatus(INFO, 'save model to %s' % modelfile) save_model(modelfile, new_model) printStatus(INFO, '%s done' % concept) timecost = time.time() - s_time writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath) printStatus(INFO, 'done for %g concepts: %s' % (len(activeConcepts), ' '.join(activeConcepts))) printStatus(INFO, 'models stored at %s' % modeldir) printStatus(INFO, '%g seconds in total' % timecost)
def process(options, trainCollection, annotationName, testCollection): rootpath = options.rootpath m = options.m k_r = options.kr k_d = options.kd k_s = options.ks k_c = options.kc feature = options.feature add_bonus = options.bonus overwrite = options.overwrite #outputName = 'cotag,m%d,kr%d,kd%d,ks%d,kc%d,bonus%d'%(m,k_r,k_d,k_s,k_c,add_bonus) outputName = 'cotag' # simplify the outputName to reduce the length of the result filename outputName = os.path.join(outputName, feature) if (k_c>1e-6) else outputName resultfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, trainCollection, annotationName, outputName, 'id.tagvotes.txt') if checkToSkip(resultfile, overwrite): sys.exit(0) testImageSet = readImageSet(testCollection, testCollection, rootpath=rootpath) test_tag_reader = TagReader(testCollection, rootpath=rootpath) if k_c < 1e-6: tagger = TagCooccurTagger(testCollection, trainCollection, annotationName, rootpath=rootpath) else: tagger = TagCooccurPlusTagger(testCollection, trainCollection, annotationName, feature=feature, rootpath=rootpath) tagger.m = m tagger.k_r = k_r tagger.k_d = k_d tagger.k_s = k_s tagger.k_c = k_c tagger.add_bonus = add_bonus makedirsforfile(resultfile) fw = open(resultfile, 'w') output = [] done = 0 for im in testImageSet: user_tags = test_tag_reader.get(im) tagvotes = tagger.predict(content=im, context=user_tags) newline = '%s %s' % (im, ' '.join(['%s %s'%(x[0], niceNumber(x[1],6)) for x in tagvotes])) output.append(newline) done += 1 if len(output) % 1e4 == 0: fw.write('\n'.join(output) + '\n') output=[] printStatus(INFO, '%d done' % done) if output: fw.write('\n'.join(output) + '\n') fw.close() printStatus(INFO, '%d done' % done)
def process(options, collection): rootpath = options.rootpath tpp = options.tpp overwrite = options.overwrite resultfile = os.path.join(rootpath, collection, "tagrel", collection, 'tagpos,%s' % tpp, 'id.tagvotes.txt') if checkToSkip(resultfile, overwrite): sys.exit(0) imset = readImageSet(collection, collection, rootpath) printStatus(INFO, 'working on %d test images -> %s' % (len(imset), resultfile)) reader = TagReader(collection, tpp=tpp, rootpath=rootpath) makedirsforfile(resultfile) fw = open(resultfile, "w") output = [] done = 0 for im in imset: tags = reader.get(im) tagSet = set() tagSeq = [] for tag in str.split(tags): if tag not in tagSet: tagSeq.append(tag) tagSet.add(tag) assert (len(tagSeq) == len(tagSet)) nr_tags = len(tagSeq) tagvotes = [(tagSeq[i], 1.0 - float(i) / nr_tags) for i in range(nr_tags)] newline = "%s %s" % (im, " ".join( ["%s %g" % (x[0], x[1]) for x in tagvotes])) output.append(newline + "\n") done += 1 if len(output) % 1e4 == 0: printStatus( INFO, '%d %s %s' % (done, im, ' '.join( ['%s:%g' % (x[0], x[1]) for x in tagvotes[:3]]))) fw.write("".join(output)) fw.flush() output = [] if output: fw.write("".join(output)) fw.close() printStatus(INFO, 'done')
def submit(searchers, collection,annotationName, rootpath=ROOT_PATH, overwrite=0): concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_runs = len(searchers) for concept in concepts: for j in range(nr_of_runs): resultfile = os.path.join(searchers[j].getOutputdir(), concept + ".txt") if checkToSkip(resultfile, overwrite): continue searchresults = searchers[j].scoreCollection(concept) print ("%s: %s %d -> %s" % (searchers[j].name, concept, len(searchresults), resultfile)) writeRankingResults(searchresults, resultfile) printStatus('%s.submit'%os.path.basename(__file__), "done")
def process(options, testCollection, annotationName, tagvotefile): rootpath = options.rootpath tpp = options.tpp tagged = options.tagged overwrite = options.overwrite resultdir = generate_result_dir(options, testCollection, tagvotefile) concepts = readConcepts(testCollection, annotationName, rootpath) todo = [] for concept in concepts: resfile = os.path.join(resultdir, '%s.txt'%concept) if checkToSkip(resfile, overwrite): continue todo.append(concept) if not todo: print ('nothing to do') return 0 nr_of_concepts = len(todo) labeled_set = [None] * nr_of_concepts if tagged: for i in range(nr_of_concepts): labeled_set[i] = set(readLabeledImageSet(testCollection, todo[i], tpp, rootpath)) concept2index = dict(zip(todo, range(nr_of_concepts))) ranklists = [[] for i in range(nr_of_concepts)] for line in open(tagvotefile): elems = line.strip().split() imageid = elems[0] del elems[0] assert(len(elems)%2==0) for i in range(0, len(elems), 2): tag = elems[i] c = concept2index.get(tag, -1) if c >= 0: if tagged and imageid not in labeled_set[c]: continue score = float(elems[i+1]) ranklists[c].append((imageid,score)) for i in range(nr_of_concepts): concept = todo[i] resfile = os.path.join(resultdir, '%s.txt'%concept) ranklist = sorted(ranklists[i], key=lambda v:v[1], reverse=True) print ('%s %d -> %s' % (concept, len(ranklist), resfile)) writeRankingResults(ranklist, resfile)
def process(options, label_file, label2vec_dir, testCollection, feature, new_feature): rootpath = options.rootpath overwrite = options.overwrite k = options.k blocksize = options.blocksize subset = options.subset if options.subset else testCollection resfile = os.path.join(rootpath, testCollection, 'FeatureData', new_feature, 'id.feature.txt') if checkToSkip(resfile, overwrite): return 0 imsetfile = os.path.join(rootpath, testCollection, 'ImageSets', '%s.txt' % subset) imset = map(str.strip, open(imsetfile).readlines()) printStatus(INFO, '%d images to do' % len(imset)) feat_file = BigFile(os.path.join(rootpath, testCollection, 'FeatureData', feature)) im2vec = Image2Vec(label_file, label2vec_dir) makedirsforfile(resfile) fw = open(resfile, 'w') read_time = 0 run_time = 0 start = 0 done = 0 while start < len(imset): end = min(len(imset), start + blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) s_time = time.time() renamed, test_X = feat_file.read(imset[start:end]) read_time += time.time() - s_time s_time = time.time() output = [None] * len(renamed) for i in xrange(len(renamed)): vec = im2vec.embedding(test_X[i], k) output[i] = '%s %s\n' % (renamed[i], " ".join([niceNumber(x,6) for x in vec])) run_time += time.time() - s_time start = end fw.write(''.join(output)) done += len(output) # done printStatus(INFO, "%d done. read time %g seconds, run_time %g seconds" % (done, read_time, run_time)) fw.close() return done
def process(options, pklfile, hdf5file): if checkToSkip(hdf5file, options.overwrite): return 0 printStatus(INFO, 'Loading pkl file %s' % pklfile) with open(pklfile, 'r') as f: data = pkl.load(f) printStatus(INFO, 'Found %d elements.' % len(data)) printStatus(INFO, 'Saving hdf5 file %s' % hdf5file) with h5py.File(hdf5file, 'w') as f: for k, v in data.items(): printStatus(INFO, 'Dumping %s' % k) f[k] = v printStatus(INFO, 'Done.')
def process(options, pklfile, hdf5file): if checkToSkip(hdf5file, options.overwrite): return 0 printStatus(INFO, 'Loading pkl file %s' % pklfile) with open(pklfile, 'r') as f: data = pkl.load(f) printStatus(INFO, 'Found %d elements.' % len(data)) printStatus(INFO, 'Saving hdf5 file %s' % hdf5file) with h5py.File(hdf5file,'w') as f: for k,v in data.items(): printStatus(INFO, 'Dumping %s' % k) f[k] = v printStatus(INFO, 'Done.')
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite random = options.random resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.zeros((len(id_images), len(concepts))) id_images = [] tag2idx = dict(zip(concepts, xrange(len(concepts)))) with open( os.path.join(rootpath, workingCollection, 'TextData', 'id.userid.lemmtags.txt')) as f: cnt = 0 for line in f: id_img, _, tags = line.split('\t') tags = tags.split() if len(tags) > 0: tags = [(tag2idx.get(x, -1), y) for x, y in zip(tags, xrange(len(tags)))] idx = np.array([x[0] for x in tags]) vals = 1. / (1. + np.array([x[1] for x in tags])) tagmatrix[cnt, idx] = vals id_images.append(id_img) cnt += 1 # random rank for untagged images if random: tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand( tagmatrix.shape[0], tagmatrix.shape[1]) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump( { 'concepts': concepts, 'id_images': map(int, id_images), 'scores': tagmatrix }, f, pickle.HIGHEST_PROTOCOL)
def process(options, collection, text_style, threshold): logger.info("processing %s ...", collection) rootpath = options.rootpath overwrite = options.overwrite threshold = int(threshold) lang = which_language(collection) input_file = os.path.join(rootpath, collection, 'TextData', '%s.caption.txt' % collection) output_vocab_file = os.path.join(rootpath, collection, 'TextData/vocabulary', text_style, 'word_vocab_%d.txt' % threshold) output_vocab_counter_file = os.path.join( rootpath, collection, 'TextData/vocabulary', text_style, 'word_vocab_counter_%d.txt' % threshold) if checkToSkip(output_vocab_file, overwrite): sys.exit(0) makedirsforfile(output_vocab_file) word2counter = {} for index, line in enumerate(open(input_file)): sid, sent = line.strip().split(" ", 1) if text_style == "bow": sent = clean_str(sent, lang) elif text_style == "bow_filterstop": sent = clean_str_filter_stop(sent, lang) if index == 0: logger.info(line.strip()) logger.info('After processing: %s %s', sid, ' '.join(sent)) for word in sent: word2counter[word] = word2counter.get(word, 0) + 1 sorted_wordCounter = sorted(word2counter.iteritems(), key=lambda a: a[1], reverse=True) output_line_vocab = [x[0] for x in sorted_wordCounter if x[1] >= threshold] output_line_vocab_counter = [ x[0] + ' ' + str(x[1]) for x in sorted_wordCounter if x[1] >= threshold ] open(output_vocab_file, 'w').write('\n'.join(output_line_vocab)) open(output_vocab_counter_file, 'w').write('\n'.join(output_line_vocab_counter)) logger.info('A vocabulary of %d words has been built for %s', len(output_line_vocab), collection)
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.random.rand(len(id_images), len(concepts)) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
def process(options, collection): rootpath = options.rootpath overwrite = options.overwrite inputfile = options.inputfile resultname = options.resultname result_file = os.path.join('result', resultname) if checkToSkip(result_file, overwrite): sys.exit(0) makedirsforfile(result_file) # inpute of query qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) num2file = {} num2file[0] = os.path.join(rootpath, collection, 'Annotations', 'Image', 'concepts%s.txt' % collection) method_count = 1 for line in open(os.path.join('result',inputfile)).readlines(): num2file[method_count] = line.strip() method_count +=1 fout = open(result_file, "w") for qid in qid_list: name2feature = {} for fnum in xrange(method_count): data_file = os.path.join( num2file[fnum], '%s.txt' % qid) data = readAnnotations(data_file) data.sort(key=lambda v:v[0], reverse=True) names = [x[0] for x in data] labels = [x[1] for x in data] # print 'fnum %d' % fnum if fnum == 0: key_names = names for i in xrange(len(names)): name2feature[names[i]] = [labels[i]] else: assert(checkSameList(key_names, names)) for i in xrange(len(names)): name2feature[names[i]].append(labels[i]) for img in key_names: fout.write('%s ' % qid + img + ' ' + ' '.join(name2feature[img]) + '\n') fout.close() print 'Combined result of different written into %s' % result_file
def process(options, feat_dir): newname = '' if options.ssr: newname = 'ssr' newname += 'l%d' % options.p resfile = os.path.join(feat_dir.rstrip('/\\') + newname, 'feature.bin') if checkToSkip(resfile, options.overwrite): return 0 with open(os.path.join(feat_dir, 'shape.txt')) as fr: nr_of_images, feat_dim = map(int, fr.readline().strip().split()) fr.close() offset = np.float32(1).nbytes * feat_dim res = array.array('f') fr = open(os.path.join(feat_dir,'feature.bin'), 'rb') makedirsforfile(resfile) fw = open(resfile, 'wb') print ('>>> writing results to %s' % resfile) for i in xrange(nr_of_images): res.fromfile(fr, feat_dim) vec = res if options.ssr: vec = [np.sign(x) * np.sqrt(abs(x)) for x in vec] if options.p == 1: Z = sum(abs(x) for x in vec) + 1e-9 else: Z = np.sqrt(sum([x**2 for x in vec])) + 1e-9 if i % 1e4 == 0: print ('image_%d, norm_%d=%g' % (i, options.p, Z)) vec = [x/Z for x in vec] del res[:] vec = np.array(vec, dtype=np.float32) vec.tofile(fw) fr.close() fw.close() print ('>>> %d lines parsed' % nr_of_images) shutil.copyfile(os.path.join(feat_dir,'id.txt'), os.path.join(os.path.split(resfile)[0], 'id.txt')) shapefile = os.path.join(os.path.split(resfile)[0], 'shape.txt') with open(shapefile, 'w') as fw: fw.write('%d %d' % (nr_of_images, feat_dim)) fw.close()
def process(options, collection): rootpath = options.rootpath tpp = options.tpp overwrite = options.overwrite resultfile = os.path.join(rootpath, collection, "tagrel", collection, 'tagpos,%s'%tpp, 'id.tagvotes.txt') if checkToSkip(resultfile, overwrite): sys.exit(0) imset = readImageSet(collection, collection, rootpath) printStatus(INFO, 'working on %d test images -> %s' % (len(imset),resultfile)) reader = TagReader(collection,tpp=tpp,rootpath=rootpath) makedirsforfile(resultfile) fw = open(resultfile, "w") output = [] done = 0 for im in imset: tags = reader.get(im) tagSet = set() tagSeq = [] for tag in str.split(tags): if tag not in tagSet: tagSeq.append(tag) tagSet.add(tag) assert(len(tagSeq) == len(tagSet)) nr_tags = len(tagSeq) tagvotes = [(tagSeq[i], 1.0-float(i)/nr_tags) for i in range(nr_tags)] newline = "%s %s" % (im, " ".join(["%s %g" % (x[0],x[1]) for x in tagvotes])) output.append(newline + "\n") done += 1 if len(output)%1e4 == 0: printStatus(INFO, '%d %s %s' % (done,im,' '.join(['%s:%g' % (x[0],x[1]) for x in tagvotes[:3]] ))) fw.write("".join(output)) fw.flush() output = [] if output: fw.write("".join(output)) fw.close() printStatus(INFO, 'done')
def process(options, collection, annotationName, runfile, newRunName): rootpath = options.rootpath overwrite = options.overwrite dataset = options.testset if options.testset else collection concepts = readConcepts(collection, annotationName, rootpath) simdir = os.path.join(rootpath, collection, "SimilarityIndex", dataset) data = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith("#")] models = [] for line in data: weight, run = str.split(line) models.append((run, float(weight), 1)) for concept in concepts: resultfile = os.path.join(simdir, newRunName, concept + ".txt") if checkToSkip(resultfile, overwrite): continue scorefile = os.path.join(simdir, models[0][0], concept + ".txt") if not os.path.exists(scorefile): print("%s does not exist. skip" % scorefile) continue ranklist = readRankingResults(scorefile) names = sorted([x[0] for x in ranklist]) nr_of_images = len(names) name2index = dict(zip(names, range(nr_of_images))) print("%s %d" % (concept, nr_of_images)) scoreTable = readImageScoreTable(concept, name2index, simdir, models, torank=options.torank) assert scoreTable.shape[1] == nr_of_images weights = [model[1] for model in models] scores = np.matrix(weights) * scoreTable scores = [float(scores[0, k]) for k in range(nr_of_images)] newranklist = [(names[i], scores[i]) for i in range(nr_of_images)] newranklist.sort(key=lambda v: (v[1], v[0]), reverse=True) writeRankingResults(newranklist, resultfile)
def process(options, collection, annotationName, runfile, newRunName): rootpath = options.rootpath overwrite = options.overwrite dataset = options.testset if options.testset else collection concepts = readConcepts(collection, annotationName, rootpath) simdir = os.path.join(rootpath, collection, "SimilarityIndex", dataset) data = [x.strip() for x in open(runfile).readlines() if x.strip() and not x.strip().startswith("#")] models = [] for line in data: weight,run = str.split(line) models.append((run, float(weight), 1)) for concept in concepts: resultfile = os.path.join(simdir, newRunName, concept + ".txt") if checkToSkip(resultfile, overwrite): continue scorefile = os.path.join(simdir, models[0][0], concept + ".txt") if not os.path.exists(scorefile): print ("%s does not exist. skip" % scorefile) continue ranklist = readRankingResults(scorefile) names = sorted([x[0] for x in ranklist]) nr_of_images = len(names) name2index = dict(zip(names, range(nr_of_images))) print ('%s %d' % (concept, nr_of_images)) scoreTable = readImageScoreTable(concept, name2index, simdir, models, torank=options.torank) assert(scoreTable.shape[1] == nr_of_images) weights = [model[1] for model in models] scores = np.matrix(weights) * scoreTable scores = [float(scores[0,k]) for k in range(nr_of_images)] newranklist = [(names[i], scores[i]) for i in range(nr_of_images)] newranklist.sort(key=lambda v:(v[1],v[0]), reverse=True) writeRankingResults(newranklist, resultfile)
def process(options, trainCollection, annotationName): rootpath = options.rootpath overwrite = options.overwrite resultfile = os.path.join(rootpath, trainCollection, 'TextData', 'tag.concept-rank.%s.pkl' % annotationName) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(trainCollection, annotationName, rootpath) concept_num = len(concepts) concept2index = dict(zip(concepts, range(concept_num))) tcb = TagCooccurBase(trainCollection, rootpath=rootpath) tag_num = tcb.tag_num() DEFAULT_RANK = tag_num rank_matrix = np.zeros((tag_num, concept_num), dtype=np.int) + DEFAULT_RANK tag_list = [] for i,u in enumerate(tcb.vob): ranklist = tcb.top_cooccur(u,-1) concept2rank = {} rank = [DEFAULT_RANK] * concept_num hit = 0 for j,x in enumerate(ranklist): idx = concept2index.get(x[0], -1) if idx>=0: rank_matrix[i,idx] = j+1 hit += 1 if hit == concept_num: break tag_list.append(u) if (i+1) % 1e4 == 0: printStatus(INFO, '%d done' % (i+1) ) assert(len(tag_list) == tag_num) import cPickle as pickle makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump({'tags':tag_list, 'concepts':concepts, 'rank_matrix':rank_matrix}, output, -1) output.close() printStatus(INFO, '%dx%d dumped to %s' % (tag_num, concept_num, resultfile))
def process(options, collection): rootpath = options.rootpath tpp = options.tpp tagfile = os.path.join(rootpath, collection, "TextData", "id.userid.%stags.txt" % tpp) resultfile = os.path.join(rootpath, collection, "TextData", "%stag.userfreq.imagefreq.txt" % tpp) if checkToSkip(resultfile, options.overwrite): return 0 printStatus(INFO, "parsing " + tagfile) tag2imfreq = {} tag2users = {} for line in open(tagfile): elems = str.split(line.strip()) photoid = elems[0] userid = elems[1] tagset = set(elems[2:]) for tag in tagset: tag2imfreq[tag] = tag2imfreq.get(tag, 0) + 1 tag2users.setdefault(tag, []).append(userid) printStatus(INFO, "collecting user-freq and image-freq") results = [] for tag, users in tag2users.iteritems(): userfreq = len(set(users)) imfreq = tag2imfreq[tag] results.append((tag, userfreq, imfreq)) printStatus(INFO, "sorting in descending order (user-freq as primary key)") results.sort(key=lambda v: (v[1], v[2]), reverse=True) printStatus(INFO, "-> %s" % resultfile) with open(resultfile, 'w') as fw: fw.write(''.join([ '%s %d %d\n' % (tag, userfreq, imfreq) for (tag, userfreq, imfreq) in results ])) fw.close()
def process(options, feat_dir): resultfile = os.path.join(feat_dir, "minmax.txt") if checkToSkip(resultfile, options.overwrite): sys.exit(0) nr_of_images, feat_dim = map(int, open(os.path.join(feat_dir, "shape.txt")).readline().split()) min_vals = [1e6] * feat_dim max_vals = [-1e6] * feat_dim offset = np.float32(1).nbytes * feat_dim res = array.array("f") feat_file = os.path.join(feat_dir, "feature.bin") id_file = os.path.join(feat_dir, "id.txt") nr_of_images = len(open(id_file).readline().strip().split()) printStatus(INFO, "parsing %s" % feat_file) fr = open(feat_file, "rb") s_time = time.time() for i in xrange(nr_of_images): res.fromfile(fr, feat_dim) vec = res for d in xrange(feat_dim): if vec[d] > max_vals[d]: max_vals[d] = vec[d] if vec[d] < min_vals[d]: min_vals[d] = vec[d] del res[:] fr.close() timecost = time.time() - s_time printStatus( INFO, "%g seconds to find min [%g,%g] and max [%g,%g]" % (timecost, min(min_vals), max(min_vals), min(max_vals), max(max_vals)), ) with open(resultfile, "w") as f: f.write("%s\n" % " ".join(map(str, min_vals))) f.write("%s\n" % " ".join(map(str, max_vals))) f.close()
def submit(searchers, collection, annotationName, rootpath=ROOT_PATH, overwrite=0): concepts = readConcepts(collection, annotationName, rootpath=rootpath) nr_of_runs = len(searchers) for concept in concepts: for j in range(nr_of_runs): resultfile = os.path.join(searchers[j].getOutputdir(), concept + ".txt") if checkToSkip(resultfile, overwrite): continue searchresults = searchers[j].scoreCollection(concept) print("%s: %s %d -> %s" % (searchers[j].name, concept, len(searchresults), resultfile)) writeRankingResults(searchresults, resultfile) printStatus('%s.submit' % os.path.basename(__file__), "done")
def process(options, collection, feature): rootpath = options.rootpath tpp = options.tpp k = 1000 # options.k numjobs = options.numjobs job = options.job overwrite = options.overwrite feat_dir = os.path.join(rootpath, collection, "FeatureData", feature) feat_file = BigFile(feat_dir) hitlists = buildHitLists(collection, tpp, rootpath) printStatus(INFO, "nr of tags: %d" % len(hitlists)) vob = sorted(hitlists.keys()) vob = [vob[i] for i in range(len(vob)) if i % numjobs == job - 1] printStatus(INFO, "working on %d-%d: %d tags" % (numjobs, job, len(vob))) for tag_idx, tag in enumerate(vob): resultdir = os.path.join(rootpath, collection, "FeatureIndex", feature, tag[:2], tag) binfile = os.path.join(resultdir, "feature.bin") if checkToSkip(binfile, overwrite): continue hitlist = hitlists[tag] hitlist = hitlist[:k] # keep at most 1000 images per tag renamed, vecs = feat_file.read(hitlist) makedirsforfile(binfile) np.array(vecs).astype(np.float32).tofile(binfile) idfile = os.path.join(resultdir, "id.txt") fw = open(idfile, "w") fw.write(" ".join(renamed)) fw.close() shapefile = os.path.join(resultdir, "shape.txt") fw = open(shapefile, "w") fw.write("%d %d" % (len(renamed), len(vecs[0]))) fw.close() if tag_idx % 1e3 == 0: printStatus(INFO, "%d - %s, %d images" % (tag_idx, tag, len(hitlist)))
def process(options, collection, feature): rootpath = options.rootpath tpp = options.tpp k = 1000 #options.k numjobs = options.numjobs job = options.job overwrite = options.overwrite feat_dir = os.path.join(rootpath, collection, 'FeatureData', feature) feat_file = BigFile(feat_dir) hitlists = buildHitLists(collection, tpp, rootpath) printStatus(INFO, 'nr of tags: %d' % len(hitlists)) vob = sorted(hitlists.keys()) vob = [vob[i] for i in range(len(vob)) if i%numjobs == job-1] printStatus(INFO, 'working on %d-%d: %d tags' % (numjobs, job, len(vob))) for tag_idx,tag in enumerate(vob): resultdir = os.path.join(rootpath, collection, 'FeatureIndex', feature, tag[:2], tag) binfile = os.path.join(resultdir, 'feature.bin') if checkToSkip(binfile, overwrite): continue hitlist = hitlists[tag] hitlist = hitlist[:k] # keep at most 1000 images per tag renamed,vecs = feat_file.read(hitlist) makedirsforfile(binfile) np.array(vecs).astype(np.float32).tofile(binfile) idfile = os.path.join(resultdir, 'id.txt') fw = open(idfile, 'w') fw.write(' '.join(renamed)) fw.close() shapefile = os.path.join(resultdir, 'shape.txt') fw = open(shapefile, 'w') fw.write('%d %d' % (len(renamed), len(vecs[0]))) fw.close() if tag_idx%1e3 == 0: printStatus(INFO, '%d - %s, %d images' % (tag_idx, tag, len(hitlist)))
def process(options, synset_file, synset_name): overwrite = options.overwrite rootpath = options.rootpath corpus = options.corpus word2vec_model = options.word2vec embedding = options.embedding resdir = os.path.join(rootpath, 'synset2vec', synset_name, '%s,%s,%s' % (corpus, word2vec_model, embedding)) resfile = os.path.join(resdir, 'feature.bin') if checkToSkip(resfile, overwrite): return 0 synsets = map(str.strip, open(synset_file).readlines()) s2v = get_synset_encoder(embedding)(corpus, word2vec_model, rootpath=rootpath) makedirsforfile(resfile) good = [] with open(resfile, 'wb') as fw: for i, wnid in enumerate(synsets): #if i % 1e3 == 0: # printStatus(INFO, '%d done' % i) vec = s2v.embedding(wnid) if vec is not None: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) good.append(wnid) fw.close() printStatus(INFO, '%d done, %d okay' % ((i + 1), len(good))) with open(os.path.join(resdir, 'id.txt'), 'w') as fw: fw.write(' '.join(good)) fw.close() with open(os.path.join(resdir, 'shape.txt'), 'w') as fw: fw.write('%d %d' % (len(good), s2v.get_feat_dim())) fw.close()
def process(options, feat_dir, imsetfile, result_dir): resultfile = os.path.join(result_dir, 'feature.bin') if checkToSkip(resultfile, options.overwrite): sys.exit(0) imset = map(str.strip, open(imsetfile).readlines()) print "requested", len(imset) feat_file = BigFile(feat_dir) makedirsforfile(resultfile) fw = open(resultfile, 'wb') done = [] start = 0 while start < len(imset): end = min(len(imset), start + options.blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) toread = imset[start:end] if len(toread) == 0: break renamed, vectors = feat_file.read(toread) for vec in vectors: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) done += renamed start = end fw.close() assert(len(done) == len(set(done))) with open(os.path.join(result_dir, 'id.txt'), 'w') as fw: fw.write(' '.join(done)) fw.close() with open(os.path.join(result_dir,'shape.txt'), 'w') as fw: fw.write('%d %d' % (len(done), feat_file.ndims)) fw.close() print '%d requested, %d obtained' % (len(imset), len(done))
def process(options, feat_dir, imsetfile, result_dir): resultfile = os.path.join(result_dir, "feature.bin") if checkToSkip(resultfile, options.overwrite): sys.exit(0) imset = map(str.strip, open(imsetfile).readlines()) print "requested", len(imset) feat_file = BigFile(feat_dir) makedirsforfile(resultfile) fw = open(resultfile, "wb") done = [] start = 0 while start < len(imset): end = min(len(imset), start + options.blocksize) printStatus(INFO, "processing images from %d to %d" % (start, end - 1)) toread = imset[start:end] if len(toread) == 0: break renamed, vectors = feat_file.read(toread) for vec in vectors: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) done += renamed start = end fw.close() assert len(done) == len(set(done)) with open(os.path.join(result_dir, "id.txt"), "w") as fw: fw.write(" ".join(done)) fw.close() with open(os.path.join(result_dir, "shape.txt"), "w") as fw: fw.write("%d %d" % (len(done), feat_file.ndims)) fw.close() print "%d requested, %d obtained" % (len(imset), len(done))
def process(options, synset_file, synset_name): overwrite = options.overwrite rootpath = options.rootpath corpus = options.corpus word2vec_model = options.word2vec embedding = options.embedding resdir = os.path.join(rootpath, 'synset2vec', synset_name, '%s,%s,%s' % (corpus, word2vec_model, embedding)) resfile = os.path.join(resdir, 'feature.bin') if checkToSkip(resfile, overwrite): return 0 synsets = map(str.strip, open(synset_file).readlines()) s2v = get_synset_encoder(embedding)(corpus, word2vec_model, rootpath=rootpath) makedirsforfile(resfile) good = [] with open(resfile, 'wb') as fw: for i,wnid in enumerate(synsets): #if i % 1e3 == 0: # printStatus(INFO, '%d done' % i) vec = s2v.embedding(wnid) if vec is not None: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) good.append(wnid) fw.close() printStatus(INFO, '%d done, %d okay' % ((i+1), len(good))) with open(os.path.join(resdir, 'id.txt'), 'w') as fw: fw.write(' '.join(good)) fw.close() with open(os.path.join(resdir, 'shape.txt'), 'w') as fw: fw.write('%d %d' % (len(good), s2v.get_feat_dim())) fw.close()
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.random.rand(len(id_images), len(concepts)) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump( { 'concepts': concepts, 'id_images': map(int, id_images), 'scores': tagmatrix }, f, pickle.HIGHEST_PROTOCOL)
def process(options, workingCollection, annotationName, outputpkl): rootpath = options.rootpath overwrite = options.overwrite random = options.random resultfile = os.path.join(outputpkl) if checkToSkip(resultfile, overwrite): return 0 concepts = readConcepts(workingCollection, annotationName, rootpath) id_images = readImageSet(workingCollection, workingCollection, rootpath) tagmatrix = np.zeros((len(id_images), len(concepts))) id_images = [] tag2idx = dict(zip(concepts, xrange(len(concepts)))) with open(os.path.join(rootpath, workingCollection, 'TextData', 'id.userid.lemmtags.txt')) as f: cnt = 0 for line in f: id_img, _, tags = line.split('\t') tags = tags.split() if len(tags) > 0: tags = [(tag2idx.get(x,-1), y) for x,y in zip(tags, xrange(len(tags)))] idx = np.array([x[0] for x in tags]) vals = 1. / (1. + np.array([x[1] for x in tags])) tagmatrix[cnt, idx] = vals id_images.append(id_img) cnt += 1 # random rank for untagged images if random: tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand(tagmatrix.shape[0], tagmatrix.shape[1]) # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
def process(options, collection): rootpath = options.rootpath tpp = options.tpp tagfile = os.path.join(rootpath, collection, "TextData", "id.userid.%stags.txt" % tpp) resultfile = os.path.join(rootpath, collection, "TextData", "%stag.userfreq.imagefreq.txt" % tpp) if checkToSkip(resultfile, options.overwrite): return 0 printStatus(INFO, "parsing " + tagfile) tag2imfreq = {} tag2users = {} for line in open(tagfile): elems = str.split(line.strip()) photoid = elems[0] userid = elems[1] tagset = set(elems[2:]) for tag in tagset: tag2imfreq[tag] = tag2imfreq.get(tag, 0) + 1 tag2users.setdefault(tag,[]).append(userid) printStatus(INFO, "collecting user-freq and image-freq") results = [] for tag,users in tag2users.iteritems(): userfreq = len(set(users)) imfreq = tag2imfreq[tag] results.append((tag, userfreq, imfreq)) printStatus(INFO, "sorting in descending order (user-freq as primary key)") results.sort(key=lambda v:(v[1],v[2]), reverse=True) printStatus(INFO, "-> %s" % resultfile) with open(resultfile, 'w') as fw: fw.write(''.join(['%s %d %d\n' % (tag, userfreq, imfreq) for (tag, userfreq, imfreq) in results])) fw.close()
def process(options, tagfile, tpp): if "stem" == tpp: worker = nltk.PorterStemmer() func = stemming else: worker = nltk.WordNetLemmatizer() func = lemmatize resultfile = os.path.join( os.path.split(tagfile)[0], 'id.userid.%stags.txt' % tpp) if checkToSkip(resultfile, options.overwrite): return 0 makedirsforfile(resultfile) fw = codecs.open(resultfile, "w", encoding='utf8') parsed = 0 obtained = 0 for line in open(tagfile): elems = line.strip().split() parsed += 1 if len(elems) > 2: newtags = [] for tag in elems[2:]: try: newtag = func(worker, tag.lower()) except: newtag = tag newtags.append(newtag.decode('utf-8')) newline = "\t".join([elems[0], elems[1], " ".join(newtags)]) fw.write('%s\n' % newline) obtained += 1 fw.close() print('%d lines parsed, %d records obtained' % (parsed, obtained))
def process(options, inputfile, resultfile): assert(inputfile.endswith('.pkl')) #resultfile = inputfile[:-4] + '_rank.pkl' if checkToSkip(resultfile, options.overwrite): return 0 import cPickle as pickle data = pickle.load(open(inputfile,'rb')) scores = data['scores'] id_images = data['id_images'] concepts = data['concepts'] nr_of_images = len(id_images) nr_of_concepts = len(concepts) assert(scores.shape[0] == nr_of_images) assert(scores.shape[1] == nr_of_concepts) DEFAULT_RANK = nr_of_concepts rank_matrix = np.zeros((nr_of_images, nr_of_concepts), dtype=np.int) + DEFAULT_RANK for i in xrange(nr_of_images): sorted_index = np.argsort(scores[i,:]) # in ascending order for j in range(nr_of_concepts): c_idx = sorted_index[j] rank = nr_of_concepts - j rank_matrix[i, c_idx] = rank if (i+1) % 1e5 == 0: printStatus(INFO, '%d done' % (i+1) ) makedirsforfile(resultfile) output = open(resultfile, 'wb') pickle.dump({'id_images':id_images, 'concepts':concepts, 'rank_matrix':rank_matrix}, output, -1) output.close() printStatus(INFO, '%dx%d dumped to %s' % (nr_of_images, nr_of_concepts, resultfile))
def main(): opt = parse_args() print(json.dumps(vars(opt), indent=2)) rootpath = opt.rootpath trainCollection = opt.trainCollection valCollection = opt.valCollection testCollection = opt.testCollection if opt.loss_fun == "mrl" and opt.measure == "cosine": assert opt.text_norm is True assert opt.visual_norm is True # checkpoint path model_info = '%s_concate_%s_dp_%.1f_measure_%s' % ( opt.model, opt.concate, opt.dropout, opt.measure) # text-side multi-level encoding info text_encode_info = 'vocab_%s_word_dim_%s_text_rnn_size_%s_text_norm_%s' % \ (opt.vocab, opt.word_dim, opt.text_rnn_size, opt.text_norm) text_encode_info += "_kernel_sizes_%s_num_%s" % (opt.text_kernel_sizes, opt.text_kernel_num) # video-side multi-level encoding info visual_encode_info = 'visual_feature_%s_visual_rnn_size_%d_visual_norm_%s' % \ (opt.visual_feature, opt.visual_rnn_size, opt.visual_norm) visual_encode_info += "_kernel_sizes_%s_num_%s" % (opt.visual_kernel_sizes, opt.visual_kernel_num) # common space learning info mapping_info = "mapping_text_%s_img_%s" % (opt.text_mapping_layers, opt.visual_mapping_layers) loss_info = 'loss_func_%s_margin_%s_direction_%s_max_violation_%s_cost_style_%s' % \ (opt.loss_fun, opt.margin, opt.direction, opt.max_violation, opt.cost_style) optimizer_info = 'optimizer_%s_lr_%s_decay_%.2f_grad_clip_%.1f_val_metric_%s' % \ (opt.optimizer, opt.learning_rate, opt.lr_decay_rate, opt.grad_clip, opt.val_metric) opt.logger_name = os.path.join(rootpath, trainCollection, opt.cv_name, valCollection, model_info, text_encode_info, visual_encode_info, mapping_info, loss_info, optimizer_info, opt.postfix) print(opt.logger_name) if checkToSkip(os.path.join(opt.logger_name, 'model_best.pth.tar'), opt.overwrite): sys.exit(0) if checkToSkip(os.path.join(opt.logger_name, 'val_metric.txt'), opt.overwrite): sys.exit(0) makedirsforfile(os.path.join(opt.logger_name, 'val_metric.txt')) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) opt.text_kernel_sizes = map(int, opt.text_kernel_sizes.split('-')) opt.visual_kernel_sizes = map(int, opt.visual_kernel_sizes.split('-')) # collections: trian, val collections = {'train': trainCollection, 'val': valCollection} cap_file = { 'train': '%s.caption.txt' % trainCollection, 'val': '%s.caption.txt' % valCollection } # caption caption_files = { x: os.path.join(rootpath, collections[x], 'TextData', cap_file[x]) for x in collections } # Load visual features visual_feat_path = { x: os.path.join(rootpath, collections[x], 'FeatureData', opt.visual_feature) for x in collections } visual_feats = {x: BigFile(visual_feat_path[x]) for x in visual_feat_path} opt.visual_feat_dim = visual_feats['train'].ndims # set bow vocabulary and encoding bow_vocab_file = os.path.join(rootpath, opt.trainCollection, 'TextData', 'vocabulary', 'bow', opt.vocab + '.pkl') bow_vocab = pickle.load(open(bow_vocab_file, 'rb')) bow2vec = get_text_encoder('bow')(bow_vocab) opt.bow_vocab_size = len(bow_vocab) # set rnn vocabulary rnn_vocab_file = os.path.join(rootpath, opt.trainCollection, 'TextData', 'vocabulary', 'rnn', opt.vocab + '.pkl') rnn_vocab = pickle.load(open(rnn_vocab_file, 'rb')) opt.vocab_size = len(rnn_vocab) # initialize word embedding opt.we_parameter = None if opt.word_dim == 500: w2v_data_path = os.path.join(rootpath, "word2vec", 'flickr', 'vec500flickr30m') opt.we_parameter = get_we_parameter(rnn_vocab, w2v_data_path) # mapping layer structure opt.text_mapping_layers = map(int, opt.text_mapping_layers.split('-')) opt.visual_mapping_layers = map(int, opt.visual_mapping_layers.split('-')) if opt.concate == 'full': opt.text_mapping_layers[ 0] = opt.bow_vocab_size + opt.text_rnn_size * 2 + opt.text_kernel_num * len( opt.text_kernel_sizes) opt.visual_mapping_layers[ 0] = opt.visual_feat_dim + opt.visual_rnn_size * 2 + opt.visual_kernel_num * len( opt.visual_kernel_sizes) elif opt.concate == 'reduced': opt.text_mapping_layers[ 0] = opt.text_rnn_size * 2 + opt.text_kernel_num * len( opt.text_kernel_sizes) opt.visual_mapping_layers[ 0] = opt.visual_rnn_size * 2 + opt.visual_kernel_num * len( opt.visual_kernel_sizes) else: raise NotImplementedError('Model %s not implemented' % opt.model) # set data loader video2frames = { x: read_dict( os.path.join(rootpath, collections[x], 'FeatureData', opt.visual_feature, 'video2frames.txt')) for x in collections } data_loaders = data.get_data_loaders(caption_files, visual_feats, rnn_vocab, bow2vec, opt.batch_size, opt.workers, opt.n_caption, video2frames=video2frames) # Construct the model model = get_model(opt.model)(opt) opt.we_parameter = None # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, data_loaders['val'], model, measure=opt.measure) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 no_impr_counter = 0 lr_counter = 0 best_epoch = None fout_val_metric_hist = open( os.path.join(opt.logger_name, 'val_metric_hist.txt'), 'w') for epoch in range(opt.num_epochs): print('Epoch[{0} / {1}] LR: {2}'.format( epoch, opt.num_epochs, get_learning_rate(model.optimizer)[0])) print('-' * 10) # train for one epoch train(opt, data_loaders['train'], model, epoch) # evaluate on validation set rsum = validate(opt, data_loaders['val'], model, measure=opt.measure) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) print(' * Current perf: {}'.format(rsum)) print(' * Best perf: {}'.format(best_rsum)) print('') fout_val_metric_hist.write('epoch_%d: %f\n' % (epoch, rsum)) fout_val_metric_hist.flush() if is_best: save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, filename='checkpoint_epoch_%s.pth.tar' % epoch, prefix=opt.logger_name + '/', best_epoch=best_epoch) best_epoch = epoch lr_counter += 1 decay_learning_rate(opt, model.optimizer, opt.lr_decay_rate) if not is_best: # Early stop occurs if the validation performance does not improve in ten consecutive epochs no_impr_counter += 1 if no_impr_counter > 10: print('Early stopping happended.\n') break # When the validation performance decreased after an epoch, # we divide the learning rate by 2 and continue training; # but we use each learning rate for at least 3 epochs. if lr_counter > 2: decay_learning_rate(opt, model.optimizer, 0.5) lr_counter = 0 else: no_impr_counter = 0 fout_val_metric_hist.close() print('best performance on validation: {}\n'.format(best_rsum)) with open(os.path.join(opt.logger_name, 'val_metric.txt'), 'w') as fout: fout.write('best performance on validation: ' + str(best_rsum)) # generate evaluation shell script if testCollection == 'iacc.3': templete = ''.join(open('util/TEMPLATE_do_predict.sh').readlines()) striptStr = templete.replace('@@@query_sets@@@', 'tv16.avs.txt,tv17.avs.txt,tv18.avs.txt') else: templete = ''.join(open('util/TEMPLATE_do_test.sh').readlines()) striptStr = templete.replace('@@@n_caption@@@', str(opt.n_caption)) striptStr = striptStr.replace('@@@rootpath@@@', rootpath) striptStr = striptStr.replace('@@@testCollection@@@', testCollection) striptStr = striptStr.replace('@@@logger_name@@@', opt.logger_name) striptStr = striptStr.replace('@@@overwrite@@@', str(opt.overwrite)) # perform evaluation on test set runfile = 'do_test_%s_%s.sh' % (opt.model, testCollection) open(runfile, 'w').write(striptStr + '\n') os.system('chmod +x %s' % runfile)
def process(options, collection): rootpath = options.rootpath overwrite = options.overwrite feature = options.feature method = options.method sigma = options.sigma # result path ranking_result_path = os.path.join(rootpath, collection, 'SimilarityIndex', collection, 'MetaData', method, feature) DCG_result_path = os.path.join(rootpath, collection, 'DCG', method, feature) if checkToSkip(ranking_result_path, overwrite): sys.exit(0) if checkToSkip(DCG_result_path, overwrite): sys.exit(0) # inpute of query qid_query_file = os.path.join(rootpath, collection, 'Annotations', 'qid.text.txt') qid_list, query_list = readQidQuery(qid_query_file) qid2query = dict(zip(qid_list, query_list)) # inpute of image img_feat_path = os.path.join(rootpath, collection, 'FeatureData', feature) img_feats = BigFile(img_feat_path) # the model to calculate DCG@25 scorer = getScorer("DCG@25") done = 0 qid2dcg = collections.OrderedDict() qid2iid_label_score = {} for qid in qid_list: iid_list, label_list = readAnnotationsFrom( collection, 'concepts%s.txt' % collection, qid, False, rootpath) renamed, test_X = img_feats.read(iid_list) parzen_list = [] for imidx in iid_list: parzen_list.append( calParzen(img_feats.read_one(imidx), test_X, sigma)) # parzen_list_suffle = calParzen_fast(test_X, len(renamed), sigma) # parzen_list = [] # for imidx in iid_list: # parzen_list.append(parzen_list_suffle[renamed.index(imidx)]) sorted_tuple = sorted(zip(iid_list, label_list, parzen_list), key=lambda v: v[2], reverse=True) qid2iid_label_score[qid] = sorted_tuple # calculate DCG@25 sorted_label = [x[1] for x in sorted_tuple] qid2dcg[qid] = scorer.score(sorted_label) printMessage("Done", qid, qid2query[qid]) done += 1 if done % 20 == 0: writeRankingResult(ranking_result_path, qid2iid_label_score) qid2iid_label_score = {} writeDCGResult(DCG_result_path, qid2dcg) writeRankingResult(ranking_result_path, qid2iid_label_score) print "average DCG@25: %f" % (1.0 * sum(qid2dcg.values()) / len(qid2dcg.values())) result_path_file = "result/individual_result_pathes.txt" if os.path.exists(result_path_file): fout = open(result_path_file, 'a') else: makedirsforfile(result_path_file) fout = open(result_path_file, 'w') fout.write(ranking_result_path + '\n') fout.close()
nr_neg_bags = cmdOpts.getInt('nr_neg_bags') nr_neg = nr_pos * neg_pos_ratio concepts = readConcepts(collection, annotationName) annotationNameStr = generate_new_annotation_template(cmdOpts) nr_skipped = 0 newAnnotationNames = [None] * (nr_pos_bags * nr_neg_bags) for idxp in range(nr_pos_bags): for idxn in range(nr_neg_bags): anno_idx = idxp * nr_neg_bags + idxn newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn) resultfile = os.path.join(rootpath, collection, 'Annotations', newAnnotationNames[anno_idx]) if checkToSkip(resultfile, overwrite): nr_skipped += 1 continue writeConcepts(concepts, resultfile) first, second, last = annotationNameStr.split('%d') scriptfile = os.path.join( rootpath, collection, 'annotationfiles', first + '0-%d' % (nr_pos_bags - 1) + second + '0-%d' % (nr_neg_bags - 1) + last) makedirsforfile(scriptfile) fout = open(scriptfile, 'w') fout.write('\n'.join(newAnnotationNames) + '\n') fout.close() if nr_skipped == (nr_pos_bags * nr_neg_bags): sys.exit(0)
freq_threshold = int(sys.argv[3]) overwrite = int(sys.argv[4]) for collection in collection_list: print "processing %s ..." % collection input_file = os.path.join( rootpath, "%s/TextData/%s.caption.txt" % (collection, collection)) output_vocab_file = os.path.join( rootpath, "%s/TextData/vocabulary/%s/word_vocab_%d.txt" % (collection, text_style, freq_threshold)) output_vocab_counter_file = os.path.join( rootpath, "%s/TextData/vocabulary/%s/word_vocab_counter_%d.txt" % (collection, text_style, freq_threshold)) if checkToSkip(output_vocab_file, overwrite): sys.exit(0) if checkToSkip(output_vocab_counter_file, overwrite): sys.exit(0) makedirsforfile(output_vocab_file) word2counter = {} len2counter = {} for index, line in enumerate(open(input_file)): sid, sent = line.strip().split(" ", 1) if text_style == "bow": sent = clean_str(sent) elif text_style == "bow_filterstop": sent = clean_str_filter_stop(sent) length = len(sent) len2counter[length] = len2counter.get(length, 0) + 1
def process(options, trainCollection, trainAnnotationName, valCollection, valAnnotationName, feature, modelName): assert (modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 metric = options.metric scorer = getScorer(metric) overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {} if 'fik' == modelName: from fiksvm.svmutil import svm_train as train_model from fiksvm.fiksvm import svm_to_fiksvm as compress_model from fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from fastlinear.liblinear193.python.liblinearutil import train as train_model from fastlinear.fastlinear import liblinear_to_fastlinear as compress_model modelName = 'fastlinear' concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath) valConcepts = readConcepts(valCollection, valAnnotationName, rootpath=rootpath) concept_num = len(concepts) for i in range(concept_num): assert (concepts[i] == valConcepts[i]) resultdir = os.path.join( rootpath, trainCollection, 'Models', trainAnnotationName, '%s,best_params' % modelName, '%s,%s,%s' % (valCollection, valAnnotationName, feature)) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.txt') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile( os.path.join(rootpath, trainCollection, 'FeatureData', feature)) val_feat_file = BigFile( os.path.join(rootpath, valCollection, 'FeatureData', feature)) feat_dim = train_feat_file.ndims assert (feat_dim == val_feat_file.ndims) for concept in todo: names, labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) renamed, vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1 == lab]) wp = float(beta) * (np + nn) / np wn = (1.0 - beta) * (np + nn) / nn names, labels = readAnnotationsFrom(valCollection, valAnnotationName, concept, skip_0=True, rootpath=rootpath) val_name2label = dict(zip(names, labels)) val_renamed, val_vectors = val_feat_file.read(names) min_perf = 2.0 worst_C = 1.0 max_perf = 0.0 best_C = 1.0 best_scores = None best_labels = None for C in [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]: if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' #print modelName, '>'*20, svm_params model = train_model(Ys, vectors, svm_params + ' -q') new_model = compress_model([model], [1.0], feat_dim, params) ranklist = [(val_renamed[i], new_model.predict(val_vectors[i])) for i in range(len(val_renamed))] ranklist.sort(key=lambda v: v[1], reverse=True) sorted_labels = [val_name2label[x[0]] for x in ranklist] perf = scorer.score(sorted_labels) if max_perf < perf: max_perf = perf best_C = C best_scores = [x[1] for x in ranklist] best_labels = list(sorted_labels) if min_perf > perf: min_perf = perf worst_C = C [A, B] = sigmoid_train(best_scores, best_labels) resultfile = os.path.join(resultdir, '%s.txt' % concept) printStatus( INFO, '%s -> worseAP=%g, worst_C=%g, bestAP=%g, best_C=%g, a=%g, b=%g' % (concept, min_perf, worst_C, max_perf, best_C, A, B)) makedirsforfile(resultfile) fw = open(resultfile, 'w') fw.write('bestAP=%g, best_C=%g, a=%g, b=%g' % (max_perf, best_C, A, B)) fw.close()