def merge_dbs(input_dbs, output_db, shuffle=True): map_size = 2**40 total_size = sum([db.size(input_db) for input_db in input_dbs]) ids = range(total_size) if shuffle: random.shuffle(ids) env_out = lmdb.open(output_db, readonly=False, map_size=map_size) idx = 0 for input_db in input_dbs: env_in = lmdb.open(input_db, readonly=True, map_size=map_size) with env_out.begin(write=True) as txn_out: with env_in.begin() as txn_in: for key, data in txn_in.cursor(): txn_out.put(db.str_id(ids[idx]), data) idx += 1 if idx % db.LOG_EVERY == 0: print 'Processed {0} samples'.format(idx) env_in.close() env_out.close() if idx % db.LOG_EVERY != 0: print 'Processed {0} samples'.format(idx)
def init(self, db_path=None, batch_size=None): if self.is_opened(): self.close() if db_path is not None: self.db_path = db_path self.size = db.size(self.db_path) self.entry_shape = db.entry_shape(self.db_path) self.shape = tuple([self.size] + list(self.entry_shape)) if batch_size is not None: self.batch_size = batch_size
print usage sys.exit(0) if ('--verbose' in sys.argv): verbose = 1 dirArgIndex = sys.argv.index('--dir') try: dir = sys.argv[dirArgIndex + 1] except IndexError: print 'Invalid directory argument' if (not os.path.exists(dir)) or (not os.path.isdir(dir)): print 'Invalid directory argument' sys.exit(2) images = listFiles( dir, '*.JPG;*.jpg;*.png;*.jpeg;*.bmp;*.dcx;*.gif;*.pcx;*.ppm;*.psd;*.tga;*.tif;*.tiff;*.xpm' ) index = db.size() for image in images: imat = preprocess.ProcessImage(image) if (numpy.count_nonzero(imat) > 0): iid = str(index).zfill(6) db.addimg(iid, image) db.addsig(iid, imat[0], 0) db.addsig(iid, imat[1], 1) db.addsig(iid, imat[2], 2) if (verbose): print 'added ', iid, ':', image index += 1
""" # Logger logger = log.file_console_log() # Connect to the MongoDB database stored_tweets = db.twitter_collection() # Retrieve tweets from user timeline and store new ones in database # If any error occurs, log it in log file try: timeline_tweets = twitter.home_timeline() except Exception, error: logger.error(traceback.format_exc()[:-1]) # log error raise error else: before = db.size(stored_tweets) # initial collection size # Insert each tweet in database # If any error occurs, log it in log file for tweet in timeline_tweets: try: db.insert(tweet, stored_tweets) # insert in mongoDB collection # Note: if tweet already in DB, the insertion will fail silently except db.DBError, error: logger.error(traceback.format_exc()[:-1]) # log error raise error after = db.size(stored_tweets) # new collection size # log insertion information message = "[%s] +%d new, %d stored" % (db.name(stored_tweets), after - before,
def ScoreQuery(query): """ generate scores for query image using m wavelet coefficients query is a path to uery image file returns a list of image scores number of wavelet coefficients is redefined in the preprocess model """ qmat = preprocess.ProcessImage(query) print(qmat.shape) if (numpy.count_nonzero(qmat) == 0): raise AssertionError, 'Cannot preprocess query image' return scores = numpy.zeros(db.size(), float) #initialize scores for each image print(" score size is %s ", scores.size) #open databases for reading try: idb = shelve.open(db.idbpath, flag='r') except: raise AssertionError, 'Cannot open image database' print(" w shape is %s ", w.shape) for c in range(0, 3): #for each color channel #open searcharrays for this color channel print 'color channel: ', c try: sa_plus = shelve.open(db.sadbpaths[c][0], flag='r') sa_minus = shelve.open(db.sadbpaths[c][1], flag='r') except: raise IOError, 'Cannot open search array(s) for colorplane ' + str( c) for iid in idb.keys(): #for each image print(" iid id %s int(iid) is %s c is %s ", iid, int(iid), c) print(len(idb[iid])) if (len(idb[iid]) == 4): # otherwise corrupt data in db scores[int(iid)] += w[0, c] * math.fabs( qmat[c, 0, 0] - idb[iid][c + 1]) #idb[iid][c+1] print(" done ") #set this color channel's avg color val to 0 so that it does not take #part in further scoring of images qmat[c, 0, 0] = 0 print(qmat[c, 0, 0]) print(" we are here %s ", w.size) for row in range(0, len(qmat[c])): indices = nonzero(qmat[c, row]) indices = indices[0] # bug-fixed for col in indices: #for each non-zero coefficient rowcolid = str(row).zfill(3) + str(col).zfill(3) #print rowcolid, imglist = [] print(qmat[c, row, col]) if (qmat[c, row, col] > 0): #positive search array try: iList = sa_plus[rowcolid] imglist.extend(iList) except KeyError: continue else: #negative search array try: iList = sa_minus[rowcolid] imglist.extend(iList) except KeyError: continue #print imglist for iid in imglist: #for each image in the matched im list scores[int(iid)] -= w[bin(row, col), c] #update the scores sa_plus.close() sa_minus.close() idb.close() return scores
def test_insertion_mongo(self): """ Tests that fetched tweets are correclty inserted in mongo. """ timeline = twitter.home_timeline() for tweet in timeline: db.insert(tweet, self.tweets_collection) self.assertEqual(db.size(self.tweets_collection), len(timeline))
def learn(solver_path, snapshot_path, iters_to_init, max_samples_to_use): net_path = proto.get_net_from_solver(solver_path) train_db_path = proto.get_db_from_net(net_path) train_db_len = db.size(train_db_path) # prepare path to the temporary model files active_solver_path = utils.create_temp_path(solver_path + config.POSTFIX) active_net_path = utils.create_temp_path(net_path + config.POSTFIX) active_db_path = utils.create_temp_path(train_db_path + config.POSTFIX) # prepare temporary model files proto.prepare_net(net_path, active_net_path, active_db_path) snapshot_prefix, snapshot_iter = proto.prepare_solver( solver_path, active_solver_path, active_net_path, snapshot_path, iters_to_init ) print snapshot_prefix # recover the snapshot folder snapshot_path = '/'.join(snapshot_prefix.split('/')[:-1]) epoch_file = os.path.join(snapshot_path, config.EPOCH_FILE) # deploy net # deploy_net = net.Net(active_net_path, output_layers=config.OUTPUT_LAYERS) deploy_net = net.DropoutNet(active_net_path, config.DROPOUT_ITERS, aggregate='mean', output_layers=config.OUTPUT_LAYERS, output_processor=utils.softmax) epoch_used_samples = set() dataset = samples.Dataset(train_db_path, deploy_net.batch_size, epoch_used_samples) # initialize net # solverstate_path = proto.solverstate_path(snapshot_prefix, iters_to_init) # if not os.path.exists(solverstate_path): if os.path.exists(active_db_path): shutil.rmtree(active_db_path) # shutil.copytree(train_db_path, active_db_path) used_samples = db.extract_samples(train_db_path, active_db_path, iters_to_init * deploy_net.batch_size) init_network(active_solver_path) # do the real learning print 'train samples:', train_db_len for epoch in xrange(config.MAX_EPOCHS): print 'Epoch #{0}'.format(epoch) epoch_used_samples.clear() epoch_used_samples.update(used_samples) while len(epoch_used_samples) < train_db_len: if snapshot_iter > config.MAX_ITER: break solverstate_path = proto.solverstate_path(snapshot_prefix, snapshot_iter) caffemodel_path = proto.caffemodel_path(snapshot_prefix, snapshot_iter) print 'Using snapshot iter #{0}'.format(snapshot_iter) deploy_net.load_model(caffemodel_path) # active_samples = samples.choose_active(deploy_net, dataset, config.BATCHES_PER_RUN) # epoch_used_samples.update(active_samples) # assert len(active_samples) <= int(max(active_samples)), \ # 'Index of the highest sample is lower than the number of used samples' # # check if it makes sense to continue # iters_to_do = len(active_samples) / deploy_net.batch_size # if iters_to_do == 0: # break num_samples_to_choose = min(max_samples_to_use - len(epoch_used_samples), config.NEW_SAMPLES_PER_ITER) batches_to_choose = num_samples_to_choose / deploy_net.batch_size chosen_samples = samples.choose_active(deploy_net, dataset, batches_to_choose) active_samples = chosen_samples + list(epoch_used_samples) epoch_used_samples.update(chosen_samples) print 'Used {} samples'.format(len(epoch_used_samples)) # check if it makes sense to continue iters_to_do = len(active_samples) / deploy_net.batch_size if iters_to_do == 0: break db.extract_samples(train_db_path, active_db_path, active_samples) proto.increase_max_iters(active_solver_path, iters_to_do) train_network(active_solver_path, solverstate_path) snapshot_iter += iters_to_do utils.append_to_file(epoch_file, '{}:{}'.format(snapshot_iter, len(epoch_used_samples)))