def make_data_layer_2(): data_set = {} data_set['train'] = 'data/imagernn/train_indices.npy' data_set['val'] = 'data/imagernn/valid_indices.npy' for phase in ['train', 'val']: print 'Starting %s' % phase db_name = 'data/imagernn/fc7_%s_lmdb' % phase subprocess.call(['rm', '-rf', db_name]) env = lmdb.open(db_name, map_size=2147483648*8) data = np.load(data_set[phase]) with env.begin(write=True) as txn: for var in xrange(data.shape[0]): if (var%1000 == 0): print "%08d %08d %02.2f %s"%(var,data.shape[0],float(var)/data.shape[0],time.ctime()) datum = Datum() datum.channels = 4096 datum.width = 1 datum.height = 1 for j in xrange(4096): datum.float_data.append(data[var,j]) key = "%010d"%(var) txn.put(key, datum.SerializeToString())
def make_data(param): for phase in ['train', 'valid']: print 'Starting %s' % phase db_name = 'data/imagernn/lm_%s_db' % phase subprocess.call(['rm', '-rf', db_name]) env = lmdb.open(db_name, map_size=2147483648*8) def vocab_transform(target_input): def t_foo(x): return x if x < param['unknown_symbol'] else param['unknown_symbol'] target_line = [t_foo(int(x)) for x in target_input.split(' ')[:param['maximum_length']]] #add end symbol target_line.append(param['end_symbol']) target_line = target_line[:param['maximum_length']] + \ [param['zero_symbol']] * (param['maximum_length'] - len(target_line[:param['maximum_length']])) assert len(target_line) == param['maximum_length'] return target_line allX = [] with open('data/imagernn/%s_indices.txt' % phase, 'r') as f1: for en in f1.readlines(): allX.append(vocab_transform(en)) print 'Writing %s sentences' % len(allX) feature_np_in = 'data/imagernn/%s_indices.npy'% phase feature_np = np.load(feature_np_in) print "dim %d"%(2 * param['maximum_length'] + param['image_feature_length']) with env.begin(write=True) as txn: for i, target_line in enumerate(allX): if (i%1000 == 0): print "%08d %08d %02.2f %s"%(i,len(allX),float(i)/len(allX),time.ctime()) #if (i == 3000): # break datum = Datum() datum.channels = 2 * param['maximum_length'] + param['image_feature_length'] datum.width = 1 datum.height = 1 for j in range(param['image_feature_length']): datum.float_data.append(feature_np[i,j]) for j in range(param['maximum_length']): if j == 0: datum.float_data.append(param['start_symbol']) else: datum.float_data.append(target_line[j - 1]) for j in range(param['maximum_length']): datum.float_data.append(target_line[j]) assert(len(datum.float_data) == datum.channels) key = "%08d"%(i) txn.put(key, datum.SerializeToString())
def _generate_leveldb(file_path, image_paths, targets, width, height): """ Caffe uses the LevelDB format to efficiently load its training and validation data; this method writes paired out faces in an efficient way into this format. """ print "\t\tGenerating LevelDB file at %s..." % file_path shutil.rmtree(file_path, ignore_errors=True) db = plyvel.DB(file_path, create_if_missing=True) wb = db.write_batch() commit_every = 10000 start_time = int(round(time.time() * 1000)) for idx in range(len(image_paths)): # Each image is a top level key with a keyname like 00000000011, in increasing # order starting from 00000000000. key = utils.get_key(idx) # Do common normalization that might happen across both testing and validation. try: image = _preprocess_data( _load_numpy_image(image_paths[idx], width, height)) except: print "\t\t\tWarning: Unable to process leveldb image %s" % image_paths[ idx] continue # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details. datum = Datum() datum.channels = 3 # RGB datum.height = height datum.width = width datum.data = image.tostring() datum.label = targets[idx] value = datum.SerializeToString() wb.put(key, value) if (idx + 1) % commit_every == 0: wb.write() del wb wb = db.write_batch() end_time = int(round(time.time() * 1000)) total_time = end_time - start_time print "\t\t\tWrote batch, key: %s, time for batch: %d ms" % ( key, total_time) start_time = int(round(time.time() * 1000)) end_time = int(round(time.time() * 1000)) total_time = end_time - start_time print "\t\t\tWriting final batch, time for batch: %d ms" % total_time wb.write() db.close()
def _generate_leveldb(self, file_path, pairs, target, single_data): """ Caffe uses the LevelDB format to efficiently load its training and validation data; this method writes paired out faces in an efficient way into this format. """ print "\tGenerating LevelDB file at %s..." % file_path shutil.rmtree(file_path, ignore_errors=True) db = plyvel.DB(file_path, create_if_missing=True) wb = db.write_batch() commit_every = 250000 start_time = int(round(time.time() * 1000)) for idx in range(len(pairs)): # Each image pair is a top level key with a keyname like 00000000011, in increasing # order starting from 00000000000. key = siamese_utils.get_key(idx) # Actually expand our images now, taking the index reference and turning it into real # image pairs; we delay doing this until now for efficiency reasons, as we will probably # have more pairs of images than actual computer memory. image_1 = single_data[pairs[idx][0]] image_2 = single_data[pairs[idx][1]] paired_image = np.concatenate([image_1, image_2]) # Do things like mean normalize, etc. that happen across both testing and validation. paired_image = self._preprocess_data(paired_image) # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details. datum = Datum() # One channel for each image in the pair. datum.channels = 2 # One channel for each image in the pair. datum.height = constants.HEIGHT datum.width = constants.WIDTH datum.data = paired_image.tostring() datum.label = target[idx] value = datum.SerializeToString() wb.put(key, value) if (idx + 1) % commit_every == 0: wb.write() del wb wb = db.write_batch() end_time = int(round(time.time() * 1000)) total_time = end_time - start_time print "Wrote batch, key: %s, time for batch: %d ms" % ( key, total_time) start_time = int(round(time.time() * 1000)) wb.write() db.close()
def _generate_leveldb(self, file_path, pairs, target, single_data): """ Caffe uses the LevelDB format to efficiently load its training and validation data; this method writes paired out faces in an efficient way into this format. """ print "\tGenerating LevelDB file at %s..." % file_path shutil.rmtree(file_path, ignore_errors=True) db = plyvel.DB(file_path, create_if_missing=True) wb = db.write_batch() commit_every = 250000 start_time = int(round(time.time() * 1000)) for idx in range(len(pairs)): # Each image pair is a top level key with a keyname like 00000000011, in increasing # order starting from 00000000000. key = siamese_utils.get_key(idx) # Actually expand our images now, taking the index reference and turning it into real # image pairs; we delay doing this until now for efficiency reasons, as we will probably # have more pairs of images than actual computer memory. image_1 = single_data[pairs[idx][0]] image_2 = single_data[pairs[idx][1]] paired_image = np.concatenate([image_1, image_2]) # Do things like mean normalize, etc. that happen across both testing and validation. paired_image = self._preprocess_data(paired_image) # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details. datum = Datum() # One channel for each image in the pair. datum.channels = 2 # One channel for each image in the pair. datum.height = constants.HEIGHT datum.width = constants.WIDTH datum.data = paired_image.tostring() datum.label = target[idx] value = datum.SerializeToString() wb.put(key, value) if (idx + 1) % commit_every == 0: wb.write() del wb wb = db.write_batch() end_time = int(round(time.time() * 1000)) total_time = end_time - start_time print "Wrote batch, key: %s, time for batch: %d ms" % (key, total_time) start_time = int(round(time.time() * 1000)) wb.write() db.close()
def _generate_leveldb(file_path, image_paths, targets, width, height): """ Caffe uses the LevelDB format to efficiently load its training and validation data; this method writes paired out faces in an efficient way into this format. """ print "\t\tGenerating LevelDB file at %s..." % file_path shutil.rmtree(file_path, ignore_errors=True) db = plyvel.DB(file_path, create_if_missing=True) wb = db.write_batch() commit_every = 10000 start_time = int(round(time.time() * 1000)) for idx in range(len(image_paths)): # Each image is a top level key with a keyname like 00000000011, in increasing # order starting from 00000000000. key = utils.get_key(idx) # Do common normalization that might happen across both testing and validation. try: image = _preprocess_data(_load_numpy_image(image_paths[idx], width, height)) except: print "\t\t\tWarning: Unable to process leveldb image %s" % image_paths[idx] continue # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details. datum = Datum() datum.channels = 3 # RGB datum.height = height datum.width = width datum.data = image.tostring() datum.label = targets[idx] value = datum.SerializeToString() wb.put(key, value) if (idx + 1) % commit_every == 0: wb.write() del wb wb = db.write_batch() end_time = int(round(time.time() * 1000)) total_time = end_time - start_time print "\t\t\tWrote batch, key: %s, time for batch: %d ms" % (key, total_time) start_time = int(round(time.time() * 1000)) end_time = int(round(time.time() * 1000)) total_time = end_time - start_time print "\t\t\tWriting final batch, time for batch: %d ms" % total_time wb.write() db.close()
def make_data(param): for phase in ['train', 'valid', 'test']: print 'Starting %s' % phase db_name = './examples/language_model/lm_%s_db' % phase subprocess.call(['rm', '-rf', db_name]) env = lmdb.open(db_name, map_size=2147483648 * 8) def vocab_transform(target_input): def t_foo(x): return x if x < param['unknown_symbol'] else param[ 'unknown_symbol'] target_line = [ t_foo(int(x)) for x in target_input.split(' ')[:param['maximum_length']] ] target_line = target_line[:param['maximum_length']] + \ [param['zero_symbol']] * (param['maximum_length'] - len(target_line[:param['maximum_length']])) assert len(target_line) == param['maximum_length'] return target_line allX = [] with open('./data/language_model/%s_indices.txt' % phase, 'r') as f1: for en in f1.readlines(): allX.append(vocab_transform(en)) print 'Writing %s sentences' % len(allX) with env.begin(write=True) as txn: for i, target_line in enumerate(allX): datum = Datum() datum.channels = 2 * param['maximum_length'] datum.width = 1 datum.height = 1 for j in range(param['maximum_length']): if j == 0: datum.float_data.append(param['start_symbol']) else: datum.float_data.append(target_line[j - 1]) for j in range(param['maximum_length']): datum.float_data.append(target_line[j]) key = str(i) txn.put(key, datum.SerializeToString())
def _generate_leveldb(self, file_path, image, target, single_data): """ Caffe uses the LevelDB format to efficiently load its training and validation data; this method writes paired out faces in an efficient way into this format. """ print "\tGenerating LevelDB file at %s..." % file_path shutil.rmtree(file_path, ignore_errors=True) db = plyvel.DB(file_path, create_if_missing=True) wb = db.write_batch() commit_every = 250000 start_time = int(round(time.time() * 1000)) for idx in range(len(pairs)): # Each image is a top level key with a keyname like 00000000011, in increasing # order starting from 00000000000. key = utils.get_key(idx) # Do things like mean normalize, etc. that happen across both testing and validation. paired_image = self._preprocess_data(paired_image) # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details. datum = Datum() # TODO(neuberg): Confirm that this is the correct way to setup RGB images for # Caffe for our dataset. datum.channels = 3 datum.height = constants.HEIGHT datum.width = constants.WIDTH datum.data = image.tostring() datum.label = target[idx] value = datum.SerializeToString() wb.put(key, value) if (idx + 1) % commit_every == 0: wb.write() del wb wb = db.write_batch() end_time = int(round(time.time() * 1000)) total_time = end_time - start_time print "Wrote batch, key: %s, time for batch: %d ms" % (key, total_time) start_time = int(round(time.time() * 1000)) wb.write() db.close()
def make_data(param): for phase in ['train', 'valid', 'test']: print 'Starting %s' % phase db_name = './examples/language_model/lm_%s_db' % phase #print db_name if os.path.isfile(db_name): subprocess.call(['rm', '-rf', db_name]) env = lmdb.open(db_name, map_size=10485760*64) def vocab_transform(target_input): def t_foo(x): return x if x < param['unknown_symbol'] else param['unknown_symbol'] target_line = [t_foo(int(x)) for x in target_input.split(' ')[:param['maximum_length']]] target_line = target_line[:param['maximum_length']] + \ [param['zero_symbol']] * (param['maximum_length'] - len(target_line[:param['maximum_length']])) assert len(target_line) == param['maximum_length'] return target_line allX = [] with open('./data/language_model/%s_indices.txt' % phase, 'r') as f1: for en in f1.readlines(): allX.append(vocab_transform(en)) print 'Writing %s sentences' % len(allX) with env.begin(write=True) as txn: for i, target_line in enumerate(allX): datum = Datum() datum.channels = 2 * param['maximum_length'] datum.width = 1 datum.height = 1 for j in range(param['maximum_length']): if j == 0: datum.float_data.append(param['start_symbol']) else: datum.float_data.append(target_line[j - 1]) for j in range(param['maximum_length']): datum.float_data.append(target_line[j]) key = str(i) txn.put(key, datum.SerializeToString())
def make_data(param): for phase in ["train", "valid", "test"]: print "Starting %s" % phase db_name = "./examples/language_model/lm_%s_db" % phase subprocess.call(["rm", "-rf", db_name]) env = lmdb.open(db_name, map_size=2147483648 * 8) def vocab_transform(target_input): def t_foo(x): return x if x < param["unknown_symbol"] else param["unknown_symbol"] target_line = [t_foo(int(x)) for x in target_input.split(" ")[: param["maximum_length"]]] target_line = target_line[: param["maximum_length"]] + [param["zero_symbol"]] * ( param["maximum_length"] - len(target_line[: param["maximum_length"]]) ) assert len(target_line) == param["maximum_length"] return target_line allX = [] with open("./data/language_model/%s_indices.txt" % phase, "r") as f1: for en in f1.readlines(): allX.append(vocab_transform(en)) print "Writing %s sentences" % len(allX) with env.begin(write=True) as txn: for i, target_line in enumerate(allX): datum = Datum() datum.channels = 2 * param["maximum_length"] datum.width = 1 datum.height = 1 for j in range(param["maximum_length"]): if j == 0: datum.float_data.append(param["start_symbol"]) else: datum.float_data.append(target_line[j - 1]) for j in range(param["maximum_length"]): datum.float_data.append(target_line[j]) key = str(i) txn.put(key, datum.SerializeToString())
def create_lmdb_file(dataset, phase, w2v_dict ): print 'Starting %s' % phase db_name = './examples/language_model/lm_%s_db' % phase subprocess.call(['rm', '-rf', db_name]) env = lmdb.open(db_name, map_size=2147483648*8) print 'Writing %s sentences, %s' % (len(dataset), phase) last = None index = 0 with env.begin(write=True) as txn: for i in range(len(dataset)): sentence = dataset[i] ##sentence datum = Datum() datum.channels = 1 datum.width = 300 datum.height = len(sentence) datum.label = int(sentence[-1]) current = datum.height if last and last != current: print sys._getframe().f_lineno, "length not equal" sys.exit(-1) #print sys._getframe().f_lineno, "sentence length:", len(sentence) for j in range(0, len(sentence)-1): word_idx = sentence[j] #print word_idx if word_idx in w2v_dict: elem_vector = w2v_dict[word_idx] else: elem_vector = w2v_dict[DEF_IDX] if len(elem_vector) != W2V_LEN: print sys._getframe().f_lineno, "w2v length not equal 300" sys.exit(-1) for elem in elem_vector: datum.float_data.append(elem) #datum.float_data += elem_vector key = str(i) txn.put(key, datum.SerializeToString()) index += 1 if index % 100 == 0: print "finished num:", index sys.stdout.flush() ''' if index > 2 : print '11111111111111111111111111111111111111111111' break; ''' last = current print 'Writing %s sentences, %s. End' % (len(dataset), phase)
for cur_map in maps: # print "Proccessing {}".format(cur_map) key = os.path.splitext(cur_map)[0] # try: # value = txn.get(key) # except KeyError: # Make data blob datum = Datum() submap = sio.loadmat(os.path.join(args.submap_dir, cur_map)) submap = submap[args.variable].astype('float') if submap.ndim == 3: submap = submap.swapaxes(1,2).swapaxes(0,1).astype('float') datum.channels, datum.height, datum.width = submap.shape else: datum.height, datum.width = submap.shape datum.channels = 1 datum.float_data.extend(list(submap.flatten())) if mean_blob is None: mean_blob = BlobProto() mean_blob.height = datum.height mean_blob.width = datum.width mean_blob.channels = datum.channels mean_blob.num = 1 img_mean = submap else: img_mean += submap datum.label = 0 if not txn.put(key, datum.SerializeToString(), dupdata=False):