def get_input_fn(name, batch_size=32): image_size = 32 is_training = name == 'train' ds = df.dataset.Cifar10(name, shuffle=is_training) ds = df.MapDataComponent( ds, lambda x: np.pad(x, [(4, 4), (4, 4), (0, 0)], mode='reflect'), index=0) augmentors = [ tp.imgaug.RandomCrop((32, 32)), tp.imgaug.Flip(horiz=True), #tp.imgaug.MapImage(lambda x: (x - pp_mean)/128.0), ] if is_training: ds = df.RepeatedData(ds, -1) ds = tp.AugmentImageComponent(ds, augmentors) else: ds = tp.AugmentImageComponent(ds, [tp.imgaug.CenterCrop((32, 32))]) ds = tp.AugmentImageComponent(ds, [tp.imgaug.Resize((image_size, image_size))]) ds = df.MapData(ds, tuple) # for tensorflow.data.dataset ds.reset_state() def input_fn(): with tf.name_scope('dataset'): dataset = tf.data.Dataset.from_generator( ds.get_data, output_types=(tf.float32, tf.int64), output_shapes=(tf.TensorShape([image_size, image_size, 3]), tf.TensorShape([]))).batch(batch_size) return dataset return input_fn
def lmdb_dataflow(lmdb_path, batch_size, sample_size, is_training, test_speed=False, train_perturb_list=None, valid_perturb_list=None, so3_perturb=False, use_partial=False): df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) size = df.size() if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1) df = PreprocessData(df, sample_size, is_training, train_perturb_list=train_perturb_list, valid_perturb_list=valid_perturb_list, so3_perturb=so3_perturb, use_partial=use_partial) if is_training: df = dataflow.PrefetchDataZMQ(df, nr_proc=8) df = dataflow.BatchData(df, batch_size, use_list=True) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False): #df = dataflow.LMDBSerializer.load("/home/cuda/Alex/PC-NBV/data/train.lmdb", shuffle=False) df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) #df = dataflow.LMDBSerializer.load("/home/cuda/Alex/PC-NBV/data/", shuffle=False) size = df.size() if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) df = dataflow.PrefetchData(df, num_prefetch=500, num_proc=1) # df = dataflow.PrefetchData(df,nr_prefetch=500, nr_proc=1) df = BatchData(df, batch_size, input_size, output_size) if is_training: df = dataflow.PrefetchDataZMQ(df, num_proc=8) #df = dataflow.PrefetchData(df,num_prefetch=500, num_proc=1) #df = dataflow.PrefetchDataZMQ(df, num_proc=1) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False): """load LMDB files, then generate batches??""" df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) size = df.size() if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) # buffer_size df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1) # multiprocess the data df = BatchData(df, batch_size, input_size, output_size) if is_training: df = dataflow.PrefetchDataZMQ(df, nr_proc=8) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
def build_eval_dataflow(self, policy=None, repeats=None): """ :param policy: policy to evaluate when mode == eval :param repeats: repeat evaluation multiple times when mode == eval :return: dataflow with evaluation results """ df = dataflow.DataFromList(self.filtered_samples, shuffle=False) df = dataflow.RepeatedData(df, 1000000) df = EvalDataFeed(df, self.get_db, self.domain, policy=policy, repeats=repeats) return df
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False): df = dataflow.LMDBData(lmdb_path, shuffle=False) size = df.size() if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1) df = dataflow.LMDBDataPoint(df) df = PreprocessData(df, input_size, output_size) if is_training: df = dataflow.PrefetchDataZMQ(df, nr_proc=8) df = dataflow.BatchData(df, batch_size, use_list=True) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
def __init__(self, ds, placeholders, repeat_infinite=False, queue_size=50): super(QueueInput, self).__init__() self.daemon = True self.ds = df.RepeatedData(ds, -1) if repeat_infinite else ds self.placeholders = placeholders self.queue = tf.FIFOQueue( queue_size, [ph.dtype for ph in placeholders], shapes=[ph.get_shape() for ph in placeholders]) self.op = self.queue.enqueue(self.placeholders) self.close_op = self.queue.close(cancel_pending_enqueues=True) self._itr = None self._sess = None self._lock = threading.Lock()
def build_dataflow(self, batch_size, step_size, restart_limit=None, cache=None): """ :param batch_size: batch size :param step_size: number of steps for BPTT :param restart_limit: restart after limit number of batches. Used for validation. If 0 (but not None) or larger than an epoch its set to one epoch. :param cache: preloaded cache :return: dataflow with input data """ # update db wrapper function with shared cache if cache is not None: self.get_db = ( lambda: Database(filename=self.filename, cache=cache)) df = dataflow.DataFromList(self.filtered_samples, shuffle=(self.mode == 'train')) if restart_limit is None: df = dataflow.RepeatedData(df, 1000000) # reshuffles on every repeat df = DynamicTrajBatch(df, batch_size=batch_size, step_size=step_size, traj_lens=self.filtered_samples[:, 4]) self.steps_in_epoch = df.steps_in_epoch() if restart_limit is not None: if restart_limit == 0 or restart_limit >= self.steps_in_epoch: restart_limit = self.steps_in_epoch - 1 self.steps_in_epoch = restart_limit df = OneShotData(df, size=restart_limit) df = TrajDataFeed(df, self.get_db, self.domain, batch_size=batch_size, step_size=step_size) # uncomment to test dataflow speed # dataflow.TestDataSpeed(df, size=1000).start() return df
def dataflow4(num_reference=3, order=1, shuffle=True): kinetics_path = "/media/engin/63c43c7a-cb63-4c43-b70c-f3cb4d68762a/datasets/kinetics/kinetics700" ds = KineticsClustered(order, kinetics_path, num_frames=num_reference + 1, skips=[0, 4, 4, 4][:num_reference + 1], shuffle=False) # # stack for tensor ds = df.MapData( ds, lambda dp: [np.stack(dp[1], axis=0), np.stack(dp[2], axis=0)]) ds = df.MapData(ds, tuple) # for tensorflow.data.dataset ds = df.RepeatedData(ds, -1) return ds
def __init__(self, split, batch_size, set_size): if split == 'train': lmdb_path = f'{data_path}/ModelNet40_train_1024_middle.lmdb' else: lmdb_path = f'{data_path}/ModelNet40_test_1024_middle.lmdb' df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) self.size = df.size() self.num_batches = self.size // batch_size if split == 'train': df = dataflow.LocallyShuffleData(df, buffer_size=2000) # buffer_size df = dataflow.PrefetchData(df, num_prefetch=500, num_proc=1) df = BatchData(df, batch_size, set_size // 8, set_size - set_size // 8) if split == 'train': df = dataflow.PrefetchDataZMQ(df, num_proc=8) df = dataflow.RepeatedData(df, -1) df.reset_state() self.generator = df.get_data()
def read_data(files=None, batch_size=1, window=2, random_rotation=False, repeat=False, shuffle_buffer=None, num_workers=1, cache_data=False): print(files[0:20], '...' if len(files) > 20 else '') # caching makes only sense if the data is finite if cache_data: if repeat == True: raise Exception("repeat must be False if cache_data==True") if random_rotation == True: raise Exception( "random_rotation must be False if cache_data==True") if num_workers != 1: raise Exception("num_workers must be 1 if cache_data==True") df = PhysicsSimDataFlow( files=files, random_rotation=random_rotation, shuffle=True if shuffle_buffer else False, window=window, ) if repeat: df = dataflow.RepeatedData(df, -1) if shuffle_buffer: df = dataflow.LocallyShuffleData(df, shuffle_buffer) if num_workers > 1: df = dataflow.MultiProcessRunnerZMQ(df, num_proc=num_workers) df = dataflow.BatchData(df, batch_size=batch_size, use_list=True) if cache_data: df = dataflow.CacheData(df) df.reset_state() return df
def lmdb_dataflow(lmdb_path, batch_size, num_points, shuffle, task, render=False): df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) size = df.size() if render: df = VirtualRenderData(df) if num_points is not None: df = ResampleData(df, num_points, task) if shuffle: df = dataflow.LocallyShuffleData(df, 1000) df = dataflow.PrefetchDataZMQ(df, 8) df = dataflow.BatchData(df, batch_size, use_list=True) df = dataflow.RepeatedData(df, -1) df.reset_state() return df, size
def dataflow3(num_reference=3, num_sets=1, shuffle=True): kinetics_path = "/media/engin/63c43c7a-cb63-4c43-b70c-f3cb4d68762a/datasets/kinetics/kinetics700" ds_list = [] for i in range(num_sets): ds1 = KineticsClustered(i, kinetics_path, num_frames=num_reference + 1, skips=[0, 4, 4, 4][:num_reference + 1], shuffle=False) ds1 = df.RepeatedData(ds1, -1) ds_list.append(ds1) # ds2 = KineticsClustered(1, kinetics_path, num_frames=num_reference + 1, # skips=[0, 4, 4, 4][:num_reference + 1], shuffle=False) # ds2 = df.RepeatedData(ds2, -1) ds = df.JoinData(ds_list) # ds = df.MapData(ds, lambda dp: [ [dp[0], dp[1], dp[2]] ]) ds = df.MapData( ds, lambda dp: [[dp[i], dp[i + 1], dp[i + 2]] for i in range(0, num_sets * 3, 3)]) # for idx in [0, 1]: # ds = df.MapDataComponent(ds, lambda dp: [dp[1][:num_reference], dp[2][:num_reference], dp[1][num_reference:], dp[2][num_reference:]], index=idx) # # stack for tensor for idx in range(num_sets): ds = df.MapDataComponent( ds, lambda dp: [np.stack(dp[1], axis=0), np.stack(dp[2], axis=0)], index=idx) ds = df.MapData(ds, tuple) # for tensorflow.data.dataset # ds = df.BatchData(ds, 2, use_list=True) #Prepare epochs # ds = df.RepeatedData(ds, total_num_epoch) # ds = df.RepeatedData(ds, -1) return ds
def __init__(self, ds, placeholders, repeat_infinite=False, queue_size=50): super(QueueInputMulti, self).__init__() self.daemon = True self.ds = df.RepeatedData(ds, -1) if repeat_infinite else ds self.placeholders = placeholders self.queue = [ tf.FIFOQueue(queue_size, [ph.dtype for ph in phs], shapes=[ph.get_shape() for ph in phs]) for phs in placeholders ] self.op = [ self.queue[idx].enqueue(phs) for idx, phs in enumerate(placeholders) ] self.close_op = [ q.close(cancel_pending_enqueues=True) for q in self.queue ] self._itr = None self._sess = None self._lock = threading.Lock()
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False, filter_rate=0): df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) df = dataflow.MapData(df, lambda dp: [item for item in dp] + [random.random()]) size = df.size() print(size) if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1) df = BatchData(df, batch_size, input_size, output_size) if is_training: df = dataflow.PrefetchDataZMQ(df, nr_proc=8) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
# get every frame of ds = Kinetics(kinetics_dirpath, num_frames=1, skips=[0], shuffle=False) # keep only first frame of each sub-video # [sub_video_idx, frames[]] -> [first_frame, sub_video_idx] ds = df.MapData(ds, lambda dp: [dp[1][0], dp[0]]) else: ds = df.dataset.Cifar10('train', shuffle=False) logging.info('Downsampling frames to 32x32 resolution') ds = df.MapDataComponent(ds, lambda image: cv2.resize(image, (32, 32))) logging.info('Converting RGB to Lab color space') ds = df.MapDataComponent(ds, lambda image: cv2.cvtColor(np.float32(image / 255.0), cv2.COLOR_RGB2Lab)) ds = df.MapDataComponent(ds, lambda image: image[:, :, 1:]) ds = df.MapDataComponent(ds, lambda image: image.reshape((-1, 2))) ds = df.RepeatedData(ds, -1) ds.reset_state() generator = ds.get_data() samples = [] for _ in range(args.num_samples): samples.append(next(generator)[0]) vectors = np.array(samples).reshape((-1, 2)) logging.info('Vectorized images in the shape: %s', vectors.shape) kmeans = KMeans(args.num_clusters).fit(vectors) logging.info('Fitted kmeans clustering') centroids = np.array(kmeans.cluster_centers_)
import tensorpack.dataflow as df if __name__ == '__main__': ds = df.dataset.Mnist('train') augmentors = [ df.imgaug.RandomApplyAug( df.imgaug.RandomResize((0.8, 1.2), (0.8, 1.2)), 0.3), df.imgaug.RandomApplyAug(df.imgaug.RotationAndCropValid(15), 0.5), df.imgaug.RandomApplyAug( df.imgaug.SaltPepperNoise(white_prob=0.01, black_prob=0.01), 0.25), df.imgaug.Resize((28, 28)), df.imgaug.CenterPaste((32, 32)), df.imgaug.RandomCrop((28, 28)), df.imgaug.MapImage(lambda x: x.reshape(28, 28, 1)) ] ds = df.AugmentImageComponent(ds, augmentors) ds = df.BatchData(ds, batch_size=32, remainder=False) ds = df.PrefetchData(ds, nr_prefetch=12, nr_proc=2) ds = df.PrintData(ds) ds = df.RepeatedData(ds, nr=-1) df.TestDataSpeed(ds, size=50000).start()
def get_dataflow(files, params, is_training): """ Build a tensorflow Dataset from appropriate tfrecords files. :param files: list a file paths corresponding to appropriate tfrecords data :param params: parsed arguments :param is_training: bool, true for training. :return: (nextdata, num_samples). nextdata: list of tensorflow ops that produce the next input with the following elements: true_states, global_map, init_particles, observations, odometries, is_first_step. See House3DTrajData.get_data for definitions. num_samples: number of samples that make an epoch """ mapmode = params.mapmode obsmode = params.obsmode batchsize = params.batchsize num_particles = params.num_particles trajlen = params.trajlen bptt_steps = params.bptt_steps # build initial covariance matrix of particles, in pixels and radians particle_std = params.init_particles_std.copy() particle_std[0] = particle_std[ 0] / params.map_pixel_in_meters # convert meters to pixels particle_std2 = np.square(particle_std) # element-wise variance init_particles_cov = np.diag(particle_std2[(0, 0, 1), ]) # index is (0,0,1) df = House3DTrajData( files, mapmode, obsmode, trajlen, num_particles, params.init_particles_distr, init_particles_cov, seed=(params.seed if params.seed is not None and params.seed > 0 else (params.validseed if not is_training else None))) # data: true_states, global_map, init_particles, observation, odometry # make it a multiple of batchsize df = dataflow.FixedSizeData(df, size=(df.size() // batchsize) * batchsize, keep_state=False) # shuffle if is_training: df = dataflow.LocallyShuffleData( df, 100 * batchsize) # buffer_size = 100 * batchsize # repeat data for the number of epochs df = dataflow.RepeatedData(df, params.epochs) # batch df = BatchDataWithPad(df, batchsize, padded_indices=(1, )) # break trajectory into multiple segments for BPTT training. Augment df with is_first_step indicator df = BreakForBPTT(df, timed_indices=(0, 3, 4), trajlen=trajlen, bptt_steps=bptt_steps) # data: true_states, global_map, init_particles, observation, odometry, is_first_step num_samples = df.size() // params.epochs df.reset_state() # # test dataflow # df = dataflow.TestDataSpeed(dataflow.PrintData(df), 100) # df.start() obs_ch = {'rgb': 3, 'depth': 1, 'rgb-depth': 4} map_ch = { 'wall': 1, 'wall-door': 2, 'wall-roomtype': 10, 'wall-door-roomtype': 11 } # every semantic is a channel types = [ tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.bool ] sizes = [ (batchsize, bptt_steps, 3), (batchsize, None, None, map_ch[mapmode]), (batchsize, num_particles, 3), (batchsize, bptt_steps, 56, 56, obs_ch[obsmode]), (batchsize, bptt_steps, 3), (), ] # turn it into a tf dataset def tuplegen(): for dp in df.get_data(): yield tuple(dp) dataset = tf.data.Dataset.from_generator(tuplegen, tuple(types), tuple(sizes)) iterator = dataset.make_one_shot_iterator() # only read once nextdata = iterator.get_next() return nextdata, num_samples