Ejemplo n.º 1
0
def get_input_fn(name, batch_size=32):
    image_size = 32

    is_training = name == 'train'
    ds = df.dataset.Cifar10(name, shuffle=is_training)
    ds = df.MapDataComponent(
        ds,
        lambda x: np.pad(x, [(4, 4), (4, 4), (0, 0)], mode='reflect'),
        index=0)
    augmentors = [
        tp.imgaug.RandomCrop((32, 32)),
        tp.imgaug.Flip(horiz=True),
        #tp.imgaug.MapImage(lambda x: (x - pp_mean)/128.0),
    ]
    if is_training:
        ds = df.RepeatedData(ds, -1)
        ds = tp.AugmentImageComponent(ds, augmentors)
    else:
        ds = tp.AugmentImageComponent(ds, [tp.imgaug.CenterCrop((32, 32))])

    ds = tp.AugmentImageComponent(ds,
                                  [tp.imgaug.Resize((image_size, image_size))])
    ds = df.MapData(ds, tuple)  # for tensorflow.data.dataset
    ds.reset_state()

    def input_fn():
        with tf.name_scope('dataset'):
            dataset = tf.data.Dataset.from_generator(
                ds.get_data,
                output_types=(tf.float32, tf.int64),
                output_shapes=(tf.TensorShape([image_size, image_size, 3]),
                               tf.TensorShape([]))).batch(batch_size)
        return dataset

    return input_fn
Ejemplo n.º 2
0
def lmdb_dataflow(lmdb_path,
                  batch_size,
                  sample_size,
                  is_training,
                  test_speed=False,
                  train_perturb_list=None,
                  valid_perturb_list=None,
                  so3_perturb=False,
                  use_partial=False):
    df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False)
    size = df.size()
    if is_training:
        df = dataflow.LocallyShuffleData(df, buffer_size=2000)
    df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1)
    df = PreprocessData(df,
                        sample_size,
                        is_training,
                        train_perturb_list=train_perturb_list,
                        valid_perturb_list=valid_perturb_list,
                        so3_perturb=so3_perturb,
                        use_partial=use_partial)
    if is_training:
        df = dataflow.PrefetchDataZMQ(df, nr_proc=8)
    df = dataflow.BatchData(df, batch_size, use_list=True)
    df = dataflow.RepeatedData(df, -1)
    if test_speed:
        dataflow.TestDataSpeed(df, size=1000).start()
    df.reset_state()
    return df, size
Ejemplo n.º 3
0
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False):
    #df = dataflow.LMDBSerializer.load("/home/cuda/Alex/PC-NBV/data/train.lmdb", shuffle=False)

    df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False)
    #df = dataflow.LMDBSerializer.load("/home/cuda/Alex/PC-NBV/data/", shuffle=False)

    size = df.size()
    if is_training:
        df = dataflow.LocallyShuffleData(df, buffer_size=2000)
        
        df = dataflow.PrefetchData(df, num_prefetch=500, num_proc=1)
        # df = dataflow.PrefetchData(df,nr_prefetch=500, nr_proc=1)

        
    df = BatchData(df, batch_size, input_size, output_size)
    if is_training:
        df = dataflow.PrefetchDataZMQ(df, num_proc=8)
        #df = dataflow.PrefetchData(df,num_prefetch=500, num_proc=1)
        #df = dataflow.PrefetchDataZMQ(df, num_proc=1)
    df = dataflow.RepeatedData(df, -1)
    if test_speed:
        dataflow.TestDataSpeed(df, size=1000).start()
    
    df.reset_state()

    return df, size
Ejemplo n.º 4
0
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False):
    """load LMDB files, then generate batches??"""
    df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False)
    size = df.size()
    if is_training:
        df = dataflow.LocallyShuffleData(df, buffer_size=2000)  # buffer_size
        df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1)  # multiprocess the data
    df = BatchData(df, batch_size, input_size, output_size)
    if is_training:
        df = dataflow.PrefetchDataZMQ(df, nr_proc=8)
    df = dataflow.RepeatedData(df, -1)
    if test_speed:
        dataflow.TestDataSpeed(df, size=1000).start()
    df.reset_state()
    return df, size
Ejemplo n.º 5
0
    def build_eval_dataflow(self, policy=None, repeats=None):
        """
        :param policy: policy to evaluate when mode == eval
        :param repeats: repeat evaluation multiple times when mode == eval
        :return: dataflow with evaluation results
        """
        df = dataflow.DataFromList(self.filtered_samples, shuffle=False)
        df = dataflow.RepeatedData(df, 1000000)

        df = EvalDataFeed(df,
                          self.get_db,
                          self.domain,
                          policy=policy,
                          repeats=repeats)

        return df
Ejemplo n.º 6
0
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False):
    df = dataflow.LMDBData(lmdb_path, shuffle=False)
    size = df.size()
    if is_training:
        df = dataflow.LocallyShuffleData(df, buffer_size=2000)
    df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1)
    df = dataflow.LMDBDataPoint(df)
    df = PreprocessData(df, input_size, output_size)
    if is_training:
        df = dataflow.PrefetchDataZMQ(df, nr_proc=8)
    df = dataflow.BatchData(df, batch_size, use_list=True)
    df = dataflow.RepeatedData(df, -1)
    if test_speed:
        dataflow.TestDataSpeed(df, size=1000).start()
    df.reset_state()
    return df, size
Ejemplo n.º 7
0
    def __init__(self, ds, placeholders, repeat_infinite=False, queue_size=50):
        super(QueueInput, self).__init__()
        self.daemon = True

        self.ds = df.RepeatedData(ds, -1) if repeat_infinite else ds
        self.placeholders = placeholders
        self.queue = tf.FIFOQueue(
            queue_size, [ph.dtype for ph in placeholders],
            shapes=[ph.get_shape() for ph in placeholders])

        self.op = self.queue.enqueue(self.placeholders)
        self.close_op = self.queue.close(cancel_pending_enqueues=True)

        self._itr = None
        self._sess = None
        self._lock = threading.Lock()
Ejemplo n.º 8
0
    def build_dataflow(self,
                       batch_size,
                       step_size,
                       restart_limit=None,
                       cache=None):
        """
        :param batch_size: batch size
        :param step_size: number of steps for BPTT
        :param restart_limit: restart after limit number of batches. Used for validation. If 0 (but not None) or larger
         than an epoch its set to one epoch.
        :param cache: preloaded cache
        :return: dataflow with input data
        """
        # update db wrapper function with shared cache
        if cache is not None:
            self.get_db = (
                lambda: Database(filename=self.filename, cache=cache))

        df = dataflow.DataFromList(self.filtered_samples,
                                   shuffle=(self.mode == 'train'))

        if restart_limit is None:
            df = dataflow.RepeatedData(df,
                                       1000000)  # reshuffles on every repeat

        df = DynamicTrajBatch(df,
                              batch_size=batch_size,
                              step_size=step_size,
                              traj_lens=self.filtered_samples[:, 4])

        self.steps_in_epoch = df.steps_in_epoch()
        if restart_limit is not None:
            if restart_limit == 0 or restart_limit >= self.steps_in_epoch:
                restart_limit = self.steps_in_epoch - 1
            self.steps_in_epoch = restart_limit
            df = OneShotData(df, size=restart_limit)

        df = TrajDataFeed(df,
                          self.get_db,
                          self.domain,
                          batch_size=batch_size,
                          step_size=step_size)

        # uncomment to test dataflow speed
        # dataflow.TestDataSpeed(df, size=1000).start()

        return df
Ejemplo n.º 9
0
def dataflow4(num_reference=3, order=1, shuffle=True):
    kinetics_path = "/media/engin/63c43c7a-cb63-4c43-b70c-f3cb4d68762a/datasets/kinetics/kinetics700"

    ds = KineticsClustered(order,
                           kinetics_path,
                           num_frames=num_reference + 1,
                           skips=[0, 4, 4, 4][:num_reference + 1],
                           shuffle=False)

    # # stack for tensor
    ds = df.MapData(
        ds, lambda dp: [np.stack(dp[1], axis=0),
                        np.stack(dp[2], axis=0)])

    ds = df.MapData(ds, tuple)  # for tensorflow.data.dataset

    ds = df.RepeatedData(ds, -1)
    return ds
Ejemplo n.º 10
0
 def __init__(self, split, batch_size, set_size):
     if split == 'train':
         lmdb_path = f'{data_path}/ModelNet40_train_1024_middle.lmdb'
     else:
         lmdb_path = f'{data_path}/ModelNet40_test_1024_middle.lmdb'
     df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False)
     self.size = df.size()
     self.num_batches = self.size // batch_size
     if split == 'train':
         df = dataflow.LocallyShuffleData(df,
                                          buffer_size=2000)  # buffer_size
         df = dataflow.PrefetchData(df, num_prefetch=500, num_proc=1)
     df = BatchData(df, batch_size, set_size // 8, set_size - set_size // 8)
     if split == 'train':
         df = dataflow.PrefetchDataZMQ(df, num_proc=8)
     df = dataflow.RepeatedData(df, -1)
     df.reset_state()
     self.generator = df.get_data()
Ejemplo n.º 11
0
def read_data(files=None,
              batch_size=1,
              window=2,
              random_rotation=False,
              repeat=False,
              shuffle_buffer=None,
              num_workers=1,
              cache_data=False):
    print(files[0:20], '...' if len(files) > 20 else '')

    # caching makes only sense if the data is finite
    if cache_data:
        if repeat == True:
            raise Exception("repeat must be False if cache_data==True")
        if random_rotation == True:
            raise Exception(
                "random_rotation must be False if cache_data==True")
        if num_workers != 1:
            raise Exception("num_workers must be 1 if cache_data==True")

    df = PhysicsSimDataFlow(
        files=files,
        random_rotation=random_rotation,
        shuffle=True if shuffle_buffer else False,
        window=window,
    )

    if repeat:
        df = dataflow.RepeatedData(df, -1)

    if shuffle_buffer:
        df = dataflow.LocallyShuffleData(df, shuffle_buffer)

    if num_workers > 1:
        df = dataflow.MultiProcessRunnerZMQ(df, num_proc=num_workers)

    df = dataflow.BatchData(df, batch_size=batch_size, use_list=True)

    if cache_data:
        df = dataflow.CacheData(df)

    df.reset_state()
    return df
Ejemplo n.º 12
0
def lmdb_dataflow(lmdb_path,
                  batch_size,
                  num_points,
                  shuffle,
                  task,
                  render=False):
    df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False)
    size = df.size()
    if render:
        df = VirtualRenderData(df)
    if num_points is not None:
        df = ResampleData(df, num_points, task)
    if shuffle:
        df = dataflow.LocallyShuffleData(df, 1000)
        df = dataflow.PrefetchDataZMQ(df, 8)
    df = dataflow.BatchData(df, batch_size, use_list=True)
    df = dataflow.RepeatedData(df, -1)
    df.reset_state()
    return df, size
Ejemplo n.º 13
0
def dataflow3(num_reference=3, num_sets=1, shuffle=True):
    kinetics_path = "/media/engin/63c43c7a-cb63-4c43-b70c-f3cb4d68762a/datasets/kinetics/kinetics700"

    ds_list = []
    for i in range(num_sets):
        ds1 = KineticsClustered(i,
                                kinetics_path,
                                num_frames=num_reference + 1,
                                skips=[0, 4, 4, 4][:num_reference + 1],
                                shuffle=False)
        ds1 = df.RepeatedData(ds1, -1)
        ds_list.append(ds1)

    # ds2 = KineticsClustered(1, kinetics_path, num_frames=num_reference + 1,
    #                        skips=[0, 4, 4, 4][:num_reference + 1], shuffle=False)
    # ds2 = df.RepeatedData(ds2, -1)

    ds = df.JoinData(ds_list)

    # ds = df.MapData(ds, lambda dp: [ [dp[0], dp[1], dp[2]] ])
    ds = df.MapData(
        ds, lambda dp: [[dp[i], dp[i + 1], dp[i + 2]]
                        for i in range(0, num_sets * 3, 3)])

    # for idx in [0, 1]:
    #     ds = df.MapDataComponent(ds, lambda dp: [dp[1][:num_reference], dp[2][:num_reference], dp[1][num_reference:], dp[2][num_reference:]], index=idx)

    # # stack for tensor
    for idx in range(num_sets):
        ds = df.MapDataComponent(
            ds,
            lambda dp: [np.stack(dp[1], axis=0),
                        np.stack(dp[2], axis=0)],
            index=idx)

    ds = df.MapData(ds, tuple)  # for tensorflow.data.dataset

    # ds = df.BatchData(ds, 2, use_list=True)

    #Prepare epochs
    # ds = df.RepeatedData(ds, total_num_epoch)
    # ds = df.RepeatedData(ds, -1)
    return ds
Ejemplo n.º 14
0
    def __init__(self, ds, placeholders, repeat_infinite=False, queue_size=50):
        super(QueueInputMulti, self).__init__()
        self.daemon = True

        self.ds = df.RepeatedData(ds, -1) if repeat_infinite else ds
        self.placeholders = placeholders
        self.queue = [
            tf.FIFOQueue(queue_size, [ph.dtype for ph in phs],
                         shapes=[ph.get_shape() for ph in phs])
            for phs in placeholders
        ]

        self.op = [
            self.queue[idx].enqueue(phs)
            for idx, phs in enumerate(placeholders)
        ]
        self.close_op = [
            q.close(cancel_pending_enqueues=True) for q in self.queue
        ]

        self._itr = None
        self._sess = None
        self._lock = threading.Lock()
Ejemplo n.º 15
0
def lmdb_dataflow(lmdb_path,
                  batch_size,
                  input_size,
                  output_size,
                  is_training,
                  test_speed=False,
                  filter_rate=0):
    df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False)
    df = dataflow.MapData(df,
                          lambda dp: [item for item in dp] + [random.random()])

    size = df.size()
    print(size)
    if is_training:
        df = dataflow.LocallyShuffleData(df, buffer_size=2000)
        df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1)
    df = BatchData(df, batch_size, input_size, output_size)
    if is_training:
        df = dataflow.PrefetchDataZMQ(df, nr_proc=8)
    df = dataflow.RepeatedData(df, -1)
    if test_speed:
        dataflow.TestDataSpeed(df, size=1000).start()
    df.reset_state()
    return df, size
Ejemplo n.º 16
0
        # get every frame of
        ds = Kinetics(kinetics_dirpath, num_frames=1, skips=[0], shuffle=False)

        # keep only first frame of each sub-video
        # [sub_video_idx, frames[]] -> [first_frame, sub_video_idx]
        ds = df.MapData(ds, lambda dp: [dp[1][0], dp[0]])
    else:
        ds = df.dataset.Cifar10('train', shuffle=False)

    logging.info('Downsampling frames to 32x32 resolution')
    ds = df.MapDataComponent(ds, lambda image: cv2.resize(image, (32, 32)))
    logging.info('Converting RGB to Lab color space')
    ds = df.MapDataComponent(ds, lambda image: cv2.cvtColor(np.float32(image / 255.0), cv2.COLOR_RGB2Lab))
    ds = df.MapDataComponent(ds, lambda image: image[:, :, 1:])
    ds = df.MapDataComponent(ds, lambda image: image.reshape((-1, 2)))
    ds = df.RepeatedData(ds, -1)
    ds.reset_state()

    generator = ds.get_data()

    samples = []
    for _ in range(args.num_samples):
        samples.append(next(generator)[0])
    vectors = np.array(samples).reshape((-1, 2))
    logging.info('Vectorized images in the shape: %s', vectors.shape)

    kmeans = KMeans(args.num_clusters).fit(vectors)
    logging.info('Fitted kmeans clustering')

    centroids = np.array(kmeans.cluster_centers_)
Ejemplo n.º 17
0
import tensorpack.dataflow as df

if __name__ == '__main__':
    ds = df.dataset.Mnist('train')
    augmentors = [
        df.imgaug.RandomApplyAug(
            df.imgaug.RandomResize((0.8, 1.2), (0.8, 1.2)), 0.3),
        df.imgaug.RandomApplyAug(df.imgaug.RotationAndCropValid(15), 0.5),
        df.imgaug.RandomApplyAug(
            df.imgaug.SaltPepperNoise(white_prob=0.01, black_prob=0.01), 0.25),
        df.imgaug.Resize((28, 28)),
        df.imgaug.CenterPaste((32, 32)),
        df.imgaug.RandomCrop((28, 28)),
        df.imgaug.MapImage(lambda x: x.reshape(28, 28, 1))
    ]
    ds = df.AugmentImageComponent(ds, augmentors)
    ds = df.BatchData(ds, batch_size=32, remainder=False)
    ds = df.PrefetchData(ds, nr_prefetch=12, nr_proc=2)
    ds = df.PrintData(ds)
    ds = df.RepeatedData(ds, nr=-1)

    df.TestDataSpeed(ds, size=50000).start()
Ejemplo n.º 18
0
def get_dataflow(files, params, is_training):
    """
    Build a tensorflow Dataset from appropriate tfrecords files.
    :param files: list a file paths corresponding to appropriate tfrecords data
    :param params: parsed arguments
    :param is_training: bool, true for training.
    :return: (nextdata, num_samples).
    nextdata: list of tensorflow ops that produce the next input with the following elements:
    true_states, global_map, init_particles, observations, odometries, is_first_step.
    See House3DTrajData.get_data for definitions.
    num_samples: number of samples that make an epoch
    """

    mapmode = params.mapmode
    obsmode = params.obsmode
    batchsize = params.batchsize
    num_particles = params.num_particles
    trajlen = params.trajlen
    bptt_steps = params.bptt_steps

    # build initial covariance matrix of particles, in pixels and radians
    particle_std = params.init_particles_std.copy()
    particle_std[0] = particle_std[
        0] / params.map_pixel_in_meters  # convert meters to pixels
    particle_std2 = np.square(particle_std)  # element-wise variance
    init_particles_cov = np.diag(particle_std2[(0, 0,
                                                1), ])  # index is (0,0,1)

    df = House3DTrajData(
        files,
        mapmode,
        obsmode,
        trajlen,
        num_particles,
        params.init_particles_distr,
        init_particles_cov,
        seed=(params.seed if params.seed is not None and params.seed > 0 else
              (params.validseed if not is_training else None)))
    # data: true_states, global_map, init_particles, observation, odometry

    # make it a multiple of batchsize
    df = dataflow.FixedSizeData(df,
                                size=(df.size() // batchsize) * batchsize,
                                keep_state=False)

    # shuffle
    if is_training:
        df = dataflow.LocallyShuffleData(
            df, 100 * batchsize)  # buffer_size = 100 * batchsize

    # repeat data for the number of epochs
    df = dataflow.RepeatedData(df, params.epochs)

    # batch
    df = BatchDataWithPad(df, batchsize, padded_indices=(1, ))

    # break trajectory into multiple segments for BPTT training. Augment df with is_first_step indicator
    df = BreakForBPTT(df,
                      timed_indices=(0, 3, 4),
                      trajlen=trajlen,
                      bptt_steps=bptt_steps)
    # data: true_states, global_map, init_particles, observation, odometry, is_first_step

    num_samples = df.size() // params.epochs

    df.reset_state()

    # # test dataflow
    # df = dataflow.TestDataSpeed(dataflow.PrintData(df), 100)
    # df.start()

    obs_ch = {'rgb': 3, 'depth': 1, 'rgb-depth': 4}
    map_ch = {
        'wall': 1,
        'wall-door': 2,
        'wall-roomtype': 10,
        'wall-door-roomtype': 11
    }  # every semantic is a channel
    types = [
        tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.bool
    ]
    sizes = [
        (batchsize, bptt_steps, 3),
        (batchsize, None, None, map_ch[mapmode]),
        (batchsize, num_particles, 3),
        (batchsize, bptt_steps, 56, 56, obs_ch[obsmode]),
        (batchsize, bptt_steps, 3),
        (),
    ]

    # turn it into a tf dataset
    def tuplegen():
        for dp in df.get_data():
            yield tuple(dp)

    dataset = tf.data.Dataset.from_generator(tuplegen, tuple(types),
                                             tuple(sizes))
    iterator = dataset.make_one_shot_iterator()  # only read once
    nextdata = iterator.get_next()

    return nextdata, num_samples