def __init__(self, mode, batch_size=256, shuffle=False, num_workers=25, cache=50000, collate_fn=default_collate, drop_last=False, cuda=False): # enumerate standard imagenet augmentors imagenet_augmentors = fbresnet_augmentor(mode == 'train') # load the lmdb if we can find it lmdb_loc = os.path.join(os.environ['IMAGENET'], 'ILSVRC-%s.lmdb' % mode) ds = td.LMDBData(lmdb_loc, shuffle=False) ds = td.LocallyShuffleData(ds, cache) ds = td.PrefetchData(ds, 5000, 1) ds = td.LMDBDataPoint(ds) ds = td.MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) ds = td.AugmentImageComponent(ds, imagenet_augmentors) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers self.cuda = cuda
def test_davis_tensorpack_dataflow(): ds = Davis('/data/public/rw/datasets/videos/davis/trainval', num_frames=4) ds = df.MapDataComponent( ds, lambda images: [cv2.resize(image, (256, 256)) for image in images], index=1) ds = df.MapDataComponent( ds, lambda images: [cv2.resize(image, (256, 256)) for image in images], index=2) ds = df.MapDataComponent(ds, lambda images: np.stack(images, axis=0), index=1) ds = df.MapDataComponent(ds, lambda images: np.stack(images, axis=0), index=2) ds = df.BatchData(ds, 6) ds.reset_state() generator = ds.get_data() for _ in range(10): _, images, annotations = next(generator) assert images.shape == (6, 4, 256, 256, 3) assert annotations.shape == (6, 4, 256, 256, 3)
def __init__(self, mode, batch_size=256, shuffle=False, num_workers=25, cache=50000, collate_fn=default_collate, remainder=False, cuda=False, transform=None): # enumerate standard imagenet augmentors #imagenet_augmentors = fbresnet_augmentor(mode == 'train') imagenet_augmentors = [ImgAugTVCompose(transform)] # load the lmdb if we can find it lmdb_loc = os.path.join(os.environ['IMAGENET'], 'ILSVRC-%s.lmdb' % mode) ds = td.LMDBData(lmdb_loc, shuffle=False) if mode == 'train': ds = td.LocallyShuffleData(ds, cache) ds = td.PrefetchData(ds, 5000, 1) ds = td.LMDBDataPoint(ds) #ds = td.MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) ds = td.MapDataComponent( ds, lambda x: np.asarray(Image.open(io.BytesIO(x)).convert('RGB')), 0) ds = td.AugmentImageComponent(ds, imagenet_augmentors) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size, remainder=remainder) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers self.cuda = cuda
def lmdb_dataflow(lmdb_path, batch_size, sample_size, is_training, test_speed=False, train_perturb_list=None, valid_perturb_list=None, so3_perturb=False, use_partial=False): df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) size = df.size() if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1) df = PreprocessData(df, sample_size, is_training, train_perturb_list=train_perturb_list, valid_perturb_list=valid_perturb_list, so3_perturb=so3_perturb, use_partial=use_partial) if is_training: df = dataflow.PrefetchDataZMQ(df, nr_proc=8) df = dataflow.BatchData(df, batch_size, use_list=True) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
def data_pipe(fmri_files, confound_files, label_matrix, target_name=None, batch_size=32, data_type='train', train_percent=0.8, nr_thread=nr_thread, buffer_size=buffer_size): assert data_type in ['train', 'val', 'test'] assert fmri_files is not None print('\n\nGenerating dataflow for %s datasets \n' % data_type) buffer_size = min(len(fmri_files), buffer_size) nr_thread = min(len(fmri_files), nr_thread) ds0 = gen_fmri_file(fmri_files, confound_files, label_matrix, data_type=data_type, train_percent=train_percent) print('dataflowSize is ' + str(ds0.size())) print('Loading data using %d threads with %d buffer_size ... \n' % (nr_thread, buffer_size)) if target_name is None: target_name = np.unique(label_matrix) ####running the model start_time = time.clock() ds1 = dataflow.MultiThreadMapData( ds0, nr_thread=nr_thread, map_func=lambda dp: map_load_fmri_image(dp, target_name), buffer_size=buffer_size, strict=True) ds1 = dataflow.PrefetchData(ds1, buffer_size, 1) ds1 = split_samples(ds1) print('prefetch dataflowSize is ' + str(ds1.size())) ds1 = dataflow.LocallyShuffleData(ds1, buffer_size=ds1.size() * buffer_size) ds1 = dataflow.BatchData(ds1, batch_size=batch_size) print('Time Usage of loading data in seconds: {} \n'.format(time.clock() - start_time)) ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=1) ds1._reset_once() ##ds1.reset_state() #return ds1.get_data() for df in ds1.get_data(): ##print(np.expand_dims(df[0].astype('float32'),axis=3).shape) yield (np.expand_dims(df[0].astype('float32'), axis=3), to_categorical(df[1].astype('int32'), len(target_name)))
def __init__(self, corpus_path, tokenizer, seq_len, encoding="utf-8", predict_feature=False, batch_size=512, shuffle=False, num_workers=25, cache=10000, drop_last=False, cuda=False, distributed=False, visualization=False, span_mask=False, cond_mask=False, region_len=36): if dist.is_available() and distributed: rank = dist.get_rank() lmdb_file = os.path.join( corpus_path, "training_feat_part_" + str(rank) + ".lmdb") else: lmdb_file = os.path.join(corpus_path, "training_feat_all.lmdb") caption_path = os.path.join(corpus_path, "caption_train.json") print("Loading from %s" % lmdb_file) os.listdir(corpus_path) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) self.cond_mask = cond_mask preprocess_function = BertPreprocessBatch( caption_path, tokenizer, seq_len, region_len, self.num_dataset, encoding="utf-8", predict_feature=predict_feature, span_mask=span_mask, cond_mask=cond_mask) # ds = td.LocallyShuffleData(ds, cache) ds = td.PrefetchData(ds, 5000, 1) ds = td.MapData(ds, preprocess_function) # self.ds = td.PrefetchData(ds, 1) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) # self.ds = ds self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers
def __init__( self, corpus_path, tokenizer, seq_len, encoding='utf-8', predict_feature=False, hard_negative=False, batch_size=512, shuffle=False, num_workers=25, cache=50000, drop_last=False, cuda=False, distributed=False, visualization=False, ): if dist.is_available() and distributed: # num_replicas = dist.get_world_size() # assert num_replicas == 8 rank = dist.get_rank() lmdb_file = '/mnt3/xuesheng/features_lmdb/CC/training_feat_part_' + str( rank) + '.lmdb' # if not os.path.exists(lmdb_file): # lmdb_file = "/srv/share/datasets/conceptual_caption/training_feat_part_" + str(rank) + ".lmdb" else: # lmdb_file = "/coc/dataset/conceptual_caption/training_feat_all.lmdb" # if not os.path.exists(lmdb_file): lmdb_file = '/mnt3/xuesheng/features_lmdb/CC/training_feat_part_0.lmdb' caption_path = '/mnt3/xuesheng/features_lmdb/CC/caption_train.json' print('Loading from %s' % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=True) self.num_dataset = len(ds) preprocess_function = BertPreprocessBatch( caption_path, tokenizer, seq_len, 36, self.num_dataset, encoding='utf-8', predict_feature=predict_feature, ) # ds = td.LocallyShuffleData(ds, cache) # ds = td.PrefetchData(ds, 5000, 1) ds = td.MapData(ds, preprocess_function) # self.ds = td.PrefetchData(ds, 1) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) # self.ds = ds self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers
def create_dataflow(data_dir: Path, kind: str, batch_size: int, shuffle: bool = True) -> td.DataFlow: path = data_dir / "{}.mdb".format(kind) ds = td.LMDBData(str(path), shuffle=shuffle) ds = td.MapData(ds, _decode_data) ds = td.BatchData(ds, batch_size, remainder=False) ds = td.MapDataComponent(ds, _squeeze_last, index=1) return ds
def __init__(self, datafile, batch_size, num_workers=1, nviews=12, reset=True, augment=False, filter_classes=None, filter_views=None, polarmode='cartesian', shuffle=True, filter_ids=None, label_to0idx=False, rgb=False, force_res=0, autocrop=False, keep_aspect_ratio=False): self.filter_classes = filter_classes self.filter_views = filter_views self.filter_ids = filter_ids self.polarmode = polarmode self.label_to0idx = label_to0idx self.rgb = rgb self.force_res = force_res self.autocrop = autocrop self.keep_aspect_ratio = keep_aspect_ratio if not isinstance(datafile, list): datafile = [datafile] ds = [] for d in datafile: ds.append(df.LMDBSerializer.load(d, shuffle=shuffle)) if shuffle: ds[-1] = df.LocallyShuffleData(ds[-1], 100) ds[-1] = df.PrefetchData(ds[-1], 20, 1) ds[-1] = df.MapData(ds[-1], self.load) if augment: ds[-1] = df.MapDataComponent(ds[-1], LMDBMultiView._augment, 0) if (not filter_classes and not filter_ids and num_workers > 1): # warning: skipping this is slower when filtering datasets # but epoch counting will be wrong otherwise ds[-1] = df.PrefetchDataZMQ(ds[-1], num_workers) ds[-1] = df.BatchData(ds[-1], batch_size) if reset: ds[-1].reset_state() self.ds = ds
def get_dataflows(config): """ construct and initialize dataflows based on config. """ df = ExpertDataflow(config) df = tp_dataflow.PrefetchDataZMQ(df, nr_proc=16) df = tp_dataflow.BatchData(df, config['batch_size'], remainder=False) # initialize random number generator in child processes to unique values df.reset_state() return df
def __init__( self, annotations_path, features_path, tokenizer, bert_model, seq_len, batch_size=512, num_workers=25, cache=10000, local_rank=-1, objective=0, num_locs=5, add_global_imgfeat=None, ): if dist.is_available() and local_rank != -1: rank = dist.get_rank() lmdb_file = os.path.join( features_path, "training_feat_part_" + str(rank) + ".lmdb") else: lmdb_file = os.path.join(features_path, "training_feat_all.lmdb") print("Loading from %s" % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) ds = td.LocallyShuffleData(ds, cache) caption_path = os.path.join(annotations_path, "caption_train.json") preprocess_function = BertPreprocessBatch( caption_path, tokenizer, bert_model, seq_len, 36, self.num_dataset, objective=objective, num_locs=num_locs, ) ds = td.PrefetchData(ds, 5000, 1) ds = td.MapData(ds, preprocess_function) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers self.add_global_imgfeat = add_global_imgfeat self.num_locs = num_locs
def __init__(self, corpus_path, tokenizer, seq_len, encoding="utf-8", predict_feature=False, batch_size=512, shuffle=False, num_workers=25, cache=50000, drop_last=False, cuda=False, distributed=False, visualization=False, span_mask=False, cond_mask=False, region_len=36): lmdb_file = os.path.join(corpus_path, "validation_all.lmdb") caption_path = os.path.join(corpus_path, "caption_val.json") print("Loading from %s" % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) self.cond_mask = cond_mask preprocess_function = BertPreprocessBatch( caption_path, tokenizer, seq_len, region_len, self.num_dataset, encoding="utf-8", predict_feature=predict_feature, visualization=visualization, span_mask=span_mask, cond_mask=cond_mask, ) ds = td.MapData(ds, preprocess_function) self.ds = td.BatchData(ds, batch_size) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False): df = dataflow.LMDBData(lmdb_path, shuffle=False) size = df.size() if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1) df = dataflow.LMDBDataPoint(df) df = PreprocessData(df, input_size, output_size) if is_training: df = dataflow.PrefetchDataZMQ(df, nr_proc=8) df = dataflow.BatchData(df, batch_size, use_list=True) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
def init_dataflow(ctfstar,batch_size): ''' This function creates dataflow that reads and preprocesses data in parallel ''' augm = df.imgaug.AugmentorList([df.imgaug.MeanVarianceNormalize()]) # create partitioned generators, one for each element in a batch dss0,shape = MicrosGenerator.create_partition(ctfstar,batch_size) # preprocess input dss1 = [df.MapData(ds0, lambda dp: [augm.augment(preprocess_micro(dp[0], dp[1], psize, bn)), np.array(dp[0])]) for ds0 in dss0] # prefetch each generator in a separate process with buffer of 4 images per process # dss1 = [df.PrefetchDataZMQ(ds1, nr_proc=1, hwm=2) for ds1 in dss1] dss1 = [df.PrefetchData(ds1, nr_prefetch=4, nr_proc=1) for ds1 in dss1] # join all dataflows ds1 = df.RandomMixData(dss1) # ds1 = df.JoinData(dss1) ds = df.BatchData(ds1, batch_size) ds.reset_state() return ds,shape
def lmdb_dataflow(lmdb_path, batch_size, num_points, shuffle, task, render=False): df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) size = df.size() if render: df = VirtualRenderData(df) if num_points is not None: df = ResampleData(df, num_points, task) if shuffle: df = dataflow.LocallyShuffleData(df, 1000) df = dataflow.PrefetchDataZMQ(df, 8) df = dataflow.BatchData(df, batch_size, use_list=True) df = dataflow.RepeatedData(df, -1) df.reset_state() return df, size
def __init__( self, corpus_path, tokenizer, bert_model, seq_len, encoding="utf-8", visual_target=0, batch_size=512, shuffle=False, num_workers=25, cache=5000, drop_last=False, cuda=False, objective=0, visualization=False, ): lmdb_file = os.path.join(corpus_path, "validation_feat_all.lmdb") caption_path = os.path.join(corpus_path, "caption_val.json") print("Loading from %s" % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) preprocess_function = BertPreprocessBatch( caption_path, tokenizer, bert_model, seq_len, 36, self.num_dataset, encoding="utf-8", visual_target=visual_target, visualization=visualization, objective=objective, ) ds = td.MapData(ds, preprocess_function) self.ds = td.BatchData(ds, batch_size) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers
def read_data(files=None, batch_size=1, window=2, random_rotation=False, repeat=False, shuffle_buffer=None, num_workers=1, cache_data=False): print(files[0:20], '...' if len(files) > 20 else '') # caching makes only sense if the data is finite if cache_data: if repeat == True: raise Exception("repeat must be False if cache_data==True") if random_rotation == True: raise Exception( "random_rotation must be False if cache_data==True") if num_workers != 1: raise Exception("num_workers must be 1 if cache_data==True") df = PhysicsSimDataFlow( files=files, random_rotation=random_rotation, shuffle=True if shuffle_buffer else False, window=window, ) if repeat: df = dataflow.RepeatedData(df, -1) if shuffle_buffer: df = dataflow.LocallyShuffleData(df, shuffle_buffer) if num_workers > 1: df = dataflow.MultiProcessRunnerZMQ(df, num_proc=num_workers) df = dataflow.BatchData(df, batch_size=batch_size, use_list=True) if cache_data: df = dataflow.CacheData(df) df.reset_state() return df
def __init__( self, corpus_path, tokenizer, seq_len, encoding='utf-8', predict_feature=False, batch_size=512, shuffle=False, num_workers=25, cache=50000, drop_last=False, cuda=False, distributed=False, visualization=False, ): lmdb_file = '/mnt3/xuesheng/features_lmdb/CC_val/validation_all.lmdb' if not os.path.exists(lmdb_file): lmdb_file = '/mnt3/xuesheng/features_lmdb/CC_val/validation_all.lmdb' caption_path = '/mnt3/xuesheng/features_lmdb/CC_val/caption_val.json' print('Loading from %s' % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) preprocess_function = BertPreprocessBatch( caption_path, tokenizer, seq_len, 36, self.num_dataset, encoding='utf-8', predict_feature=predict_feature, visualization=visualization, ) ds = td.MapData(ds, preprocess_function) self.ds = td.BatchData(ds, batch_size) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers
def test_kinetics_tensorpack_dataflow(): ds = Kinetics('/data/public/rw/datasets/videos/kinetics', num_frames=4, skips=[0, 4, 4, 8]) ds = df.MapDataComponent( ds, lambda images: [cv2.resize(image, (256, 256)) for image in images], index=1) ds = df.MapDataComponent(ds, lambda images: np.stack(images, axis=0), index=1) ds = df.BatchData(ds, 6) ds.reset_state() generator = ds.get_data() for _ in range(10): _, images = next(generator) assert images.shape == (6, 4, 256, 256, 3)
def __init__( self, corpus_path, tokenizer, seq_len, encoding="utf-8", predict_feature=False, batch_size=512, shuffle=False, num_workers=25, cache=50000, drop_last=False, cuda=False, distributed=False, visualization=False, ): lmdb_file = "/coc/dataset/conceptual_caption/validation_feat_all.lmdb" if not os.path.exists(lmdb_file): lmdb_file = "/coc/pskynet2/jlu347/multi-modal-bert/data/conceptual_caption/validation_feat_all.lmdb" caption_path = "/coc/pskynet2/jlu347/multi-modal-bert/data/conceptual_caption/caption_val.json" print("Loading from %s" % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) preprocess_function = BertPreprocessBatch( caption_path, tokenizer, seq_len, 36, self.num_dataset, encoding="utf-8", predict_feature=predict_feature, visualization=visualization, ) ds = td.MapData(ds, preprocess_function) self.ds = td.BatchData(ds, batch_size) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers
def __init__( self, annotations_path, features_path, tokenizer, bert_model, seq_len, batch_size=512, num_workers=25, cache=5000, objective=0, num_locs=5, add_global_imgfeat=True, visualization=False, ): lmdb_file = os.path.join(features_path, "validation_feat_all.lmdb") caption_path = os.path.join(annotations_path, "caption_valid.json") print("Loading from %s" % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) preprocess_function = BertPreprocessBatch( caption_path, tokenizer, bert_model, seq_len, 36, self.num_dataset, visualization=visualization, objective=objective, num_locs=num_locs, ) ds = td.MapData(ds, preprocess_function) self.ds = td.BatchData(ds, batch_size) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers self.add_global_imgfeat = add_global_imgfeat self.num_locs = num_locs
def __init__(self, mode, do_aug, batch_size=256, shuffle=False, num_workers=25, cache=50000, cuda=False, out_tensor=True, data_transforms=None): # enumerate standard imagenet augmentors imagenet_augmentors = fbresnet_augmentor(do_aug) # load the lmdb if we can find it lmdb_loc = os.path.join(os.environ['IMAGENET'], 'ILSVRC-%s.lmdb' % mode) ds = td.LMDBSerializer.load(lmdb_loc, shuffle=shuffle) #ds = td.LMDBData(lmdb_loc, shuffle=False) #ds = td.LocallyShuffleData(ds, cache) #ds = td.PrefetchData(ds, 5000, 1) #ds = td.LMDBDataPoint(ds) ds = td.MapDataComponent( ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR)[:, :, ::-1], 0) ds = td.AugmentImageComponent(ds, imagenet_augmentors) ds = td.MultiProcessRunnerZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers self.cuda = cuda self.out_tensor = out_tensor # data_transforms should be present only when out_tensor=True # data_transforms typically consists of # PIL Image transforms, ToTensor(), Normalize(): # normalize = transforms.Compose( [ # transforms.ToTensor(), # transforms.Normalize(mean=[0.485, 0.456, 0.406], # std=[0.229, 0.224, 0.225]) ] ) self.data_transforms = data_transforms print("Loaded '%s'." % lmdb_loc)
import tensorpack.dataflow as df if __name__ == '__main__': ds = df.dataset.Mnist('train') augmentors = [ df.imgaug.RandomApplyAug( df.imgaug.RandomResize((0.8, 1.2), (0.8, 1.2)), 0.3), df.imgaug.RandomApplyAug(df.imgaug.RotationAndCropValid(15), 0.5), df.imgaug.RandomApplyAug( df.imgaug.SaltPepperNoise(white_prob=0.01, black_prob=0.01), 0.25), df.imgaug.Resize((28, 28)), df.imgaug.CenterPaste((32, 32)), df.imgaug.RandomCrop((28, 28)), df.imgaug.MapImage(lambda x: x.reshape(28, 28, 1)) ] ds = df.AugmentImageComponent(ds, augmentors) ds = df.BatchData(ds, batch_size=32, remainder=False) ds = df.PrefetchData(ds, nr_prefetch=12, nr_proc=2) ds = df.PrintData(ds) df.send_dataflow_zmq(ds, 'tcp://localhost:2222')
import tensorflow as tf import tensorpack.dataflow as df if __name__ == '__main__': # prepare dataset ds = df.dataset.Mnist('train') augmentors_variation = [ df.imgaug.Resize((28, 28)), df.imgaug.CenterPaste((32, 32)), df.imgaug.RandomCrop((28, 28)), df.imgaug.MapImage(lambda v: v.reshape(784)) ] ds = df.AugmentImageComponent(ds, augmentors_variation) ds = df.PrefetchData(ds, nr_prefetch=12, nr_proc=4) ds = df.BatchData(ds, batch_size=128, remainder=False, use_list=False) # create the model x = tf.placeholder(tf.float32, [None, 784]) W = tf.Variable(tf.ones([784, 10])) b = tf.Variable(tf.zeros([10])) y = tf.matmul(x, W) + b y_ = tf.placeholder(tf.int64, [None]) cross_entropy = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_, logits=y)) global_step = tf.train.get_or_create_global_step() train_op = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize( cross_entropy, global_step=global_step) correct_prediction = tf.equal(tf.argmax(y, 1), y_) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# Create a dataflow of training and validation # TODO ds_train = CustomDataflow(size=100, datadir=args.data) augs = [ # imgaug.ResizeShortestEdge(270), imgaug.RandomCrop(SHAPE), imgaug.Flip(horiz=True), imgaug.Flip(vert=True), imgaug.Transpose() ] ds_train = AugmentImageComponents(ds_train, augs, (0, 1)) ds_train = MapData(ds_train, lambda dp: [np.expand_dims(dp[0], axis=0), np.expand_dims(dp[1], axis=0), ]) ds_train = df.BatchData(ds_train, batch_size=BATCH) ds_train = df.PrintData(ds_train) # ds_train = df.PrefetchDataZMQ(ds_train, nr_proc=4) ds_valid= CustomDataflow(size=100, datadir=args.data) # # Training loop # max_step = 10000000 for epoch in range(EPOCH): for mb_train in ds_train.get_data(): step = step+1 if step > max_step: exit() # print("Step: {}, Epoch {}".format(step, epoch))
# imgaug.RandomCrop(SHAPE), # imgaug.Resize(int(SHAPE)), imgaug.Flip(horiz=True), imgaug.Flip(vert=True), imgaug.Albumentations(AB.RandomRotate90(p=1)) ] # ds_train = df.AugmentImageComponent(ds_train, ag_image, 0) # Apply for image only ds_train = df.AugmentImageComponents(ds_train, ag_train, (0, 1)) ds_train = df.MapData( ds_train, lambda dp: [ np.expand_dims(dp[0], axis=0), np.expand_dims(dp[1], axis=0), ]) ds_train = df.MultiProcessRunner(ds_train, num_proc=8, num_prefetch=4) ds_train = df.BatchData(ds_train, batch_size=BATCH) ds_train = df.PrintData(ds_train) ds_train = df.MapData( ds_train, lambda dp: [ torch.tensor(dp[0]), torch.tensor(dp[1]), ]) # ds_valid ds_valid = CustomDataFlow(size=500, datadir=args.data, istrain=False) ag_valid = [ imgaug.Flip(horiz=True), imgaug.Flip(vert=True), imgaug.Albumentations(AB.RandomRotate90(p=1)) ] ds_valid = df.AugmentImageComponents(ds_valid, ag_valid, (0, 1))
df.imgaug.Contrast((0.8, 1.2), clip=False), 0.5), # df.imgaug.RandomApplyAug(df.imgaug.Saturation(0.4, rgb=False), 0.5), ]), ] augmentors_default = [ df.imgaug.Resize((32, 32)), df.imgaug.MapImage(lambda x: x.reshape(32, 32, 1)) ] # keep original image at index 1 ds = df.MapData( ds, lambda datapoint: [datapoint[0], datapoint[0]] + datapoint[1:]) ds = df.AugmentImageComponent(ds, augmentors_variation + augmentors_default) ds = df.AugmentImageComponent(ds, augmentors_default, index=1) ds = df.PrefetchData(ds, nr_prefetch=12, nr_proc=4) ds = df.PrintData(ds) ds = df.BatchData(ds, batch_size=32, remainder=False, use_list=True) ds = df.PrintData(ds) for minibatch in ds.get_data(): images, originals, labels = minibatch image, original, label = images[0], originals[0], labels[0] name = '{:02d}'.format(label) cv2.namedWindow(name) cv2.moveWindow(name, 0, 25 + 128 * label) display_image = np.concatenate((image, original), axis=1) cv2.imshow(name, display_image) cv2.waitKey(1)
def data_pipe_3dcnn_block(fmri_files, confound_files, label_matrix, target_name=None, flag_cnn='3d', block_dura=1, batch_size=32, data_type='train', nr_thread=nr_thread, buffer_size=buffer_size): assert data_type in ['train', 'val', 'test'] assert flag_cnn in ['3d', '2d'] assert fmri_files is not None isTrain = data_type == 'train' isVal = data_type == 'val' print('\n\nGenerating dataflow for %s datasets \n' % data_type) buffer_size = int(min(len(fmri_files), buffer_size)) nr_thread = int(min(len(fmri_files), nr_thread)) ds0 = gen_fmri_file(fmri_files, confound_files, label_matrix, data_type=data_type) print('dataflowSize is ' + str(ds0.size())) print('Loading data using %d threads with %d buffer_size ... \n' % (nr_thread, buffer_size)) if target_name is None: target_name = np.unique(label_matrix) ##Subject_Num, Trial_Num = np.array(label_matrix).shape ####running the model start_time = time.clock() if flag_cnn == '2d': ds1 = dataflow.MultiThreadMapData( ds0, nr_thread=nr_thread, map_func=lambda dp: map_load_fmri_image_block( dp, target_name, block_dura=block_dura), buffer_size=buffer_size, strict=True) elif flag_cnn == '3d': ds1 = dataflow.MultiThreadMapData( ds0, nr_thread=nr_thread, map_func=lambda dp: map_load_fmri_image_3d_block( dp, target_name, block_dura=block_dura), buffer_size=buffer_size, strict=True) ds1 = dataflow.PrefetchData(ds1, buffer_size, 1) ds1 = split_samples(ds1) print('prefetch dataflowSize is ' + str(ds1.size())) if isTrain: print('%d #Trials/Samples per subject with %d channels in tc' % (ds1.Trial_Num, ds1.Block_dura)) Trial_Num = ds1.Trial_Num #ds1 = dataflow.LocallyShuffleData(ds1, buffer_size=ds1.size() * buffer_size) ds1 = dataflow.LocallyShuffleData(ds1, buffer_size=Trial_Num * buffer_size, shuffle_interval=Trial_Num * buffer_size) #//2 ds1 = dataflow.BatchData(ds1, batch_size=batch_size, remainder=True) print('Time Usage of loading data in seconds: {} \n'.format(time.clock() - start_time)) ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=1) #ds1._reset_once() ##ds1.reset_state() ''' for df in ds1.get_data(): if flag_cnn == '2d': yield (df[0].astype('float32'),to_categorical(df[1].astype('int32'), len(target_name))) elif flag_cnn == '3d': yield (df[0].astype('float32'),to_categorical(df[1].astype('int32'), len(target_name))) ''' return ds1
label.append(self.df.iloc[idx]['Cavitation']) label.append(self.df.iloc[idx]['Fibrosis']) label.append(self.df.iloc[idx]['Widening_Mediastinum']) label.append(self.df.iloc[idx]['Medical_device']) label.append(self.df.iloc[idx]['Fracture']) label.append(self.df.iloc[idx]['No_Finding']) elif self.types == 1: assert self.pathology is not None label.append(self.df.iloc[idx][self.pathology]) else: pass # Try catch exception label = np.nan_to_num(label, copy=True, nan=0) label = np.array(label, dtype=np.float32) types = label.copy() yield [image, types] elif self.is_train == 'test': yield [image] # , np.array([-1, -1, -1, -1, -1]) else: pass if __name__ == '__main__': ds = Vinmec(folder='/u01/data/Vimmec_Data_small/', train_or_valid='train', resize=256) ds.reset_state() # ds = df.MultiProcessRunnerZMQ(ds, num_proc=8) ds = df.BatchData(ds, 32) # ds = df.PrintData(ds) df.TestDataSpeed(ds).start()
def __init__( self, corpus_path, tokenizer, seq_len, encoding='utf-8', predict_feature=False, batch_size=512, shuffle=False, num_workers=10, cache=50000, drop_last=False, cuda=False, ): lmdb_file = '/coc/dataset/conceptual_caption/validation_feat_all.lmdb' if not os.path.exists(lmdb_file): lmdb_file = '/coc/pskynet2/jlu347/multi-modal-bert/data/conceptual_caption/validation_feat_all.lmdb' caption_path = '/coc/pskynet2/jlu347/multi-modal-bert/data/conceptual_caption/caption_val.json' print('Loading from %s' % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) preprocess_function = BertPreprocessRetrieval( caption_path, tokenizer, seq_len, 36, 1000, encoding='utf-8', predict_feature=predict_feature, ) ds = td.MapData(ds, preprocess_function) self.ds = td.BatchData(ds, 1) self.ds.reset_state() self.batch_size = 1 self.num_workers = num_workers self._entry = [] self.features_all = np.zeros((1000, 37, 2048), dtype=np.float32) self.spatials_all = np.zeros((1000, 37, 5), dtype=np.float32) self.image_mask_all = np.zeros((1000, 37), dtype=np.float32) self.image_ids = [] # load first 1000 file here. for i, batch in enumerate(self.ds.get_data()): if i >= 1000: break input_ids, input_mask, segment_ids, is_next, image_feat, image_loc, image_mask, image_id, caption = batch batch_size = input_ids.shape[0] g_image_feat = np.sum(image_feat, axis=1) / np.sum( image_mask, axis=1, keepdims=True) image_feat = np.concatenate( [np.expand_dims(g_image_feat, axis=1), image_feat], axis=1) image_feat = np.array(image_feat, dtype=np.float32) g_image_loc = np.repeat(np.array([[0, 0, 1, 1, 1]], dtype=np.float32), batch_size, axis=0) image_loc = np.concatenate( [np.expand_dims(g_image_loc, axis=1), image_loc], axis=1) image_loc = np.array(image_loc, dtype=np.float32) g_image_mask = np.repeat(np.array([[1]]), batch_size, axis=0) image_mask = np.concatenate([g_image_mask, image_mask], axis=1) batch = (input_ids, input_mask, segment_ids, image_id, caption) self._entry.append(batch) self.features_all[i] = image_feat self.image_mask_all[i] = np.array(image_mask) self.spatials_all[i] = image_loc self.image_ids.append(image_id) sys.stdout.write('%d/%d\r' % (i, 1000)) sys.stdout.flush()