Ejemplo n.º 1
0
    def __init__(self,
                 mode,
                 batch_size=256,
                 shuffle=False,
                 num_workers=25,
                 cache=50000,
                 collate_fn=default_collate,
                 drop_last=False,
                 cuda=False):
        # enumerate standard imagenet augmentors
        imagenet_augmentors = fbresnet_augmentor(mode == 'train')

        # load the lmdb if we can find it
        lmdb_loc = os.path.join(os.environ['IMAGENET'],
                                'ILSVRC-%s.lmdb' % mode)
        ds = td.LMDBData(lmdb_loc, shuffle=False)
        ds = td.LocallyShuffleData(ds, cache)
        ds = td.PrefetchData(ds, 5000, 1)
        ds = td.LMDBDataPoint(ds)
        ds = td.MapDataComponent(ds,
                                 lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR),
                                 0)
        ds = td.AugmentImageComponent(ds, imagenet_augmentors)
        ds = td.PrefetchDataZMQ(ds, num_workers)
        self.ds = td.BatchData(ds, batch_size)
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
        self.cuda = cuda
Ejemplo n.º 2
0
def test_davis_tensorpack_dataflow():
    ds = Davis('/data/public/rw/datasets/videos/davis/trainval', num_frames=4)

    ds = df.MapDataComponent(
        ds,
        lambda images: [cv2.resize(image, (256, 256)) for image in images],
        index=1)
    ds = df.MapDataComponent(
        ds,
        lambda images: [cv2.resize(image, (256, 256)) for image in images],
        index=2)
    ds = df.MapDataComponent(ds,
                             lambda images: np.stack(images, axis=0),
                             index=1)
    ds = df.MapDataComponent(ds,
                             lambda images: np.stack(images, axis=0),
                             index=2)
    ds = df.BatchData(ds, 6)

    ds.reset_state()
    generator = ds.get_data()
    for _ in range(10):
        _, images, annotations = next(generator)
        assert images.shape == (6, 4, 256, 256, 3)
        assert annotations.shape == (6, 4, 256, 256, 3)
Ejemplo n.º 3
0
    def __init__(self,
                 mode,
                 batch_size=256,
                 shuffle=False,
                 num_workers=25,
                 cache=50000,
                 collate_fn=default_collate,
                 remainder=False,
                 cuda=False,
                 transform=None):
        # enumerate standard imagenet augmentors
        #imagenet_augmentors = fbresnet_augmentor(mode == 'train')
        imagenet_augmentors = [ImgAugTVCompose(transform)]

        # load the lmdb if we can find it
        lmdb_loc = os.path.join(os.environ['IMAGENET'],
                                'ILSVRC-%s.lmdb' % mode)
        ds = td.LMDBData(lmdb_loc, shuffle=False)
        if mode == 'train':
            ds = td.LocallyShuffleData(ds, cache)
        ds = td.PrefetchData(ds, 5000, 1)
        ds = td.LMDBDataPoint(ds)
        #ds = td.MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0)
        ds = td.MapDataComponent(
            ds, lambda x: np.asarray(Image.open(io.BytesIO(x)).convert('RGB')),
            0)
        ds = td.AugmentImageComponent(ds, imagenet_augmentors)
        ds = td.PrefetchDataZMQ(ds, num_workers)
        self.ds = td.BatchData(ds, batch_size, remainder=remainder)
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
        self.cuda = cuda
Ejemplo n.º 4
0
def lmdb_dataflow(lmdb_path,
                  batch_size,
                  sample_size,
                  is_training,
                  test_speed=False,
                  train_perturb_list=None,
                  valid_perturb_list=None,
                  so3_perturb=False,
                  use_partial=False):
    df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False)
    size = df.size()
    if is_training:
        df = dataflow.LocallyShuffleData(df, buffer_size=2000)
    df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1)
    df = PreprocessData(df,
                        sample_size,
                        is_training,
                        train_perturb_list=train_perturb_list,
                        valid_perturb_list=valid_perturb_list,
                        so3_perturb=so3_perturb,
                        use_partial=use_partial)
    if is_training:
        df = dataflow.PrefetchDataZMQ(df, nr_proc=8)
    df = dataflow.BatchData(df, batch_size, use_list=True)
    df = dataflow.RepeatedData(df, -1)
    if test_speed:
        dataflow.TestDataSpeed(df, size=1000).start()
    df.reset_state()
    return df, size
def data_pipe(fmri_files,
              confound_files,
              label_matrix,
              target_name=None,
              batch_size=32,
              data_type='train',
              train_percent=0.8,
              nr_thread=nr_thread,
              buffer_size=buffer_size):
    assert data_type in ['train', 'val', 'test']
    assert fmri_files is not None

    print('\n\nGenerating dataflow for %s datasets \n' % data_type)

    buffer_size = min(len(fmri_files), buffer_size)
    nr_thread = min(len(fmri_files), nr_thread)

    ds0 = gen_fmri_file(fmri_files,
                        confound_files,
                        label_matrix,
                        data_type=data_type,
                        train_percent=train_percent)
    print('dataflowSize is ' + str(ds0.size()))
    print('Loading data using %d threads with %d buffer_size ... \n' %
          (nr_thread, buffer_size))

    if target_name is None:
        target_name = np.unique(label_matrix)

    ####running the model
    start_time = time.clock()
    ds1 = dataflow.MultiThreadMapData(
        ds0,
        nr_thread=nr_thread,
        map_func=lambda dp: map_load_fmri_image(dp, target_name),
        buffer_size=buffer_size,
        strict=True)

    ds1 = dataflow.PrefetchData(ds1, buffer_size, 1)

    ds1 = split_samples(ds1)
    print('prefetch dataflowSize is ' + str(ds1.size()))

    ds1 = dataflow.LocallyShuffleData(ds1,
                                      buffer_size=ds1.size() * buffer_size)

    ds1 = dataflow.BatchData(ds1, batch_size=batch_size)
    print('Time Usage of loading data in seconds: {} \n'.format(time.clock() -
                                                                start_time))

    ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=1)
    ds1._reset_once()
    ##ds1.reset_state()

    #return ds1.get_data()
    for df in ds1.get_data():
        ##print(np.expand_dims(df[0].astype('float32'),axis=3).shape)
        yield (np.expand_dims(df[0].astype('float32'), axis=3),
               to_categorical(df[1].astype('int32'), len(target_name)))
Ejemplo n.º 6
0
    def __init__(self,
                 corpus_path,
                 tokenizer,
                 seq_len,
                 encoding="utf-8",
                 predict_feature=False,
                 batch_size=512,
                 shuffle=False,
                 num_workers=25,
                 cache=10000,
                 drop_last=False,
                 cuda=False,
                 distributed=False,
                 visualization=False,
                 span_mask=False,
                 cond_mask=False,
                 region_len=36):

        if dist.is_available() and distributed:
            rank = dist.get_rank()
            lmdb_file = os.path.join(
                corpus_path, "training_feat_part_" + str(rank) + ".lmdb")
        else:
            lmdb_file = os.path.join(corpus_path, "training_feat_all.lmdb")

        caption_path = os.path.join(corpus_path, "caption_train.json")

        print("Loading from %s" % lmdb_file)

        os.listdir(corpus_path)

        ds = td.LMDBSerializer.load(lmdb_file, shuffle=False)
        self.num_dataset = len(ds)

        self.cond_mask = cond_mask

        preprocess_function = BertPreprocessBatch(
            caption_path,
            tokenizer,
            seq_len,
            region_len,
            self.num_dataset,
            encoding="utf-8",
            predict_feature=predict_feature,
            span_mask=span_mask,
            cond_mask=cond_mask)

        # ds = td.LocallyShuffleData(ds, cache)
        ds = td.PrefetchData(ds, 5000, 1)
        ds = td.MapData(ds, preprocess_function)
        # self.ds = td.PrefetchData(ds, 1)
        ds = td.PrefetchDataZMQ(ds, num_workers)
        self.ds = td.BatchData(ds, batch_size)
        # self.ds = ds
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
Ejemplo n.º 7
0
    def __init__(
        self,
        corpus_path,
        tokenizer,
        seq_len,
        encoding='utf-8',
        predict_feature=False,
        hard_negative=False,
        batch_size=512,
        shuffle=False,
        num_workers=25,
        cache=50000,
        drop_last=False,
        cuda=False,
        distributed=False,
        visualization=False,
    ):

        if dist.is_available() and distributed:
            # num_replicas = dist.get_world_size()
            # assert num_replicas == 8
            rank = dist.get_rank()
            lmdb_file = '/mnt3/xuesheng/features_lmdb/CC/training_feat_part_' + str(
                rank) + '.lmdb'
            # if not os.path.exists(lmdb_file):
            # lmdb_file = "/srv/share/datasets/conceptual_caption/training_feat_part_" + str(rank) + ".lmdb"
        else:
            # lmdb_file = "/coc/dataset/conceptual_caption/training_feat_all.lmdb"
            # if not os.path.exists(lmdb_file):
            lmdb_file = '/mnt3/xuesheng/features_lmdb/CC/training_feat_part_0.lmdb'

        caption_path = '/mnt3/xuesheng/features_lmdb/CC/caption_train.json'
        print('Loading from %s' % lmdb_file)

        ds = td.LMDBSerializer.load(lmdb_file, shuffle=True)
        self.num_dataset = len(ds)

        preprocess_function = BertPreprocessBatch(
            caption_path,
            tokenizer,
            seq_len,
            36,
            self.num_dataset,
            encoding='utf-8',
            predict_feature=predict_feature,
        )

        # ds = td.LocallyShuffleData(ds, cache)
        # ds = td.PrefetchData(ds, 5000, 1)
        ds = td.MapData(ds, preprocess_function)
        # self.ds = td.PrefetchData(ds, 1)
        ds = td.PrefetchDataZMQ(ds, num_workers)
        self.ds = td.BatchData(ds, batch_size)
        # self.ds = ds
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
Ejemplo n.º 8
0
def create_dataflow(data_dir: Path,
                    kind: str,
                    batch_size: int,
                    shuffle: bool = True) -> td.DataFlow:

    path = data_dir / "{}.mdb".format(kind)
    ds = td.LMDBData(str(path), shuffle=shuffle)
    ds = td.MapData(ds, _decode_data)
    ds = td.BatchData(ds, batch_size, remainder=False)
    ds = td.MapDataComponent(ds, _squeeze_last, index=1)
    return ds
Ejemplo n.º 9
0
    def __init__(self,
                 datafile,
                 batch_size,
                 num_workers=1,
                 nviews=12,
                 reset=True,
                 augment=False,
                 filter_classes=None,
                 filter_views=None,
                 polarmode='cartesian',
                 shuffle=True,
                 filter_ids=None,
                 label_to0idx=False,
                 rgb=False,
                 force_res=0,
                 autocrop=False,
                 keep_aspect_ratio=False):
        self.filter_classes = filter_classes
        self.filter_views = filter_views
        self.filter_ids = filter_ids
        self.polarmode = polarmode
        self.label_to0idx = label_to0idx
        self.rgb = rgb
        self.force_res = force_res
        self.autocrop = autocrop
        self.keep_aspect_ratio = keep_aspect_ratio

        if not isinstance(datafile, list):
            datafile = [datafile]

        ds = []
        for d in datafile:

            ds.append(df.LMDBSerializer.load(d, shuffle=shuffle))

            if shuffle:
                ds[-1] = df.LocallyShuffleData(ds[-1], 100)
            ds[-1] = df.PrefetchData(ds[-1], 20, 1)

            ds[-1] = df.MapData(ds[-1], self.load)
            if augment:
                ds[-1] = df.MapDataComponent(ds[-1], LMDBMultiView._augment, 0)

            if (not filter_classes and not filter_ids and num_workers > 1):
                # warning: skipping this is slower when filtering datasets
                #          but epoch counting will be wrong otherwise
                ds[-1] = df.PrefetchDataZMQ(ds[-1], num_workers)
            ds[-1] = df.BatchData(ds[-1], batch_size)

            if reset:
                ds[-1].reset_state()

        self.ds = ds
Ejemplo n.º 10
0
def get_dataflows(config):
    """
    construct and initialize dataflows based on config.
    """

    df = ExpertDataflow(config)
    df = tp_dataflow.PrefetchDataZMQ(df, nr_proc=16)
    df = tp_dataflow.BatchData(df, config['batch_size'], remainder=False)

    # initialize random number generator in child processes to unique values
    df.reset_state()

    return df
Ejemplo n.º 11
0
    def __init__(
        self,
        annotations_path,
        features_path,
        tokenizer,
        bert_model,
        seq_len,
        batch_size=512,
        num_workers=25,
        cache=10000,
        local_rank=-1,
        objective=0,
        num_locs=5,
        add_global_imgfeat=None,
    ):

        if dist.is_available() and local_rank != -1:
            rank = dist.get_rank()
            lmdb_file = os.path.join(
                features_path, "training_feat_part_" + str(rank) + ".lmdb")
        else:
            lmdb_file = os.path.join(features_path, "training_feat_all.lmdb")

            print("Loading from %s" % lmdb_file)

        ds = td.LMDBSerializer.load(lmdb_file, shuffle=False)
        self.num_dataset = len(ds)
        ds = td.LocallyShuffleData(ds, cache)
        caption_path = os.path.join(annotations_path, "caption_train.json")

        preprocess_function = BertPreprocessBatch(
            caption_path,
            tokenizer,
            bert_model,
            seq_len,
            36,
            self.num_dataset,
            objective=objective,
            num_locs=num_locs,
        )

        ds = td.PrefetchData(ds, 5000, 1)
        ds = td.MapData(ds, preprocess_function)
        ds = td.PrefetchDataZMQ(ds, num_workers)
        self.ds = td.BatchData(ds, batch_size)
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
        self.add_global_imgfeat = add_global_imgfeat
        self.num_locs = num_locs
Ejemplo n.º 12
0
    def __init__(self,
                 corpus_path,
                 tokenizer,
                 seq_len,
                 encoding="utf-8",
                 predict_feature=False,
                 batch_size=512,
                 shuffle=False,
                 num_workers=25,
                 cache=50000,
                 drop_last=False,
                 cuda=False,
                 distributed=False,
                 visualization=False,
                 span_mask=False,
                 cond_mask=False,
                 region_len=36):

        lmdb_file = os.path.join(corpus_path, "validation_all.lmdb")

        caption_path = os.path.join(corpus_path, "caption_val.json")

        print("Loading from %s" % lmdb_file)

        ds = td.LMDBSerializer.load(lmdb_file, shuffle=False)
        self.num_dataset = len(ds)

        self.cond_mask = cond_mask

        preprocess_function = BertPreprocessBatch(
            caption_path,
            tokenizer,
            seq_len,
            region_len,
            self.num_dataset,
            encoding="utf-8",
            predict_feature=predict_feature,
            visualization=visualization,
            span_mask=span_mask,
            cond_mask=cond_mask,
        )

        ds = td.MapData(ds, preprocess_function)
        self.ds = td.BatchData(ds, batch_size)
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
Ejemplo n.º 13
0
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False):
    df = dataflow.LMDBData(lmdb_path, shuffle=False)
    size = df.size()
    if is_training:
        df = dataflow.LocallyShuffleData(df, buffer_size=2000)
    df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1)
    df = dataflow.LMDBDataPoint(df)
    df = PreprocessData(df, input_size, output_size)
    if is_training:
        df = dataflow.PrefetchDataZMQ(df, nr_proc=8)
    df = dataflow.BatchData(df, batch_size, use_list=True)
    df = dataflow.RepeatedData(df, -1)
    if test_speed:
        dataflow.TestDataSpeed(df, size=1000).start()
    df.reset_state()
    return df, size
Ejemplo n.º 14
0
def init_dataflow(ctfstar,batch_size):
    ''' This function creates dataflow that reads and preprocesses data in parallel '''
    augm = df.imgaug.AugmentorList([df.imgaug.MeanVarianceNormalize()])
    # create partitioned generators, one for each element in a batch
    dss0,shape = MicrosGenerator.create_partition(ctfstar,batch_size)
    # preprocess input
    dss1 = [df.MapData(ds0, lambda dp: [augm.augment(preprocess_micro(dp[0], dp[1], psize, bn)), np.array(dp[0])]) for ds0 in dss0]
    # prefetch each generator in a separate process with buffer of 4 images per process
    # dss1 = [df.PrefetchDataZMQ(ds1, nr_proc=1, hwm=2) for ds1 in dss1]
    dss1 = [df.PrefetchData(ds1, nr_prefetch=4, nr_proc=1) for ds1 in dss1]
    # join all dataflows
    ds1  = df.RandomMixData(dss1)
    # ds1  = df.JoinData(dss1)
    ds   = df.BatchData(ds1, batch_size)
    ds.reset_state()
    return ds,shape
Ejemplo n.º 15
0
def lmdb_dataflow(lmdb_path,
                  batch_size,
                  num_points,
                  shuffle,
                  task,
                  render=False):
    df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False)
    size = df.size()
    if render:
        df = VirtualRenderData(df)
    if num_points is not None:
        df = ResampleData(df, num_points, task)
    if shuffle:
        df = dataflow.LocallyShuffleData(df, 1000)
        df = dataflow.PrefetchDataZMQ(df, 8)
    df = dataflow.BatchData(df, batch_size, use_list=True)
    df = dataflow.RepeatedData(df, -1)
    df.reset_state()
    return df, size
Ejemplo n.º 16
0
    def __init__(
        self,
        corpus_path,
        tokenizer,
        bert_model,
        seq_len,
        encoding="utf-8",
        visual_target=0,
        batch_size=512,
        shuffle=False,
        num_workers=25,
        cache=5000,
        drop_last=False,
        cuda=False,
        objective=0,
        visualization=False,
    ):

        lmdb_file = os.path.join(corpus_path, "validation_feat_all.lmdb")
        caption_path = os.path.join(corpus_path, "caption_val.json")
        print("Loading from %s" % lmdb_file)

        ds = td.LMDBSerializer.load(lmdb_file, shuffle=False)
        self.num_dataset = len(ds)
        preprocess_function = BertPreprocessBatch(
            caption_path,
            tokenizer,
            bert_model,
            seq_len,
            36,
            self.num_dataset,
            encoding="utf-8",
            visual_target=visual_target,
            visualization=visualization,
            objective=objective,
        )

        ds = td.MapData(ds, preprocess_function)
        self.ds = td.BatchData(ds, batch_size)
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
Ejemplo n.º 17
0
def read_data(files=None,
              batch_size=1,
              window=2,
              random_rotation=False,
              repeat=False,
              shuffle_buffer=None,
              num_workers=1,
              cache_data=False):
    print(files[0:20], '...' if len(files) > 20 else '')

    # caching makes only sense if the data is finite
    if cache_data:
        if repeat == True:
            raise Exception("repeat must be False if cache_data==True")
        if random_rotation == True:
            raise Exception(
                "random_rotation must be False if cache_data==True")
        if num_workers != 1:
            raise Exception("num_workers must be 1 if cache_data==True")

    df = PhysicsSimDataFlow(
        files=files,
        random_rotation=random_rotation,
        shuffle=True if shuffle_buffer else False,
        window=window,
    )

    if repeat:
        df = dataflow.RepeatedData(df, -1)

    if shuffle_buffer:
        df = dataflow.LocallyShuffleData(df, shuffle_buffer)

    if num_workers > 1:
        df = dataflow.MultiProcessRunnerZMQ(df, num_proc=num_workers)

    df = dataflow.BatchData(df, batch_size=batch_size, use_list=True)

    if cache_data:
        df = dataflow.CacheData(df)

    df.reset_state()
    return df
Ejemplo n.º 18
0
    def __init__(
        self,
        corpus_path,
        tokenizer,
        seq_len,
        encoding='utf-8',
        predict_feature=False,
        batch_size=512,
        shuffle=False,
        num_workers=25,
        cache=50000,
        drop_last=False,
        cuda=False,
        distributed=False,
        visualization=False,
    ):

        lmdb_file = '/mnt3/xuesheng/features_lmdb/CC_val/validation_all.lmdb'
        if not os.path.exists(lmdb_file):
            lmdb_file = '/mnt3/xuesheng/features_lmdb/CC_val/validation_all.lmdb'
        caption_path = '/mnt3/xuesheng/features_lmdb/CC_val/caption_val.json'

        print('Loading from %s' % lmdb_file)

        ds = td.LMDBSerializer.load(lmdb_file, shuffle=False)
        self.num_dataset = len(ds)
        preprocess_function = BertPreprocessBatch(
            caption_path,
            tokenizer,
            seq_len,
            36,
            self.num_dataset,
            encoding='utf-8',
            predict_feature=predict_feature,
            visualization=visualization,
        )

        ds = td.MapData(ds, preprocess_function)
        self.ds = td.BatchData(ds, batch_size)
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
Ejemplo n.º 19
0
def test_kinetics_tensorpack_dataflow():
    ds = Kinetics('/data/public/rw/datasets/videos/kinetics',
                  num_frames=4,
                  skips=[0, 4, 4, 8])

    ds = df.MapDataComponent(
        ds,
        lambda images: [cv2.resize(image, (256, 256)) for image in images],
        index=1)
    ds = df.MapDataComponent(ds,
                             lambda images: np.stack(images, axis=0),
                             index=1)
    ds = df.BatchData(ds, 6)

    ds.reset_state()
    generator = ds.get_data()
    for _ in range(10):
        _, images = next(generator)
        assert images.shape == (6, 4, 256, 256, 3)
Ejemplo n.º 20
0
    def __init__(
        self,
        corpus_path,
        tokenizer,
        seq_len,
        encoding="utf-8",
        predict_feature=False,
        batch_size=512,
        shuffle=False,
        num_workers=25,
        cache=50000,
        drop_last=False,
        cuda=False,
        distributed=False,
        visualization=False,
    ):
    
        lmdb_file = "/coc/dataset/conceptual_caption/validation_feat_all.lmdb"
        if not os.path.exists(lmdb_file):
            lmdb_file = "/coc/pskynet2/jlu347/multi-modal-bert/data/conceptual_caption/validation_feat_all.lmdb"
        caption_path = "/coc/pskynet2/jlu347/multi-modal-bert/data/conceptual_caption/caption_val.json"

        print("Loading from %s" % lmdb_file)

        ds = td.LMDBSerializer.load(lmdb_file, shuffle=False)
        self.num_dataset = len(ds)
        preprocess_function = BertPreprocessBatch(
            caption_path,
            tokenizer,
            seq_len,
            36,
            self.num_dataset,
            encoding="utf-8",
            predict_feature=predict_feature,
            visualization=visualization,
        )

        ds = td.MapData(ds, preprocess_function)
        self.ds = td.BatchData(ds, batch_size)
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
Ejemplo n.º 21
0
    def __init__(
        self,
        annotations_path,
        features_path,
        tokenizer,
        bert_model,
        seq_len,
        batch_size=512,
        num_workers=25,
        cache=5000,
        objective=0,
        num_locs=5,
        add_global_imgfeat=True,
        visualization=False,
    ):
        lmdb_file = os.path.join(features_path, "validation_feat_all.lmdb")
        caption_path = os.path.join(annotations_path, "caption_valid.json")
        print("Loading from %s" % lmdb_file)

        ds = td.LMDBSerializer.load(lmdb_file, shuffle=False)
        self.num_dataset = len(ds)
        preprocess_function = BertPreprocessBatch(
            caption_path,
            tokenizer,
            bert_model,
            seq_len,
            36,
            self.num_dataset,
            visualization=visualization,
            objective=objective,
            num_locs=num_locs,
        )

        ds = td.MapData(ds, preprocess_function)
        self.ds = td.BatchData(ds, batch_size)
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
        self.add_global_imgfeat = add_global_imgfeat
        self.num_locs = num_locs
Ejemplo n.º 22
0
    def __init__(self,
                 mode,
                 do_aug,
                 batch_size=256,
                 shuffle=False,
                 num_workers=25,
                 cache=50000,
                 cuda=False,
                 out_tensor=True,
                 data_transforms=None):
        # enumerate standard imagenet augmentors
        imagenet_augmentors = fbresnet_augmentor(do_aug)

        # load the lmdb if we can find it
        lmdb_loc = os.path.join(os.environ['IMAGENET'],
                                'ILSVRC-%s.lmdb' % mode)
        ds = td.LMDBSerializer.load(lmdb_loc, shuffle=shuffle)
        #ds = td.LMDBData(lmdb_loc, shuffle=False)
        #ds = td.LocallyShuffleData(ds, cache)
        #ds = td.PrefetchData(ds, 5000, 1)
        #ds = td.LMDBDataPoint(ds)
        ds = td.MapDataComponent(
            ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR)[:, :, ::-1], 0)
        ds = td.AugmentImageComponent(ds, imagenet_augmentors)
        ds = td.MultiProcessRunnerZMQ(ds, num_workers)
        self.ds = td.BatchData(ds, batch_size)
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
        self.cuda = cuda
        self.out_tensor = out_tensor
        # data_transforms should be present only when out_tensor=True
        # data_transforms typically consists of
        # PIL Image transforms, ToTensor(), Normalize():
        #    normalize = transforms.Compose( [
        #          transforms.ToTensor(),
        #          transforms.Normalize(mean=[0.485, 0.456, 0.406],
        #                                std=[0.229, 0.224, 0.225]) ] )
        self.data_transforms = data_transforms
        print("Loaded '%s'." % lmdb_loc)
Ejemplo n.º 23
0
import tensorpack.dataflow as df

if __name__ == '__main__':
    ds = df.dataset.Mnist('train')
    augmentors = [
        df.imgaug.RandomApplyAug(
            df.imgaug.RandomResize((0.8, 1.2), (0.8, 1.2)), 0.3),
        df.imgaug.RandomApplyAug(df.imgaug.RotationAndCropValid(15), 0.5),
        df.imgaug.RandomApplyAug(
            df.imgaug.SaltPepperNoise(white_prob=0.01, black_prob=0.01), 0.25),
        df.imgaug.Resize((28, 28)),
        df.imgaug.CenterPaste((32, 32)),
        df.imgaug.RandomCrop((28, 28)),
        df.imgaug.MapImage(lambda x: x.reshape(28, 28, 1))
    ]
    ds = df.AugmentImageComponent(ds, augmentors)
    ds = df.BatchData(ds, batch_size=32, remainder=False)
    ds = df.PrefetchData(ds, nr_prefetch=12, nr_proc=2)
    ds = df.PrintData(ds)

    df.send_dataflow_zmq(ds, 'tcp://localhost:2222')
Ejemplo n.º 24
0
import tensorflow as tf
import tensorpack.dataflow as df

if __name__ == '__main__':
    # prepare dataset
    ds = df.dataset.Mnist('train')
    augmentors_variation = [
        df.imgaug.Resize((28, 28)),
        df.imgaug.CenterPaste((32, 32)),
        df.imgaug.RandomCrop((28, 28)),
        df.imgaug.MapImage(lambda v: v.reshape(784))
    ]
    ds = df.AugmentImageComponent(ds, augmentors_variation)
    ds = df.PrefetchData(ds, nr_prefetch=12, nr_proc=4)
    ds = df.BatchData(ds, batch_size=128, remainder=False, use_list=False)

    # create the model
    x = tf.placeholder(tf.float32, [None, 784])
    W = tf.Variable(tf.ones([784, 10]))
    b = tf.Variable(tf.zeros([10]))
    y = tf.matmul(x, W) + b
    y_ = tf.placeholder(tf.int64, [None])
    cross_entropy = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_, logits=y))
    global_step = tf.train.get_or_create_global_step()
    train_op = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(
        cross_entropy, global_step=global_step)

    correct_prediction = tf.equal(tf.argmax(y, 1), y_)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
Ejemplo n.º 25
0
        # Create a dataflow of training and validation
        # TODO
        ds_train = CustomDataflow(size=100, datadir=args.data) 
        augs = [
                # imgaug.ResizeShortestEdge(270),
                imgaug.RandomCrop(SHAPE), 
                imgaug.Flip(horiz=True), 
                imgaug.Flip(vert=True), 
                imgaug.Transpose()
                ]
        ds_train = AugmentImageComponents(ds_train, augs, (0, 1))
        ds_train = MapData(ds_train, lambda dp: [np.expand_dims(dp[0], axis=0), 
                                                 np.expand_dims(dp[1], axis=0), 
                                                 ])
        ds_train = df.BatchData(ds_train, batch_size=BATCH)
        ds_train = df.PrintData(ds_train)
        # ds_train = df.PrefetchDataZMQ(ds_train, nr_proc=4)

        ds_valid= CustomDataflow(size=100, datadir=args.data)

        #
        # Training loop
        #
        max_step = 10000000
        for epoch in range(EPOCH):
            for mb_train in ds_train.get_data():
                step = step+1
                if step > max_step:
                    exit()
                # print("Step: {}, Epoch {}".format(step, epoch))
Ejemplo n.º 26
0
            # imgaug.RandomCrop(SHAPE),
            # imgaug.Resize(int(SHAPE)),
            imgaug.Flip(horiz=True),
            imgaug.Flip(vert=True),
            imgaug.Albumentations(AB.RandomRotate90(p=1))
        ]
        # ds_train = df.AugmentImageComponent(ds_train, ag_image, 0) # Apply for image only
        ds_train = df.AugmentImageComponents(ds_train, ag_train, (0, 1))
        ds_train = df.MapData(
            ds_train, lambda dp: [
                np.expand_dims(dp[0], axis=0),
                np.expand_dims(dp[1], axis=0),
            ])

        ds_train = df.MultiProcessRunner(ds_train, num_proc=8, num_prefetch=4)
        ds_train = df.BatchData(ds_train, batch_size=BATCH)
        ds_train = df.PrintData(ds_train)
        ds_train = df.MapData(
            ds_train, lambda dp: [
                torch.tensor(dp[0]),
                torch.tensor(dp[1]),
            ])

        # ds_valid
        ds_valid = CustomDataFlow(size=500, datadir=args.data, istrain=False)
        ag_valid = [
            imgaug.Flip(horiz=True),
            imgaug.Flip(vert=True),
            imgaug.Albumentations(AB.RandomRotate90(p=1))
        ]
        ds_valid = df.AugmentImageComponents(ds_valid, ag_valid, (0, 1))
Ejemplo n.º 27
0
                df.imgaug.Contrast((0.8, 1.2), clip=False), 0.5),
            # df.imgaug.RandomApplyAug(df.imgaug.Saturation(0.4, rgb=False), 0.5),
        ]),
    ]
    augmentors_default = [
        df.imgaug.Resize((32, 32)),
        df.imgaug.MapImage(lambda x: x.reshape(32, 32, 1))
    ]
    # keep original image at index 1
    ds = df.MapData(
        ds, lambda datapoint: [datapoint[0], datapoint[0]] + datapoint[1:])
    ds = df.AugmentImageComponent(ds,
                                  augmentors_variation + augmentors_default)
    ds = df.AugmentImageComponent(ds, augmentors_default, index=1)
    ds = df.PrefetchData(ds, nr_prefetch=12, nr_proc=4)
    ds = df.PrintData(ds)
    ds = df.BatchData(ds, batch_size=32, remainder=False, use_list=True)
    ds = df.PrintData(ds)

    for minibatch in ds.get_data():
        images, originals, labels = minibatch
        image, original, label = images[0], originals[0], labels[0]
        name = '{:02d}'.format(label)

        cv2.namedWindow(name)
        cv2.moveWindow(name, 0, 25 + 128 * label)

        display_image = np.concatenate((image, original), axis=1)
        cv2.imshow(name, display_image)
        cv2.waitKey(1)
def data_pipe_3dcnn_block(fmri_files,
                          confound_files,
                          label_matrix,
                          target_name=None,
                          flag_cnn='3d',
                          block_dura=1,
                          batch_size=32,
                          data_type='train',
                          nr_thread=nr_thread,
                          buffer_size=buffer_size):
    assert data_type in ['train', 'val', 'test']
    assert flag_cnn in ['3d', '2d']
    assert fmri_files is not None
    isTrain = data_type == 'train'
    isVal = data_type == 'val'

    print('\n\nGenerating dataflow for %s datasets \n' % data_type)

    buffer_size = int(min(len(fmri_files), buffer_size))
    nr_thread = int(min(len(fmri_files), nr_thread))

    ds0 = gen_fmri_file(fmri_files,
                        confound_files,
                        label_matrix,
                        data_type=data_type)
    print('dataflowSize is ' + str(ds0.size()))
    print('Loading data using %d threads with %d buffer_size ... \n' %
          (nr_thread, buffer_size))

    if target_name is None:
        target_name = np.unique(label_matrix)
    ##Subject_Num, Trial_Num = np.array(label_matrix).shape

    ####running the model
    start_time = time.clock()
    if flag_cnn == '2d':
        ds1 = dataflow.MultiThreadMapData(
            ds0,
            nr_thread=nr_thread,
            map_func=lambda dp: map_load_fmri_image_block(
                dp, target_name, block_dura=block_dura),
            buffer_size=buffer_size,
            strict=True)
    elif flag_cnn == '3d':
        ds1 = dataflow.MultiThreadMapData(
            ds0,
            nr_thread=nr_thread,
            map_func=lambda dp: map_load_fmri_image_3d_block(
                dp, target_name, block_dura=block_dura),
            buffer_size=buffer_size,
            strict=True)

    ds1 = dataflow.PrefetchData(ds1, buffer_size, 1)

    ds1 = split_samples(ds1)
    print('prefetch dataflowSize is ' + str(ds1.size()))

    if isTrain:
        print('%d #Trials/Samples per subject with %d channels in tc' %
              (ds1.Trial_Num, ds1.Block_dura))
        Trial_Num = ds1.Trial_Num
        #ds1 = dataflow.LocallyShuffleData(ds1, buffer_size=ds1.size() * buffer_size)
        ds1 = dataflow.LocallyShuffleData(ds1,
                                          buffer_size=Trial_Num * buffer_size,
                                          shuffle_interval=Trial_Num *
                                          buffer_size)  #//2

    ds1 = dataflow.BatchData(ds1, batch_size=batch_size, remainder=True)
    print('Time Usage of loading data in seconds: {} \n'.format(time.clock() -
                                                                start_time))

    ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=1)
    #ds1._reset_once()
    ##ds1.reset_state()
    '''
    for df in ds1.get_data():
        if flag_cnn == '2d':
            yield (df[0].astype('float32'),to_categorical(df[1].astype('int32'), len(target_name)))
        elif flag_cnn == '3d':
            yield (df[0].astype('float32'),to_categorical(df[1].astype('int32'), len(target_name)))
    '''
    return ds1
Ejemplo n.º 29
0
                    label.append(self.df.iloc[idx]['Cavitation'])
                    label.append(self.df.iloc[idx]['Fibrosis'])
                    label.append(self.df.iloc[idx]['Widening_Mediastinum'])
                    label.append(self.df.iloc[idx]['Medical_device'])
                    label.append(self.df.iloc[idx]['Fracture'])
                    label.append(self.df.iloc[idx]['No_Finding'])
                elif self.types == 1:
                    assert self.pathology is not None
                    label.append(self.df.iloc[idx][self.pathology])
                else:
                    pass
                # Try catch exception
                label = np.nan_to_num(label, copy=True, nan=0)
                label = np.array(label, dtype=np.float32)
                types = label.copy()
                yield [image, types]
            elif self.is_train == 'test':
                yield [image]  # , np.array([-1, -1, -1, -1, -1])
            else:
                pass


if __name__ == '__main__':
    ds = Vinmec(folder='/u01/data/Vimmec_Data_small/',
                train_or_valid='train',
                resize=256)
    ds.reset_state()
    # ds = df.MultiProcessRunnerZMQ(ds, num_proc=8)
    ds = df.BatchData(ds, 32)
    # ds = df.PrintData(ds)
    df.TestDataSpeed(ds).start()
Ejemplo n.º 30
0
    def __init__(
        self,
        corpus_path,
        tokenizer,
        seq_len,
        encoding='utf-8',
        predict_feature=False,
        batch_size=512,
        shuffle=False,
        num_workers=10,
        cache=50000,
        drop_last=False,
        cuda=False,
    ):

        lmdb_file = '/coc/dataset/conceptual_caption/validation_feat_all.lmdb'
        if not os.path.exists(lmdb_file):
            lmdb_file = '/coc/pskynet2/jlu347/multi-modal-bert/data/conceptual_caption/validation_feat_all.lmdb'
        caption_path = '/coc/pskynet2/jlu347/multi-modal-bert/data/conceptual_caption/caption_val.json'

        print('Loading from %s' % lmdb_file)

        ds = td.LMDBSerializer.load(lmdb_file, shuffle=False)
        self.num_dataset = len(ds)
        preprocess_function = BertPreprocessRetrieval(
            caption_path,
            tokenizer,
            seq_len,
            36,
            1000,
            encoding='utf-8',
            predict_feature=predict_feature,
        )

        ds = td.MapData(ds, preprocess_function)
        self.ds = td.BatchData(ds, 1)
        self.ds.reset_state()

        self.batch_size = 1
        self.num_workers = num_workers
        self._entry = []

        self.features_all = np.zeros((1000, 37, 2048), dtype=np.float32)
        self.spatials_all = np.zeros((1000, 37, 5), dtype=np.float32)
        self.image_mask_all = np.zeros((1000, 37), dtype=np.float32)
        self.image_ids = []
        # load first 1000 file here.
        for i, batch in enumerate(self.ds.get_data()):
            if i >= 1000:
                break
            input_ids, input_mask, segment_ids, is_next, image_feat, image_loc, image_mask, image_id, caption = batch

            batch_size = input_ids.shape[0]
            g_image_feat = np.sum(image_feat, axis=1) / np.sum(
                image_mask, axis=1, keepdims=True)
            image_feat = np.concatenate(
                [np.expand_dims(g_image_feat, axis=1), image_feat], axis=1)
            image_feat = np.array(image_feat, dtype=np.float32)

            g_image_loc = np.repeat(np.array([[0, 0, 1, 1, 1]],
                                             dtype=np.float32),
                                    batch_size,
                                    axis=0)
            image_loc = np.concatenate(
                [np.expand_dims(g_image_loc, axis=1), image_loc], axis=1)

            image_loc = np.array(image_loc, dtype=np.float32)
            g_image_mask = np.repeat(np.array([[1]]), batch_size, axis=0)
            image_mask = np.concatenate([g_image_mask, image_mask], axis=1)

            batch = (input_ids, input_mask, segment_ids, image_id, caption)
            self._entry.append(batch)

            self.features_all[i] = image_feat
            self.image_mask_all[i] = np.array(image_mask)
            self.spatials_all[i] = image_loc
            self.image_ids.append(image_id)
            sys.stdout.write('%d/%d\r' % (i, 1000))
            sys.stdout.flush()