コード例 #1
0
def dataflow(centroids, num_reference=3, num_process=16, shuffle=True):
    ds = Kinetics('/data/public/rw/datasets/videos/kinetics',
                  num_frames=num_reference + 1,
                  skips=[0, 4, 4, 8][:num_reference + 1],
                  shuffle=shuffle)

    ds = df.MapDataComponent(ds, ImageProcess.resize(small_axis=256), index=1)
    ds = df.MapDataComponent(ds, ImageProcess.crop(shape=(256, 256)), index=1)
    #ds = df.MapDataComponent(ds, lambda images: [cv2.resize(image, (256, 256)) for image in images], index=1)

    ds = df.MapData(
        ds, lambda dp: [
            dp[1][:num_reference],
            copy.deepcopy(dp[1][:num_reference]), dp[1][num_reference:],
            copy.deepcopy(dp[1][num_reference:])
        ])

    # for images (ref, target)
    for idx in [0, 2]:
        ds = df.MapDataComponent(
            ds,
            lambda images: [
                cv2.cvtColor(image, cv2.COLOR_BGR2GRAY).reshape(256, 256, 1)
                for image in images
            ],
            index=idx)

    # for labels (ref, target)
    for idx in [1, 3]:
        ds = df.MapDataComponent(
            ds,
            lambda images: [cv2.resize(image, (32, 32)) for image in images],
            index=idx)
        ds = df.MapDataComponent(ds,
                                 lambda images: [
                                     cv2.cvtColor(np.float32(image / 255.0),
                                                  cv2.COLOR_BGR2Lab)[:, :, 1:]
                                     for image in images
                                 ],
                                 index=idx)
        ds = df.MapDataComponent(
            ds,
            lambda images: [
                np.array([
                    np.argmin(np.linalg.norm(centroids - v, axis=1))
                    for v in image.reshape((-1, 2))
                ]).reshape((32, 32, 1)) for image in images
            ],
            index=idx)

    # stack for tensor
    ds = df.MapData(
        ds, lambda dp:
        [np.stack(dp[0] + dp[2], axis=0),
         np.stack(dp[1] + dp[3], axis=0)])

    ds = df.MapData(ds, tuple)  # for tensorflow.data.dataset
    ds = df.MultiProcessPrefetchData(ds, nr_prefetch=256, nr_proc=num_process)
    ds = df.PrefetchDataZMQ(ds, nr_proc=1)
    return ds
コード例 #2
0
ファイル: data_util_nbv.py プロジェクト: AlexPop104/PC_NBV
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False):
    #df = dataflow.LMDBSerializer.load("/home/cuda/Alex/PC-NBV/data/train.lmdb", shuffle=False)

    df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False)
    #df = dataflow.LMDBSerializer.load("/home/cuda/Alex/PC-NBV/data/", shuffle=False)

    size = df.size()
    if is_training:
        df = dataflow.LocallyShuffleData(df, buffer_size=2000)
        
        df = dataflow.PrefetchData(df, num_prefetch=500, num_proc=1)
        # df = dataflow.PrefetchData(df,nr_prefetch=500, nr_proc=1)

        
    df = BatchData(df, batch_size, input_size, output_size)
    if is_training:
        df = dataflow.PrefetchDataZMQ(df, num_proc=8)
        #df = dataflow.PrefetchData(df,num_prefetch=500, num_proc=1)
        #df = dataflow.PrefetchDataZMQ(df, num_proc=1)
    df = dataflow.RepeatedData(df, -1)
    if test_speed:
        dataflow.TestDataSpeed(df, size=1000).start()
    
    df.reset_state()

    return df, size
コード例 #3
0
ファイル: data.py プロジェクト: hhjung1202/Prob_network
    def __init__(self,
                 mode,
                 batch_size=256,
                 shuffle=False,
                 num_workers=25,
                 cache=50000,
                 collate_fn=default_collate,
                 drop_last=False,
                 cuda=False):
        # enumerate standard imagenet augmentors
        imagenet_augmentors = fbresnet_augmentor(mode == 'train')

        # load the lmdb if we can find it
        lmdb_loc = os.path.join(os.environ['IMAGENET'],
                                'ILSVRC-%s.lmdb' % mode)
        ds = td.LMDBData(lmdb_loc, shuffle=False)
        ds = td.LocallyShuffleData(ds, cache)
        ds = td.PrefetchData(ds, 5000, 1)
        ds = td.LMDBDataPoint(ds)
        ds = td.MapDataComponent(ds,
                                 lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR),
                                 0)
        ds = td.AugmentImageComponent(ds, imagenet_augmentors)
        ds = td.PrefetchDataZMQ(ds, num_workers)
        self.ds = td.BatchData(ds, batch_size)
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
        self.cuda = cuda
コード例 #4
0
ファイル: seqimagenet.py プロジェクト: ziqi-zhang/Renofeation
    def __init__(self,
                 mode,
                 batch_size=256,
                 shuffle=False,
                 num_workers=25,
                 cache=50000,
                 collate_fn=default_collate,
                 remainder=False,
                 cuda=False,
                 transform=None):
        # enumerate standard imagenet augmentors
        #imagenet_augmentors = fbresnet_augmentor(mode == 'train')
        imagenet_augmentors = [ImgAugTVCompose(transform)]

        # load the lmdb if we can find it
        lmdb_loc = os.path.join(os.environ['IMAGENET'],
                                'ILSVRC-%s.lmdb' % mode)
        ds = td.LMDBData(lmdb_loc, shuffle=False)
        if mode == 'train':
            ds = td.LocallyShuffleData(ds, cache)
        ds = td.PrefetchData(ds, 5000, 1)
        ds = td.LMDBDataPoint(ds)
        #ds = td.MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0)
        ds = td.MapDataComponent(
            ds, lambda x: np.asarray(Image.open(io.BytesIO(x)).convert('RGB')),
            0)
        ds = td.AugmentImageComponent(ds, imagenet_augmentors)
        ds = td.PrefetchDataZMQ(ds, num_workers)
        self.ds = td.BatchData(ds, batch_size, remainder=remainder)
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
        self.cuda = cuda
コード例 #5
0
def lmdb_dataflow(lmdb_path,
                  batch_size,
                  sample_size,
                  is_training,
                  test_speed=False,
                  train_perturb_list=None,
                  valid_perturb_list=None,
                  so3_perturb=False,
                  use_partial=False):
    df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False)
    size = df.size()
    if is_training:
        df = dataflow.LocallyShuffleData(df, buffer_size=2000)
    df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1)
    df = PreprocessData(df,
                        sample_size,
                        is_training,
                        train_perturb_list=train_perturb_list,
                        valid_perturb_list=valid_perturb_list,
                        so3_perturb=so3_perturb,
                        use_partial=use_partial)
    if is_training:
        df = dataflow.PrefetchDataZMQ(df, nr_proc=8)
    df = dataflow.BatchData(df, batch_size, use_list=True)
    df = dataflow.RepeatedData(df, -1)
    if test_speed:
        dataflow.TestDataSpeed(df, size=1000).start()
    df.reset_state()
    return df, size
コード例 #6
0
def data_pipe(fmri_files,
              confound_files,
              label_matrix,
              target_name=None,
              batch_size=32,
              data_type='train',
              train_percent=0.8,
              nr_thread=nr_thread,
              buffer_size=buffer_size):
    assert data_type in ['train', 'val', 'test']
    assert fmri_files is not None

    print('\n\nGenerating dataflow for %s datasets \n' % data_type)

    buffer_size = min(len(fmri_files), buffer_size)
    nr_thread = min(len(fmri_files), nr_thread)

    ds0 = gen_fmri_file(fmri_files,
                        confound_files,
                        label_matrix,
                        data_type=data_type,
                        train_percent=train_percent)
    print('dataflowSize is ' + str(ds0.size()))
    print('Loading data using %d threads with %d buffer_size ... \n' %
          (nr_thread, buffer_size))

    if target_name is None:
        target_name = np.unique(label_matrix)

    ####running the model
    start_time = time.clock()
    ds1 = dataflow.MultiThreadMapData(
        ds0,
        nr_thread=nr_thread,
        map_func=lambda dp: map_load_fmri_image(dp, target_name),
        buffer_size=buffer_size,
        strict=True)

    ds1 = dataflow.PrefetchData(ds1, buffer_size, 1)

    ds1 = split_samples(ds1)
    print('prefetch dataflowSize is ' + str(ds1.size()))

    ds1 = dataflow.LocallyShuffleData(ds1,
                                      buffer_size=ds1.size() * buffer_size)

    ds1 = dataflow.BatchData(ds1, batch_size=batch_size)
    print('Time Usage of loading data in seconds: {} \n'.format(time.clock() -
                                                                start_time))

    ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=1)
    ds1._reset_once()
    ##ds1.reset_state()

    #return ds1.get_data()
    for df in ds1.get_data():
        ##print(np.expand_dims(df[0].astype('float32'),axis=3).shape)
        yield (np.expand_dims(df[0].astype('float32'), axis=3),
               to_categorical(df[1].astype('int32'), len(target_name)))
コード例 #7
0
    def __init__(
        self,
        corpus_path,
        tokenizer,
        seq_len,
        encoding='utf-8',
        predict_feature=False,
        hard_negative=False,
        batch_size=512,
        shuffle=False,
        num_workers=25,
        cache=50000,
        drop_last=False,
        cuda=False,
        distributed=False,
        visualization=False,
    ):

        if dist.is_available() and distributed:
            # num_replicas = dist.get_world_size()
            # assert num_replicas == 8
            rank = dist.get_rank()
            lmdb_file = '/mnt3/xuesheng/features_lmdb/CC/training_feat_part_' + str(
                rank) + '.lmdb'
            # if not os.path.exists(lmdb_file):
            # lmdb_file = "/srv/share/datasets/conceptual_caption/training_feat_part_" + str(rank) + ".lmdb"
        else:
            # lmdb_file = "/coc/dataset/conceptual_caption/training_feat_all.lmdb"
            # if not os.path.exists(lmdb_file):
            lmdb_file = '/mnt3/xuesheng/features_lmdb/CC/training_feat_part_0.lmdb'

        caption_path = '/mnt3/xuesheng/features_lmdb/CC/caption_train.json'
        print('Loading from %s' % lmdb_file)

        ds = td.LMDBSerializer.load(lmdb_file, shuffle=True)
        self.num_dataset = len(ds)

        preprocess_function = BertPreprocessBatch(
            caption_path,
            tokenizer,
            seq_len,
            36,
            self.num_dataset,
            encoding='utf-8',
            predict_feature=predict_feature,
        )

        # ds = td.LocallyShuffleData(ds, cache)
        # ds = td.PrefetchData(ds, 5000, 1)
        ds = td.MapData(ds, preprocess_function)
        # self.ds = td.PrefetchData(ds, 1)
        ds = td.PrefetchDataZMQ(ds, num_workers)
        self.ds = td.BatchData(ds, batch_size)
        # self.ds = ds
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
コード例 #8
0
    def __init__(self,
                 corpus_path,
                 tokenizer,
                 seq_len,
                 encoding="utf-8",
                 predict_feature=False,
                 batch_size=512,
                 shuffle=False,
                 num_workers=25,
                 cache=10000,
                 drop_last=False,
                 cuda=False,
                 distributed=False,
                 visualization=False,
                 span_mask=False,
                 cond_mask=False,
                 region_len=36):

        if dist.is_available() and distributed:
            rank = dist.get_rank()
            lmdb_file = os.path.join(
                corpus_path, "training_feat_part_" + str(rank) + ".lmdb")
        else:
            lmdb_file = os.path.join(corpus_path, "training_feat_all.lmdb")

        caption_path = os.path.join(corpus_path, "caption_train.json")

        print("Loading from %s" % lmdb_file)

        os.listdir(corpus_path)

        ds = td.LMDBSerializer.load(lmdb_file, shuffle=False)
        self.num_dataset = len(ds)

        self.cond_mask = cond_mask

        preprocess_function = BertPreprocessBatch(
            caption_path,
            tokenizer,
            seq_len,
            region_len,
            self.num_dataset,
            encoding="utf-8",
            predict_feature=predict_feature,
            span_mask=span_mask,
            cond_mask=cond_mask)

        # ds = td.LocallyShuffleData(ds, cache)
        ds = td.PrefetchData(ds, 5000, 1)
        ds = td.MapData(ds, preprocess_function)
        # self.ds = td.PrefetchData(ds, 1)
        ds = td.PrefetchDataZMQ(ds, num_workers)
        self.ds = td.BatchData(ds, batch_size)
        # self.ds = ds
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
コード例 #9
0
    def __init__(self,
                 datafile,
                 batch_size,
                 num_workers=1,
                 nviews=12,
                 reset=True,
                 augment=False,
                 filter_classes=None,
                 filter_views=None,
                 polarmode='cartesian',
                 shuffle=True,
                 filter_ids=None,
                 label_to0idx=False,
                 rgb=False,
                 force_res=0,
                 autocrop=False,
                 keep_aspect_ratio=False):
        self.filter_classes = filter_classes
        self.filter_views = filter_views
        self.filter_ids = filter_ids
        self.polarmode = polarmode
        self.label_to0idx = label_to0idx
        self.rgb = rgb
        self.force_res = force_res
        self.autocrop = autocrop
        self.keep_aspect_ratio = keep_aspect_ratio

        if not isinstance(datafile, list):
            datafile = [datafile]

        ds = []
        for d in datafile:

            ds.append(df.LMDBSerializer.load(d, shuffle=shuffle))

            if shuffle:
                ds[-1] = df.LocallyShuffleData(ds[-1], 100)
            ds[-1] = df.PrefetchData(ds[-1], 20, 1)

            ds[-1] = df.MapData(ds[-1], self.load)
            if augment:
                ds[-1] = df.MapDataComponent(ds[-1], LMDBMultiView._augment, 0)

            if (not filter_classes and not filter_ids and num_workers > 1):
                # warning: skipping this is slower when filtering datasets
                #          but epoch counting will be wrong otherwise
                ds[-1] = df.PrefetchDataZMQ(ds[-1], num_workers)
            ds[-1] = df.BatchData(ds[-1], batch_size)

            if reset:
                ds[-1].reset_state()

        self.ds = ds
コード例 #10
0
    def __init__(self, num_split):

        lmdb_file = "/srv/share/vgoswami8/conceptual_captions/training_feat_all.lmdb"

        caption_path = "/srv/share/vgoswami8/conceptual_captions/caption_train.json"
        print("Loading from %s" % lmdb_file)
        ds = td.LMDBSerializer.load(lmdb_file, shuffle=False)
        self.num_dataset = int(len(ds) / num_split) + 1
        ds = td.PrefetchDataZMQ(ds, nr_proc=1)
        ds = td.FixedSizeData(ds, self.num_dataset, keep_state=True)
        self.ds = ds
        self.ds.reset_state()
コード例 #11
0
def get_dataflows(config):
    """
    construct and initialize dataflows based on config.
    """

    df = ExpertDataflow(config)
    df = tp_dataflow.PrefetchDataZMQ(df, nr_proc=16)
    df = tp_dataflow.BatchData(df, config['batch_size'], remainder=False)

    # initialize random number generator in child processes to unique values
    df.reset_state()

    return df
コード例 #12
0
    def __init__(
        self,
        annotations_path,
        features_path,
        tokenizer,
        bert_model,
        seq_len,
        batch_size=512,
        num_workers=25,
        cache=10000,
        local_rank=-1,
        objective=0,
        num_locs=5,
        add_global_imgfeat=None,
    ):

        if dist.is_available() and local_rank != -1:
            rank = dist.get_rank()
            lmdb_file = os.path.join(
                features_path, "training_feat_part_" + str(rank) + ".lmdb")
        else:
            lmdb_file = os.path.join(features_path, "training_feat_all.lmdb")

            print("Loading from %s" % lmdb_file)

        ds = td.LMDBSerializer.load(lmdb_file, shuffle=False)
        self.num_dataset = len(ds)
        ds = td.LocallyShuffleData(ds, cache)
        caption_path = os.path.join(annotations_path, "caption_train.json")

        preprocess_function = BertPreprocessBatch(
            caption_path,
            tokenizer,
            bert_model,
            seq_len,
            36,
            self.num_dataset,
            objective=objective,
            num_locs=num_locs,
        )

        ds = td.PrefetchData(ds, 5000, 1)
        ds = td.MapData(ds, preprocess_function)
        ds = td.PrefetchDataZMQ(ds, num_workers)
        self.ds = td.BatchData(ds, batch_size)
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
        self.add_global_imgfeat = add_global_imgfeat
        self.num_locs = num_locs
コード例 #13
0
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False):
    """load LMDB files, then generate batches??"""
    df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False)
    size = df.size()
    if is_training:
        df = dataflow.LocallyShuffleData(df, buffer_size=2000)  # buffer_size
        df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1)  # multiprocess the data
    df = BatchData(df, batch_size, input_size, output_size)
    if is_training:
        df = dataflow.PrefetchDataZMQ(df, nr_proc=8)
    df = dataflow.RepeatedData(df, -1)
    if test_speed:
        dataflow.TestDataSpeed(df, size=1000).start()
    df.reset_state()
    return df, size
コード例 #14
0
ファイル: data_util.py プロジェクト: mihaibujanca/pcn
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False):
    df = dataflow.LMDBData(lmdb_path, shuffle=False)
    size = df.size()
    if is_training:
        df = dataflow.LocallyShuffleData(df, buffer_size=2000)
    df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1)
    df = dataflow.LMDBDataPoint(df)
    df = PreprocessData(df, input_size, output_size)
    if is_training:
        df = dataflow.PrefetchDataZMQ(df, nr_proc=8)
    df = dataflow.BatchData(df, batch_size, use_list=True)
    df = dataflow.RepeatedData(df, -1)
    if test_speed:
        dataflow.TestDataSpeed(df, size=1000).start()
    df.reset_state()
    return df, size
コード例 #15
0
 def __init__(self, split, batch_size, set_size):
     if split == 'train':
         lmdb_path = f'{data_path}/ModelNet40_train_1024_middle.lmdb'
     else:
         lmdb_path = f'{data_path}/ModelNet40_test_1024_middle.lmdb'
     df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False)
     self.size = df.size()
     self.num_batches = self.size // batch_size
     if split == 'train':
         df = dataflow.LocallyShuffleData(df,
                                          buffer_size=2000)  # buffer_size
         df = dataflow.PrefetchData(df, num_prefetch=500, num_proc=1)
     df = BatchData(df, batch_size, set_size // 8, set_size - set_size // 8)
     if split == 'train':
         df = dataflow.PrefetchDataZMQ(df, num_proc=8)
     df = dataflow.RepeatedData(df, -1)
     df.reset_state()
     self.generator = df.get_data()
コード例 #16
0
def lmdb_dataflow(lmdb_path,
                  batch_size,
                  num_points,
                  shuffle,
                  task,
                  render=False):
    df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False)
    size = df.size()
    if render:
        df = VirtualRenderData(df)
    if num_points is not None:
        df = ResampleData(df, num_points, task)
    if shuffle:
        df = dataflow.LocallyShuffleData(df, 1000)
        df = dataflow.PrefetchDataZMQ(df, 8)
    df = dataflow.BatchData(df, batch_size, use_list=True)
    df = dataflow.RepeatedData(df, -1)
    df.reset_state()
    return df, size
コード例 #17
0
ファイル: write_lmdb.py プロジェクト: wentaoyuan/pcd_seg
def process_dataset(args):
    sample_num = args.m
    split = args.dataset
    args.is_test = split == 'test'
    for cat_name in args.c:
        synset_id = synset_ids[cat_name]
        print('Processing', synset_id, cat_name)
        data_dir = os.path.join('../data', split + '_data', synset_id)
        label_dir = os.path.join('../data', split + '_label', synset_id)
        file_names = [
            os.path.splitext(entry)[0] for entry in os.listdir(data_dir)
            if entry.endswith('.pts')
        ]
        df = graph_df(file_names, data_dir, label_dir, args)
        output_path = os.path.join(
            '../data/lmdb', '%s_%d_%s.lmdb' % (cat_name, sample_num, split))
        if os.path.exists(output_path):
            os.system('rm -f %s' % output_path)
        df = dataflow.PrefetchDataZMQ(df, nr_proc=1)
        dftools.dump_dataflow_to_lmdb(df, output_path, write_frequency=args.w)
コード例 #18
0
def lmdb_dataflow(lmdb_path,
                  batch_size,
                  input_size,
                  output_size,
                  is_training,
                  test_speed=False,
                  filter_rate=0):
    df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False)
    df = dataflow.MapData(df,
                          lambda dp: [item for item in dp] + [random.random()])

    size = df.size()
    print(size)
    if is_training:
        df = dataflow.LocallyShuffleData(df, buffer_size=2000)
        df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1)
    df = BatchData(df, batch_size, input_size, output_size)
    if is_training:
        df = dataflow.PrefetchDataZMQ(df, nr_proc=8)
    df = dataflow.RepeatedData(df, -1)
    if test_speed:
        dataflow.TestDataSpeed(df, size=1000).start()
    df.reset_state()
    return df, size
コード例 #19
0
            ]
        elif train_or_valid in ['valid', 'validation']:
            synsets = [
                line.strip()
                for line in open(data_path+'/imagenet_2012_validation_synset_labels.txt').readlines()
            ]
            self.datapoints = [
                [base_path + 'Data/CLS-LOC/val/ILSVRC2012_val_%08d.JPEG' % (i+1), int(self.maps['synset2idx'][synset])]
                for i, synset in enumerate(synsets)
            ]
        else:
            raise ValueError('train_or_valid=%s is invalid argument must be a set train or valid' % train_or_valid)


if __name__ == '__main__':
    import argparse
    import tensorpack.dataflow as df

    parser = argparse.ArgumentParser(description='Imagenet Dataset on Kakao Example')
    parser.add_argument('--service-code', type=str, required=True,
                        help='licence key')
    parser.add_argument('--name', type=str, default='train',
                        help='train or valid')
    args = parser.parse_args()

    ds = ILSVRC12(args.service_code, args.name).parallel(num_threads=32)
    if args.name in ['train', 'training']:
        ds = df.PrefetchDataZMQ(ds, nr_proc=2)
    
    df.TestDataSpeed(ds, size=5000).start()
コード例 #20
0
    def __init__(
        self,
        corpus_path,
        tokenizer,
        bert_model,
        seq_len,
        encoding="utf-8",
        visual_target=0,
        hard_negative=False,
        batch_size=512,
        shuffle=False,
        num_workers=25,
        cache=10000,
        drop_last=False,
        cuda=False,
        local_rank=-1,
        objective=0,
        visualization=False,
    ):
        TRAIN_DATASET_SIZE = 3119449

        if dist.is_available() and local_rank != -1:

            num_replicas = dist.get_world_size()
            rank = dist.get_rank()

            lmdb_file = os.path.join(
                corpus_path, "training_feat_part_" + str(rank) + ".lmdb")
        else:
            lmdb_file = os.path.join(corpus_path,
                                     "gqa_resnext152_faster_rcnn_genome.lmdb")
            # lmdb_file = os.path.join(corpus_path, "validation_feat_all.lmdb")

            print("Loading from %s" % lmdb_file)

        ds = td.LMDBSerializer.load(lmdb_file, shuffle=False)
        self.num_dataset = len(ds)
        ds = td.LocallyShuffleData(ds, cache)
        caption_path = os.path.join(corpus_path, "caption_train.json")
        # caption_path = os.path.join(corpus_path, "caption_val.json")

        preprocess_function = BertPreprocessBatch(
            caption_path,
            tokenizer,
            bert_model,
            seq_len,
            36,
            self.num_dataset,
            encoding="utf-8",
            visual_target=visual_target,
            objective=objective,
        )

        ds = td.PrefetchData(ds, 5000, 1)
        ds = td.MapData(ds, preprocess_function)
        # self.ds = td.PrefetchData(ds, 1)
        ds = td.PrefetchDataZMQ(ds, num_workers)
        self.ds = td.BatchData(ds, batch_size)
        # self.ds = ds
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
コード例 #21
0
def dataflow(centroids, num_refs=3, num_process=16, shuffle=False):
    """
    Compute graph to retrieve 3 reference and 1 target frames from Kinetics.

    Downsample grayscale frames to 256x256 and colorized frames to 32x32
    feature maps in Lab colorspace. Cluster colors in colorized frames.

    Returned tensors are of shape (num_refs + 1, 256, 256, 1)
    and (num_refs + 1, 32, 32, 1) each. Instead of colorized output,
    cluster centroid index is returned.

    :return: (grayscale input, cluster indices for colorized output)
    """
    config = Config.get_instance()
    kinetics_dirpath = config['data_dir']['kinetics']

    # get frame and 3 prior reference frames with certain number of skips
    data = Kinetics(kinetics_dirpath,
                    num_frames=num_refs + 1,
                    skips=[0, 4, 4, 8][:num_refs + 1],
                    shuffle=shuffle)

    # downsample frames to 256x256
    data = df.MapDataComponent(data,
                               ImageProcessor.resize(small_axis=256),
                               index=1)
    data = df.MapDataComponent(data,
                               ImageProcessor.crop(shape=(256, 256)),
                               index=1)
    # data = df.MapDataComponent(
    #    data, lambda images: [cv2.resize(image, (256, 256)) for image in images], index=1)

    # split frames into 3 references and 1 target frame
    # create deep copies of each at odd indices
    data = df.MapData(
        data, lambda dp: [
            dp[1][:num_refs],
            copy.deepcopy(dp[1][:num_refs]), dp[1][num_refs:],
            copy.deepcopy(dp[1][num_refs:])
        ])

    # decolorize first set of reference and target frames as (256, 256, 1)
    for idx in [0, 2]:
        data = df.MapDataComponent(
            data,
            lambda images: [
                np.int32(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)).reshape(
                    256, 256, 1) for image in images
            ],
            index=idx)

    for idx in [1, 3]:
        # downsample to 32x32 feature map
        data = df.MapDataComponent(
            data,
            lambda images: [cv2.resize(image, (32, 32)) for image in images],
            index=idx)

        # discard grayscale L space, keep only 'ab' from Lab color space
        # scale from 0-255 to 0-1 for clustering in next step
        data = df.MapDataComponent(
            data,
            lambda images: [
                cv2.cvtColor(np.float32(image / 255.0), cv2.COLOR_BGR2Lab)
                [:, :, 1:] for image in images
            ],
            index=idx)

        # find nearest color cluster index for every pixel in ref and target
        data = df.MapDataComponent(
            data,
            lambda images:
            [get_cluster_labels(image, centroids) for image in images],
            index=idx)

    # combine ref and target frames into (num_refs + 1, dim, dim, 1) tensor
    # for both grayscale and colorized feature maps respectively
    # generates [input tensor, output tensor]
    data = df.MapData(
        data, lambda dp:
        [np.stack(dp[0] + dp[2], axis=0),
         np.stack(dp[1] + dp[3], axis=0)])

    # important for tensorflow.data.dataset
    # does not do what it is supposed to do
    data = df.MapData(data, tuple)

    # prefetch 256 datapoints
    data = df.MultiProcessPrefetchData(data,
                                       nr_prefetch=256,
                                       nr_proc=num_process)
    data = df.PrefetchDataZMQ(data, nr_proc=1)

    return data
コード例 #22
0
def data_pipe_3dcnn_block(fmri_files,
                          confound_files,
                          label_matrix,
                          target_name=None,
                          flag_cnn='3d',
                          block_dura=1,
                          hrf_delay=0,
                          batch_size=32,
                          data_type='train',
                          nr_thread=4,
                          buffer_size=10,
                          dataselect_percent=1.0,
                          seed=814,
                          verbose=0):
    assert data_type in ['train', 'val', 'test']
    assert flag_cnn in ['3d', '2d']
    assert fmri_files is not None
    isTrain = data_type == 'train'
    isVal = data_type == 'val'
    isTest = data_type == 'test'

    buffer_size = int(min(len(fmri_files), buffer_size))
    nr_thread = int(min(len(fmri_files), nr_thread))

    ds0 = gen_fmri_file(fmri_files,
                        confound_files,
                        label_matrix,
                        data_type=data_type,
                        seed=seed)

    if target_name is None:
        target_name = np.unique(label_matrix)
    ##Subject_Num, Trial_Num = np.array(label_matrix).shape

    ####running the model
    start_time = time.clock()
    if flag_cnn == '2d':
        ds1 = dataflow.MultiThreadMapData(
            ds0,
            nr_thread=nr_thread,
            map_func=lambda dp: map_load_fmri_image_block(
                dp, target_name, block_dura=block_dura, hrf_delay=hrf_delay),
            buffer_size=buffer_size,
            strict=True)
    elif flag_cnn == '3d':
        ds1 = dataflow.MultiThreadMapData(
            ds0,
            nr_thread=nr_thread,
            map_func=lambda dp: map_load_fmri_image_3d_block(
                dp, target_name, block_dura=block_dura, hrf_delay=hrf_delay),
            buffer_size=buffer_size,
            strict=True)

    ds1 = dataflow.PrefetchData(ds1, buffer_size, 1)  ##1

    ds1 = split_samples(ds1,
                        subject_num=len(fmri_files),
                        batch_size=batch_size,
                        dataselect_percent=dataselect_percent)
    dataflowSize = ds1.size()

    if isTrain:
        if verbose:
            print('%d #Trials/Samples per subject with %d channels in tc' %
                  (ds1.Trial_Num, ds1.Block_dura))
        Trial_Num = ds1.Trial_Num
        ds1 = dataflow.LocallyShuffleData(ds1,
                                          buffer_size=Trial_Num * buffer_size,
                                          shuffle_interval=Trial_Num *
                                          buffer_size // 2)  #//2

    ds1 = dataflow.BatchData(ds1, batch_size=batch_size)

    if verbose:
        print('\n\nGenerating dataflow for %s datasets \n' % data_type)
        print('dataflowSize is ' + str(ds0.size()))
        print('Loading data using %d threads with %d buffer_size ... \n' %
              (nr_thread, buffer_size))
        print('prefetch dataflowSize is ' + str(dataflowSize))

        print('Time Usage of loading data in seconds: {} \n'.format(
            time.clock() - start_time))

    if isTrain:
        ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=nr_thread)  ##1
    else:
        ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=1)  ##1
    ##ds1._reset_once()
    ds1.reset_state()

    for df in ds1.get_data():
        yield (df[0].astype('float32'),
               one_hot(df[1],
                       len(target_name) + 1).astype('uint8'))


###end of tensorpack: multithread
##############################################################
def data_pipe_3dcnn_block(fmri_files,
                          confound_files,
                          label_matrix,
                          target_name=None,
                          flag_cnn='3d',
                          block_dura=1,
                          batch_size=32,
                          data_type='train',
                          nr_thread=nr_thread,
                          buffer_size=buffer_size):
    assert data_type in ['train', 'val', 'test']
    assert flag_cnn in ['3d', '2d']
    assert fmri_files is not None
    isTrain = data_type == 'train'
    isVal = data_type == 'val'

    print('\n\nGenerating dataflow for %s datasets \n' % data_type)

    buffer_size = int(min(len(fmri_files), buffer_size))
    nr_thread = int(min(len(fmri_files), nr_thread))

    ds0 = gen_fmri_file(fmri_files,
                        confound_files,
                        label_matrix,
                        data_type=data_type)
    print('dataflowSize is ' + str(ds0.size()))
    print('Loading data using %d threads with %d buffer_size ... \n' %
          (nr_thread, buffer_size))

    if target_name is None:
        target_name = np.unique(label_matrix)
    ##Subject_Num, Trial_Num = np.array(label_matrix).shape

    ####running the model
    start_time = time.clock()
    if flag_cnn == '2d':
        ds1 = dataflow.MultiThreadMapData(
            ds0,
            nr_thread=nr_thread,
            map_func=lambda dp: map_load_fmri_image_block(
                dp, target_name, block_dura=block_dura),
            buffer_size=buffer_size,
            strict=True)
    elif flag_cnn == '3d':
        ds1 = dataflow.MultiThreadMapData(
            ds0,
            nr_thread=nr_thread,
            map_func=lambda dp: map_load_fmri_image_3d_block(
                dp, target_name, block_dura=block_dura),
            buffer_size=buffer_size,
            strict=True)

    ds1 = dataflow.PrefetchData(ds1, buffer_size, 1)

    ds1 = split_samples(ds1)
    print('prefetch dataflowSize is ' + str(ds1.size()))

    if isTrain:
        print('%d #Trials/Samples per subject with %d channels in tc' %
              (ds1.Trial_Num, ds1.Block_dura))
        Trial_Num = ds1.Trial_Num
        #ds1 = dataflow.LocallyShuffleData(ds1, buffer_size=ds1.size() * buffer_size)
        ds1 = dataflow.LocallyShuffleData(ds1,
                                          buffer_size=Trial_Num * buffer_size,
                                          shuffle_interval=Trial_Num *
                                          buffer_size)  #//2

    ds1 = dataflow.BatchData(ds1, batch_size=batch_size, remainder=True)
    print('Time Usage of loading data in seconds: {} \n'.format(time.clock() -
                                                                start_time))

    ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=1)
    #ds1._reset_once()
    ##ds1.reset_state()
    '''
    for df in ds1.get_data():
        if flag_cnn == '2d':
            yield (df[0].astype('float32'),to_categorical(df[1].astype('int32'), len(target_name)))
        elif flag_cnn == '3d':
            yield (df[0].astype('float32'),to_categorical(df[1].astype('int32'), len(target_name)))
    '''
    return ds1
コード例 #24
0
ファイル: ilsvrc.py プロジェクト: zyclarkcheng/dataflow
        logging.basicConfig(level=logging.INFO,
                            format='[%(asctime)s %(levelname)s] %(message)s',
                            filename=args.log_filename)

    if args.augment:
        augmentors = fbresnet_augmentor(isTrain=True)
    else:
        augmentors = [
            df.imgaug.Resize((128, 128)),
        ]

    ds = dataflow.dataset.ILSVRC12(
        args.service_code, 'train',
        shuffle=True).parallel(num_threads=args.threads)
    ds = df.AugmentImageComponent(ds, augmentors, copy=False)
    ds = df.PrefetchDataZMQ(ds, nr_proc=args.process)
    if args.view:
        ds = dataflow.utils.image.Viewer(ds,
                                         lambda x: x[1] == 4,
                                         'label-4',
                                         prob=1.0,
                                         pos=(0, (128 + 64) * 0))
        ds = dataflow.utils.image.Viewer(ds,
                                         lambda x: x[1] == 16,
                                         'label-16',
                                         prob=1.0,
                                         pos=(0, (128 + 64) * 1))
        ds = dataflow.utils.image.Viewer(ds,
                                         lambda x: x[1] == 32,
                                         'label-32',
                                         prob=1.0,