Example #1
0
def get_imagenet_dataflow(datadir, name, batch_size, augmentors):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    cpu = min(30, multiprocessing.cpu_count())
    meta_dir = './ilsvrc_metadata'
    if isTrain:
        ds = dataset.ILSVRC12(datadir, name, meta_dir=meta_dir, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        ds = PrefetchDataZMQ(ds, cpu)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir,
                                   name,
                                   meta_dir=meta_dir,
                                   shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds, cpu, mapf, buffer_size=2000, strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
    return ds
Example #2
0
def get_imagenet_dataflow(
        datadir, name, batch_size,
        augmentors=None, parallel=None):
    """
    Args:
        augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)`

    Returns: A DataFlow which produces BGR images and labels.
    """
    assert name in ['train', 'val', 'test']
    isTrain = name == 'train'
    assert datadir is not None
    if augmentors is None:
        augmentors = fbresnet_augmentor(isTrain)
    assert isinstance(augmentors, list)
    augmentors = AugmentorList(augmentors)
    if parallel is None:
        parallel = min(40, multiprocessing.cpu_count() // 2)  # assuming hyperthreading

    def mapf(dp):
        fname, label = dp
        img = cv2.imread(fname)
        img = augmentors.augment(img)
        return img, label

    if isTrain:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=True)
        ds = MultiProcessMapDataZMQ(ds, parallel, mapf, buffer_size=2000)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
    return ds
Example #3
0
def get_iNaturalist_dataflow(
        datadir, name, batch_size,
        augmentors, parallel=None):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    if parallel is None:
        parallel = min(40, multiprocessing.cpu_count() // 2)  # assuming hyperthreading
    if isTrain:
        ds = dataset.iNaturalist(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logger.warn("DataFlow may become the bottleneck when too few processes are used.")
        ds = PrefetchDataZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.iNaturalistFiles(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, cls
        ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
    return ds
Example #4
0
def get_data():
    def f(dp):
        im = dp[0][:, :, None]
        onehot = np.eye(10)[dp[1]]
        return [im, onehot]

    train = BatchData(MapData(dataset.Mnist('train'), f), 128)
    test = BatchData(MapData(dataset.Mnist('test'), f), 256)
    return train, test
Example #5
0
def get_data():
    def f(dp):
        im = dp[0][:, :, None]
        onehot = np.zeros(10, dtype='int32')
        onehot[dp[1]] = 1
        return [im, onehot]

    train = BatchData(MapData(dataset.Mnist('train'), f), 128)
    test = BatchData(MapData(dataset.Mnist('test'), f), 256)
    return train, test
def get_imagenet_dataflow(datadir,
                          name,
                          batch_size,
                          augmentors,
                          parallel=None):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    if parallel is None:
        parallel = min(40, multiprocessing.cpu_count())
    if isTrain:
        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = PrefetchDataZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp

            jpeg_filename = os.path.basename(fname)
            jpeg_dirname = os.path.basename(os.path.dirname(fname))
            zip_filepath = os.path.dirname(fname) + '.zip'

            f = zipfile.ZipFile(zip_filepath, 'r')
            compress_jpeg = np.fromstring(f.read(
                os.path.join(jpeg_dirname, jpeg_filename)),
                                          dtype=np.uint8)

            im = cv2.imdecode(compress_jpeg, cv2.IMREAD_COLOR)
            #im = cv2.imread(fname, cv2.IMREAD_COLOR)

            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
    return ds
Example #7
0
def get_imagenet_dataflow(datadir,
                          name,
                          batch_size,
                          augmentors=None,
                          parallel=None):
    """
    Args:
        augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)`

    Returns: A DataFlow which produces BGR images and labels.

    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    isTrain = name == 'train'
    assert datadir is not None
    if augmentors is None:
        augmentors = fbresnet_augmentor(isTrain)
    assert isinstance(augmentors, list)
    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

    if isTrain:
        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = MultiProcessRunnerZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = MultiProcessRunnerZMQ(ds, 1)
    return ds
def get_imagenet_dataflow(datadir,
                          name,
                          batch_size,
                          augmentors,
                          parallel=None):  #获取图像网络数据流
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    if parallel is None:  # 如果不是并行的话
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading 超线程? 获取当前计算机cpu数量

    if isTrain:
        # dataset:创建一个在数据流上运行的预测器,并且拿出一个batch?
        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors,
                                   copy=False)  # 使用共享的增强参数在多个组件上应用图像增强器
        if parallel < 16:  # 如果少于16个的话
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = PrefetchDataZMQ(ds, parallel)  # 实现高效的数据流水线
        ds = BatchData(ds, batch_size, remainder=False)  # 取一个batch?
    else:
        # 如果是测试时,增强图像,加速对数据流的读取操作等
        # 与ILSVRC12相同,但生成图像的文件名而不是np array。
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR
                            )  # cv2.IMREAD_COLOR : 默认使用该种标识。加载一张彩色图片,忽视它的透明度
            im = aug.augment(im)  # 增强图像
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)  # 并行加速?
        ds = BatchData(ds, batch_size, remainder=True)  # 取一个batch?
        ds = PrefetchDataZMQ(ds, 1)
    return ds
Example #9
0
def get_imagenet_dataflow(datadir,
                          name,
                          batch_size,
                          augmentors,
                          parallel=None):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    if parallel is None:
        parallel = min(40, multiprocessing.cpu_count() // 6)
    if isTrain:
        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logger.warning(
                "DataFlow may become the bottleneck when too few processes are used."
            )

        ds = PrefetchData(ds, 1000, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = np.zeros((256, 256, 3), dtype=np.uint8)
            for _ in range(30):
                try:
                    im = cv2.imread(fname, cv2.IMREAD_COLOR)
                    im = aug.augment(im)
                    break
                except Exception as e:
                    logger.warning(str(e), 'file=', fname)
                    time.sleep(1)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchData(ds, 100, 1)
    return ds
Example #10
0
    def make_data(self, ds,is_training=True):

        if is_training:
            ds = MultiThreadMapData(ds, 10, self.train_map_func, buffer_size=200, strict=True)
        else:
            ds = MultiThreadMapData(ds, 5, self.val_map_func, buffer_size=200, strict=True)
        ds = BatchData(ds, cfg.TRAIN.num_gpu * cfg.TRAIN.batch_size, remainder=True,use_list=False)
        ds = MultiProcessPrefetchData(ds, 100,2)
        ds.reset_state()
        ds=ds.get_data()

        ###########
        # ds = data_set.shuffle(buffer_size=512)  # shuffle before loading images
        # ds = ds.repeat(cfg.TRAIN.epoch)
        # if is_training:
        #     ds = ds.map(self.train_map_func, num_parallel_calls=multiprocessing.cpu_count())  # decouple the heavy map_fn
        # else:
        #     ds = ds.map(self.val_map_func, num_parallel_calls=multiprocessing.cpu_count())  # decouple the heavy map_fn
        # ds = ds.batch(
        #     cfg.TRAIN.num_gpu * cfg.TRAIN.batch_size)  # TODO: consider using tf.contrib.map_and_batch
        #
        # ds = ds.prefetch(5 * cfg.TRAIN.num_gpu)
        # iterator = ds.make_one_shot_iterator()
        # one_element = iterator.get_next()
        # images, labels = one_element
        return ds
Example #11
0
def get_imagenet_dataflow(datadir, name, batch_size, parallel=None):
    """
    Get a standard imagenet training/evaluation dataflow, for linear classifier tuning.
    """
    assert name in ['train', 'val']
    isTrain = name == 'train'
    assert datadir is not None
    augmentors = get_basic_augmentor(isTrain)
    augmentors = imgaug.AugmentorList(augmentors)
    if parallel is None:
        parallel = min(50, mp.cpu_count())

    def mapper(dp):
        fname, label = dp
        img = cv2.imread(fname)
        img = augmentors.augment(img)
        return img, label

    if isTrain:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=True)
        ds = MultiProcessMapAndBatchDataZMQ(ds,
                                            parallel,
                                            mapper,
                                            batch_size,
                                            buffer_size=7000)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        ds = MultiThreadMapData(ds,
                                parallel,
                                mapper,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = MultiProcessRunnerZMQ(ds, 1)
    return ds
Example #12
0
def get_val_dataflow(
        datadir, batch_size,
        augmentors, parallel=None,
        num_splits=None, split_index=None):
    assert datadir is not None
    assert isinstance(augmentors, list)
    if parallel is None:
        parallel = min(40, multiprocessing.cpu_count())

    if num_splits is None:
        ds = dataset.ILSVRC12Files(datadir, 'val', shuffle=False)
    else:
        assert split_index < num_splits
        files = dataset.ILSVRC12Files(datadir, 'val', shuffle=False)
        files.reset_state()
        files = list(files.get_data())
        logger.info("#ValidationData = {}".format(len(files)))
        split_size = len(files) // num_splits
        start, end = split_size * split_index, split_size * (split_index + 1)
        end = min(end, len(files))
        logger.info("#ValidationSplit = {} - {}".format(start, end))
        files = files[start: end]
        ds = DataFromList(files, shuffle=False)
    aug = imgaug.AugmentorList(augmentors)

    def mapf(dp):
        fname, cls = dp
        im = cv2.imread(fname, cv2.IMREAD_COLOR)
        im = aug.augment(im)
        return im, cls
    ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True)
    ds = BatchData(ds, batch_size, remainder=True)
    # ds = PrefetchDataZMQ(ds, 1)
    # do not fork() under MPI
    return ds
Example #13
0
    def val_dataloader(self):
        """Summary

        Returns:
            TYPE: Description
        """
        ds_valid = MultiLabelDataset(
            folder=self.hparams.data,
            is_train='valid',
            fname='covid_test_v5.csv',
            types=self.hparams.types,
            pathology=self.hparams.pathology,
            resize=int(self.hparams.shape),
        )

        ds_valid.reset_state()
        ag_valid = [
            imgaug.Resize(self.hparams.shape, interp=cv2.INTER_AREA),
            imgaug.ToFloat32(),
        ]
        ds_valid = AugmentImageComponent(ds_valid, ag_valid, 0)
        ds_valid = BatchData(ds_valid, self.hparams.batch, remainder=True)
        ds_valid = MultiProcessRunner(ds_valid, num_proc=4, num_prefetch=16)
        ds_valid = PrintData(ds_valid)
        ds_valid = MapData(
            ds_valid, lambda dp: [
                torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))),
                torch.tensor(dp[1]).float()
            ])
        return ds_valid
Example #14
0
    def test_dataloader(self):
        """Summary

        Returns:
            TYPE: Description
        """
        ds_test = MultiLabelDataset(folder=self.hparams.data_path,
                                    is_train='test',
                                    fname='test.csv',
                                    types=self.hparams.types,
                                    pathology=self.hparams.pathology,
                                    resize=int(self.hparams.shape))

        ds_test.reset_state()
        ag_test = [
            imgaug.Albumentations(
                AB.SmallestMaxSize(self.hparams.shape, p=1.0)),
            iimgaug.ColorSpace(mode=cv2.COLOR_GRAY2RGB),
            imgaug.Albumentations(AB.CLAHE(p=1)),
            imgaug.ToFloat32(),
        ]
        ds_test = AugmentImageComponent(ds_test, ag_test, 0)
        ds_test = BatchData(ds_test, self.hparams.batch, remainder=True)
        # ds_test = MultiProcessRunner(ds_test, num_proc=4, num_prefetch=16)
        ds_test = PrintData(ds_test)
        ds_test = MapData(ds_test,
                          lambda dp: [torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))),
                                      torch.tensor(dp[1]).float()])
        return ds_test
Example #15
0
    def test_dataloader(self):
        ds_test = MultiLabelDataset(folder=self.hparams.data,
                                    is_train='valid',
                                    fname='covid_test_v5.csv',
                                    types=self.hparams.types,
                                    pathology=self.hparams.pathology,
                                    resize=int(self.hparams.shape),
                                    fold_idx=None,
                                    n_folds=1)

        ds_test.reset_state()
        ag_test = [
            imgaug.Resize(self.hparams.shape, interp=cv2.INTER_AREA),
            imgaug.ToFloat32(),
        ]
        ds_test = AugmentImageComponent(ds_test, ag_test, 0)
        ds_test = BatchData(ds_test, self.hparams.batch, remainder=True)
        ds_test = MultiProcessRunner(ds_test, num_proc=4, num_prefetch=16)
        ds_test = PrintData(ds_test)
        ds_test = MapData(
            ds_test, lambda dp: [
                torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))),
                torch.tensor(dp[1]).float()
            ])
        return ds_test
def get_downsampled_imagenet_augmented_data(subset, options,
        do_multiprocess=True, do_validation=False, shuffle=None):
    isTrain = subset == 'train' and do_multiprocess
    shuffle = shuffle if shuffle is not None else isTrain

    reret = re.search(r'^imagenet([0-9]*)$', options.ds_name)
    input_size = int(reret.group(1))

    ds = DownsampledImageNet(_data_batch_dir(options.data_dir, input_size),\
         subset, shuffle, input_size, do_validation=do_validation)

    pp_mean = ds.mean_img
    paste_size = ds.input_size * 5 // 4
    crop_size = ds.input_size
    if isTrain:
        augmentors = [
            imgaug.CenterPaste((paste_size, paste_size)),
            imgaug.RandomCrop((crop_size, crop_size)),
            imgaug.Flip(horiz=True),
            imgaug.MapImage(lambda x: (x - pp_mean)/128.0),
        ]
    else:
        augmentors = [
            imgaug.MapImage(lambda x: (x - pp_mean)/128.0)
        ]
    ds = AugmentImageComponent(ds, augmentors)
    ds = BatchData(ds, options.batch_size // options.nr_gpu, remainder=not isTrain)
    if do_multiprocess:
        ds = PrefetchData(ds, 4, 2)
    return ds
def data_to_dataflow(x, y, config: DotMap) -> DataFlow:
    dataflow = data_to_generator(x, y, True)
    dataflow = GeneratorToDataFlow(dataflow)
    dataflow = BatchData(dataflow, config.trainer.batch_size)
    dataflow.reset_state()

    return dataflow
Example #18
0
    def test_dataloader(self):
        """Summary

        Returns:
            TYPE: Description
        """
        ds_test = CustomDataSet(folder=self.hparams.data,
                                train_or_valid='test',
                                size=np.inf,
                                hparams=self.hparams)

        ds_test.reset_state()
        ag_test = [
            imgaug.Resize(self.hparams.shape, interp=cv2.INTER_NEAREST),
            imgaug.ToFloat32(),
        ]
        # ds_test = AugmentImageComponent(ds_test, [imgaug.Albumentations(AB.CLAHE(tile_grid_size=(32, 32), always_apply=True, p=1)),], 0)
        ds_test = AugmentImageComponents(ds_test, ag_test, [0, 1])
        ds_test = BatchData(ds_test, self.hparams.batch, remainder=True)
        ds_test = MultiProcessRunner(ds_test, num_proc=4, num_prefetch=16)
        ds_test = PrintData(ds_test)
        ds_test = MapData(
            ds_test, lambda dp: [
                torch.tensor(dp[0][:, np.newaxis, :, :]).float(),
                torch.tensor(dp[1][:, np.newaxis, :, :]).float(),
            ])
        return ds_test
Example #19
0
def get_simple_val_dataflow(src,
                            batch_size=32,
                            augmentors=None,
                            parallel=None):
    if augmentors is None:
        augmentors = fbresnet_augmentor(False)
    assert isinstance(augmentors, list)
    if parallel is None:
        parallel = min(40, multiprocessing.cpu_count())

    aug = imgaug.AugmentorList(augmentors)

    def mapf(dp):
        im, cls = dp
        im_new = im * 255
        im_new = im_new[:, :, ::-1]
        im_new = im_new.astype('uint8')
        return im_new, cls
        im_new = aug.augment(im_new)
        return im_new, cls

    ds = MultiThreadMapData(src,
                            parallel,
                            mapf,
                            buffer_size=min(2000, src.size()),
                            strict=True)
    ds = BatchData(ds, batch_size, remainder=True)
    # do not fork() under MPI
    return ds
Example #20
0
def get_resnet_train_dataflow():
    imgs = ResnetDetection.load_many(
        config.BASEDIR, config.TRAIN_DATASET)
    # Valid training images should have at least one fg box.
    # But this filter shall not be applied for testing.
    imgs = list(imgs)

    ds = DataFromList(imgs, shuffle=True)
    augmentors = get_resnet_augmentor()
    def preprocess(img):
        im, fname, label = img['image_data'], img['id'], img['with_ship']
        im = cv2.imread(im)
        #============Aug================
        im = cv2.resize(im, (config.RESNET_SIZE, config.RESNET_SIZE))
        augmented = strong_aug()(image=im)
        im = augmented['image']
        # im, multi_mask = do_flip_transpose2(im, multi_mask, type=random.randint(0,7))
        #============================
        ret = [im, label]
        return ret
    ds = MapData(ds, preprocess)
    ds = AugmentImageComponent(ds, augmentors, copy=False)
    ds = BatchData(ds, config.RESNET_BATCH)
    ds = PrefetchDataZMQ(ds, 6)
    return ds
Example #21
0
def data_orig(num_readers=16, batch_size=16, is_vis=False, is_print=False):

    # we may not need to process
    t1_start = time()
    dr = data_raw(video_dir, data_dir, is_print=True)
    df = MyDataFlow(raw_data=dr, num_steps=5, is_training=True)
    # df = MultiProcessMapDataZMQ(df, 4, preprocess)
    df = BatchData(df, batch_size)
    df = PrefetchDataZMQ(df, num_readers)
    # df = PlasmaGetData(df)
    df.reset_state()
    t1_end = time()
    print("data loader preparation costs {} seconds".format(t1_end - t1_start))
    step = 0
    time_start = time()
    for datapoint in df:
        if is_print is True:
            for j in range(len(datapoint)):
                print(datapoint[j].shape)
        if is_vis is True:
            array_show(datapoint)
        step = step + 1
        print("now passed {} seconds".format(time() - t1_end))
    average_time = (time.time() - time_start) / 10
    return average_time
Example #22
0
def get_imagenet_dataflow(datadir,
                          is_train,
                          batch_size,
                          augmentors,
                          parallel=None):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert datadir is not None
    assert isinstance(augmentors, list)
    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading
    if is_train:
        ds = dataset.ILSVRC12(datadir, "train", shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logging.warning(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = PrefetchDataZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, "val", shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = np.flip(im, axis=2)
            # print("fname={}".format(fname))
            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        # ds = MapData(ds, mapf)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
        # ds = PrefetchData(ds, 1)
    return ds
    def build_iter(self):

        ds = DataFromGenerator(self.generator)
        ds = BatchData(ds, self.batch_size)
        ds = MultiProcessPrefetchData(ds, self.prefetch_size, self.process_num)
        ds.reset_state()
        ds = ds.get_data()
        return ds
Example #24
0
    def get_data(self, name, num_gpu):
        gpu_batch = self.batch_size // num_gpu

        assert name in ['train', 'val', 'test']
        isTrain = name == 'train'

        augmentors = fbresnet_augmentor(isTrain)
        assert isinstance(augmentors, list)

        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

        if isTrain:
            ds = dataset.ILSVRC12(self.datadir,
                                  name,
                                  shuffle=True,
                                  dir_structure='train')
            ds = AugmentImageComponent(ds, augmentors, copy=False)
            ds = MultiProcessRunnerZMQ(ds, parallel)
            ds = BatchData(ds, gpu_batch, remainder=False)
            #ds = QueueInput(ds)
        else:
            ds = dataset.ILSVRC12Files(self.datadir,
                                       name,
                                       shuffle=False,
                                       dir_structure='train')
            aug = imgaug.AugmentorList(augmentors)

            def mapf(dp):
                fname, cls = dp
                im = cv2.imread(fname, cv2.IMREAD_COLOR)
                im = aug.augment(im)
                return im, cls

            ds = MultiThreadMapData(ds,
                                    parallel,
                                    mapf,
                                    buffer_size=2000,
                                    strict=True)
            ds = BatchData(ds, gpu_batch, remainder=True)
            ds = MultiProcessRunnerZMQ(ds, 1)

            if num_gpu == 1:
                ds = QueueInput(ds)
        return ds
Example #25
0
 def prepared(self, num_gpu, batch_size, eval=False):
     # use a single process version to debug if needed
     if self.min_num_workers == 0:
         ds = MapData(self, self.ex_process.train_process)
     else:
         ds = MultiProcessMapData(self, max(num_gpu, self.min_num_workers),
                                  self.ex_process.train_process)
     return BatchData(ds, batch_size)
Example #26
0
    def train_dataloader(self):
        ds_train = MultiLabelDataset(folder=self.hparams.data,
                                     is_train='train',
                                     fname='covid_train_v5.csv',
                                     types=self.hparams.types,
                                     pathology=self.hparams.pathology,
                                     resize=int(self.hparams.shape),
                                     balancing=None)

        ds_train.reset_state()
        ag_train = [
            # imgaug.Albumentations(
            #     AB.SmallestMaxSize(self.hparams.shape, p=1.0)),
            imgaug.ColorSpace(mode=cv2.COLOR_GRAY2RGB),
            # imgaug.Affine(shear=10),
            imgaug.RandomChooseAug([
                imgaug.Albumentations(AB.Blur(blur_limit=4, p=0.25)),
                imgaug.Albumentations(AB.MotionBlur(blur_limit=4, p=0.25)),
                imgaug.Albumentations(AB.MedianBlur(blur_limit=4, p=0.25)),
            ]),
            imgaug.Albumentations(AB.CLAHE(tile_grid_size=(32, 32), p=0.5)),
            imgaug.RandomOrderAug([
                imgaug.Affine(shear=10,
                              border=cv2.BORDER_CONSTANT,
                              interp=cv2.INTER_AREA),
                imgaug.Affine(translate_frac=(0.01, 0.02),
                              border=cv2.BORDER_CONSTANT,
                              interp=cv2.INTER_AREA),
                imgaug.Affine(scale=(0.5, 1.0),
                              border=cv2.BORDER_CONSTANT,
                              interp=cv2.INTER_AREA),
            ]),
            imgaug.RotationAndCropValid(max_deg=10, interp=cv2.INTER_AREA),
            imgaug.GoogleNetRandomCropAndResize(
                crop_area_fraction=(0.8, 1.0),
                aspect_ratio_range=(0.8, 1.2),
                interp=cv2.INTER_AREA,
                target_shape=self.hparams.shape),
            imgaug.ColorSpace(mode=cv2.COLOR_RGB2GRAY),
            imgaug.ToFloat32(),
        ]
        ds_train = AugmentImageComponent(ds_train, ag_train, 0)
        # Label smoothing
        ag_label = [
            imgaug.BrightnessScale((0.8, 1.2), clip=False),
        ]
        # ds_train = AugmentImageComponent(ds_train, ag_label, 1)
        ds_train = BatchData(ds_train, self.hparams.batch, remainder=True)
        if self.hparams.debug:
            ds_train = FixedSizeData(ds_train, 2)
        ds_train = MultiProcessRunner(ds_train, num_proc=4, num_prefetch=16)
        ds_train = PrintData(ds_train)
        ds_train = MapData(
            ds_train, lambda dp: [
                torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))),
                torch.tensor(dp[1]).float()
            ])
        return ds_train
Example #27
0
def preprocess_data_flow(ds, options, is_train, do_multiprocess=False):
    ds_size = ds.size()
    while options.batch_size > ds_size:
        options.batch_size //= 2
    ds = BatchData(ds, max(1, options.batch_size // options.nr_gpu),
        remainder=not is_train)
    if do_multiprocess:
        ds = PrefetchData(ds, 5, 5)
    return ds
Example #28
0
    def build_iter(self):

        map_func = partial(self._map_func, is_training=self.training_flag)
        ds = DataFromGenerator(self.generator)
        ds = BatchData(ds, self.num_gpu * self.batch_size)
        ds = MultiProcessPrefetchData(ds, self.prefetch_size, self.process_num)
        ds.reset_state()
        ds = ds.get_data()
        return ds
def get_sequential_loader(ds, isTrain, batch_size, augmentors, parallel=None):
    """ Load a Single-File LMDB (Sequential Read)
    Args:
        augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)`

    Returns: A LMDBData which produces BGR images and labels.

    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html
    """
    assert isinstance(augmentors, list)
    aug = imgaug.AugmentorList(augmentors)

    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

    if isTrain:
        ds = LocallyShuffleData(ds, 50000)
        ds = MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR),
                              0)
        ds = AugmentImageComponent(ds, aug, copy=False)
        if parallel < 16:
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = BatchData(ds, batch_size, remainder=False, use_list=True)
        ds = MultiProcessRunnerZMQ(ds, parallel)
    else:

        def mapper(data):
            im, label = data
            im = cv2.imdecode(im, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, label

        ds = MultiProcessMapDataZMQ(ds,
                                    parallel,
                                    mapper,
                                    buffer_size=2000,
                                    strict=True)
        ds = BatchData(ds, batch_size, remainder=True, use_list=True)
    return ds
Example #30
0
def get_data(split, option):
    is_training = split == 'train'
    parallel = multiprocessing.cpu_count() // 2
    ds = get_data_flow(split, is_training, option)
    augmentors = fbresnet_augmentor(is_training, option)
    ds = AugmentImageCoordinates(ds, augmentors, coords_index=2, copy=False)
    if is_training:
        ds = PrefetchDataZMQ(ds, parallel)
    ds = BatchData(ds, option.batch_size, remainder=not is_training)
    return ds