Ejemplo n.º 1
0
    def test_dataloader(self):
        """Summary

        Returns:
            TYPE: Description
        """
        ds_test = CustomDataSet(folder=self.hparams.data,
                                train_or_valid='test',
                                size=np.inf,
                                hparams=self.hparams)

        ds_test.reset_state()
        ag_test = [
            imgaug.Resize(self.hparams.shape, interp=cv2.INTER_NEAREST),
            imgaug.ToFloat32(),
        ]
        ds_test = AugmentImageComponent(ds_test, [
            imgaug.Albumentations(AB.CLAHE(p=1)),
        ], 0)
        ds_test = AugmentImageComponent(ds_test, ag_test, 0)
        ds_test = BatchData(ds_test, self.hparams.batch, remainder=True)
        # ds_test = MultiProcessRunner(ds_test, num_proc=4, num_prefetch=16)
        ds_test = PrintData(ds_test)
        ds_test = MapData(
            ds_test, lambda dp: [
                torch.tensor(dp[0][:, np.newaxis, :, :]).float(),
                torch.tensor(dp[1][:, np.newaxis, :, :]).float()
            ])
        return ds_test
Ejemplo n.º 2
0
def train_generator(ds, shape_aug=None, input_aug=None, label_aug=None, batch_size=16, nr_procs=8):
    ### augment both the input and label
    ds = ds if shape_aug is None else AugmentImageComponents(ds, shape_aug, (0, 1), copy=True)
    ### augment just the input i.e index 0 within each yield of DatasetSerial
    ds = ds if input_aug is None else AugmentImageComponent(ds, input_aug, index=0, copy=False)
    ### augment just the output i.e index 1 within each yield of DatasetSerial
    ds = ds if label_aug is None else AugmentImageComponent(ds, label_aug, index=1, copy=True)
    #
    ds = BatchDataByShape(ds, batch_size, idx=0)
    ds = PrefetchDataZMQ(ds, nr_procs)
    return ds
Ejemplo n.º 3
0
def train_generator_class(ds, shape_aug=None, input_aug=None, batch_size=16, nr_procs=8):
    ### augment the input
    ds = ds if shape_aug is None else AugmentImageComponent(
        ds, shape_aug, index=0, copy=True)
    ### augment the input i.e index 0 within each yield of DatasetSerial
    ds = ds if input_aug is None else AugmentImageComponent(
        ds, input_aug, index=0, copy=False)
    #
    ds = BatchDataByShape(ds, batch_size, idx=0)
    ds = PrefetchDataZMQ(ds, nr_procs)
    return ds
Ejemplo n.º 4
0
def valid_generator(ds, shape_aug=None, input_aug=None, label_aug=None, batch_size=16, nr_procs=1):
    ### augment both the input and label
    ds = ds if shape_aug is None else AugmentImageComponents(ds, shape_aug, (0, 1), copy=True)
    ### augment just the input
    ds = ds if input_aug is None else AugmentImageComponent(ds, input_aug, index=0, copy=False)
    ### augment just the output
    ds = ds if label_aug is None else AugmentImageComponent(ds, label_aug, index=1, copy=True)
    #
    ds = BatchData(ds, batch_size, remainder=True)
    ds = CacheData(ds) # cache all inference images 
    return ds
Ejemplo n.º 5
0
def valid_generator_class(ds, shape_aug=None, input_aug=None,
                          batch_size=16, nr_procs=1):
    ### augment the input
    ds = ds if shape_aug is None else AugmentImageComponent(
        ds, shape_aug, index=0, copy=True)
    ### augment the input
    ds = ds if input_aug is None else AugmentImageComponent(
        ds, input_aug, index=0, copy=False)
    #
    ds = BatchData(ds, batch_size, remainder=True)
    ds = CacheData(ds)  # cache all inference images
    return ds
Ejemplo n.º 6
0
    def val_dataloader(self):
        """Summary

        Returns:
            TYPE: Description
        """
        ds_valid = MultiLabelDataset(
            folder=self.hparams.data,
            is_train='valid',
            fname='covid_test_v5.csv',
            types=self.hparams.types,
            pathology=self.hparams.pathology,
            resize=int(self.hparams.shape),
        )

        ds_valid.reset_state()
        ag_valid = [
            imgaug.Resize(self.hparams.shape, interp=cv2.INTER_AREA),
            imgaug.ToFloat32(),
        ]
        ds_valid = AugmentImageComponent(ds_valid, ag_valid, 0)
        ds_valid = BatchData(ds_valid, self.hparams.batch, remainder=True)
        ds_valid = MultiProcessRunner(ds_valid, num_proc=4, num_prefetch=16)
        ds_valid = PrintData(ds_valid)
        ds_valid = MapData(
            ds_valid, lambda dp: [
                torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))),
                torch.tensor(dp[1]).float()
            ])
        return ds_valid
Ejemplo n.º 7
0
    def val_dataloader(self):
        """Summary

        Returns:
            TYPE: Description
        """
        ds_valid = CustomDataSet(folder=self.hparams.data,
                                 train_or_valid='valid',
                                 size=np.inf,
                                 hparams=self.hparams)

        ds_valid.reset_state()
        ag_valid = [
            imgaug.Resize(self.hparams.shape, interp=cv2.INTER_NEAREST),
            imgaug.ToFloat32(),
        ]
        ds_valid = AugmentImageComponent(ds_valid, [
            imgaug.Albumentations(
                AB.CLAHE(tile_grid_size=(32, 32), always_apply=True, p=1), ),
        ], 0)
        ds_valid = AugmentImageComponents(ds_valid, ag_valid, [0, 1])
        ds_valid = BatchData(ds_valid, self.hparams.batch, remainder=True)
        ds_valid = MultiProcessRunner(ds_valid, num_proc=4, num_prefetch=16)
        ds_valid = PrintData(ds_valid)
        ds_valid = MapData(
            ds_valid, lambda dp: [
                torch.tensor(dp[0][:, np.newaxis, :, :]).float(),
                torch.tensor(dp[1][:, np.newaxis, :, :]).float(),
            ])
        return ds_valid
Ejemplo n.º 8
0
    def test_dataloader(self):
        """Summary

        Returns:
            TYPE: Description
        """
        ds_test = MultiLabelDataset(folder=self.hparams.data_path,
                                    is_train='test',
                                    fname='test.csv',
                                    types=self.hparams.types,
                                    pathology=self.hparams.pathology,
                                    resize=int(self.hparams.shape))

        ds_test.reset_state()
        ag_test = [
            imgaug.Albumentations(
                AB.SmallestMaxSize(self.hparams.shape, p=1.0)),
            iimgaug.ColorSpace(mode=cv2.COLOR_GRAY2RGB),
            imgaug.Albumentations(AB.CLAHE(p=1)),
            imgaug.ToFloat32(),
        ]
        ds_test = AugmentImageComponent(ds_test, ag_test, 0)
        ds_test = BatchData(ds_test, self.hparams.batch, remainder=True)
        # ds_test = MultiProcessRunner(ds_test, num_proc=4, num_prefetch=16)
        ds_test = PrintData(ds_test)
        ds_test = MapData(ds_test,
                          lambda dp: [torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))),
                                      torch.tensor(dp[1]).float()])
        return ds_test
Ejemplo n.º 9
0
def get_resnet_train_dataflow():
    imgs = ResnetDetection.load_many(
        config.BASEDIR, config.TRAIN_DATASET)
    # Valid training images should have at least one fg box.
    # But this filter shall not be applied for testing.
    imgs = list(imgs)

    ds = DataFromList(imgs, shuffle=True)
    augmentors = get_resnet_augmentor()
    def preprocess(img):
        im, fname, label = img['image_data'], img['id'], img['with_ship']
        im = cv2.imread(im)
        #============Aug================
        im = cv2.resize(im, (config.RESNET_SIZE, config.RESNET_SIZE))
        augmented = strong_aug()(image=im)
        im = augmented['image']
        # im, multi_mask = do_flip_transpose2(im, multi_mask, type=random.randint(0,7))
        #============================
        ret = [im, label]
        return ret
    ds = MapData(ds, preprocess)
    ds = AugmentImageComponent(ds, augmentors, copy=False)
    ds = BatchData(ds, config.RESNET_BATCH)
    ds = PrefetchDataZMQ(ds, 6)
    return ds
def get_downsampled_imagenet_augmented_data(subset, options,
        do_multiprocess=True, do_validation=False, shuffle=None):
    isTrain = subset == 'train' and do_multiprocess
    shuffle = shuffle if shuffle is not None else isTrain

    reret = re.search(r'^imagenet([0-9]*)$', options.ds_name)
    input_size = int(reret.group(1))

    ds = DownsampledImageNet(_data_batch_dir(options.data_dir, input_size),\
         subset, shuffle, input_size, do_validation=do_validation)

    pp_mean = ds.mean_img
    paste_size = ds.input_size * 5 // 4
    crop_size = ds.input_size
    if isTrain:
        augmentors = [
            imgaug.CenterPaste((paste_size, paste_size)),
            imgaug.RandomCrop((crop_size, crop_size)),
            imgaug.Flip(horiz=True),
            imgaug.MapImage(lambda x: (x - pp_mean)/128.0),
        ]
    else:
        augmentors = [
            imgaug.MapImage(lambda x: (x - pp_mean)/128.0)
        ]
    ds = AugmentImageComponent(ds, augmentors)
    ds = BatchData(ds, options.batch_size // options.nr_gpu, remainder=not isTrain)
    if do_multiprocess:
        ds = PrefetchData(ds, 4, 2)
    return ds
Ejemplo n.º 11
0
def get_imagenet_dataflow(datadir, name, batch_size, augmentors):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    cpu = min(30, multiprocessing.cpu_count())
    meta_dir = './ilsvrc_metadata'
    if isTrain:
        ds = dataset.ILSVRC12(datadir, name, meta_dir=meta_dir, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        ds = PrefetchDataZMQ(ds, cpu)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir,
                                   name,
                                   meta_dir=meta_dir,
                                   shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds, cpu, mapf, buffer_size=2000, strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
    return ds
Ejemplo n.º 12
0
    def test_dataloader(self):
        ds_test = MultiLabelDataset(folder=self.hparams.data,
                                    is_train='valid',
                                    fname='covid_test_v5.csv',
                                    types=self.hparams.types,
                                    pathology=self.hparams.pathology,
                                    resize=int(self.hparams.shape),
                                    fold_idx=None,
                                    n_folds=1)

        ds_test.reset_state()
        ag_test = [
            imgaug.Resize(self.hparams.shape, interp=cv2.INTER_AREA),
            imgaug.ToFloat32(),
        ]
        ds_test = AugmentImageComponent(ds_test, ag_test, 0)
        ds_test = BatchData(ds_test, self.hparams.batch, remainder=True)
        ds_test = MultiProcessRunner(ds_test, num_proc=4, num_prefetch=16)
        ds_test = PrintData(ds_test)
        ds_test = MapData(
            ds_test, lambda dp: [
                torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))),
                torch.tensor(dp[1]).float()
            ])
        return ds_test
Ejemplo n.º 13
0
def get_iNaturalist_dataflow(
        datadir, name, batch_size,
        augmentors, parallel=None):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    if parallel is None:
        parallel = min(40, multiprocessing.cpu_count() // 2)  # assuming hyperthreading
    if isTrain:
        ds = dataset.iNaturalist(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logger.warn("DataFlow may become the bottleneck when too few processes are used.")
        ds = PrefetchDataZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.iNaturalistFiles(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, cls
        ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
    return ds
Ejemplo n.º 14
0
    def train_dataloader(self):
        ds_train = MultiLabelDataset(folder=self.hparams.data,
                                     is_train='train',
                                     fname='covid_train_v5.csv',
                                     types=self.hparams.types,
                                     pathology=self.hparams.pathology,
                                     resize=int(self.hparams.shape),
                                     balancing=None)

        ds_train.reset_state()
        ag_train = [
            # imgaug.Albumentations(
            #     AB.SmallestMaxSize(self.hparams.shape, p=1.0)),
            imgaug.ColorSpace(mode=cv2.COLOR_GRAY2RGB),
            # imgaug.Affine(shear=10),
            imgaug.RandomChooseAug([
                imgaug.Albumentations(AB.Blur(blur_limit=4, p=0.25)),
                imgaug.Albumentations(AB.MotionBlur(blur_limit=4, p=0.25)),
                imgaug.Albumentations(AB.MedianBlur(blur_limit=4, p=0.25)),
            ]),
            imgaug.Albumentations(AB.CLAHE(tile_grid_size=(32, 32), p=0.5)),
            imgaug.RandomOrderAug([
                imgaug.Affine(shear=10,
                              border=cv2.BORDER_CONSTANT,
                              interp=cv2.INTER_AREA),
                imgaug.Affine(translate_frac=(0.01, 0.02),
                              border=cv2.BORDER_CONSTANT,
                              interp=cv2.INTER_AREA),
                imgaug.Affine(scale=(0.5, 1.0),
                              border=cv2.BORDER_CONSTANT,
                              interp=cv2.INTER_AREA),
            ]),
            imgaug.RotationAndCropValid(max_deg=10, interp=cv2.INTER_AREA),
            imgaug.GoogleNetRandomCropAndResize(
                crop_area_fraction=(0.8, 1.0),
                aspect_ratio_range=(0.8, 1.2),
                interp=cv2.INTER_AREA,
                target_shape=self.hparams.shape),
            imgaug.ColorSpace(mode=cv2.COLOR_RGB2GRAY),
            imgaug.ToFloat32(),
        ]
        ds_train = AugmentImageComponent(ds_train, ag_train, 0)
        # Label smoothing
        ag_label = [
            imgaug.BrightnessScale((0.8, 1.2), clip=False),
        ]
        # ds_train = AugmentImageComponent(ds_train, ag_label, 1)
        ds_train = BatchData(ds_train, self.hparams.batch, remainder=True)
        if self.hparams.debug:
            ds_train = FixedSizeData(ds_train, 2)
        ds_train = MultiProcessRunner(ds_train, num_proc=4, num_prefetch=16)
        ds_train = PrintData(ds_train)
        ds_train = MapData(
            ds_train, lambda dp: [
                torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))),
                torch.tensor(dp[1]).float()
            ])
        return ds_train
Ejemplo n.º 15
0
def get_data(batch, augmentors):
    """
    Sec 3, Remark 4:
    Use a single random shuffling of the training data (per epoch) that is divided amongst all k workers.
    Here we do not follow the paper because it does not seem to make a difference.
    """
    ds = dataset.ILSVRC12(args.data, 'train', shuffle=True)
    ds = AugmentImageComponent(ds, augmentors, copy=False)
    ds = BatchData(ds, batch, remainder=False)
    ds = PrefetchDataZMQ(ds, min(50, mp.cpu_count()))
    return ds
def get_imagenet_dataflow(datadir,
                          name,
                          batch_size,
                          augmentors,
                          parallel=None):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    if parallel is None:
        parallel = min(40, multiprocessing.cpu_count())
    if isTrain:
        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = PrefetchDataZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp

            jpeg_filename = os.path.basename(fname)
            jpeg_dirname = os.path.basename(os.path.dirname(fname))
            zip_filepath = os.path.dirname(fname) + '.zip'

            f = zipfile.ZipFile(zip_filepath, 'r')
            compress_jpeg = np.fromstring(f.read(
                os.path.join(jpeg_dirname, jpeg_filename)),
                                          dtype=np.uint8)

            im = cv2.imdecode(compress_jpeg, cv2.IMREAD_COLOR)
            #im = cv2.imread(fname, cv2.IMREAD_COLOR)

            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
    return ds
def get_data(batch, augmentors, workers):
    """
    Sec 3, Remark 4:
    Use a single random shuffling of the training data (per epoch) that is divided amongst all k workers.

    NOTE: Here we do not follow the paper, but it makes little differences.
    """
    ds = dataset.ILSVRC12(args.data, 'train', shuffle=True)
    ds = AugmentImageComponent(ds, augmentors, copy=False)
    ds = PrefetchDataZMQ(ds, workers)
    ds = BatchData(ds, batch, remainder=False)
    return ds
Ejemplo n.º 18
0
def get_imagenet_dataflow(datadir,
                          name,
                          batch_size,
                          augmentors=None,
                          parallel=None):
    """
    Args:
        augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)`

    Returns: A DataFlow which produces BGR images and labels.

    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    isTrain = name == 'train'
    assert datadir is not None
    if augmentors is None:
        augmentors = fbresnet_augmentor(isTrain)
    assert isinstance(augmentors, list)
    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

    if isTrain:
        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = MultiProcessRunnerZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = MultiProcessRunnerZMQ(ds, 1)
    return ds
def get_imagenet_dataflow(datadir,
                          name,
                          batch_size,
                          augmentors,
                          parallel=None):  #获取图像网络数据流
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    if parallel is None:  # 如果不是并行的话
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading 超线程? 获取当前计算机cpu数量

    if isTrain:
        # dataset:创建一个在数据流上运行的预测器,并且拿出一个batch?
        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors,
                                   copy=False)  # 使用共享的增强参数在多个组件上应用图像增强器
        if parallel < 16:  # 如果少于16个的话
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = PrefetchDataZMQ(ds, parallel)  # 实现高效的数据流水线
        ds = BatchData(ds, batch_size, remainder=False)  # 取一个batch?
    else:
        # 如果是测试时,增强图像,加速对数据流的读取操作等
        # 与ILSVRC12相同,但生成图像的文件名而不是np array。
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR
                            )  # cv2.IMREAD_COLOR : 默认使用该种标识。加载一张彩色图片,忽视它的透明度
            im = aug.augment(im)  # 增强图像
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)  # 并行加速?
        ds = BatchData(ds, batch_size, remainder=True)  # 取一个batch?
        ds = PrefetchDataZMQ(ds, 1)
    return ds
Ejemplo n.º 20
0
def get_imagenet_dataflow(datadir,
                          name,
                          batch_size,
                          augmentors,
                          parallel=None):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    if parallel is None:
        parallel = min(40, multiprocessing.cpu_count() // 6)
    if isTrain:
        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logger.warning(
                "DataFlow may become the bottleneck when too few processes are used."
            )

        ds = PrefetchData(ds, 1000, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = np.zeros((256, 256, 3), dtype=np.uint8)
            for _ in range(30):
                try:
                    im = cv2.imread(fname, cv2.IMREAD_COLOR)
                    im = aug.augment(im)
                    break
                except Exception as e:
                    logger.warning(str(e), 'file=', fname)
                    time.sleep(1)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchData(ds, 100, 1)
    return ds
Ejemplo n.º 21
0
def generate_dataflow(dataset, option):
    if option['number_of_cores'] == -1:
        option['number_of_cores'] = mp.cpu_count()
    
    ds = DataFlow(dataset, option)
    ds = AugmentImageComponent(ds, option['augmentors'], copy = False)
    
    if option['number_of_cores'] < 16:
        print('[!} Warning = DataFlow may become the bottleneck when too few processes are used.')
    
    ds = PrefetchData(ds, option['num_prefetch_for_dataset'], option['number_of_cores'])

    ds = BatchData(ds, option['batch_size'], remainder = option['remainder'])
    ds = PrefetchData(ds, option['num_prefetch_for_batch'], 2)
    
    return ds
Ejemplo n.º 22
0
def get_train_dataflow(datadir, batch, augmentors=None):
    """
    Sec 3, Remark 4:
    Use a single random shuffling of the training data (per epoch)
    that is divided amongst all k workers.

    NOTE: Here we do not follow the paper which makes some differences.
    Here, each machine shuffles independently.
    """
    if augmentors is None:
        augmentors = fbresnet_augmentor(True)
    ds = dataset.ILSVRC12(datadir, 'train', shuffle=True)
    ds = AugmentImageComponent(ds, augmentors, copy=False)
    ds = BatchData(ds, batch, remainder=False)
    ds = MultiProcessRunnerZMQ(ds, min(50, mp.cpu_count()))
    return ds
Ejemplo n.º 23
0
def get_imagenet_dataflow(datadir,
                          is_train,
                          batch_size,
                          augmentors,
                          parallel=None):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert datadir is not None
    assert isinstance(augmentors, list)
    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading
    if is_train:
        ds = dataset.ILSVRC12(datadir, "train", shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logging.warning(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = PrefetchDataZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, "val", shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = np.flip(im, axis=2)
            # print("fname={}".format(fname))
            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        # ds = MapData(ds, mapf)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
        # ds = PrefetchData(ds, 1)
    return ds
Ejemplo n.º 24
0
    def get_data(self, name, num_gpu):
        gpu_batch = self.batch_size // num_gpu

        assert name in ['train', 'val', 'test']
        isTrain = name == 'train'

        augmentors = fbresnet_augmentor(isTrain)
        assert isinstance(augmentors, list)

        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

        if isTrain:
            ds = dataset.ILSVRC12(self.datadir,
                                  name,
                                  shuffle=True,
                                  dir_structure='train')
            ds = AugmentImageComponent(ds, augmentors, copy=False)
            ds = MultiProcessRunnerZMQ(ds, parallel)
            ds = BatchData(ds, gpu_batch, remainder=False)
            #ds = QueueInput(ds)
        else:
            ds = dataset.ILSVRC12Files(self.datadir,
                                       name,
                                       shuffle=False,
                                       dir_structure='train')
            aug = imgaug.AugmentorList(augmentors)

            def mapf(dp):
                fname, cls = dp
                im = cv2.imread(fname, cv2.IMREAD_COLOR)
                im = aug.augment(im)
                return im, cls

            ds = MultiThreadMapData(ds,
                                    parallel,
                                    mapf,
                                    buffer_size=2000,
                                    strict=True)
            ds = BatchData(ds, gpu_batch, remainder=True)
            ds = MultiProcessRunnerZMQ(ds, 1)

            if num_gpu == 1:
                ds = QueueInput(ds)
        return ds
Ejemplo n.º 25
0
def get_data(train_or_test, option):
    isTrain = train_or_test == 'train'

    datadir = option.data
    if option.final_size == 64:
        ds = dataset.tinyImagenetHaS(datadir,
                                     train_or_test,
                                     'all',
                                     shuffle=isTrain)
    elif option.final_size == 224:
        ds = dataset.ILSVRC12(datadir, train_or_test, shuffle=isTrain)
    augmentors = fbresnet_augmentor(isTrain, option=option)
    augmentors.append(imgaug.ToUint8())
    ds = AugmentImageComponent(ds, augmentors, copy=False)
    if isTrain:
        ds = PrefetchDataZMQ(ds, min(25, multiprocessing.cpu_count()))
    ds = BatchData(ds, int(option.batch), remainder=not isTrain)
    return ds
Ejemplo n.º 26
0
def get_sequential_loader(ds, isTrain, batch_size, augmentors, parallel=None):
    """ Load a Single-File LMDB (Sequential Read)
    Args:
        augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)`

    Returns: A LMDBData which produces BGR images and labels.

    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html
    """
    assert isinstance(augmentors, list)
    aug = imgaug.AugmentorList(augmentors)

    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

    if isTrain:
        ds = LocallyShuffleData(ds, 50000)
        ds = MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR),
                              0)
        ds = AugmentImageComponent(ds, aug, copy=False)
        if parallel < 16:
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = BatchData(ds, batch_size, remainder=False, use_list=True)
        ds = MultiProcessRunnerZMQ(ds, parallel)
    else:

        def mapper(data):
            im, label = data
            im = cv2.imdecode(im, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, label

        ds = MultiProcessMapDataZMQ(ds,
                                    parallel,
                                    mapper,
                                    buffer_size=2000,
                                    strict=True)
        ds = BatchData(ds, batch_size, remainder=True, use_list=True)
    return ds
Ejemplo n.º 27
0
def get_cifar_augmented_data(subset,
                             options,
                             do_multiprocess=True,
                             do_validation=False,
                             shuffle=None):
    isTrain = subset == 'train' and do_multiprocess
    shuffle = shuffle if shuffle is not None else isTrain
    if options.num_classes == 10 and options.ds_name == 'cifar10':
        ds = dataset.Cifar10(subset,
                             shuffle=shuffle,
                             do_validation=do_validation)
        cutout_length = 16
        n_holes = 1
    elif options.num_classes == 100 and options.ds_name == 'cifar100':
        ds = dataset.Cifar100(subset,
                              shuffle=shuffle,
                              do_validation=do_validation)
        cutout_length = 8
        n_holes = 1
    else:
        raise ValueError(
            'Number of classes must be set to 10(default) or 100 for CIFAR')
    logger.info('{} set has n_samples: {}'.format(subset, len(ds.data)))
    pp_mean = ds.get_per_pixel_mean()
    if isTrain:
        logger.info('Will do cut-out with length={} n_holes={}'.format(
            cutout_length, n_holes))
        augmentors = [
            imgaug.CenterPaste((40, 40)),
            imgaug.RandomCrop((32, 32)),
            imgaug.Flip(horiz=True),
            imgaug.MapImage(lambda x: (x - pp_mean) / 128.0),
            Cutout(length=cutout_length, n_holes=n_holes),
        ]
    else:
        augmentors = [imgaug.MapImage(lambda x: (x - pp_mean) / 128.0)]
    ds = AugmentImageComponent(ds, augmentors)
    ds = BatchData(ds,
                   options.batch_size // options.nr_gpu,
                   remainder=not isTrain)
    if do_multiprocess:
        ds = PrefetchData(ds, 3, 2)
    return ds
Ejemplo n.º 28
0
def get_random_loader(ds, isTrain, batch_size, augmentors, parallel=None):
    """ DataFlow data (Random Read)
    Args:
        augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)`

    Returns: A DataFlow which produces BGR images and labels.

    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html
    """
    assert isinstance(augmentors, list)
    aug = imgaug.AugmentorList(augmentors)

    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

    if isTrain:
        ds = AugmentImageComponent(ds, aug, copy=False)
        if parallel < 16:
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = MultiProcessRunnerZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = MultiProcessRunnerZMQ(ds, 1)
    return ds
Ejemplo n.º 29
0
 def get_data(self, train_or_test):
     isTrain = train_or_test == 'train'
     ds = dataset.Cifar10(train_or_test, dir='.')
     pp_mean = ds.get_per_pixel_mean()
     if isTrain:
         augmentors = [
             imgaug.CenterPaste((40, 40)),
             imgaug.RandomCrop((32, 32)),
             imgaug.Flip(horiz=True),
             # imgaug.Brightness(20),
             # imgaug.Contrast((0.6,1.4)),
             imgaug.MapImage(lambda x: x - pp_mean),
         ]
     else:
         augmentors = [imgaug.MapImage(lambda x: x - pp_mean)]
     ds = AugmentImageComponent(ds, augmentors)
     ds = BatchData(ds, self.batch_size, remainder=not isTrain)
     if isTrain:
         ds = PrefetchData(ds, 3, 2)
     return ds
Ejemplo n.º 30
0
def get_inat_augmented_data(subset,
                            options,
                            lmdb_dir=None,
                            year='2018',
                            do_multiprocess=True,
                            do_validation=False,
                            is_train=None,
                            shuffle=None,
                            n_allow=None):
    input_size = options.input_size if options.input_size else 224
    isTrain = is_train if is_train is not None else (subset == 'train'
                                                     and do_multiprocess)
    shuffle = shuffle if shuffle is not None else isTrain
    postfix = "" if n_allow is None else "_allow_{}".format(n_allow)

    #TODO: Parameterize the cv split to be consider
    #Currently hardcoding to 1
    cv = 1

    # When do_validation is True it will expect *cv_train and *cv_val lmdbs
    # Currently the cv_train split is always used
    if isTrain:
        postfix += '_cv_train_{}'.format(cv)
    elif do_validation:
        subset = 'train'
        postfix += '_cv_val_{}'.format(cv)

    if lmdb_dir == None:
        lmdb_path = os.path.join(options.data_dir, 'inat_lmdb',
                                 'inat2018_{}{}.lmdb'.format(subset, postfix))
    else:
        lmdb_path = os.path.join(
            options.data_dir, lmdb_dir,
            'inat{}_{}{}.lmdb'.format(year, subset, postfix))

    ds = LMDBData(lmdb_path, shuffle=False)
    if shuffle:
        ds = LocallyShuffleData(ds,
                                1024 * 80)  # This is 64G~80G in memory images
    ds = PrefetchData(ds, 1024 * 8, 1)  # prefetch around 8 G
    ds = LMDBDataPoint(ds)
    ds = MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR),
                          0)  # BGR uint8 data
    if isTrain:

        class Resize(imgaug.ImageAugmentor):
            """
            crop 8%~100% of the original image
            See `Going Deeper with Convolutions` by Google.
            """
            def _augment(self, img, _):
                h, w = img.shape[:2]
                area = h * w
                for _ in range(10):
                    targetArea = self.rng.uniform(0.08, 1.0) * area
                    aspectR = self.rng.uniform(0.75, 1.333)
                    ww = int(np.sqrt(targetArea * aspectR))
                    hh = int(np.sqrt(targetArea / aspectR))
                    if self.rng.uniform() < 0.5:
                        ww, hh = hh, ww
                    if hh <= h and ww <= w:
                        x1 = 0 if w == ww else self.rng.randint(0, w - ww)
                        y1 = 0 if h == hh else self.rng.randint(0, h - hh)
                        out = img[y1:y1 + hh, x1:x1 + ww]
                        out = cv2.resize(out, (input_size, input_size),
                                         interpolation=cv2.INTER_CUBIC)
                        return out
                out = cv2.resize(img, (input_size, input_size),
                                 interpolation=cv2.INTER_CUBIC)
                return out

        augmentors = [
            Resize(),
            imgaug.RandomOrderAug([
                imgaug.Brightness(30, clip=False),
                imgaug.Contrast((0.8, 1.2), clip=False),
                imgaug.Saturation(0.4),
                # rgb-bgr conversion
                imgaug.Lighting(0.1,
                                eigval=[0.2175, 0.0188, 0.0045][::-1],
                                eigvec=np.array([[-0.5675, 0.7192, 0.4009],
                                                 [-0.5808, -0.0045, -0.8140],
                                                 [-0.5836, -0.6948, 0.4203]],
                                                dtype='float32')[::-1, ::-1])
            ]),
            imgaug.Clip(),
            imgaug.Flip(horiz=True),
            imgaug.ToUint8()
        ]
    else:
        augmentors = [
            imgaug.ResizeShortestEdge(256),
            imgaug.CenterCrop((input_size, input_size)),
            imgaug.ToUint8()
        ]
    ds = AugmentImageComponent(ds, augmentors, copy=False)
    if do_multiprocess:
        ds = PrefetchDataZMQ(ds, min(24, multiprocessing.cpu_count()))
    ds = BatchData(ds,
                   options.batch_size // options.nr_gpu,
                   remainder=not isTrain)
    return ds