Beispiel #1
0
 def get_images(dir):
     files = glob.glob(os.path.join(dir, "*.jpg"))
     if shuffle_read:
         import random
         random.shuffle(files)
     else:
         files = sorted(files)
     image_df = ImageFromFile(files, channel=3, shuffle=isTrain)
     image_df = AugmentImageComponent(image_df, augs)
     random_df = RandomZData([size, size, 3], zmin, zmax)
     return JoinData([random_df, image_df])
Beispiel #2
0
def get_data(phase):
    is_train = phase == "train"
    ds = dataset.Cifar10(phase)
    pp_mean = ds.get_per_pixel_mean(("train", ))
    if is_train:
        augmentors = [
            imgaug.CenterPaste((40, 40)),
            imgaug.RandomCrop((32, 32)),
            imgaug.Flip(horiz=True),
            imgaug.MapImage(lambda x: x - pp_mean),
        ]
    else:
        augmentors = [imgaug.MapImage(lambda x: x - pp_mean)]
    ds = AugmentImageComponent(ds, augmentors)
    ds = BatchData(ds, BATCH_SIZE, remainder=not is_train)
    return ds
 def get_images(dir):
     files = sorted(glob.glob(os.path.join(dir, "*.jpg")))
     df = ImageFromFile(files, channel=3, shuffle=isTrain)
     random_df = RandomZData([size, size, 3], zmin, zmax)
     return JoinData([random_df, AugmentImageComponent(df, augs)])
Beispiel #4
0
def create_data_loader(split_dir,
                       dataset_index,
                       is_sobel,
                       sobel_normalized=False,
                       aug='central_crop',
                       batch_size=args.batch_size,
                       shuffle=None,
                       num_workers=2,
                       return_index=False,
                       use_fast_dataflow=False,
                       overwrite_labels=None,
                       buffer_size=5000):
    """

    Args:
        split_dir:
        dataset_index: used only if use_fast_dataflow=True
        is_sobel:
        sobel_normalized:
        aug:
        batch_size:
        shuffle: one of:
            - None: no shuffle
            - 'shuffle': normal shuffle
            - 'shuffle_buffer': shuffle only locally using a buffer. Cannot be used if use_fast_dataflow=False
        num_workers:
        return_index:
        use_fast_dataflow: use fast dataFlow based on Tensorpack?
                If num_workers > 1 can produce duplicates of the datapoints.
        overwrite_labels (np.array): list of labels which will be used instead of stored in lmdb.
            Has no effect if use_fast_dataflow=False.

    Returns:

    """
    if shuffle not in [None, 'shuffle', 'shuffle_buffer']:
        raise ValueError('Unknown shuffle value: {}'.format(shuffle))
    if shuffle == 'shuffle_buffer' and not use_fast_dataflow:
        raise ValueError(
            'Cannot use shuffle="shuffle_buffer" when use_fast_dataflow=False.'
        )

    print 'Creating dataset... ({})'.format(aug)
    target_transform = None
    if aug == 'random_crop_flip':
        transf = [
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor()
        ]
        collate_fn = torch.utils.data.dataloader.default_collate
    elif aug == 'central_crop':
        transf = [
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor()
        ]
        collate_fn = torch.utils.data.dataloader.default_collate
    elif aug == '10_crop':
        assert not return_index, 'Not implemented for several crops per image'
        to_tensor = transforms.ToTensor()
        transf = [
            transforms.Resize(256),
            transforms.TenCrop(224),
            transforms.Lambda(lambda crops: torch.stack(
                [to_tensor(crop) for crop in crops])),  # returns a 4D tensor
        ]
        target_transform = transforms.Lambda(lambda x: np.array([x] * 10))
        collate_fn = collate_concat
    else:
        raise ValueError('Unknown aug:' + aug)

    if not use_fast_dataflow:
        if not is_sobel or sobel_normalized:
            if aug != '10_crop':
                transf.append(IMAGENET_NORMALIZE)
            else:
                transf.append(
                    transforms.Lambda(lambda crops: torch.stack(
                        [IMAGENET_NORMALIZE(crop) for crop in crops])))

        dataset = IndexedDataset(split_dir,
                                 dataset_index,
                                 transform=transforms.Compose(transf),
                                 target_transform=target_transform,
                                 return_index=return_index)
        loader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=shuffle == 'shuffle',
                                             num_workers=num_workers,
                                             pin_memory=True,
                                             collate_fn=collate_fn)
    else:
        # FIXME: not sure, but sometimes training without -fdf and then enabling it results in drop in accuracy.
        # Maybe data preprocessing is not entirely equivalent for these 2 data-loading techniques. Should be further debugged.
        if aug == '10_crop':
            raise NotImplementedError(
                'FastDataFlow for aug=10_crop is not implemented yet!')
        import cv2
        from data_utils.fast_dataflow import create_lmdb_stream, TorchAugmentorList, TorchBatchData
        from tensorpack import LocallyShuffleData, PrefetchData, \
            MapDataComponent, AugmentImageComponent, PrefetchDataZMQ
        lmdb_path = split_dir.rstrip('/') + '.lmdb'
        ds = create_lmdb_stream(lmdb_path,
                                new_labels=overwrite_labels,
                                shuffle=(shuffle == 'shuffle'),
                                return_index=return_index)
        nr_prefetch = int(buffer_size * 1.5)
        if shuffle == 'shuffle_buffer':
            ds = LocallyShuffleData(ds, buffer_size=buffer_size)
            nr_prefetch = buffer_size
        ds = PrefetchData(
            ds, nr_prefetch=nr_prefetch,
            nr_proc=1)  # will ensure that LMDB Flow is not forked.
        # This will decode images with BGR channel order
        ds = MapDataComponent(ds,
                              lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR),
                              index=0)

        assert isinstance(transf[-1], transforms.ToTensor)
        transform_list = [transforms.ToPILImage()
                          ] + transf[:-1] + [pil_to_np_array]
        if not is_sobel or sobel_normalized:
            transform_list.append(IMAGENET_NORMALIZE_NP)

        ds = AugmentImageComponent(ds,
                                   TorchAugmentorList(
                                       transforms.Compose(transform_list)),
                                   index=0,
                                   copy=False)
        ds = PrefetchDataZMQ(ds, nr_proc=num_workers)
        loader = TorchBatchData(ds, batch_size=batch_size, remainder=True)

    return loader
Beispiel #5
0

path = '/home/dan/prj/datasets'
photo_filenames, class_names = _get_filenames_and_classes(path)

random.seed(0)
_NUM_VALIDATION = 350
random.shuffle(photo_filenames)
training_filenames = photo_filenames[:_NUM_VALIDATION]
validataion_filenames = photo_filenames[_NUM_VALIDATION:]
class_names_to_ids = dict(zip(class_names, range(len(class_names))))

train_dataset = my_dataset_flow(training_filenames, 'train',
                                class_names_to_ids)

ds = AugmentImageComponent(train_dataset, [imgaug.Resize((299, 299))])
#ds = PrefetchData(ds, 1000, multiprocessing.cpu_count())
'''중요한 점은, 데이터를 읽는 부분이나 rotation, flip, crop 등의 augmentation을 정의하고 이를 PrefetchData에 넘기면 필요한 부분을 여러 프로세스로 띄워서 처리해준다는 점입니다.'''

batchsize = 256
ds = BatchData(ds, batchsize, use_list=True)

nr_prefetch = 10
nr_proc = 2
ds = PrefetchData(ds, nr_prefetch, nr_proc)

TestDataSpeed(ds).start()
j = 0
for i in ds.get_data():
    print(np.array(i[0]).shape)
    print(np.array(i[1]).shape)