Beispiel #1
0
    def load_dataset(self):
        """
        Overrides base dataset loading
        Fixes path to data
        Fixes all tranformations to be identical
        Preprocessing from: https://github.com/pytorch/vision/issues/39
        """

        train_path = os.path.expanduser("~/nta/data/imagenet/train")
        val_path = os.path.expanduser("~/nta/data/imagenet/val")

        stats_mean, stats_std = datasets_stats["ImageNet"]
        train_transform = transforms.Compose(
            [
                transforms.RandomSizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize(stats_mean, stats_std),
            ]
        )

        val_transform = transforms.Compose(
            [
                transforms.Scale(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize(stats_mean, stats_std),
            ]
        )

        # load datasets
        train_dataset = CachedDatasetFolder(
            train_path, transform=train_transform, num_classes=self.num_classes
        )
        test_dataset = CachedDatasetFolder(
            val_path, transform=val_transform, num_classes=self.num_classes
        )

        # load dataloaders
        # added pin_memory=True for faster data recovery
        self.train_loader = DataLoader(
            train_dataset,
            shuffle=True,
            batch_size=self.batch_size_train,
            pin_memory=True,
            num_workers=56,
        )
        self.test_loader = DataLoader(
            test_dataset,
            shuffle=False,
            batch_size=self.batch_size_test,
            pin_memory=True,
            num_workers=56,
        )
def create_validation_dataloader(data_dir,
                                 val_dir,
                                 batch_size,
                                 workers,
                                 num_classes=1000):
    """
    Configure Imagenet validation dataloader

    Creates :class:`torch.utils.data.DataLoader` using :class:`CachedDatasetFolder`
    or :class:`HDF5Dataset` pre-configured for the validation cycle.

    :param data_dir: The directory or hdf5 file containing the dataset
    :param val_dir: The directory containing or hdf5 group the validation data
    :param batch_size: Images per batch
    :param workers: how many data loading subprocesses to use
    :param num_classes: Limit the dataset size to the given number of classes
    :return: torch.utils.data.DataLoader
    """

    transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225],
                             inplace=True),
    ])
    if h5py.is_hdf5(data_dir):
        if num_classes in IMAGENET_NUM_CLASSES:
            classes = IMAGENET_NUM_CLASSES[num_classes]
            dataset = HDF5Dataset(hdf5_file=data_dir,
                                  root=val_dir,
                                  classes=classes,
                                  transform=transform)
        else:
            dataset = HDF5Dataset(hdf5_file=data_dir,
                                  root=val_dir,
                                  num_classes=num_classes,
                                  transform=transform)
    else:
        dataset = CachedDatasetFolder(root=os.path.join(data_dir, val_dir),
                                      num_classes=num_classes,
                                      transform=transform)
    return torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=workers,
        pin_memory=False,
    )
def create_validation_dataset(data_dir, val_dir, num_classes=1000):
    """
    Configure Imagenet validation dataloader

    Creates :class:`CachedDatasetFolder` or :class:`HDF5Dataset` pre-configured
    for the validation cycle.

    :param data_dir: The directory or hdf5 file containing the dataset
    :param val_dir: The directory containing or hdf5 group the validation data
    :param num_classes: Limit the dataset size to the given number of classes
    :return: CachedDatasetFolder or HDF5Dataset
    """

    transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225],
                             inplace=True),
    ])
    if h5py.is_hdf5(data_dir):
        if num_classes in IMAGENET_NUM_CLASSES:
            classes = IMAGENET_NUM_CLASSES[num_classes]
            dataset = HDF5Dataset(hdf5_file=data_dir,
                                  root=val_dir,
                                  classes=classes,
                                  transform=transform)
        else:
            dataset = HDF5Dataset(hdf5_file=data_dir,
                                  root=val_dir,
                                  num_classes=num_classes,
                                  transform=transform)
    else:
        dataset = CachedDatasetFolder(root=os.path.join(data_dir, val_dir),
                                      num_classes=num_classes,
                                      transform=transform)
    return dataset
Beispiel #4
0
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(stats_mean, stats_std),
])

val_transform = transforms.Compose([
    transforms.Scale(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(stats_mean, stats_std),
])

# load train dataset
t0 = time()
train_dataset = CachedDatasetFolder(train_path,
                                    transform=train_transform,
                                    num_classes=num_classes)
print("Loaded train dataset")
t1 = time()
print("Time spent to load train dataset: {:.2f}".format(t1 - t0))

# load test dataset
t0 = time()
test_dataset = CachedDatasetFolder(val_path,
                                   transform=val_transform,
                                   num_classes=num_classes)
print("Loaded test dataset")
t1 = time()
print("Time spent to load test dataset: {:.2f}".format(t1 - t0))

# load dataloaders
def create_train_dataloader(
    data_dir,
    train_dir,
    batch_size,
    workers,
    distributed,
    num_classes=1000,
    use_auto_augment=False,
):
    """
    Configure Imagenet training dataloader

    Creates :class:`torch.utils.data.DataLoader` using :class:`CachedDatasetFolder`
    or :class:`HDF5Dataset` pre-configured for the training cycle

    :param data_dir: The directory or hdf5 file containing the dataset
    :param train_dir: The directory or hdf5 group containing the training data
    :param batch_size: Images per batch
    :param workers: how many data loading subprocesses to use
    :param distributed: Whether or not to use `DistributedSampler`
    :param num_classes: Limit the dataset size to the given number of classes
    :return: torch.utils.data.DataLoader
    """
    if use_auto_augment:
        transform = transforms.Compose(transforms=[
            RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            ImageNetPolicy(),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225],
                                 inplace=True),
        ], )
    else:
        transform = transforms.Compose(transforms=[
            RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225],
                                 inplace=True),
        ], )
    if h5py.is_hdf5(data_dir):
        # Use fixed Imagenet classes if mapping is available
        if num_classes in IMAGENET_NUM_CLASSES:
            classes = IMAGENET_NUM_CLASSES[num_classes]
            dataset = HDF5Dataset(hdf5_file=data_dir,
                                  root=train_dir,
                                  classes=classes,
                                  transform=transform)
        else:
            dataset = HDF5Dataset(hdf5_file=data_dir,
                                  root=train_dir,
                                  num_classes=num_classes,
                                  transform=transform)
    else:
        dataset = CachedDatasetFolder(root=os.path.join(data_dir, train_dir),
                                      num_classes=num_classes,
                                      transform=transform)
    if distributed:
        train_sampler = DistributedSampler(dataset)
    else:
        train_sampler = None

    return torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=train_sampler is None,
        num_workers=workers,
        sampler=train_sampler,
        pin_memory=torch.cuda.is_available(),
    )
def create_train_dataset(data_dir,
                         train_dir,
                         num_classes=1000,
                         use_auto_augment=False,
                         sample_transform=None,
                         target_transform=None,
                         replicas_per_sample=1):
    """
    Configure Imagenet training dataset

    Creates :class:`CachedDatasetFolder` :class:`HDF5Dataset` pre-configured
    for the training cycle

    :param data_dir: The directory or hdf5 file containing the dataset
    :param train_dir: The directory or hdf5 group containing the training data
    :param num_classes: Limit the dataset size to the given number of classes
    :param sample_transform: List of transforms acting on the samples
                             to be added to the defaults below
    :param target_transform: List of transforms acting on the targets
    :param replicas_per_sample: Number of replicas to create per sample
                                in the batch (each replica is transformed
                                independently). Used in maxup.

    :return: CachedDatasetFolder or HDF5Dataset
    """
    if use_auto_augment:
        transform = transforms.Compose(transforms=[
            RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            ImageNetPolicy(),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225],
                                 inplace=True),
        ], )
    else:
        transform = transforms.Compose(transforms=[
            RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225],
                                 inplace=True),
        ], )

    transform = transforms.Compose(transforms=[transform] +
                                   (sample_transform or []))

    if h5py.is_hdf5(data_dir):
        # Use fixed Imagenet classes if mapping is available
        if num_classes in IMAGENET_NUM_CLASSES:
            classes = IMAGENET_NUM_CLASSES[num_classes]
            dataset = HDF5Dataset(hdf5_file=data_dir,
                                  root=train_dir,
                                  classes=classes,
                                  transform=transform,
                                  target_transform=target_transform,
                                  replicas_per_sample=replicas_per_sample)
        else:
            dataset = HDF5Dataset(hdf5_file=data_dir,
                                  root=train_dir,
                                  num_classes=num_classes,
                                  transform=transform,
                                  target_transform=target_transform,
                                  replicas_per_sample=replicas_per_sample)
    else:
        dataset = CachedDatasetFolder(root=os.path.join(data_dir, train_dir),
                                      num_classes=num_classes,
                                      transform=transform,
                                      target_transform=target_transform)
    return dataset
Beispiel #7
0
def _create_train_dataloader(data_dir,
                             train_dir,
                             batch_size,
                             workers,
                             distributed,
                             progressive_resize,
                             num_classes=1000):
    """
    Configure Imagenet training dataloader

    Creates :class:`torch.utils.data.DataLoader` using :class:`CachedDatasetFolder`
    or or :class:`HDF5Dataset` pre-configured for the training cycle with an
    optional :class:`ProgressiveRandomResizedCrop` schedule where the images
    sizes can vary at different epochs during the cycle.

    :param data_dir: The directory or hdf5 file containing the dataset
    :param train_dir: The directory or hdf5 group containing the training data
    :param batch_size: Images per batch
    :param workers: how many data loading subprocesses to use
    :param distributed: Whether or not to use `DistributedSampler`
    :param progressive_resize: Dictionary containing the progressive resize schedule
    :param num_classes: Limit the dataset size to the given number of classes
    :return: torch.utils.data.DataLoader
    """
    if progressive_resize is None:
        # Standard size for all epochs
        resize_transform = RandomResizedCrop(224)
    else:
        # Convert progressive_resize dict from {str:int} to {int:int}
        progressive_resize = {int(k): v for k, v in progressive_resize.items()}
        resize_transform = ProgressiveRandomResizedCrop(
            progressive_resize=progressive_resize)

    transform = transforms.Compose(transforms=[
        resize_transform,
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ], )
    if h5py.is_hdf5(data_dir):
        if num_classes in IMAGENET_NUM_CLASSES:
            classes = IMAGENET_NUM_CLASSES[num_classes]
            dataset = HDF5Dataset(hdf5_file=data_dir,
                                  root=train_dir,
                                  classes=classes,
                                  transform=transform)
        else:
            dataset = HDF5Dataset(hdf5_file=data_dir,
                                  root=train_dir,
                                  num_classes=num_classes,
                                  transform=transform)
    else:
        dataset = CachedDatasetFolder(root=os.path.join(data_dir, train_dir),
                                      num_classes=num_classes,
                                      transform=transform)
    if distributed:
        train_sampler = DistributedSampler(dataset)
    else:
        train_sampler = None

    return torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=train_sampler is None,
        num_workers=workers,
        sampler=train_sampler,
        pin_memory=torch.cuda.is_available(),
    )