Example #1
0
def torchvision_dataset(transform=None, train=True, subset=None):
    """Creates a dataset from torchvision, configured using Command Line Arguments.

    Args:
        transform (callable, optional): A function that transforms an image (default None).
        train (bool, optional): Training set or validation - if applicable (default True).
        subset (string, optional): Specifies the subset of the relevant
            categories, if any of them was split (default, None).

    Relevant Command Line Arguments:

        - **dataset**: `--data`, `--torchvision_dataset`.

    Note:
        Settings are automatically acquired from a call to :func:`dlt.config.parse`
        from the built-in ones. If :func:`dlt.config.parse` was not called in the 
        main script, this function will call it.

    Warning:
        Unlike the torchvision datasets, this function returns a dataset that
        uses NumPy Arrays instead of a PIL Images.
    """
    opts = fetch_opts(['dataset'], subset)

    if opts.torchvision_dataset is None:
        if subset is not None:
            apnd = '_' + subset
        else:
            apnd = ''
        raise ValueError('No value given for --torchvision_dataset{0}.'.format(apnd))

    if opts.torchvision_dataset == 'mnist':
        from torchvision.datasets import MNIST
        MNIST.__getitem__ = _custom_get_item
        ret_dataset = MNIST(opts.data, train=train, download=True, transform=transform)
        # Add channel dimension for consistency
        if train:
            ret_dataset.train_data = ret_dataset.train_data.unsqueeze(3)
        else:
            ret_dataset.test_data = ret_dataset.test_data.unsqueeze(3)
    elif opts.torchvision_dataset == 'fashionmnist':
        from torchvision.datasets import FashionMNIST
        FashionMNIST.__getitem__ = _custom_get_item
        ret_dataset = FashionMNIST(opts.data, train=train, download=True, transform=transform)
        if train:
            ret_dataset.train_data = ret_dataset.train_data.unsqueeze(3)
        else:
            ret_dataset.test_data = ret_dataset.test_data.unsqueeze(3)
    elif opts.torchvision_dataset == 'cifar10':
        from torchvision.datasets import CIFAR10
        CIFAR10.__getitem__ = _custom_get_item
        ret_dataset = CIFAR10(opts.data, train=train, download=True, transform=transform)
    elif opts.torchvision_dataset == 'cifar100':
        from torchvision.datasets import CIFAR100
        CIFAR100.__getitem__ = _custom_get_item
        ret_dataset = CIFAR100(opts.data, train=train, download=True, transform=transform)
    return ret_dataset
Example #2
0
def get_dataloaders(data='mnist',
                    train_bs=128,
                    test_bs=500,
                    root='./data',
                    ohe_labels=False,
                    train_fraction=1.):
    to_tensor = transforms.ToTensor()
    if data == 'mnist':
        trainset = MNIST(root, train=True, download=True, transform=to_tensor)
        if train_fraction < 1.:
            data, _, labels, _ = train_test_split(
                trainset.train_data.numpy(),
                trainset.train_labels.numpy(),
                stratify=trainset.train_labels.numpy(),
                train_size=train_fraction)
            trainset.train_data, trainset.train_labels = torch.ByteTensor(
                data), torch.LongTensor(labels)

        idx = torch.LongTensor(np.where(trainset.train_labels.numpy() == 0)[0])
        trainset.train_data = trainset.train_data[idx]
        trainset.train_labels = trainset.train_labels[idx]

        if ohe_labels:
            x = trainset.train_labels.numpy()
            ohe = np.zeros((len(x), 10))
            ohe[np.arange(ohe.shape[0]), x] = 1
            trainset.train_labels = torch.from_numpy(ohe.astype(np.float32))

        testset = MNIST(root, train=False, download=True, transform=to_tensor)
        if ohe_labels:
            x = testset.test_labels.numpy()
            ohe = np.zeros((len(x), 10))
            ohe[np.arange(ohe.shape[0]), x] = 1
            testset.test_labels = torch.from_numpy(ohe.astype(np.float32))
    elif data == 'not-mnist':
        trainset = MNIST(root=os.path.join(root, 'not-mnist'),
                         train=False,
                         download=True,
                         transform=to_tensor)
        testset = MNIST(root=os.path.join(root, 'not-mnist'),
                        train=False,
                        download=True,
                        transform=to_tensor)
    else:
        raise NotImplementedError

    trainloader = torch.utils.data.DataLoader(trainset, batch_size=train_bs)
    testloader = torch.utils.data.DataLoader(testset, batch_size=test_bs)

    return trainloader, testloader
Example #3
0
def fetch_dataloader(args, train=True, download=True, mini_size=128):
    # load dataset and init in the dataloader

    transforms = T.Compose([T.ToTensor()])
    dataset = MNIST(root=args.data_dir,
                    train=train,
                    download=download,
                    transform=transforms)

    # load dataset and init in the dataloader
    if args.mini_data:
        if train:
            dataset.train_data = dataset.train_data[:mini_size]
            dataset.train_labels = dataset.train_labels[:mini_size]
        else:
            dataset.test_data = dataset.test_data[:mini_size]
            dataset.test_labels = dataset.test_labels[:mini_size]

    kwargs = {
        'num_workers': 1,
        'pin_memory': True
    } if args.device.type is 'cuda' else {}

    dl = DataLoader(dataset,
                    batch_size=args.batch_size,
                    shuffle=train,
                    drop_last=True,
                    **kwargs)

    return dl
Example #4
0
def get_loaders(nb_labelled, batch_size, unlab_rat, lab_inds=[]):
    transform_train, transform_test = augment()

    trainset_l = MNIST(root='./data',
                       train=True,
                       download=True,
                       transform=transform_train)
    test_set = MNIST(root='./data',
                     train=False,
                     download=True,
                     transform=transform_test)
    nb_class = 10

    print('total training dataset - ')
    print(trainset_l.train_data.shape, len(trainset_l.train_labels))
    if len(lab_inds) == 0:
        lab_inds = []
        for i in range(nb_class):
            labels = np.array(trainset_l.train_labels)
            inds_i = np.where(labels == i)[0]
            inds_i = np.random.permutation(inds_i)
            lab_inds.extend(inds_i[0:int(nb_labelled / nb_class)].tolist())
        lab_inds = np.array(lab_inds)

    all_inds = np.arange(len(trainset_l.train_labels))
    unlab_inds = np.setdiff1d(all_inds, lab_inds)

    trainset_u = copy.deepcopy(trainset_l)
    trainset_u.train_data = torch.tensor(
        np.array(trainset_u.train_data)[unlab_inds])
    trainset_u.train_labels = torch.tensor(
        np.array(trainset_u.train_labels)[unlab_inds])
    trainloader_u = DataLoader(trainset_u,
                               batch_size=batch_size,
                               shuffle=False)
    print('unlabelled part of training dataset - ')
    print(trainset_u.train_data.shape, len(trainset_u.train_labels))

    trainset_l.train_data = torch.tensor(
        np.array(trainset_l.train_data)[lab_inds])
    trainset_l.train_labels = torch.tensor(
        np.array(trainset_l.train_labels)[lab_inds])
    print('labelled part of training dataset - ')
    print(trainset_l.train_data.shape, len(trainset_l.train_labels))
    trainloader_l = DataLoader(trainset_l, batch_size=batch_size, shuffle=True)

    testloader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

    loaders = {
        "trainloader_l": trainloader_l,
        "testloader": testloader,
        "trainloader_u": trainloader_u,
        "trainset_l": trainset_l,
        "test_set": test_set,
        "trainset_u": trainset_u,
        "lab_inds": lab_inds
    }
    return loaders
def fetch_dataloader(params, train=True, mini_size=128):

    # load dataset and init in the dataloader
    transforms = T.Compose([T.ToTensor()])
    dataset = MNIST(root=params.data_dir,
                    train=train,
                    download=True,
                    transform=transforms)

    if params.dict.get('mini_data'):
        if train:
            dataset.train_data = dataset.train_data[:mini_size]
            dataset.train_labels = dataset.train_labels[:mini_size]
        else:
            dataset.test_data = dataset.test_data[:mini_size]
            dataset.test_labels = dataset.test_labels[:mini_size]

    if params.dict.get('mini_ones'):
        if train:
            labels = dataset.train_labels[:2000]
            mask = labels == 1
            dataset.train_labels = labels[mask][:mini_size]
            dataset.train_data = dataset.train_data[:2000][mask][:mini_size]
        else:
            labels = dataset.test_labels[:2000]
            mask = labels == 1
            dataset.test_labels = labels[mask][:mini_size]
            dataset.test_data = dataset.test_data[:2000][mask][:mini_size]

    kwargs = {
        'num_workers': 1,
        'pin_memory': True
    } if torch.cuda.is_available() and params.device.type is 'cuda' else {}

    return DataLoader(dataset,
                      batch_size=params.batch_size,
                      shuffle=True,
                      drop_last=True,
                      **kwargs)
def load_data(opt):
    """ Load Data

    Args:
        opt ([type]): Argument Parser

    Raises:
        IOError: Cannot Load Dataset

    Returns:
        [type]: dataloader
    """

    ##
    # LOAD DATA SET
    if opt.dataroot == '':
        opt.dataroot = './data/{}'.format(opt.dataset)

    if opt.dataset in ['cifar10']:
        splits = ['train', 'test']
        drop_last_batch = {'train': True, 'test': False}
        shuffle = {'train': True, 'test': False}

        transform = transforms.Compose([
            transforms.Resize(opt.isize),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])

        classes = {
            'plane': 0,
            'car': 1,
            'bird': 2,
            'cat': 3,
            'deer': 4,
            'dog': 5,
            'frog': 6,
            'horse': 7,
            'ship': 8,
            'truck': 9
        }

        dataset = {}
        dataset['train'] = CIFAR10(root='./data',
                                   train=True,
                                   download=True,
                                   transform=transform)
        dataset['test'] = CIFAR10(root='./data',
                                  train=False,
                                  download=True,
                                  transform=transform)

        dataset['train'].train_data, dataset['train'].train_labels, \
        dataset['test'].test_data, dataset['test'].test_labels = get_cifar_anomaly_dataset(
            trn_img=dataset['train'].train_data,
            trn_lbl=dataset['train'].train_labels,
            tst_img=dataset['test'].test_data,
            tst_lbl=dataset['test'].test_labels,
            abn_cls_idx=classes[opt.anomaly_class]
        )

        dataloader = {
            x: torch.utils.data.DataLoader(dataset=dataset[x],
                                           batch_size=opt.batchsize,
                                           shuffle=shuffle[x],
                                           num_workers=int(opt.workers),
                                           drop_last=drop_last_batch[x])
            for x in splits
        }
        return dataloader

    elif opt.dataset in ['mnist']:
        opt.anomaly_class = int(opt.anomaly_class)

        splits = ['train', 'test']
        drop_last_batch = {'train': True, 'test': False}
        shuffle = {'train': True, 'test': True}

        transform = transforms.Compose([
            transforms.Scale(opt.isize),
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])

        dataset = {}
        dataset['train'] = MNIST(root='./data',
                                 train=True,
                                 download=True,
                                 transform=transform)
        dataset['test'] = MNIST(root='./data',
                                train=False,
                                download=True,
                                transform=transform)

        dataset['train'].train_data, dataset['train'].train_labels, \
        dataset['test'].test_data, dataset['test'].test_labels = get_mnist_anomaly_dataset(
            trn_img=dataset['train'].train_data,
            trn_lbl=dataset['train'].train_labels,
            tst_img=dataset['test'].test_data,
            tst_lbl=dataset['test'].test_labels,
            abn_cls_idx=opt.anomaly_class
        )

        dataloader = {
            x: torch.utils.data.DataLoader(dataset=dataset[x],
                                           batch_size=opt.batchsize,
                                           shuffle=shuffle[x],
                                           num_workers=int(opt.workers),
                                           drop_last=drop_last_batch[x])
            for x in splits
        }
        return dataloader

    elif opt.dataset in ['mnist2']:
        opt.anomaly_class = int(opt.anomaly_class)

        splits = ['train', 'test']
        drop_last_batch = {'train': True, 'test': False}
        shuffle = {'train': True, 'test': True}

        transform = transforms.Compose([
            transforms.Scale(opt.isize),
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])

        dataset = {}
        dataset['train'] = MNIST(root='./data',
                                 train=True,
                                 download=True,
                                 transform=transform)
        dataset['test'] = MNIST(root='./data',
                                train=False,
                                download=True,
                                transform=transform)

        dataset['train'].train_data, dataset['train'].train_labels, \
        dataset['test'].test_data, dataset['test'].test_labels = get_mnist2_anomaly_dataset(
            trn_img=dataset['train'].train_data,
            trn_lbl=dataset['train'].train_labels,
            tst_img=dataset['test'].test_data,
            tst_lbl=dataset['test'].test_labels,
            nrm_cls_idx=opt.anomaly_class,
            proportion=opt.proportion
        )

        dataloader = {
            x: torch.utils.data.DataLoader(dataset=dataset[x],
                                           batch_size=opt.batchsize,
                                           shuffle=shuffle[x],
                                           num_workers=int(opt.workers),
                                           drop_last=drop_last_batch[x])
            for x in splits
        }
        return dataloader

    else:
        splits = ['train', 'test']
        drop_last_batch = {'train': True, 'test': False}
        shuffle = {'train': True, 'test': True}
        transform = transforms.Compose([
            transforms.Scale(opt.isize),
            transforms.CenterCrop(opt.isize),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
        ])

        dataset = {
            x: ImageFolder(os.path.join(opt.dataroot, x), transform)
            for x in splits
        }
        dataloader = {
            x: torch.utils.data.DataLoader(dataset=dataset[x],
                                           batch_size=opt.batchsize,
                                           shuffle=shuffle[x],
                                           num_workers=int(opt.workers),
                                           drop_last=drop_last_batch[x])
            for x in splits
        }
        return dataloader
Example #7
0
def load_data(opt):
    """ 
    Args:
        opt ([type]): Argument Parser
    Raises:
        IOError: Cannot Load Dataset
    Returns:
        [type]: dataloader
    """

    ##
    # LOAD DATA SET
    if opt.dataroot == '':
        opt.dataroot = './data/{}'.format(opt.dataset)

    if opt.dataset in ['disk']:
        splits = ['train', 'test']
        drop_last_batch = {'train': True, 'test': False}
        shuffle = {'train': True, 'test': True}

        # transforms.Compose就是将transforms组合在一起
        transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize(32),
            transforms.ToTensor(),  # range [0, 255] -> [0.0,1.0]
        ])

        good_sample = LoadDiskDataset(opt, 'good_sample', transform)
        failed_sample = LoadDiskDataset(opt, 'failed_sample', transform)

        dataset = {}
        dataset['train'], dataset['test'] = get_disk_anomaly_dataset(
            good_sample, failed_sample, 1)

        # 生成一个dict
        # DataLoader中的shuffle,洗牌。默认设置为False。在每次迭代训练时是否将数据洗牌,默认设置是False。
        # 将输入数据的顺序打乱,是为了使数据更有独立性,但如果数据是有序列特征的,就不要设置成True了。
        # 在disk failure prediction中是不能够打乱次序的,因为SMART是有序的
        dataloader = {
            x: torch.utils.data.DataLoader(dataset=dataset[x],
                                           batch_size=opt.batchsize,
                                           shuffle=shuffle[x],
                                           num_workers=int(opt.workers),
                                           drop_last=drop_last_batch[x])
            for x in splits
        }
        return dataloader
    ##
    elif opt.dataset in ['mnist']:
        opt.anomaly_class = int(opt.anomaly_class)

        splits = ['train', 'test']
        drop_last_batch = {'train': True, 'test': False}
        shuffle = {'train': True, 'test': True}

        transform = transforms.Compose([
            transforms.Scale(opt.isize),
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])

        dataset = {}
        dataset['train'] = MNIST(root='./data',
                                 train=True,
                                 download=True,
                                 transform=transform)
        dataset['test'] = MNIST(root='./data',
                                train=False,
                                download=True,
                                transform=transform)

        dataset['train'].train_data, dataset['train'].train_labels, \
        dataset['test'].test_data, dataset['test'].test_labels = get_mnist_anomaly_dataset(
            trn_img=dataset['train'].train_data,
            trn_lbl=dataset['train'].train_labels,
            tst_img=dataset['test'].test_data,
            tst_lbl=dataset['test'].test_labels,
            abn_cls_idx=opt.anomaly_class
        )

        dataloader = {
            x: torch.utils.data.DataLoader(dataset=dataset[x],
                                           batch_size=opt.batchsize,
                                           shuffle=shuffle[x],
                                           num_workers=int(opt.workers),
                                           drop_last=drop_last_batch[x])
            for x in splits
        }
        return dataloader
Example #8
0
def load_data(opt):
    """ Load Data

    Args:
        opt ([type]): Argument Parser

    Raises:
        IOError: Cannot Load Dataset

    Returns:
        [type]: dataloader
    """

    ##
    # LOAD DATA SET
    if opt.dataroot == '':
        opt.dataroot = './data/{}'.format(opt.dataset)

    if opt.dataset in ['cifar10']:
        splits = ['train', 'test']
        drop_last_batch = {'train': True, 'test': False}
        shuffle = {'train': True, 'test': False}

        transform = transforms.Compose(
            [
                transforms.Resize(opt.isize),
                transforms.ToTensor(),
                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
            ]
        )

        classes = {
            'plane': 0, 'car': 1, 'bird': 2, 'cat': 3, 'deer': 4,
            'dog': 5, 'frog': 6, 'horse': 7, 'ship': 8, 'truck': 9
        }

        dataset = {}
        dataset['train'] = CIFAR10(root='./data', train=True, download=True, transform=transform)
        dataset['test'] = CIFAR10(root='./data', train=False, download=True, transform=transform)

        if opt.task == 'anomaly_detect':
            dataset['train'].train_data, dataset['train'].train_labels, \
            dataset['test'].test_data, dataset['test'].test_labels = get_cifar_anomaly_dataset(
                trn_img=dataset['train'].train_data,
                trn_lbl=dataset['train'].train_labels,
                tst_img=dataset['test'].test_data,
                tst_lbl=dataset['test'].test_labels,
                abn_cls_idx=classes[opt.anomaly_class]
            )
        elif opt.task == 'random_walk':
            dataset['train'].train_data, dataset['train'].train_labels, \
            dataset['test'].test_data, dataset['test'].test_labels = get_sub_cifar10_dataset(
                trn_img=dataset['train'].train_data,
                trn_lbl=dataset['train'].train_labels,
                tst_img=dataset['test'].test_data,
                tst_lbl=dataset['test'].test_labels,
            )
        elif opt.task == 'llk_trend':
            dataset['train'].train_data, dataset['train'].train_labels, \
            dataset['test'].test_data, dataset['test'].test_labels = get_sub_cifar10_dataset(
                trn_img=dataset['train'].train_data,
                trn_lbl=dataset['train'].train_labels,
                tst_img=dataset['test'].test_data,
                tst_lbl=dataset['test'].test_labels,
                abn_cls_idx=[0]
            )
        elif opt.task == 'rw_llk': ##for simplication, let's do it together
            dataset['train'].train_data, dataset['train'].train_labels, \
            dataset['test'].test_data, dataset['test'].test_labels = get_cifar_rwllk_dataset(
                trn_img=dataset['train'].train_data,
                trn_lbl=dataset['train'].train_labels,
                tst_img=dataset['test'].test_data,
                tst_lbl=dataset['test'].test_labels,
                abn_cls_idx=classes[opt.anomaly_class]
            )

        dataloader = {x: torch.utils.data.DataLoader(dataset=dataset[x],
                                                     batch_size=opt.batch_size,
                                                     shuffle=shuffle[x],
                                                     num_workers=int(opt.workers),
                                                     drop_last=drop_last_batch[x]) for x in splits}
        return dataloader
    elif opt.dataset in ['mnist']:
        opt.anomaly_class = int(opt.anomaly_class)

        splits = ['train', 'test']
        drop_last_batch = {'train': True, 'test': False}
        shuffle = {'train': True, 'test': True}
        ##the second one is good for mnist
        transform = transforms.Compose([transforms.Resize((opt.isize, opt.isize)), transforms.ToTensor(),
                                        transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))])

        dataset = {}
        dataset['train'] = MNIST(root='./data', train=True, download=True, transform=transform)
        dataset['test'] = MNIST(root='./data', train=False, download=True, transform=transform)

        if opt.task == 'anomaly_detect':
            dataset['train'].train_data, dataset['train'].train_labels, \
            dataset['test'].test_data, dataset['test'].test_labels = get_mnist_anomaly_dataset(
                trn_img=dataset['train'].train_data,
                trn_lbl=dataset['train'].train_labels,
                tst_img=dataset['test'].test_data,
                tst_lbl=dataset['test'].test_labels,
                abn_cls_idx=opt.anomaly_class
            )
        elif opt.task == 'random_walk':
            dataset['train'].train_data, dataset['train'].train_labels, \
            dataset['test'].test_data, dataset['test'].test_labels = get_sub_mnist_dataset(
                trn_img=dataset['train'].train_data,
                trn_lbl=dataset['train'].train_labels,
                tst_img=dataset['test'].test_data,
                tst_lbl=dataset['test'].test_labels,
            )

        elif opt.task == 'llk_trend':
            dataset['train'].train_data, dataset['train'].train_labels, \
            dataset['test'].test_data, dataset['test'].test_labels = get_sub_mnist_dataset(
                trn_img=dataset['train'].train_data,
                trn_lbl=dataset['train'].train_labels,
                tst_img=dataset['test'].test_data,
                tst_lbl=dataset['test'].test_labels,
                abn_cls_idx = [0]
            )
        elif opt.task == 'rw_llk':  ##for simplication, let's do it together
            dataset['train'].train_data, dataset['train'].train_labels, \
            dataset['test'].test_data, dataset['test'].test_labels = get_mnist_rwllk_dataset(
                trn_img=dataset['train'].train_data,
                trn_lbl=dataset['train'].train_labels,
                tst_img=dataset['test'].test_data,
                tst_lbl=dataset['test'].test_labels,
                abn_cls_idx=opt.anomaly_class
            )

        dataloader = {x: torch.utils.data.DataLoader(dataset=dataset[x],
                                                     batch_size=opt.batch_size,
                                                     shuffle=shuffle[x],
                                                     num_workers=int(opt.workers),
                                                     drop_last=drop_last_batch[x]) for x in splits}

        return dataloader

    elif opt.dataset in ['svhn']:
        opt.anomaly_class = int(opt.anomaly_class)

        splits = ['train', 'test']
        drop_last_batch = {'train': True, 'test': False}
        shuffle = {'train': True, 'test': True}

        ##it seems that the transform in our GAN, we can use identical transform for both cifar10 and mnist
        ##the following is not working well at all.
        # transform = transforms.Compose(
        #     [
        #         transforms.Scale(opt.isize),
        #         transforms.ToTensor(),
        #         transforms.Normalize((0.1307,), (0.3081,))
        #     ]
        # )
        ##the second one is good for mnist
        transform = transforms.Compose([transforms.Resize((opt.isize, opt.isize)), transforms.ToTensor(),
                                        transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))])

        dataset = {}
        dataset['train'] = SVHN(root='./data', split='train', download=True, transform=transform)
        dataset['test'] = SVHN(root='./data', split='test', download=True, transform=transform)

        if opt.task == 'anomaly_detect':
            dataset['train'].train_data, dataset['train'].train_labels, \
            dataset['test'].test_data, dataset['test'].test_labels = get_cifar_anomaly_dataset( ##not sure if we need to write a get_svhn_anomaly_dataset yet.
                trn_img=dataset['train'].data,
                trn_lbl=dataset['train'].labels,
                tst_img=dataset['test'].data,
                tst_lbl=dataset['test'].labels,
                abn_cls_idx=opt.anomaly_class
            )


        dataloader = {x: torch.utils.data.DataLoader(dataset=dataset[x],
                                                     batch_size=opt.batch_size,
                                                     shuffle=shuffle[x],
                                                     num_workers=int(opt.workers),
                                                     drop_last=drop_last_batch[x]) for x in splits}
        return dataloader

    elif opt.dataset in ['mnist2']:
        opt.anomaly_class = int(opt.anomaly_class)

        splits = ['train', 'test']
        drop_last_batch = {'train': True, 'test': False}
        shuffle = {'train': True, 'test': True}

        transform = transforms.Compose(
            [
                transforms.Scale(opt.isize),
                transforms.ToTensor(),
                transforms.Normalize((0.1307,), (0.3081,))
            ]
        )

        dataset = {}
        dataset['train'] = MNIST(root='./data', train=True, download=True, transform=transform)
        dataset['test'] = MNIST(root='./data', train=False, download=True, transform=transform)

        if opt.task == 'anomaly_detect':
            dataset['train'].train_data, dataset['train'].train_labels, \
            dataset['test'].test_data, dataset['test'].test_labels = get_mnist2_anomaly_dataset(
                trn_img=dataset['train'].train_data,
                trn_lbl=dataset['train'].train_labels,
                tst_img=dataset['test'].test_data,
                tst_lbl=dataset['test'].test_labels,
                nrm_cls_idx=opt.anomaly_class,
                proportion=opt.proportion
            )

        dataloader = {x: torch.utils.data.DataLoader(dataset=dataset[x],
                                                     batch_size=opt.batch_size,
                                                     shuffle=shuffle[x],
                                                     num_workers=int(opt.workers),
                                                     drop_last=drop_last_batch[x]) for x in splits}
        return dataloader

    # elif opt.dataset in ['celebA']:
    #     ##not for abnormal detection but not for classification either
    #     splits = ['train', 'test']
    #     drop_last_batch = {'train': True, 'test': False}
    #     shuffle = {'train': True, 'test': True}
    #     transform = transforms.Compose([transforms.Scale(opt.isize),
    #                                     transforms.CenterCrop(opt.isize),
    #                                     transforms.ToTensor(),
    #                                     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ])
    #     print(os.path.abspath('./data/celebA'))
    #     dataset = datasets.ImageFolder(os.path.abspath('./data/celebA'), transform)
    #     dataloader = torch.utils.data.DataLoader(dataset=dataset, batch_size=opt.batch_size, shuffle=True)
    #     return dataloader
    elif opt.dataset in ['celebA']:
        ##not for abnormal detection but not for classification either
        # import helper
        # helper.download_extract('celeba', opt.dataroot)
        # splits = ['train', 'test']
        # drop_last_batch = {'train': True, 'test': False}
        # shuffle = {'train': True, 'test': True}
        # transform = transforms.Compose([transforms.Scale(opt.isize),
        #                                 transforms.CenterCrop(opt.isize),
        #                                 transforms.ToTensor(),
        #                                 transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ])
        #
        # # transform = transforms.Compose([
        # #     transforms.CenterCrop(160),
        # #     transforms.Scale(opt.isize),
        # #     transforms.ToTensor(),)
        #
        # dataset = ImageFolder(root=image_root, transform=transforms.Compose([
        #     transforms.CenterCrop(160),
        #     transforms.Scale(scale_size),
        #     transforms.ToTensor(),
        #     #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
        # ]))
        #
        # dataset = {x: ImageFolder(os.path.join(opt.dataroot, x), transform) for x in splits}
        # dataloader = {x: torch.utils.data.DataLoader(dataset=dataset[x],
        #                                              batch_size=opt.batch_size,
        #                                              shuffle=shuffle[x],
        #                                              num_workers=int(opt.workers),
        #                                              drop_last=drop_last_batch[x]) for x in splits}

        dataloader = get_loader(
            './data/celebA', 'train', opt.batch_size, opt.isize)
        return dataloader
def load_data(opt):
    """ Load Data

    Args:
        opt ([type]): Argument Parser

    Raises:
        IOError: Cannot Load Dataset

    Returns:
        [type]: dataloader
    """

    ##
    # LOAD DATA SET
    if opt.dataroot == '':
        opt.dataroot = './data/{}'.format(opt.dataset)

    if opt.dataset in ['cifar10']:
        splits = ['train', 'test']
        drop_last_batch = {'train': True, 'test': False}
        shuffle = {'train': True, 'test': False}

        transform = transforms.Compose([
            transforms.Scale(opt.isize),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])

        classes = {
            'plane': 0,
            'car': 1,
            'bird': 2,
            'cat': 3,
            'deer': 4,
            'dog': 5,
            'frog': 6,
            'horse': 7,
            'ship': 8,
            'truck': 9
        }

        dataset = {}
        dataset['train'] = CIFAR10(root='./data',
                                   train=True,
                                   download=True,
                                   transform=transform)
        dataset['test'] = CIFAR10(root='./data',
                                  train=False,
                                  download=True,
                                  transform=transform)

        dataset['train'].train_data, dataset['train'].train_labels, \
        dataset['test'].test_data, dataset['test'].test_labels = get_cifar_anomaly_dataset(
            trn_img=dataset['train'].train_data,
            trn_lbl=dataset['train'].train_labels,
            tst_img=dataset['test'].test_data,
            tst_lbl=dataset['test'].test_labels,
            abn_cls_idx=classes[opt.anomaly_class]
        )

        dataloader = {
            x: torch.utils.data.DataLoader(dataset=dataset[x],
                                           batch_size=opt.batchsize,
                                           shuffle=shuffle[x],
                                           num_workers=int(opt.workers),
                                           drop_last=drop_last_batch[x])
            for x in splits
        }
        return dataloader

    elif opt.dataset in ['mnist']:
        opt.anomaly_class = int(opt.anomaly_class)

        splits = ['train', 'test']
        drop_last_batch = {'train': True, 'test': False}
        shuffle = {'train': True, 'test': True}

        transform = transforms.Compose([
            transforms.Scale(opt.isize),
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])

        dataset = {}
        dataset['train'] = MNIST(root='./data',
                                 train=True,
                                 download=True,
                                 transform=transform)
        dataset['test'] = MNIST(root='./data',
                                train=False,
                                download=True,
                                transform=transform)

        dataset['train'].train_data, dataset['train'].train_labels, \
        dataset['test'].test_data, dataset['test'].test_labels = get_mnist_anomaly_dataset(
            trn_img=dataset['train'].train_data,
            trn_lbl=dataset['train'].train_labels,
            tst_img=dataset['test'].test_data,
            tst_lbl=dataset['test'].test_labels,
            abn_cls_idx=opt.anomaly_class
        )

        dataloader = {
            x: torch.utils.data.DataLoader(dataset=dataset[x],
                                           batch_size=opt.batchsize,
                                           shuffle=shuffle[x],
                                           num_workers=int(opt.workers),
                                           drop_last=drop_last_batch[x])
            for x in splits
        }
        return dataloader

    else:
        splits = ['train', 'val', 'test']
        drop_last_batch = {'train': True, 'val': False, 'test': False}
        shuffle = {'train': True, 'val': True, 'test': True}

        # transform = transforms.Compose([transforms.Resize((opt.iheight, opt.iwidth)),
        # the PILL Image has 0~1
        # normlizing it would give me -1~+1 values
        transform = transforms.Compose([
            # transforms.Crop((opt.iheight, opt.iwidth*2)),
            # MyCrop(0,0, opt.iheight, opt.iwidth*2),

            # resize or grayscale
            #transforms.Resize((opt.iheight, opt.iwidth)),
            #transforms.Grayscale(num_output_channels=3),

            #transforms.RandomHorizontalFlip(),
            #transforms.RandomApply((
            #    transforms.RandomRotation(15),
            #    transforms.RandomAffine(degrees=0, shear=15),
            #)),
            #transforms.RandomAffine(degrees=15, translate=(0.2, 0.2), shear=15),
            #transforms.RandomAffine(degrees=0, translate=(0.2, 0.2)),
            transforms.ToTensor(
            ),  # convert the PIL image in the range [0, 255] to a tensor in the range [0, 1]
            # MyTransformation(chunks=2, splitDim=2, mergeDim=0),
            #transforms.Normalize((0.5), (0.5)) # for gray scale, 1 channel
            transforms.Normalize(
                (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
            )  # normalize the tensor by subtract mean and divide it with std
        ])

        #dataset = {x: ImageFolder(os.path.join(opt.dataroot, x), transform) for x in splits}
        dataset = {
            x: ImageFolderWithPaths(os.path.join(opt.dataroot, x), transform)
            for x in splits
        }
        dataloader = {
            x: torch.utils.data.DataLoader(dataset=dataset[x],
                                           batch_size=opt.batchsize,
                                           shuffle=shuffle[x],
                                           num_workers=int(opt.workers),
                                           drop_last=drop_last_batch[x])
            for x in splits
        }
        return dataloader
Example #10
0
def get_datasets(args):
    if args.dataset == 'mnist':
        biased_weights = [(args.bias, 1 - args.bias)
                          for _ in range(args.n_classes)]
        balanced_weights = [(1, 1) for _ in range(args.n_classes)]
        override = sample_n(
            range(10), [10 // args.n_classes for _ in range(args.n_classes)])
        train_dataset = MNIST(root='./data/mnist', train=True, download=True)
        test_dataset = MNIST(root='./data/mnist', train=False, download=True)
        train_dataset.train_data = train_dataset.train_data.unsqueeze(
            -1).numpy()
        test_dataset.test_data = test_dataset.test_data.unsqueeze(-1).numpy()
        transform = Compose(
            [ToPILImage(), RandomCrop(28, padding=4),
             ToTensor()])

    elif args.dataset == 'cifar10':
        biased_weights = [(args.bias, 1 - args.bias)
                          for _ in range(args.n_classes)]
        balanced_weights = [(1, 1) for _ in range(args.n_classes)]
        override = sample_n(
            range(10), [10 // args.n_classes for _ in range(args.n_classes)])
        train_dataset = CIFAR10(root='./data/cifar10',
                                train=True,
                                download=True)
        test_dataset = CIFAR10(root='./data/cifar10',
                               train=False,
                               download=True)
        transform = Compose([
            ToPILImage(),
            RandomCrop(32, padding=4),
            RandomHorizontalFlip(),
            ToTensor()
        ])

    elif args.dataset == 'cifar100':  # uses predefined coarse labels
        coarse_labels = [(4, 30, 55, 72, 95), (1, 32, 67, 73, 91),
                         (54, 62, 70, 82, 92), (9, 10, 16, 28, 61),
                         (0, 51, 53, 57, 83), (22, 39, 40, 86, 87),
                         (5, 20, 25, 84, 94), (6, 7, 14, 18, 24),
                         (3, 42, 43, 88, 97), (12, 17, 37, 68, 76),
                         (23, 33, 49, 60, 71), (15, 19, 21, 31, 38),
                         (34, 63, 64, 66, 75), (26, 45, 77, 79, 99),
                         (2, 11, 35, 46, 98), (27, 29, 44, 78, 93),
                         (36, 50, 65, 74, 80), (47, 52, 56, 59, 96),
                         (8, 13, 48, 58, 90), (41, 69, 81, 85, 89)]
        biased_weights = [(args.bias, (1 - args.bias) / 4, (1 - args.bias) / 4,
                           (1 - args.bias) / 4, (1 - args.bias) / 4)
                          for _ in range(args.n_classes)]
        balanced_weights = [(1, 1, 1, 1, 1) for _ in range(args.n_classes)]
        override = random.sample(
            [random.sample(coarse_label, 5) for coarse_label in coarse_labels],
            args.n_classes)
        train_dataset = CIFAR100(root='./data/cifar100',
                                 train=True,
                                 download=True)
        test_dataset = CIFAR100(root='./data/cifar100',
                                train=False,
                                download=True)
        transform = Compose([
            ToPILImage(),
            RandomCrop(32, padding=4),
            RandomHorizontalFlip(),
            ToTensor()
        ])

    train_mixture = MixtureDataset(train_dataset.train_data[:-args.n_valid],
                                   train_dataset.train_labels[:-args.n_valid],
                                   mixture_weights=biased_weights,
                                   mixture_override=override,
                                   transform=transform)

    valid_mixture = MixtureDataset(train_dataset.train_data[-args.n_valid:],
                                   train_dataset.train_labels[-args.n_valid:],
                                   mixture_weights=balanced_weights,
                                   mixture_override=override,
                                   transform=transform)

    test_mixture = MixtureDataset(test_dataset.test_data,
                                  test_dataset.test_labels,
                                  mixture_weights=balanced_weights,
                                  mixture_override=override,
                                  transform=transform)

    return train_mixture, valid_mixture, test_mixture