Beispiel #1
0
def blobs(data_dir,
          batch_size,
          mode="base",
          normalize=True,
          norm_layer=None,
          size=32):
    """
    Minimal version since we use this dataset only for OOD evaluation.
    """
    data = np.float32(
        np.random.binomial(n=1, p=0.7, size=(10000, size, size, 3)))
    for i in range(10000):
        data[i] = gblur(data[i], sigma=1.5, multichannel=False)
        data[i][data[i] < 0.75] = 0.0

    dummy_targets = torch.ones(10000)
    data = torch.cat([
        norm_layer(x).unsqueeze(0)
        for x in torch.from_numpy(data.transpose((0, 3, 1, 2)))
    ])
    dataset = torch.utils.data.TensorDataset(data, dummy_targets)
    loader = torch.utils.data.DataLoader(dataset,
                                         batch_size=batch_size,
                                         shuffle=False,
                                         num_workers=4,
                                         pin_memory=True)
    return 0, loader, 0
Beispiel #2
0
    def _DataGenerator(self, dataName, num_examples):
        """
        create synthetic data using numpy

        """
        if dataName == "gaussian":
            # prepare gaussian data
            data = np.clip(
                np.random.normal(size=(num_examples, self.size, self.size, 3),
                                 scale=0.5), -1, 1).astype(np.float32)

        elif dataName == "rademacher":
            # prepare rademacher data
            data = np.random.binomial(
                n=1, p=0.5, size=(num_examples, self.size, self.size,
                                  3)).astype(np.float32) * 2 - 1

        elif dataName == "blob":
            data = np.random.binomial(n=1,
                                      p=0.7,
                                      size=(num_examples, self.size, self.size,
                                            3)).astype(np.float32)
            for i in range(num_examples):
                data[i] = gblur(data[i], sigma=1.5, multichannel=False)
                data[i][data[i] < 0.75] = 0.0

        label = np.random.randint(low=0,
                                  high=10,
                                  size=num_examples,
                                  dtype=np.int64)

        return data, label
Beispiel #3
0
    def __init__(self,
                 size=32,
                 batch_sizes={'ood': 100},
                 noise_type='Gaussian',
                 dataset_len=2000):
        super(Noise, self).__init__(None, size, batch_sizes, {'ood': None})
        if isinstance(size, tuple):
            h, w = size
        else:
            h = w = size
        dummy_targets = torch.ones(dataset_len)
        '''
        Original source of noise formulation
        from https://github.com/hendrycks/outlier-exposure
        '''
        if noise_type == 'Gaussian':
            noise_data = torch.from_numpy(
                np.float32(
                    np.clip(
                        np.random.normal(size=(dataset_len, 3, h, w),
                                         scale=0.5), -1, 1)))
        elif noise_type == 'Rademacher':
            noise_data = torch.from_numpy(
                np.random.binomial(n=1, p=0.5, size=(
                    dataset_len, 3, h, w)).astype(np.float32)) * 2 - 1
        elif noise_type == 'Blob':
            from skimage.filters import gaussian as gblur
            noise_data = np.float32(
                np.random.binomial(n=1, p=0.7, size=(dataset_len, h, w, 3)))
            for i in range(dataset_len):
                noise_data[i] = gblur(noise_data[i],
                                      sigma=1.5,
                                      multichannel=False)
                noise_data[i][noise_data[i] < 0.75] = 0.0

            noise_data = torch.from_numpy(noise_data.transpose(
                (0, 3, 1, 2))) * 2 - 1
        noise_data = torch.utils.data.TensorDataset(noise_data, dummy_targets)
        self.loader = torch.utils.data.DataLoader(
            noise_data,
            batch_size=batch_sizes['ood'],
            shuffle=True,
            num_workers=12,
            pin_memory=True)
        self.noise_type = noise_type
        self.name = "{}_{}".format(self.name, noise_type)
ood_data = torch.utils.data.TensorDataset(ood_data, dummy_targets)
ood_loader = torch.utils.data.DataLoader(ood_data,
                                         batch_size=args.test_bs,
                                         shuffle=True)

print('\n\nRademacher Noise Calibration')
get_and_print_results(ood_loader)

# /////////////// Blob ///////////////

ood_data = np.float32(
    np.random.binomial(n=1,
                       p=0.7,
                       size=(ood_num_examples * args.num_to_avg, 32, 32, 3)))
for i in range(ood_num_examples * args.num_to_avg):
    ood_data[i] = gblur(ood_data[i], sigma=1.5, multichannel=False)
    ood_data[i][ood_data[i] < 0.75] = 0.0

dummy_targets = torch.ones(ood_num_examples * args.num_to_avg)
ood_data = torch.from_numpy(ood_data.transpose((0, 3, 1, 2))) * 2 - 1
ood_data = torch.utils.data.TensorDataset(ood_data, dummy_targets)
ood_loader = torch.utils.data.DataLoader(ood_data,
                                         batch_size=args.test_bs,
                                         shuffle=True,
                                         num_workers=args.prefetch,
                                         pin_memory=True)

print('\n\nBlob Calibration')
get_and_print_results(ood_loader)

# /////////////// Textures ///////////////
Beispiel #5
0
def get_dataset(name,
                split='train',
                transform=None,
                target_transform=None,
                download=True,
                datasets_path=__DATASETS_DEFAULT_PATH,
                limit=None,
                shuffle_before_limit=False,
                limit_shuffle_seed=None,
                class_ids=None,
                per_class_limit=True,
                split_start=0):
    train = (split == 'train')
    if '+' in name:
        ds = None
        for ds_name in name.split('+'):
            ds_ = get_dataset(ds_name,
                              split,
                              transform,
                              target_transform,
                              download,
                              limit=limit,
                              shuffle_before_limit=shuffle_before_limit,
                              datasets_path=__DATASETS_DEFAULT_PATH,
                              split_start=split_start)
            if ds is None:
                ds = ds_
            else:
                ds += ds_
                ds.targets = ds.datasets[0].targets + ds.datasets[1].targets

        if limit or class_ids:
            ds = limit_ds(ds,
                          limit,
                          per_class=per_class_limit,
                          shuffle=shuffle_before_limit,
                          seed=limit_shuffle_seed,
                          allowed_classes=class_ids,
                          split_start=split_start)
        return ds

    if name.endswith('_3c'):
        transform = tv.transforms.Compose([
            tv.transforms.ToTensor(), lambda x: x.repeat(3, 1, 1),
            tv.transforms.ToPILImage(), transform
        ])
        name = name[:-3]
    if name.endswith('-raw'):
        ds_dir_name = name[:-4]
    elif name.startswith('folder-'):
        ds_dir_name = name[7:]
    elif name == 'places365_standard-lsun':
        ds_dir_name = 'places365_standard'
        name = ds_dir_name
        class_ids = filter(
            lambda x: x not in [52, 66, 91, 92, 102, 121, 203, 215, 284, 334],
            range(365))
    elif name.startswith('DomainNet-'):
        parts = name.split('-')
        domain = parts[1]
        if name.endswith('-measure') and train:
            ds_dir_name = os.path.join('DomainNet', 'measure', domain)
        else:
            ds_dir_name = os.path.join('DomainNet',
                                       'train' if train else 'test', domain)
        name = name.replace('-measure', '')
        if name.endswith('-A') or name.endswith('-B'):
            set = parts[2]
            class_ids = range(173) if set == 'A' else range(173, 345)
    elif name.endswith('-dogs') or name.endswith('-cats'):
        if name.startswith('imagenet-'):
            if name.endswith('dogs'):
                _ids = _imagenet_dogs.keys()
            else:
                _ids = _imagenet_cats.keys()
        else:
            _ids = [1] if name.endswith('dogs') else [0]

        return get_dataset(name[:-5],
                           split,
                           transform,
                           target_transform,
                           download,
                           limit=limit,
                           shuffle_before_limit=True,
                           datasets_path=__DATASETS_DEFAULT_PATH,
                           class_ids=_ids,
                           per_class_limit=False,
                           limit_shuffle_seed=0,
                           split_start=split_start)
    elif name.startswith('imagine-'):
        if train:
            ds_dir_name = None
            for i_cfg in _IMAGINE_CONFIGS:
                idx = name.find(i_cfg)
                if idx > 0:
                    ds_dir_name = os.path.join(name[:idx - 1], i_cfg,
                                               name[idx + len(i_cfg) + 1:])
                    #print(ds_dir_name)
                    break
            assert ds_dir_name is not None
        else:
            return get_dataset(name.split('-')[1],
                               split,
                               transform,
                               target_transform,
                               download,
                               limit=limit,
                               shuffle_before_limit=shuffle_before_limit,
                               datasets_path=__DATASETS_DEFAULT_PATH,
                               split_start=split_start)
    else:
        ds_dir_name = name

    root = os.path.join(datasets_path, ds_dir_name)
    if name == 'cifar10':
        return datasets.CIFAR10(root=root,
                                train=train,
                                transform=transform,
                                target_transform=target_transform,
                                download=download)
    elif name == 'cifar100':
        return datasets.CIFAR100(root=root,
                                 train=train,
                                 transform=transform,
                                 target_transform=target_transform,
                                 download=download)
    elif name == 'cifar10.1':
        return CIFAR10_1(root=root,
                         transform=transform,
                         target_transform=target_transform,
                         download=download)
    elif name.startswith('cifar10_custom'):
        ds_ = []
        if 'val' in split:
            ds_.append(
                get_dataset('cifar10',
                            split='val',
                            transform=transform,
                            download=download,
                            target_transform=target_transform))
        if '10.1' in split:
            ds_.append(
                get_dataset('cifar10.1',
                            split='',
                            transform=transform,
                            download=download,
                            target_transform=target_transform))
        if 'train' in split:
            ds_.append(
                get_dataset('cifar10',
                            split='train',
                            transform=transform,
                            download=download,
                            target_transform=target_transform))
        if 'ext' in split:
            ds_.append(
                get_dataset('folder-cifar10_ext',
                            split='val',
                            transform=transform,
                            download=download,
                            target_transform=target_transform))
        ds = ds_[0]
        for d in ds_[1:]:
            ds += d

        return limit_ds(ds,
                        limit=limit,
                        split_start=split_start,
                        per_class=per_class_limit,
                        shuffle=shuffle_before_limit,
                        seed=limit_shuffle_seed,
                        allowed_classes=class_ids)
    elif name.lower().startswith('svhn_custom'):
        ds_ = []
        if 'val' in split or 'test' in split:
            ds_.append(
                get_dataset('SVHN',
                            split='test',
                            transform=transform,
                            download=download,
                            target_transform=target_transform))
        if 'train' in split:
            ds_.append(
                get_dataset('SVHN',
                            split='train',
                            transform=transform,
                            download=download,
                            target_transform=target_transform))

        ds = ds_[0]
        for d in ds_[1:]:
            ds += d
        return limit_ds(ds,
                        limit=limit,
                        split_start=split_start,
                        per_class=per_class_limit,
                        shuffle=shuffle_before_limit,
                        seed=limit_shuffle_seed,
                        allowed_classes=class_ids)
    elif name.startswith('cifar100_custom'):
        ds_ = []
        if 'val' in split or 'test' in split:
            ds_.append(
                get_dataset('cifar100',
                            split='test',
                            transform=transform,
                            download=download,
                            target_transform=target_transform))
        if 'train' in split:
            ds_.append(
                get_dataset('cifar100',
                            split='train',
                            transform=transform,
                            download=download,
                            target_transform=target_transform))

        ds = ds_[0]
        for d in ds_[1:]:
            ds += d
        return limit_ds(ds,
                        limit=limit,
                        split_start=split_start,
                        per_class=per_class_limit,
                        shuffle=shuffle_before_limit,
                        seed=limit_shuffle_seed,
                        allowed_classes=class_ids)

    elif name == 'mnist' or name == 'mnist_3c':
        return datasets.MNIST(root=root,
                              train=train,
                              transform=transform,
                              target_transform=target_transform,
                              download=download)
    elif name == 'SVHN':
        return datasets.SVHN(root=root,
                             split='test' if not train else 'train',
                             transform=transform,
                             target_transform=target_transform,
                             download=download)
    elif 'stl10' in name:
        if train and name.endswith('train_test'):
            return datasets.STL10(root=root,
                                  split='train',
                                  transform=transform,
                                  target_transform=target_transform,
                                  download=download) + datasets.STL10(
                                      root=root,
                                      split='test',
                                      transform=transform,
                                      target_transform=target_transform,
                                      download=download)
        return datasets.STL10(root=root,
                              split=split,
                              transform=transform,
                              target_transform=target_transform,
                              download=download)
    elif name == 'LSUN':
        return datasets.LSUN(root=root,
                             classes=split,
                             transform=transform,
                             target_transform=target_transform)
    elif name.startswith('folder'):
        ds = datasets.ImageFolder(root=root,
                                  transform=transform,
                                  target_transform=target_transform)
        if limit or class_ids:
            ds = limit_ds(ds,
                          limit,
                          per_class=per_class_limit,
                          shuffle=shuffle_before_limit,
                          seed=limit_shuffle_seed,
                          allowed_classes=class_ids,
                          split_start=split_start)
        return ds
    elif name in ['imagenet', 'cats_vs_dogs', 'places365_standard'] or any(
            i in name for i in ['imagine-', '-raw']):
        if train:
            root = os.path.join(root, 'train')
        else:
            root = os.path.join(root, 'val')
        ds = datasets.ImageFolder(root=root,
                                  transform=transform,
                                  target_transform=target_transform)
        if limit or class_ids:
            if 'no_dd' in name:
                ds = limit_ds(ds,
                              limit * len(ds.classes),
                              per_class=False,
                              shuffle=shuffle_before_limit,
                              seed=limit_shuffle_seed,
                              split_start=split_start)
            else:
                ds = limit_ds(ds,
                              limit,
                              per_class=per_class_limit,
                              shuffle=shuffle_before_limit,
                              seed=limit_shuffle_seed,
                              allowed_classes=class_ids,
                              split_start=split_start)
        return ds
    elif name.startswith('DomainNet-'):
        ds = datasets.ImageFolder(root=root,
                                  transform=transform,
                                  target_transform=target_transform)
        if limit or class_ids:
            return limit_ds(ds,
                            limit=limit,
                            per_class=per_class_limit,
                            shuffle=shuffle_before_limit,
                            seed=limit_shuffle_seed,
                            allowed_classes=class_ids)
        return ds

    elif name.startswith('random-'):
        np.random.seed(limit_shuffle_seed)
        n_samples = limit or 10000
        dummy_targets = torch.ones(n_samples)

        ds_name = name[7:]
        if name.endswith('-normal'):
            mean, std = [0., 0., 0.], [1., 1., 1.]
            return RandomDatasetGenerator([3, 512, 512],
                                          mean,
                                          std,
                                          limit=n_samples,
                                          transform=transform,
                                          train=train)
        use_random_test = False
        if name.endswith('-rt'):
            use_random_test = True
            ds_name = ds_name[:-3]

        if ds_name in _DATASET_META_DATA:
            meta = _DATASET_META_DATA[ds_name]
            nclasses, data_shape, mean, std = meta.get_attrs().values()

        ## borrowed from https://github.com/hendrycks/outlier-exposure/
        elif ds_name == 'gaussian':
            samples = torch.from_numpy(
                np.clip(
                    np.random.normal(size=(n_samples, 3, 32, 32),
                                     loc=0.5,
                                     scale=0.5).astype(np.float32), 0, 1))
            return torch.utils.data.TensorDataset(samples, dummy_targets)
        elif ds_name == 'rademacher' or ds_name == 'bernoulli':
            samples = torch.from_numpy(
                np.random.binomial(n=1, p=0.5, size=(n_samples, 3, 32,
                                                     32)).astype(np.float32))
            if ds_name == 'rademacher':
                samples = samples * 2 - 1
            return torch.utils.data.TensorDataset(samples, dummy_targets)
        elif ds_name == 'blob':
            from skimage.filters import gaussian as gblur
            samples = np.float32(
                np.random.binomial(n=1, p=0.7, size=(n_samples, 32, 32, 3)))
            for i in range(n_samples):
                samples[i] = gblur(samples[i], sigma=1.5, multichannel=False)
                samples[i][samples[i] < 0.75] = 0.0

            samples = torch.from_numpy(samples.transpose((0, 3, 1, 2)))
            return torch.utils.data.TensorDataset(samples, dummy_targets)
        else:
            raise NotImplementedError

        limit = limit or 1000
        if per_class_limit:
            limit = limit * nclasses

        if train or use_random_test:
            return RandomDatasetGenerator(data_shape,
                                          mean,
                                          std,
                                          limit=limit,
                                          transform=transform,
                                          train=train)
        else:
            return get_dataset(ds_name,
                               split,
                               transform,
                               target_transform,
                               download,
                               limit=limit,
                               shuffle_before_limit=shuffle_before_limit,
                               datasets_path=__DATASETS_DEFAULT_PATH,
                               split_start=split_start)

    elif hasattr(datasets, name):
        return getattr(datasets, name)(root=datasets_path,
                                       train=train,
                                       transform=transform,
                                       target_transform=target_transform,
                                       download=download)