def setup_data(p, test_set=False): dataset_class, training_set_size = { 'cifar10': (CIFAR10, 40000), 'mnist': (MNIST, 50000), 'reddit': (SubredditTopPhotosFeatures22, 20000) }[p.dataset] # Allow overriding the default from command line if p.get('unlabeled_samples') is not None: training_set_size = p.unlabeled_samples train_set = dataset_class(("train", )) # Take all indices and permutate them all_ind = numpy.arange(train_set.num_examples) if p.get('dseed'): rng = numpy.random.RandomState(seed=p.dseed) rng.shuffle(all_ind) d = AttributeDict() # Choose the training set d.train = train_set d.train_ind = all_ind[:training_set_size] # Then choose validation set from the remaining indices d.valid = train_set d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size] logger.info('Using %d examples for validation' % len(d.valid_ind)) # Only touch test data if requested if test_set: d.test = dataset_class(("test", )) d.test_ind = numpy.arange(d.test.num_examples) # Setup optional whitening, only used for Cifar-10 in_dim = train_set.data_sources[train_set.sources.index( 'features')].shape[1:] if len(in_dim) > 1 and p.whiten_zca > 0: assert numpy.product(in_dim) == p.whiten_zca, \ 'Need %d whitening dimensions, not %d' % (numpy.product(in_dim), p.whiten_zca) cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None def get_data(d, i): data = d.get_data(request=i)[d.sources.index('features')] # Fuel provides Cifar in uint8, convert to float32 data = numpy.require(data, dtype=numpy.float32) return data if cnorm is None else cnorm.apply(data) if p.whiten_zca > 0: logger.info('Whitening using %d ZCA components' % p.whiten_zca) whiten = ZCA() whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind)) else: whiten = None return in_dim, d, whiten, cnorm
def setup_data(p, test_set=False): dataset_class, training_set_size = { 'cifar10': (CIFAR10, 40000), 'mnist': (MNIST, 50000), 'conll': (EMBOOT_CONLL, 13900), 'ontonotes': (EMBOOT_ONTO, 67000) }[p.dataset] print("p.dataset = ", p.dataset) # Allow overriding the default from command line if p.get('unlabeled_samples') is not None: training_set_size = p.unlabeled_samples print("Training set size : ", training_set_size) train_set = dataset_class(["train"]) print("train_set.num_examples : ", train_set.num_examples) # Make sure the MNIST data is in right format if p.dataset == 'mnist': d = train_set.data_sources[train_set.sources.index('features')] assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \ 'Make sure data is in float format and in range 0 to 1' # Take all indices and permutate them all_ind = numpy.arange(train_set.num_examples) if p.get('dseed'): rng = numpy.random.RandomState(seed=p.dseed) rng.shuffle(all_ind) d = AttributeDict() # Choose the training set d.train = train_set d.train_ind = all_ind[:training_set_size] # Then choose validation set from the remaining indices d.valid = train_set d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size] logger.info('Using %d examples for validation' % len(d.valid_ind)) # Only touch test data if requested if test_set: d.test = dataset_class(["test"]) d.test_ind = numpy.arange(d.test.num_examples) print("d.test.num_examples = ", d.test.num_examples) in_dim = train_set.data_sources[train_set.sources.index( 'features')].shape[1:] if p.dataset == 'conll' or p.dataset == 'ontonotes': whiten = None cnorm = None # Setup optional whitening, only used for Cifar-10 elif p.dataset == 'cifar10': if len(in_dim) > 1 and p.whiten_zca > 0: assert numpy.product(in_dim) == p.whiten_zca, \ 'Need %d whitening dimensions, not %d' % (numpy.product(in_dim), p.whiten_zca) cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None def get_data(d, i): data = d.get_data(request=list(i))[d.sources.index('features')] # Fuel provides Cifar in uint8, convert to float32 data = numpy.require(data, dtype=numpy.float32) return data if cnorm is None else cnorm.apply(data) if p.whiten_zca > 0: logger.info('Whitening using %d ZCA components' % p.whiten_zca) whiten = ZCA() whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind)) else: whiten = None return in_dim, d, whiten, cnorm
def setup_data(p, test_set=False): if p.dataset in ['cifar10', 'mnist']: dataset_class, training_set_size = { 'cifar10': (CIFAR10, 40000), 'mnist': (MNIST, 50000), }[p.dataset] else: from fuel.datasets import H5PYDataset from fuel.utils import find_in_data_path from functools import partial fn = p.dataset fn = os.path.join(fn, fn + '.hdf5') def dataset_class(which_sets): return H5PYDataset(file_or_path=find_in_data_path(fn), which_sets=which_sets, load_in_memory=True) training_set_size = None train_set = dataset_class(["train"]) # Allow overriding the default from command line if p.get('unlabeled_samples') is not None and p.unlabeled_samples >= 0: training_set_size = p.unlabeled_samples elif training_set_size is None: training_set_size = train_set.num_examples # Make sure the MNIST data is in right format if p.dataset == 'mnist': d = train_set.data_sources[train_set.sources.index('features')] assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \ 'Make sure data is in float format and in range 0 to 1' # Take all indices and permutate them all_ind = numpy.arange(train_set.num_examples) if p.get('dseed'): rng = numpy.random.RandomState(seed=p.dseed) rng.shuffle(all_ind) d = AttributeDict() # Choose the training set d.train = train_set d.train_ind = all_ind[:training_set_size] # Then choose validation set from the remaining indices d.valid = train_set d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size] logger.info('Using %d examples for validation' % len(d.valid_ind)) # Only touch test data if requested if test_set: d.test = dataset_class(["test"]) d.test_ind = numpy.arange(d.test.num_examples) # Setup optional whitening, only used for Cifar-10 in_dim = train_set.data_sources[train_set.sources.index( 'features')].shape[1:] if len(in_dim) > 1 and p.whiten_zca > 0: assert numpy.product(in_dim) == p.whiten_zca, \ 'Need %d whitening dimensions, not %d' % (numpy.product(in_dim), p.whiten_zca) cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None def get_data(d, i): data = d.get_data(request=i)[d.sources.index('features')] # Fuel provides Cifar in uint8, convert to float32 data = numpy.require(data, dtype=numpy.float32) return data if cnorm is None else cnorm.apply(data) if p.whiten_zca > 0: logger.info('Whitening using %d ZCA components' % p.whiten_zca) whiten = ZCA() whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind)) else: whiten = None return in_dim, d, whiten, cnorm