Beispiel #1
0
def setup_data(p, test_set=False):
    dataset_class, training_set_size = {
        'cifar10': (CIFAR10, 40000),
        'mnist': (MNIST, 50000),
        'reddit': (SubredditTopPhotosFeatures22, 20000)
    }[p.dataset]

    # Allow overriding the default from command line
    if p.get('unlabeled_samples') is not None:
        training_set_size = p.unlabeled_samples

    train_set = dataset_class(("train", ))

    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    if p.get('dseed'):
        rng = numpy.random.RandomState(seed=p.dseed)
        rng.shuffle(all_ind)

    d = AttributeDict()

    # Choose the training set
    d.train = train_set
    d.train_ind = all_ind[:training_set_size]

    # Then choose validation set from the remaining indices
    d.valid = train_set
    d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size]
    logger.info('Using %d examples for validation' % len(d.valid_ind))

    # Only touch test data if requested
    if test_set:
        d.test = dataset_class(("test", ))
        d.test_ind = numpy.arange(d.test.num_examples)

    # Setup optional whitening, only used for Cifar-10
    in_dim = train_set.data_sources[train_set.sources.index(
        'features')].shape[1:]
    if len(in_dim) > 1 and p.whiten_zca > 0:
        assert numpy.product(in_dim) == p.whiten_zca, \
            'Need %d whitening dimensions, not %d' % (numpy.product(in_dim),
                                                      p.whiten_zca)
    cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None

    def get_data(d, i):
        data = d.get_data(request=i)[d.sources.index('features')]
        # Fuel provides Cifar in uint8, convert to float32
        data = numpy.require(data, dtype=numpy.float32)
        return data if cnorm is None else cnorm.apply(data)

    if p.whiten_zca > 0:
        logger.info('Whitening using %d ZCA components' % p.whiten_zca)
        whiten = ZCA()
        whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind))
    else:
        whiten = None

    return in_dim, d, whiten, cnorm
def setup_data(p, test_set=False):
    dataset_class, training_set_size = {
        'cifar10': (CIFAR10, 40000),
        'mnist': (MNIST, 50000),
        'conll': (EMBOOT_CONLL, 13900),
        'ontonotes': (EMBOOT_ONTO, 67000)
    }[p.dataset]

    print("p.dataset = ", p.dataset)

    # Allow overriding the default from command line
    if p.get('unlabeled_samples') is not None:
        training_set_size = p.unlabeled_samples
        print("Training set size : ", training_set_size)

    train_set = dataset_class(["train"])
    print("train_set.num_examples : ", train_set.num_examples)

    # Make sure the MNIST data is in right format
    if p.dataset == 'mnist':
        d = train_set.data_sources[train_set.sources.index('features')]
        assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \
            'Make sure data is in float format and in range 0 to 1'

    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    if p.get('dseed'):
        rng = numpy.random.RandomState(seed=p.dseed)
        rng.shuffle(all_ind)

    d = AttributeDict()

    # Choose the training set
    d.train = train_set
    d.train_ind = all_ind[:training_set_size]

    # Then choose validation set from the remaining indices
    d.valid = train_set
    d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size]
    logger.info('Using %d examples for validation' % len(d.valid_ind))

    # Only touch test data if requested
    if test_set:
        d.test = dataset_class(["test"])
        d.test_ind = numpy.arange(d.test.num_examples)
        print("d.test.num_examples = ", d.test.num_examples)

    in_dim = train_set.data_sources[train_set.sources.index(
        'features')].shape[1:]

    if p.dataset == 'conll' or p.dataset == 'ontonotes':
        whiten = None
        cnorm = None

    # Setup optional whitening, only used for Cifar-10
    elif p.dataset == 'cifar10':
        if len(in_dim) > 1 and p.whiten_zca > 0:
            assert numpy.product(in_dim) == p.whiten_zca, \
                'Need %d whitening dimensions, not %d' % (numpy.product(in_dim),
                                                          p.whiten_zca)
        cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None

        def get_data(d, i):
            data = d.get_data(request=list(i))[d.sources.index('features')]
            # Fuel provides Cifar in uint8, convert to float32
            data = numpy.require(data, dtype=numpy.float32)
            return data if cnorm is None else cnorm.apply(data)

        if p.whiten_zca > 0:
            logger.info('Whitening using %d ZCA components' % p.whiten_zca)
            whiten = ZCA()
            whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind))
        else:
            whiten = None

    return in_dim, d, whiten, cnorm
Beispiel #3
0
def setup_data(p, test_set=False):
    if p.dataset in ['cifar10', 'mnist']:
        dataset_class, training_set_size = {
            'cifar10': (CIFAR10, 40000),
            'mnist': (MNIST, 50000),
        }[p.dataset]
    else:
        from fuel.datasets import H5PYDataset
        from fuel.utils import find_in_data_path
        from functools import partial
        fn = p.dataset
        fn = os.path.join(fn, fn + '.hdf5')

        def dataset_class(which_sets):
            return H5PYDataset(file_or_path=find_in_data_path(fn),
                               which_sets=which_sets,
                               load_in_memory=True)

        training_set_size = None

    train_set = dataset_class(["train"])

    # Allow overriding the default from command line
    if p.get('unlabeled_samples') is not None and p.unlabeled_samples >= 0:
        training_set_size = p.unlabeled_samples
    elif training_set_size is None:
        training_set_size = train_set.num_examples

    # Make sure the MNIST data is in right format
    if p.dataset == 'mnist':
        d = train_set.data_sources[train_set.sources.index('features')]
        assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \
            'Make sure data is in float format and in range 0 to 1'

    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    if p.get('dseed'):
        rng = numpy.random.RandomState(seed=p.dseed)
        rng.shuffle(all_ind)

    d = AttributeDict()

    # Choose the training set
    d.train = train_set
    d.train_ind = all_ind[:training_set_size]

    # Then choose validation set from the remaining indices
    d.valid = train_set
    d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size]
    logger.info('Using %d examples for validation' % len(d.valid_ind))

    # Only touch test data if requested
    if test_set:
        d.test = dataset_class(["test"])
        d.test_ind = numpy.arange(d.test.num_examples)

    # Setup optional whitening, only used for Cifar-10
    in_dim = train_set.data_sources[train_set.sources.index(
        'features')].shape[1:]
    if len(in_dim) > 1 and p.whiten_zca > 0:
        assert numpy.product(in_dim) == p.whiten_zca, \
            'Need %d whitening dimensions, not %d' % (numpy.product(in_dim),
                                                      p.whiten_zca)
    cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None

    def get_data(d, i):
        data = d.get_data(request=i)[d.sources.index('features')]
        # Fuel provides Cifar in uint8, convert to float32
        data = numpy.require(data, dtype=numpy.float32)
        return data if cnorm is None else cnorm.apply(data)

    if p.whiten_zca > 0:
        logger.info('Whitening using %d ZCA components' % p.whiten_zca)
        whiten = ZCA()
        whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind))
    else:
        whiten = None

    return in_dim, d, whiten, cnorm