Esempio n. 1
0
    def test_classification_train_valid_test(self):

        dataset = larochelle_etal_2007.Rectangles() # smallest one with splits
        assert not hasattr(dataset, 'classification_train_valid_test_task')

        train, valid, test = tasks.classification_train_valid_test(dataset)
        tasks.assert_classification(*train)
        tasks.assert_classification(*valid)
        tasks.assert_classification(*test)

        assert len(train[0]) == dataset.descr['n_train']
        assert len(valid[0]) == dataset.descr['n_valid']
        assert len(test[0]) == dataset.descr['n_test']

        tasks.assert_classification_train_valid_test(train, valid, test)
Esempio n. 2
0
    def test_classification_train_valid_test(self):

        dataset = larochelle_etal_2007.Rectangles()  # smallest one with splits
        assert not hasattr(dataset, 'classification_train_valid_test_task')

        train, valid, test = tasks.classification_train_valid_test(dataset)
        tasks.assert_classification(*train)
        tasks.assert_classification(*valid)
        tasks.assert_classification(*test)

        assert len(train[0]) == dataset.descr['n_train']
        assert len(valid[0]) == dataset.descr['n_valid']
        assert len(test[0]) == dataset.descr['n_test']

        tasks.assert_classification_train_valid_test(train, valid, test)
Esempio n. 3
0
def preprocess_data(config, ctrl):
    dataset = json_call(config['dataset_name'])
    train, valid, test = classification_train_valid_test(dataset)
    X_train, y_train = numpy.asarray(train[0]), numpy.asarray(train[1])
    X_valid, y_valid = numpy.asarray(valid[0]), numpy.asarray(valid[1])
    X_test, y_test = numpy.asarray(test[0]), numpy.asarray(test[1])

    if config['preprocessing']['kind'] == 'pca':
        # compute pca of input (TODO: retrieve only pca_whitened input)
        raise NotImplementedError('rewrite since cut and paste')
        (eigvals,eigvecs), centered_trainset = pylearn_pca.pca_from_examples(
                X=dataset['inputs'][:dataset['n_train']],
                max_energy_fraction=config['pca_energy'])
        eigmean = dataset['inputs'][0] - centered_trainset[0]

        whitened_inputs = pylearn_pca.pca_whiten((eigvals,eigvecs),
                dataset['inputs']-eigmean)
        ctrl.info('PCA kept %i of %i components'%(whitened_inputs.shape[1],
            dataset['n_inputs']))
    elif config['preprocessing']['kind'] == 'zca':
        (eigvals,eigvecs), centered_trainset = pylearn_pca.pca_from_examples(
                X=X_train,
                max_energy_fraction=config['preprocessing']['energy'])
        eigmean = X_train[0] - centered_trainset[0]

        def whiten(X):
            X = pylearn_pca.pca_whiten((eigvals,eigvecs),
                    X - eigmean)
            X = pylearn_pca.pca_whiten_inverse((eigvals, eigvecs),
                    X) + eigmean
            X = X.astype('float32')
            X_min = X.min()
            X_max = X.max()
            ctrl.info('ZCA min:%f max:%f' % (X_min, X_max))
            if X_min < 0 or X_max > 1.0:
                ctrl.info('ZCA clamping return value to (0, 1) interval')
                X = numpy.clip(X, 0, 1, out=X)
            return X

        X_train, X_valid, X_test = [whiten(X)
                for X in [X_train, X_valid, X_test]]

    elif config['preprocessing']['kind'] == 'normalize':
        raise NotImplementedError('rewrite since cut and paste')
        n_train=dataset['n_train']
        whitened_inputs = dataset['inputs']
        whitened_inputs = whitened_inputs - whitened_inputs[:n_train].mean(axis=0)
        whitened_inputs /= whitened_inputs[:n_train].std(axis=0)+1e-7
    elif config['preprocessing']['kind'] == 'raw':
        pass
    else:
        raise ValueError(
                'unrecognized preprocessing',
                config['preprocessing']['kind'])

    for Xy in 'X', 'y':
        for suffix in 'train', 'valid', 'test':
            varname = '%s_%s'%(Xy, suffix)
            var = locals()[varname]
            ctrl.info('%s shape=%s max=%f min=%f' % (
                varname,
                var.shape,
                var.max(),
                var.min()))

    s_X_train = theano.shared(X_train)
    s_y_train = theano.shared(y_train)
    s_X_valid = theano.shared(X_valid)
    s_y_valid = theano.shared(y_valid)
    s_X_test = theano.shared(X_test)
    s_y_test = theano.shared(y_test)

    return (dataset,
            (s_X_train, s_y_train),
            (s_X_valid, s_y_valid),
            (s_X_test, s_y_test))
def preprocess_data(config, ctrl):
    dataset = json_call(config['dataset_name'])
    train, valid, test = classification_train_valid_test(dataset)
    X_train, y_train = numpy.asarray(train[0]), numpy.asarray(train[1])
    X_valid, y_valid = numpy.asarray(valid[0]), numpy.asarray(valid[1])
    X_test, y_test = numpy.asarray(test[0]), numpy.asarray(test[1])

    if config['preprocessing']['kind'] == 'pca':
        # compute pca of input (TODO: retrieve only pca_whitened input)
        raise NotImplementedError('rewrite since cut and paste')
        (eigvals, eigvecs), centered_trainset = pylearn_pca.pca_from_examples(
            X=dataset['inputs'][:dataset['n_train']],
            max_energy_fraction=config['pca_energy'])
        eigmean = dataset['inputs'][0] - centered_trainset[0]

        whitened_inputs = pylearn_pca.pca_whiten((eigvals, eigvecs),
                                                 dataset['inputs'] - eigmean)
        ctrl.info('PCA kept %i of %i components' %
                  (whitened_inputs.shape[1], dataset['n_inputs']))
    elif config['preprocessing']['kind'] == 'zca':
        (eigvals, eigvecs), centered_trainset = pylearn_pca.pca_from_examples(
            X=X_train, max_energy_fraction=config['preprocessing']['energy'])
        eigmean = X_train[0] - centered_trainset[0]

        def whiten(X):
            X = pylearn_pca.pca_whiten((eigvals, eigvecs), X - eigmean)
            X = pylearn_pca.pca_whiten_inverse((eigvals, eigvecs), X) + eigmean
            X = X.astype('float32')
            X_min = X.min()
            X_max = X.max()
            ctrl.info('ZCA min:%f max:%f' % (X_min, X_max))
            if X_min < 0 or X_max > 1.0:
                ctrl.info('ZCA clamping return value to (0, 1) interval')
                X = numpy.clip(X, 0, 1, out=X)
            return X

        X_train, X_valid, X_test = [
            whiten(X) for X in [X_train, X_valid, X_test]
        ]

    elif config['preprocessing']['kind'] == 'normalize':
        raise NotImplementedError('rewrite since cut and paste')
        n_train = dataset['n_train']
        whitened_inputs = dataset['inputs']
        whitened_inputs = whitened_inputs - whitened_inputs[:n_train].mean(
            axis=0)
        whitened_inputs /= whitened_inputs[:n_train].std(axis=0) + 1e-7
    elif config['preprocessing']['kind'] == 'raw':
        pass
    else:
        raise ValueError('unrecognized preprocessing',
                         config['preprocessing']['kind'])

    for Xy in 'X', 'y':
        for suffix in 'train', 'valid', 'test':
            varname = '%s_%s' % (Xy, suffix)
            var = locals()[varname]
            ctrl.info('%s shape=%s max=%f min=%f' %
                      (varname, var.shape, var.max(), var.min()))

    s_X_train = theano.shared(X_train)
    s_y_train = theano.shared(y_train)
    s_X_valid = theano.shared(X_valid)
    s_y_valid = theano.shared(y_valid)
    s_X_test = theano.shared(X_test)
    s_y_test = theano.shared(y_test)

    return (dataset, (s_X_train, s_y_train), (s_X_valid, s_y_valid),
            (s_X_test, s_y_test))