def pca_layer(X, energy, eps): import pylearn_pca (eigvals, eigvecs), centered_trainset = pylearn_pca.pca_from_examples( X=X, max_energy_fraction=energy) centering_offset = centered_trainset[0] - X[0] W = eigvecs / np.sqrt(eigvals + eps) print('PCA kept %i of %i components' % (W.shape[1], X.shape[1])) return AffineLayerPre(W.astype(X.dtype), centering_offset.astype(X.dtype))
def pca_layer(X, energy, eps): import pylearn_pca (eigvals, eigvecs), centered_trainset = pylearn_pca.pca_from_examples( X=X, max_energy_fraction=energy) centering_offset = centered_trainset[0] - X[0] W = eigvecs / np.sqrt(eigvals + eps) print('PCA kept %i of %i components' % (W.shape[1], X.shape[1])) return AffineLayerPre( W.astype(X.dtype), centering_offset.astype(X.dtype))
def zca_layer(X, energy, eps): """ Return a pair of layers whose output when filtering X will be X's ZCA. energy - retain at least this much energy with the principle components eps - add this to the eigenvalues when computing PCA responses to prevent division-by-zero and suppress weak components in the PCA representation. """ import pylearn_pca (eigvals, eigvecs), centered_trainset = pylearn_pca.pca_from_examples( X=X, max_energy_fraction=energy) centering_offset = centered_trainset[0] - X[0] W = eigvecs / np.sqrt(eigvals + eps) print('ZCA kept %i of %i components' % (W.shape[1], X.shape[1])) l0 = AffineLayerPre(W.astype(X.dtype), centering_offset.astype(X.dtype)) l1 = ClipLayer(eigvecs.T.copy().astype(X.dtype), np.asarray(0, dtype=X.dtype)) return [l0, l1]
def preprocess_data(config, ctrl): dataset = json_call(config['dataset_name']) train, valid, test = classification_train_valid_test(dataset) X_train, y_train = numpy.asarray(train[0]), numpy.asarray(train[1]) X_valid, y_valid = numpy.asarray(valid[0]), numpy.asarray(valid[1]) X_test, y_test = numpy.asarray(test[0]), numpy.asarray(test[1]) if config['preprocessing']['kind'] == 'pca': # compute pca of input (TODO: retrieve only pca_whitened input) raise NotImplementedError('rewrite since cut and paste') (eigvals,eigvecs), centered_trainset = pylearn_pca.pca_from_examples( X=dataset['inputs'][:dataset['n_train']], max_energy_fraction=config['pca_energy']) eigmean = dataset['inputs'][0] - centered_trainset[0] whitened_inputs = pylearn_pca.pca_whiten((eigvals,eigvecs), dataset['inputs']-eigmean) ctrl.info('PCA kept %i of %i components'%(whitened_inputs.shape[1], dataset['n_inputs'])) elif config['preprocessing']['kind'] == 'zca': (eigvals,eigvecs), centered_trainset = pylearn_pca.pca_from_examples( X=X_train, max_energy_fraction=config['preprocessing']['energy']) eigmean = X_train[0] - centered_trainset[0] def whiten(X): X = pylearn_pca.pca_whiten((eigvals,eigvecs), X - eigmean) X = pylearn_pca.pca_whiten_inverse((eigvals, eigvecs), X) + eigmean X = X.astype('float32') X_min = X.min() X_max = X.max() ctrl.info('ZCA min:%f max:%f' % (X_min, X_max)) if X_min < 0 or X_max > 1.0: ctrl.info('ZCA clamping return value to (0, 1) interval') X = numpy.clip(X, 0, 1, out=X) return X X_train, X_valid, X_test = [whiten(X) for X in [X_train, X_valid, X_test]] elif config['preprocessing']['kind'] == 'normalize': raise NotImplementedError('rewrite since cut and paste') n_train=dataset['n_train'] whitened_inputs = dataset['inputs'] whitened_inputs = whitened_inputs - whitened_inputs[:n_train].mean(axis=0) whitened_inputs /= whitened_inputs[:n_train].std(axis=0)+1e-7 elif config['preprocessing']['kind'] == 'raw': pass else: raise ValueError( 'unrecognized preprocessing', config['preprocessing']['kind']) for Xy in 'X', 'y': for suffix in 'train', 'valid', 'test': varname = '%s_%s'%(Xy, suffix) var = locals()[varname] ctrl.info('%s shape=%s max=%f min=%f' % ( varname, var.shape, var.max(), var.min())) s_X_train = theano.shared(X_train) s_y_train = theano.shared(y_train) s_X_valid = theano.shared(X_valid) s_y_valid = theano.shared(y_valid) s_X_test = theano.shared(X_test) s_y_test = theano.shared(y_test) return (dataset, (s_X_train, s_y_train), (s_X_valid, s_y_valid), (s_X_test, s_y_test))
def preprocess_data(config, ctrl): dataset = json_call(config['dataset_name']) train, valid, test = classification_train_valid_test(dataset) X_train, y_train = numpy.asarray(train[0]), numpy.asarray(train[1]) X_valid, y_valid = numpy.asarray(valid[0]), numpy.asarray(valid[1]) X_test, y_test = numpy.asarray(test[0]), numpy.asarray(test[1]) if config['preprocessing']['kind'] == 'pca': # compute pca of input (TODO: retrieve only pca_whitened input) raise NotImplementedError('rewrite since cut and paste') (eigvals, eigvecs), centered_trainset = pylearn_pca.pca_from_examples( X=dataset['inputs'][:dataset['n_train']], max_energy_fraction=config['pca_energy']) eigmean = dataset['inputs'][0] - centered_trainset[0] whitened_inputs = pylearn_pca.pca_whiten((eigvals, eigvecs), dataset['inputs'] - eigmean) ctrl.info('PCA kept %i of %i components' % (whitened_inputs.shape[1], dataset['n_inputs'])) elif config['preprocessing']['kind'] == 'zca': (eigvals, eigvecs), centered_trainset = pylearn_pca.pca_from_examples( X=X_train, max_energy_fraction=config['preprocessing']['energy']) eigmean = X_train[0] - centered_trainset[0] def whiten(X): X = pylearn_pca.pca_whiten((eigvals, eigvecs), X - eigmean) X = pylearn_pca.pca_whiten_inverse((eigvals, eigvecs), X) + eigmean X = X.astype('float32') X_min = X.min() X_max = X.max() ctrl.info('ZCA min:%f max:%f' % (X_min, X_max)) if X_min < 0 or X_max > 1.0: ctrl.info('ZCA clamping return value to (0, 1) interval') X = numpy.clip(X, 0, 1, out=X) return X X_train, X_valid, X_test = [ whiten(X) for X in [X_train, X_valid, X_test] ] elif config['preprocessing']['kind'] == 'normalize': raise NotImplementedError('rewrite since cut and paste') n_train = dataset['n_train'] whitened_inputs = dataset['inputs'] whitened_inputs = whitened_inputs - whitened_inputs[:n_train].mean( axis=0) whitened_inputs /= whitened_inputs[:n_train].std(axis=0) + 1e-7 elif config['preprocessing']['kind'] == 'raw': pass else: raise ValueError('unrecognized preprocessing', config['preprocessing']['kind']) for Xy in 'X', 'y': for suffix in 'train', 'valid', 'test': varname = '%s_%s' % (Xy, suffix) var = locals()[varname] ctrl.info('%s shape=%s max=%f min=%f' % (varname, var.shape, var.max(), var.min())) s_X_train = theano.shared(X_train) s_y_train = theano.shared(y_train) s_X_valid = theano.shared(X_valid) s_y_valid = theano.shared(y_valid) s_X_test = theano.shared(X_test) s_y_test = theano.shared(y_test) return (dataset, (s_X_train, s_y_train), (s_X_valid, s_y_valid), (s_X_test, s_y_test))