def test_pca_dim(): """ """ n, p = 100, 5 X = randn(n, p)*.1 X[:10] += np.array([3, 4, 5, 1, 2]) pca = PCA(n_comp='mle') pca.fit(X) assert_true(pca.n_comp == 1)
def test_infer_dim_2(): """ """ n, p = 1000, 5 X = randn(n, p)*.1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) pca = PCA(n_comp=p) pca.fit(X) spect = pca.explained_variance_ assert_true(_infer_dimension_(spect, n, p) > 1)
def test_pca_check_projection(): """test that the projection of data is correct """ n, p = 100, 3 X = randn(n, p) * .1 X[:10] += np.array([3, 4, 5]) pca = PCA(n_comp=2) pca.fit(X) Xt = 0.1* randn(1, p) + np.array([3, 4, 5]) Yt = pca.transform(Xt) Yt /= np.sqrt((Yt**2).sum()) np.testing.assert_almost_equal(np.abs(Yt[0][0]), 1., 1)
def test_pca(): """ PCA """ pca = PCA(n_comp=2) X_r = pca.fit(X).transform(X) np.testing.assert_equal(X_r.shape[1], 2) pca = PCA() pca.fit(X) np.testing.assert_almost_equal(pca.explained_variance_ratio_.sum(), 1.0, 3)
def test_infer_dim_1(): """ """ n, p = 1000, 5 X = randn(n, p)*0.1 + randn(n, 1)*np.array([3, 4, 5, 1, 2]) + np.array( [1, 0, 7, 4, 6]) pca = PCA(n_comp=p) pca.fit(X) spect = pca.explained_variance_ ll = [] for k in range(p): ll.append(_assess_dimension_(spect, k, n, p)) ll = np.array(ll) assert_true(ll[1] > ll.max() - .01 * n)
""" print __doc__ import pylab as pl from scikits.learn import datasets from scikits.learn.pca import PCA from scikits.learn.lda import LDA iris = datasets.load_iris() X = iris.data y = iris.target target_names = iris.target_names pca = PCA(n_components=2) X_r = pca.fit(X).transform(X) lda = LDA(n_components=2) X_r2 = lda.fit(X, y).transform(X) # Percentage of variance explained for each components print 'explained variance ratio (first two components):', \ pca.explained_variance_ratio_ pl.figure() pl.subplot(2, 1, 1) for c, i, target_name in zip("rgb", [0, 1, 2], target_names): pl.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name) pl.legend() pl.title('PCA of IRIS dataset')
print "n_digits: %d" % n_digits print "n_features: %d" % n_features print "n_samples: %d" % n_samples print print "Raw k-means with k-means++ init..." t0 = time() km = KMeans(init='k-means++', k=n_digits, n_init=10).fit(data) print "done in %0.3fs" % (time() - t0) print "inertia: %f" % km.inertia_ print print "Raw k-means with random centroid init..." t0 = time() km = KMeans(init='random', k=n_digits, n_init=10).fit(data) print "done in %0.3fs" % (time() - t0) print "inertia: %f" % km.inertia_ print print "Raw k-means with PCA-based centroid init..." # in this case the seeding of the centers is deterministic, hence we run the # kmeans algorithm only once with n_init=1 t0 = time() pca = PCA(n_components=n_digits).fit(data) km = KMeans(init=pca.components_.T, k=n_digits, n_init=1).fit(data) print "done in %0.3fs" % (time() - t0) print "inertia: %f" % km.inertia_ print
print "Dataset size:" print "n_samples: %d" % n_samples print "n_features: %d" % n_features split = n_samples * 3 / 4 X_train, X_test = X[:split], X[split:] y_train, y_test = y[:split], y[split:] ################################################################################ # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 print "Extracting the top %d eigenfaces" % n_components pca = PCA(n_comp=n_components, whiten=True, do_fast_svd=True).fit(X_train) eigenfaces = pca.components_.T.reshape((n_components, 64, 64)) # project the input data on the eigenfaces orthonormal basis X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) ################################################################################ # Train a SVM classification model print "Fitting the classifier to the training set" param_grid = { 'C': [1, 5, 10, 50, 100], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
from scikits.learn.pca import PCA from scikits.learn.fastica import FastICA if __name__ == '__main__': ############################################################################### # Generate sample data S = np.random.standard_t(1.5, size=(2, 10000)) S[0] *= 2. # Mix data A = [[1, 1], [0, 2]] # Mixing matrix X = np.dot(A, S) # Generate observations pca = PCA() S_pca_ = pca.fit(X.T).transform(X.T).T ica = FastICA() S_ica_ = ica.fit(X).transform(X) # Estimate the sources S_ica_ /= S_ica_.std(axis=1)[:,np.newaxis] ############################################################################### # Plot results def plot_samples(S, axis_list=None): pl.scatter(S[0], S[1], s=2, marker='o', linewidths=0, zorder=10) if axis_list is not None: colors = [(0, 0.6, 0), (0.6, 0, 0)] for color, axis in zip(colors, axis_list):
feature space) that account for the most variance in the data. Here we plot the different samples on the 2 first principal components. """ print __doc__ import pylab as pl from scikits.learn import datasets from scikits.learn.pca import PCA iris = datasets.load_iris() X = iris.data y = iris.target target_names = iris.target_names pca = PCA(n_components=2) X_r = pca.fit(X).transform(X) # Percentage of variance explained for each components print pca.explained_variance_ pl.figure() for c, i, target_name in zip("rgb", [0, 1, 2], target_names): pl.scatter(X_r[y==i,0], X_r[y==i,1], c=c, label=target_name) pl.legend() pl.title('PCA of IRIS dataset') pl.show()
def main_train(dataset, save_dir, n_hidden, tied_weights, act_enc, act_dec, learning_rate, batch_size, epochs, cost_type, noise_type, corruption_level, reg, normalize_on_the_fly=False, do_pca=False, num_components=numpy.inf, min_variance=.0, do_create_submission=False, submission_dir=None): ''' main function used for training ''' datasets = load_data(dataset, not normalize_on_the_fly, normalize_on_the_fly) train_set_x = datasets[0] valid_set_x = datasets[1] test_set_x = datasets[2] ################# # Premier block # ################# from scikits.learn.pca import PCA pca = PCA(n_components=75, whiten=True) print '... train PCA' pca.fit(train_set_x.value) print '... explained variance' print pca.explained_variance_ print '... transform valid/test' train_r = pca.transform(train_set_x.value) valid_r = pca.transform(valid_set_x.value) test_r = pca.transform(test_set_x.value) train_set_x.value = train_r valid_set_x.value = valid_r test_set_x.value = test_r del PCA, train_r, valid_r, test_r ################## # Deuxieme Block # ################## da1 = dA() save_dir1 = '/data/lisa/exp/mesnilgr/ift6266h11/GREGAVI2_/55' da1.load(save_dir1) index = T.lscalar() # index to a [mini]batch x = theano.tensor.matrix('input') get_rep_train = theano.function([index], da1.get_hidden_values(x), updates={}, givens={x: train_set_x}, name='get_rep_train') get_rep_valid = theano.function([index], da1.get_hidden_values(x), updates={}, givens={x: valid_set_x}, name='get_rep_valid') get_rep_test = theano.function([index], da1.get_hidden_values(x), updates={}, givens={x: test_set_x}, name='get_rep_test') # valid and test representations train_r = get_rep_train(0) valid_r = get_rep_valid(0) test_r = get_rep_test(0) train_set_x.value = train_r valid_set_x.value = valid_r test_set_x.value = test_r del train_r, valid_r, test_r d = get_constant(train_set_x.shape[1]) da = dA(n_visible=d, n_hidden=n_hidden, tied_weights=tied_weights, act_enc=act_enc, act_dec=act_dec) time_spent, loss = da.fit(train_set_x, learning_rate, batch_size, epochs, cost_type, noise_type, corruption_level, reg) if save_dir: da.save(save_dir) denoising_error = da.get_denoising_error(valid_set_x, cost_type, noise_type, corruption_level) print 'Training complete in %f (min) with final denoising error %f' \ %(time_spent,denoising_error) if do_pca: print "... computing PCA" x = theano.tensor.matrix('input') get_rep_train = theano.function([], da.get_hidden_values(x), updates={}, givens={x: train_set_x}, name='get_rep_valid') pca_trainer = pca.PCATrainer(get_rep_train(), num_components=num_components, min_variance=min_variance) pca_trainer.updates() pca_trainer.save(save_dir) if do_create_submission: print "... creating submission" if submission_dir is None: submission_dir = save_dir create_submission(dataset, save_dir, submission_dir, normalize_on_the_fly, do_pca) return denoising_error, time_spent, loss
from mpl_toolkits.mplot3d import Axes3D import numpy as np from scikits.learn.pca import PCA from src.data_interface import d, L_clean, L from src.utils import get_path, bool_to_color path = get_path(__file__) + '/..' L = list(L) # Remove trial_id, obsnum and is alert # I change notation here from D to X X = d.view()[:,3:] pca = PCA(n_components=30) pca.fit(X) plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o') ax = plt.gca() plt.title('Cumulative percentage of total variation explained by principal components') ax.set_xlabel('Principal component') ax.set_ylabel('% of total variation') plt.savefig('{0}/plots/pca-variation-explained.pdf'.format(path), papertype='a4', format='pdf') plt.cla() W = pca.components_[:,0:3] X_p = np.dot(X,W) rnd_rows = np.random.random_integers(0, X.shape[0], 120)
import pylab as pl from scikits.learn.pca import PCA from scikits.learn.fastica import FastICA ############################################################################### # Generate sample data S = np.random.standard_t(1.5, size=(2, 10000)) S[0] *= 2. # Mix data A = [[1, 1], [0, 2]] # Mixing matrix X = np.dot(A, S) # Generate observations pca = PCA() S_pca_ = pca.fit(X.T).transform(X.T).T ica = FastICA() S_ica_ = ica.fit(X).transform(X) # Estimate the sources S_ica_ /= S_ica_.std(axis=1)[:, np.newaxis] ############################################################################### # Plot results def plot_samples(S, axis_list=None): pl.scatter(S[0], S[1], s=2, marker='o', linewidths=0, zorder=10) if axis_list is not None: colors = [(0, 0.6, 0), (0.6, 0, 0)]