Exemple #1
0
def test_pca_dim():
    """
    """
    n, p = 100, 5
    X = randn(n, p)*.1
    X[:10] += np.array([3, 4, 5, 1, 2])
    pca = PCA(n_comp='mle')
    pca.fit(X)
    assert_true(pca.n_comp == 1)
Exemple #2
0
def test_infer_dim_2():
    """
    """
    n, p = 1000, 5
    X = randn(n, p)*.1
    X[:10] += np.array([3, 4, 5, 1, 2])
    X[10:20] += np.array([6, 0, 7, 2, -1])
    pca = PCA(n_comp=p)
    pca.fit(X)
    spect = pca.explained_variance_
    assert_true(_infer_dimension_(spect, n, p) > 1)
Exemple #3
0
def test_pca_check_projection():
    """test that the projection of data is correct
    """
    n, p = 100, 3
    X = randn(n, p) * .1
    X[:10] += np.array([3, 4, 5])
    pca = PCA(n_comp=2)
    pca.fit(X)
    Xt = 0.1* randn(1, p) + np.array([3, 4, 5])
    Yt = pca.transform(Xt)
    Yt /= np.sqrt((Yt**2).sum())
    np.testing.assert_almost_equal(np.abs(Yt[0][0]), 1., 1)
Exemple #4
0
def test_pca():
    """
    PCA
    """
    pca = PCA(n_comp=2)
    X_r = pca.fit(X).transform(X)
    np.testing.assert_equal(X_r.shape[1], 2)

    pca = PCA()
    pca.fit(X)
    np.testing.assert_almost_equal(pca.explained_variance_ratio_.sum(),
                                   1.0, 3)
Exemple #5
0
def test_infer_dim_1():
    """
    """
    n, p = 1000, 5
    X = randn(n, p)*0.1 + randn(n, 1)*np.array([3, 4, 5, 1, 2]) + np.array(
        [1, 0, 7, 4, 6])
    pca = PCA(n_comp=p)
    pca.fit(X)
    spect = pca.explained_variance_
    ll = []
    for k in range(p):
         ll.append(_assess_dimension_(spect, k, n, p))
    ll = np.array(ll)
    assert_true(ll[1] > ll.max() - .01 * n)
"""
print __doc__

import pylab as pl

from scikits.learn import datasets
from scikits.learn.pca import PCA
from scikits.learn.lda import LDA

iris = datasets.load_iris()

X = iris.data
y = iris.target
target_names = iris.target_names

pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)

lda = LDA(n_components=2)
X_r2 = lda.fit(X, y).transform(X)

# Percentage of variance explained for each components
print 'explained variance ratio (first two components):', \
    pca.explained_variance_ratio_

pl.figure()
pl.subplot(2, 1, 1)
for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
    pl.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
pl.legend()
pl.title('PCA of IRIS dataset')
Exemple #7
0
print "n_digits: %d" % n_digits
print "n_features: %d" % n_features
print "n_samples: %d" % n_samples
print

print "Raw k-means with k-means++ init..."
t0 = time()
km = KMeans(init='k-means++', k=n_digits, n_init=10).fit(data)
print "done in %0.3fs" % (time() - t0)
print "inertia: %f" % km.inertia_
print

print "Raw k-means with random centroid init..."
t0 = time()
km = KMeans(init='random', k=n_digits, n_init=10).fit(data)
print "done in %0.3fs" % (time() - t0)
print "inertia: %f" % km.inertia_
print

print "Raw k-means with PCA-based centroid init..."
# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1
t0 = time()
pca = PCA(n_components=n_digits).fit(data)
km = KMeans(init=pca.components_.T, k=n_digits, n_init=1).fit(data)
print "done in %0.3fs" % (time() - t0)
print "inertia: %f" % km.inertia_
print

print "Dataset size:"
print "n_samples: %d" % n_samples
print "n_features: %d" % n_features

split = n_samples * 3 / 4

X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

################################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150

print "Extracting the top %d eigenfaces" % n_components
pca = PCA(n_comp=n_components, whiten=True, do_fast_svd=True).fit(X_train)

eigenfaces = pca.components_.T.reshape((n_components, 64, 64))

# project the input data on the eigenfaces orthonormal basis
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)


################################################################################
# Train a SVM classification model

print "Fitting the classifier to the training set"
param_grid = {
 'C': [1, 5, 10, 50, 100],
 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
Exemple #9
0
from scikits.learn.pca import PCA
from scikits.learn.fastica import FastICA

if __name__ == '__main__':

    ###############################################################################
    # Generate sample data
    S = np.random.standard_t(1.5, size=(2, 10000))
    S[0] *= 2.

    # Mix data
    A = [[1, 1], [0, 2]] # Mixing matrix

    X = np.dot(A, S) # Generate observations

    pca = PCA()
    S_pca_ = pca.fit(X.T).transform(X.T).T

    ica = FastICA()
    S_ica_ = ica.fit(X).transform(X) # Estimate the sources

    S_ica_ /= S_ica_.std(axis=1)[:,np.newaxis]

    ###############################################################################
    # Plot results

    def plot_samples(S, axis_list=None):
        pl.scatter(S[0], S[1], s=2, marker='o', linewidths=0, zorder=10)
        if axis_list is not None:
            colors = [(0, 0.6, 0), (0.6, 0, 0)]
            for color, axis in zip(colors, axis_list):
Exemple #10
0
feature space) that account for the most variance in the data. Here we
plot the different samples on the 2 first principal components.
"""
print __doc__

import pylab as pl

from scikits.learn import datasets
from scikits.learn.pca import PCA

iris = datasets.load_iris()

X = iris.data
y = iris.target
target_names = iris.target_names

pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)

# Percentage of variance explained for each components
print pca.explained_variance_

pl.figure()
for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
   pl.scatter(X_r[y==i,0], X_r[y==i,1], c=c, label=target_name)
pl.legend()
pl.title('PCA of IRIS dataset')

pl.show()

Exemple #11
0
def main_train(dataset,
               save_dir,
               n_hidden,
               tied_weights,
               act_enc,
               act_dec,
               learning_rate,
               batch_size,
               epochs,
               cost_type,
               noise_type,
               corruption_level,
               reg,
               normalize_on_the_fly=False,
               do_pca=False,
               num_components=numpy.inf,
               min_variance=.0,
               do_create_submission=False,
               submission_dir=None):
    ''' main function used for training '''

    datasets = load_data(dataset, not normalize_on_the_fly,
                         normalize_on_the_fly)

    train_set_x = datasets[0]
    valid_set_x = datasets[1]
    test_set_x = datasets[2]

    #################
    # Premier block #
    #################
    from scikits.learn.pca import PCA
    pca = PCA(n_components=75, whiten=True)

    print '... train PCA'
    pca.fit(train_set_x.value)
    print '... explained variance'
    print pca.explained_variance_
    print '... transform valid/test'
    train_r = pca.transform(train_set_x.value)
    valid_r = pca.transform(valid_set_x.value)
    test_r = pca.transform(test_set_x.value)

    train_set_x.value = train_r
    valid_set_x.value = valid_r
    test_set_x.value = test_r

    del PCA, train_r, valid_r, test_r

    ##################
    # Deuxieme Block #
    ##################

    da1 = dA()
    save_dir1 = '/data/lisa/exp/mesnilgr/ift6266h11/GREGAVI2_/55'
    da1.load(save_dir1)

    index = T.lscalar()  # index to a [mini]batch
    x = theano.tensor.matrix('input')
    get_rep_train = theano.function([index],
                                    da1.get_hidden_values(x),
                                    updates={},
                                    givens={x: train_set_x},
                                    name='get_rep_train')
    get_rep_valid = theano.function([index],
                                    da1.get_hidden_values(x),
                                    updates={},
                                    givens={x: valid_set_x},
                                    name='get_rep_valid')
    get_rep_test = theano.function([index],
                                   da1.get_hidden_values(x),
                                   updates={},
                                   givens={x: test_set_x},
                                   name='get_rep_test')

    # valid and test representations
    train_r = get_rep_train(0)
    valid_r = get_rep_valid(0)
    test_r = get_rep_test(0)

    train_set_x.value = train_r
    valid_set_x.value = valid_r
    test_set_x.value = test_r

    del train_r, valid_r, test_r

    d = get_constant(train_set_x.shape[1])

    da = dA(n_visible=d,
            n_hidden=n_hidden,
            tied_weights=tied_weights,
            act_enc=act_enc,
            act_dec=act_dec)

    time_spent, loss = da.fit(train_set_x, learning_rate, batch_size, epochs,
                              cost_type, noise_type, corruption_level, reg)

    if save_dir:
        da.save(save_dir)

    denoising_error = da.get_denoising_error(valid_set_x, cost_type,
                                             noise_type, corruption_level)
    print 'Training complete in %f (min) with final denoising error %f' \
        %(time_spent,denoising_error)

    if do_pca:
        print "... computing PCA"
        x = theano.tensor.matrix('input')
        get_rep_train = theano.function([],
                                        da.get_hidden_values(x),
                                        updates={},
                                        givens={x: train_set_x},
                                        name='get_rep_valid')
        pca_trainer = pca.PCATrainer(get_rep_train(),
                                     num_components=num_components,
                                     min_variance=min_variance)
        pca_trainer.updates()
        pca_trainer.save(save_dir)

    if do_create_submission:
        print "... creating submission"
        if submission_dir is None:
            submission_dir = save_dir
        create_submission(dataset, save_dir, submission_dir,
                          normalize_on_the_fly, do_pca)

    return denoising_error, time_spent, loss
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from scikits.learn.pca import PCA

from src.data_interface import d, L_clean, L
from src.utils import get_path, bool_to_color


path = get_path(__file__) + '/..'
L = list(L)

# Remove trial_id, obsnum and is alert
# I change notation here from D to X
X = d.view()[:,3:]

pca = PCA(n_components=30)
pca.fit(X)

plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
ax = plt.gca()
plt.title('Cumulative percentage of total variation explained by principal components')
ax.set_xlabel('Principal component')
ax.set_ylabel('% of total variation')
plt.savefig('{0}/plots/pca-variation-explained.pdf'.format(path), papertype='a4', format='pdf')
plt.cla()

W = pca.components_[:,0:3]
X_p = np.dot(X,W)

rnd_rows = np.random.random_integers(0, X.shape[0], 120)
Exemple #13
0
import pylab as pl

from scikits.learn.pca import PCA
from scikits.learn.fastica import FastICA

###############################################################################
# Generate sample data
S = np.random.standard_t(1.5, size=(2, 10000))
S[0] *= 2.

# Mix data
A = [[1, 1], [0, 2]]  # Mixing matrix

X = np.dot(A, S)  # Generate observations

pca = PCA()
S_pca_ = pca.fit(X.T).transform(X.T).T

ica = FastICA()
S_ica_ = ica.fit(X).transform(X)  # Estimate the sources

S_ica_ /= S_ica_.std(axis=1)[:, np.newaxis]

###############################################################################
# Plot results


def plot_samples(S, axis_list=None):
    pl.scatter(S[0], S[1], s=2, marker='o', linewidths=0, zorder=10)
    if axis_list is not None:
        colors = [(0, 0.6, 0), (0.6, 0, 0)]