Example #1
0
class PCAMODEL(object):
    n_components = None
    trainX = None
    trainY = None
    testX = None
    model = None
    def __init__(self, n='mle', X = None, Y = None):
        self.n_components = n
        self.trainX = X
        self.trainY = Y

    def build_model(self):
        self.model = PCA(self.n_components)
        self.model.fit(self.trainX)

    def reduce_dim(self, data):
        return self.model.transform(data)
Example #2
0
def PDBpca(pdblist_file, npcs=5,refPDB_file=None):

  # read pdblist file and fit each structure to refPDB
  pdbdata, miscs, rmsds = pynumpdb.readPDBlist(pdblist_file,refPDB_file)
   
  # run PCA
  #v,P,PC = pynumpdb._pca.pca_train(pdbdata,npcs,do_norm=0)
  pca = PCA()
  pca.fit(pdbdata)
  v = pca.explained_variance_
  P = pca.components_
  PC = pca.transform(pdbdata)
  print v
  print P
  print len(PC),len(PC[0])
  #print PC.T

  return v,P,PC
import numpy as np
import pylab as pl

from scikits.learn.decomposition import PCA, FastICA

###############################################################################
# Generate sample data
S = np.random.standard_t(1.5, size=(2, 10000))
S[0] *= 2.

# Mix data
A = [[1, 1], [0, 2]]  # Mixing matrix

X = np.dot(A, S)  # Generate observations

pca = PCA()
S_pca_ = pca.fit(X.T).transform(X.T).T

ica = FastICA()
S_ica_ = ica.fit(X).transform(X)  # Estimate the sources

S_ica_ /= S_ica_.std(axis=1)[:, np.newaxis]


###############################################################################
# Plot results

def plot_samples(S, axis_list=None):
    pl.scatter(S[0], S[1], s=2, marker='o', linewidths=0, zorder=10)
    if axis_list is not None:
        colors = [(0, 0.6, 0), (0.6, 0, 0)]
Example #4
0
 def build_model(self):
     self.model = PCA(self.n_components)
     self.model.fit(self.trainX)
#!/usr/bin/env python

import os, numpy
from scikits.learn.decomposition import PCA

from ift6266h12.utils.ift6266h12_io import load_train_input, load_test_input, load_valid_input

dest_path = '/data/lisa/data/UTLC/pca'

trainset = load_train_input('sylvester', normalize=True)
testset = load_test_input('sylvester', normalize=True)
validset = load_valid_input('sylvester', normalize=True)

pca = PCA(32)
pca.fit(trainset)

numpy.save(os.path.join(dest_path, 'sylvester_train_x_pca32.npy'),
           pca.transform(trainset))
numpy.save(os.path.join(dest_path, 'sylvester_valid_x_pca32.npy'),
           pca.transform(validset))
numpy.save(os.path.join(dest_path, 'sylvester_test_x_pca32.npy'),
           pca.transform(testset))
Example #6
0
def generate_clusters(n_samples=200):
    mean1 = np.array([0, 2])
    mean2 = np.array([2, 0])
    cov = np.array([[2.0, 1.0], [1.0, 2.0]])
    X_red = np.random.multivariate_normal(mean1, cov, n_samples)
    X_blue = np.random.multivariate_normal(mean2, cov, n_samples)
    return np.vstack((X_red, X_blue))

X = genenerate_rings()
#X = generate_clusters()

kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=0.5)
X_kpca = kpca.fit_transform(X)
X_back = kpca.inverse_transform(X_kpca)
pca = PCA()
X_pca = pca.fit_transform(X)

# Plot results

pl.figure()
pl.subplot(2, 2, 1, aspect='equal')
pl.title("Original space")
pl.plot(X[:200, 0], X[:200, 1], "ro")
pl.plot(X[200:, 0], X[200:, 1], "bo")
pl.xlabel("$x_1$")
pl.ylabel("$x_2$")

X1, X2 = np.meshgrid(np.linspace(-6, 6, 50), np.linspace(-6, 6, 50))
X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T
# projection on the first principal component (in the phi space)
Example #7
0
import pylab as pl

from scikits.learn import datasets
from scikits.learn.decomposition import PCA
from scikits.learn.lda import LDA

iris = datasets.load_iris()

X = iris.data

y = iris.target

target_names = iris.target_names
print target_names
pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)

lda = LDA(n_components=2)
X_r2 = lda.fit(X, y).transform(X)

# Percentage of variance explained for each components
print 'explained variance ratio (first two components):', \
    pca.explained_variance_ratio_

pl.figure()
for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
    pl.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
pl.legend()
pl.title('PCA of IRIS dataset')
Example #8
0
    'box1.npy': 4,
    'box2.npy': 4,
    'box3.npy': 4,
    'box4.npy': 4,
    'box5.npy': 4,
    'bottle1.npy': 3,
    'bottle2.npy': 3,
    'bottle3.npy': 3,
    'bottle4.npy': 3,
    'bottle5.npy': 3
}

X, Y = load_and_pack_data(dnames, 20, 20)

# PCA and Kernel PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)
print 'done simple pca'

kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True)
X_kpca = kpca.fit_transform(X)
print 'fitted kernel pca'
X_back = kpca.inverse_transform(X_kpca)
print 'done back transforming with kpca'

# plots
reds = Y == 1
blues = Y == 2
greens = Y == 3
magentas = Y == 4
yellows = Y == 5
Example #9
0
print "n_digits: %d" % n_digits
print "n_features: %d" % n_features
print "n_samples: %d" % n_samples
print

print "Raw k-means with k-means++ init..."
t0 = time()
km = KMeans(init='k-means++', k=n_digits, n_init=10).fit(data)
print "done in %0.3fs" % (time() - t0)
print "inertia: %f" % km.inertia_
print

print "Raw k-means with random centroid init..."
t0 = time()
km = KMeans(init='random', k=n_digits, n_init=10).fit(data)
print "done in %0.3fs" % (time() - t0)
print "inertia: %f" % km.inertia_
print

print "Raw k-means with PCA-based centroid init..."
# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1
t0 = time()
pca = PCA(n_components=n_digits).fit(data)
km = KMeans(init=pca.components_, k=n_digits, n_init=1).fit(data)
print "done in %0.3fs" % (time() - t0)
print "inertia: %f" % km.inertia_
print

Example #10
0
"""
print __doc__

import pylab as pl

from scikits.learn import datasets
from scikits.learn.decomposition import PCA
from scikits.learn.lda import LDA

iris = datasets.load_iris()

X = iris.data
y = iris.target
target_names = iris.target_names

pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)

lda = LDA(n_components=2)
X_r2 = lda.fit(X, y).transform(X)

# Percentage of variance explained for each components
print 'explained variance ratio (first two components):', \
    pca.explained_variance_ratio_

pl.figure()
for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
    pl.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
pl.legend()
pl.title('PCA of IRIS dataset')
Example #11
0
import numpy as np
import pylab as pl

from scikits.learn.decomposition import PCA, FastICA

###############################################################################
# Generate sample data
S = np.random.standard_t(1.5, size=(10000, 2))
S[0] *= 2.

# Mix data
A = np.array([[1, 1], [0, 2]])  # Mixing matrix

X = np.dot(S, A.T)  # Generate observations

pca = PCA()
S_pca_ = pca.fit(X).transform(X)

ica = FastICA()
S_ica_ = ica.fit(X).transform(X)  # Estimate the sources

S_ica_ /= S_ica_.std(axis=0)


###############################################################################
# Plot results

def plot_samples(S, axis_list=None):
    pl.scatter(S[:,0], S[:,1], s=2, marker='o', linewidths=0, zorder=10)
    if axis_list is not None:
        colors = [(0, 0.6, 0), (0.6, 0, 0)]
def generate_clusters(n_samples=200):
    mean1 = np.array([0, 2])
    mean2 = np.array([2, 0])
    cov = np.array([[2.0, 1.0], [1.0, 2.0]])
    X_red = np.random.multivariate_normal(mean1, cov, n_samples)
    X_blue = np.random.multivariate_normal(mean2, cov, n_samples)
    return np.vstack((X_red, X_blue))

X = genenerate_rings()
#X = generate_clusters()

kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True)
X_kpca = kpca.fit_transform(X)
X_back = kpca.inverse_transform(X_kpca)
pca = PCA()
X_pca = pca.fit_transform(X)

# Plot results

pl.figure()
pl.subplot(2, 2, 1, aspect='equal')
pl.title("Original space")
pl.plot(X[:200, 0], X[:200, 1], "ro")
pl.plot(X[200:, 0], X[200:, 1], "bo")
pl.xlabel("$x_1$")
pl.ylabel("$x_2$")

X1, X2 = np.meshgrid(np.linspace(-6, 6, 50), np.linspace(-6, 6, 50))
X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T
# projection on the first principal component (in the phi space)
Example #13
0
def main_train(work_dir="../results/avicenna/",
               corruption_level=0.3,
               nvis=75,
               nhid=600,
               tied_weights=True,
               act_enc="sigmoid",
               act_dec=None,
               max_epochs=2,
               learning_rate=0.001,
               batch_size=20,
               monitoring_batches=5,
               save_freq=1,
               n_components_trans_pca=7):

    conf = {
        'corruption_level': corruption_level,
        'nvis': nvis,
        'nhid': nhid,
        'tied_weights': tied_weights,
        'act_enc': act_enc,
        'act_dec': act_dec,
        'max_epochs': max_epochs,
        'learning_rate': learning_rate,
        'batch_size': batch_size,
        'monitoring_batches': monitoring_batches,
        'save_freq': save_freq,
        'n_components_trans_pca': n_components_trans_pca
    }

    start = time.clock()

    ###############   TRAIN THE DAE
    train_file = work_dir + "train_pca" + str(conf['nvis']) + ".npy"
    save_path = work_dir + "train_pca" + str(conf['nvis']) + "_dae" + str(
        conf['nhid']) + "_model.pkl"

    trainset = NpyDataset(file=train_file)
    trainset.yaml_src = 'script'
    corruptor = BinomialCorruptor(corruption_level=conf['corruption_level'])
    dae = DenoisingAutoencoder(nvis=conf['nvis'],
                               nhid=conf['nhid'],
                               tied_weights=conf['tied_weights'],
                               corruptor=corruptor,
                               act_enc=conf['act_enc'],
                               act_dec=conf['act_dec'])
    cost = MeanSquaredReconstructionError()
    termination_criterion = EpochCounter(max_epochs=conf['max_epochs'])
    algorithm = UnsupervisedExhaustiveSGD(
        learning_rate=conf['learning_rate'],
        batch_size=conf['batch_size'],
        monitoring_batches=conf['monitoring_batches'],
        monitoring_dataset=trainset,
        cost=cost,
        termination_criterion=termination_criterion)

    train_obj = Train(dataset=trainset,
                      model=dae,
                      algorithm=algorithm,
                      save_freq=conf['save_freq'],
                      save_path=save_path)
    train_obj.main_loop()

    ###############   APPLY THE MODEL ON THE TRAIN DATASET
    print("Applying the model on the train dataset...")
    model = load(save_path)
    save_train_path = work_dir + "train_pca" + str(
        conf['nvis']) + "_dae" + str(conf['nhid']) + ".npy"
    dump_obj = FeatureDump(encoder=model,
                           dataset=trainset,
                           path=save_train_path)
    dump_obj.main_loop()

    ###############   APPLY THE MODEL ON THE VALID DATASET
    print("Applying the model on the valid dataset...")
    valid_file = work_dir + "valid_pca" + str(conf['nvis']) + ".npy"

    validset = NpyDataset(file=valid_file)
    validset.yaml_src = 'script'
    save_valid_path = work_dir + "valid_pca" + str(
        conf['nvis']) + "_dae" + str(conf['nhid']) + ".npy"
    dump_obj = FeatureDump(encoder=model,
                           dataset=validset,
                           path=save_valid_path)
    dump_obj.main_loop()

    ###############   APPLY THE MODEL ON THE TEST DATASET
    print("Applying the model on the test dataset...")
    test_file = work_dir + "test_pca" + str(conf['nvis']) + ".npy"

    testset = NpyDataset(file=test_file)
    testset.yaml_src = 'script'
    save_test_path = work_dir + "test_pca" + str(conf['nvis']) + "_dae" + str(
        conf['nhid']) + ".npy"
    dump_obj = FeatureDump(encoder=model, dataset=testset, path=save_test_path)
    dump_obj.main_loop()

    ###############   COMPUTE THE ALC SCORE ON VALIDATION SET
    valid_data = ift6266h12.load_npy(save_valid_path)
    label_data = ift6266h12.load_npy(
        '/data/lisa/data/UTLC/numpy_data/avicenna_valid_y.npy')
    alc_1 = score(valid_data, label_data)

    ###############   APPLY THE TRANSDUCTIVE PCA
    test_data = ift6266h12.load_npy(save_test_path)
    trans_pca = PCA(n_components=conf['n_components_trans_pca'])
    final_valid = trans_pca.fit_transform(valid_data)
    final_test = trans_pca.fit_transform(test_data)

    save_valid_path = work_dir + "valid_pca" + str(
        conf['nvis']) + "_dae" + str(conf['nhid']) + "_tpca" + str(
            conf['n_components_trans_pca']) + ".npy"
    save_test_path = work_dir + "test_pca" + str(conf['nvis']) + "_dae" + str(
        conf['nhid']) + "_tpca" + str(conf['n_components_trans_pca']) + ".npy"

    np.save(save_valid_path, final_valid)
    np.save(save_test_path, final_test)

    ###############   COMPUTE THE NEW ALC SCORE ON VALIDATION SET
    alc_2 = score(final_valid, label_data)

    ###############   OUTPUT AND RETURN THE RESULTS
    timeSpent = ((time.clock() - start) / 60.)
    print 'FINAL RESULTS (PCA-' + str(conf['nvis']) + ' DAE-' + str(conf['nhid']) + ' TransPCA-' + str(conf['n_components_trans_pca']) + ') ALC after DAE: ', alc_1, ' FINAL ALC: ', alc_2, \
            ' Computed in %5.2f min' % (timeSpent)

    return timeSpent, alc_1, alc_2