Example #1
0
 def __init__(self,
              n_clusters=50,
              pca_n_components=20,
              kmpca_n_components=3,
              kernel_n_components=30):
     self.counter = text.CountVectorizer(stop_words='english',
                                         ngram_range=(1, 2),
                                         min_df=30,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Title_CounterX', 'Title_ClusterdX', 'Title_KmX', 'Title_PCAX',
         'Title_PCAClusterdX', 'Title_RbfX', 'Title_TreeX'
     ]
     self.linear_feature_selector = None
Example #2
0
def pca_docs(docs,client,collectionname = None,filename = None,scalerfilename = None):
    if collectionname is None:
        collectionname = "doc2vec"
    if filename is None:
        filename = "doc2vec_pca"
    if scalerfilename is None:
        scalerfilename = "doc2vec_pca_scaler"
    modelstore = GridFS(client.models,collection=collectionname)
    try:
        pca_model = pickle.loads(modelstore.get_version(filename=filename).read())
    except NoFile:
        pca_model = decomposition.RandomizedPCA(n_components=PCAVECTORSIZE)
    if pca_model.n_components != PCAVECTORSIZE:
        pca_model = decomposition.RandomizedPCA(n_components=PCAVECTORSIZE)
    training_data = []
    for doc in docs:
        try:
            doc_result = get_vector(ObjectId(doc.tags[0]),client)
            if doc_result is not None:
                training_data.append(doc_result)
        except Exception:
            pass
    try:
        scaler = pickle.loads(modelstore.get_version(filename=scalerfilename).read())
    except NoFile:
        scaler = StandardScaler()
    scaler.fit(training_data)
    pca_model.fit(scaler.transform(training_data))
    modelstore.put(pickle.dumps(pca_model),filename=filename)
    modelstore.put(pickle.dumps(scaler),filename=scalerfilename)
    update_pcavecs(docs,pca_model,scaler,client)
 def __init__(self,
              n_clusters=50,
              pca_n_components=30,
              kmpca_n_components=3,
              kernel_n_components=30):
     ## use (min_df = 30, max_df = 0.5) to generate a lot of features - more choic for feature selection
     ## use (min_df = 0.001, max_df = 0.05) to generate fewer features - better clustering
     self.counter = text.CountVectorizer(stop_words='english',
                                         charset='utf-8',
                                         charset_error='ignore',
                                         ngram_range=(1, 1),
                                         min_df=0.001,
                                         max_df=0.05,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Desc_CounterX', 'Desc_ClusterdX', 'Desc_KmX', 'Desc_PCAX',
         'Desc_PCAClusterdX', 'Desc_RbfX', 'Desc_TreeX'
     ]
     self.linear_feature_selector = None
Example #4
0
def show_PCA_training(digits):
    # Create a Randomized PCA model that takes two components
    from sklearn import decomposition
    
    randomized_pca = decomposition.RandomizedPCA(n_components=2)


    # Fit and transform the data to the model
    reduced_data_rpca = randomized_pca.fit_transform(digits.data)

    # Create a regular PCA model 
    pca = decomposition.PCA(n_components=2)

    # Fit and transform the data to the model
    reduced_data_pca = pca.fit_transform(digits.data)

    # Inspect the shape
    reduced_data_pca.shape

    # Print out the data
    print(reduced_data_rpca)
    print(reduced_data_pca)    
    
    colors = ['black', 'blue', 'purple', 'yellow', 'white', 'red', 'lime', 'cyan', 'orange', 'gray']
    for i in range(len(colors)):
        x = reduced_data_rpca[:, 0][digits.target == i]
        y = reduced_data_rpca[:, 1][digits.target == i]
        plt.scatter(x, y, c=colors[i])
    plt.legend(digits.target_names, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.title("PCA Scatter Plot")
    plt.show()
Example #5
0
    def decompose(self,features,labels=None):
        if self.pca_components == 0 \
           or self.pca_components+self.start_component > features.shape[1]:
            print ('WARNING no / too many pca-components given, take all'
                    ' (={} dimensions)'.format(features.shape[1]))
            self.pca_components = features.shape[1]
            if self.start_component:
                self.pca_components -= self.start_component

        print ('run {} w. {} components and reg of '
               '{} [start component:{}]'.format(self.decomp_method,
                                                self.pca_components,
                                                self.reg,
                                                self.start_component)\
               + (' + whiten' if self.pca_whiten else ''))

        if 'pca' in self.decomp_method and not 'rpca' in self.decomp_method:
            if features.shape[1] > 500 and features.shape[0] > 500:
                print ('your data seems to be too much ({}) for simple PCA --> '
                       ' try RandomizePCA , however you should run it on asubset and transform then the'
                       ' rest'.format(features.shape))
                self.pca = decomposition.RandomizedPCA(self.pca_components,
                                                       iterated_power=5,
                                                       whiten=self.pca_whiten)
            else:
                self.pca = decomposition.PCA(self.pca_components,
                                             whiten=self.pca_whiten)
        elif 'rpca' in self.decomp_method:
            self.pca = RegularizedPCA(self.pca_components,
                                      whiten=self.pca_whiten,
                                      regularization=self.reg,
                                      start_component=self.start_component)
Example #6
0
    def __init__(self, texts, labels, weights):
        self.texts = texts
        self.classifier = linear_model.LogisticRegression()

        # Vectorize
        self.vectorizer = CountVectorizer(
            ngram_range=(1, 1),
            min_df=0.0,
            max_df=0.9,
            strip_accents='unicode',
            stop_words=stopwords.words('spanish'),
            binary=False)
        self.vectorizer.fit(texts)
        word_counts = self.vectorizer.transform(texts)
        # Normalize
        self.tf_transformer = TfidfTransformer(use_idf=True)
        data = self.tf_transformer.fit_transform(word_counts)

        n_eigenfaces = 200
        self.pca = decomposition.RandomizedPCA(n_components=n_eigenfaces,
                                               whiten=True)
        pca_features = self.pca.fit_transform(data.toarray())

        #with plt.style.context('cev_plot'):
        # revisando que el numero de eigenfaces tenga sentido..
        plt.figure(figsize=(8, 6))
        plt.title('cev vs eigenFace')
        plt.plot(self.pca.explained_variance_ratio_.cumsum())
        #plt.show()

        self.classifier.fit(pca_features, labels)
Example #7
0
    def PreprocessingRandomizedPCA(self,
                                   PCA_coefficients,
                                   MNE_coefficients,
                                   N_neighbors,
                                   whiten=True):
        """
        :type MNE_coefficients: int
        :type PCA_coefficients: int
        :param MNE_coefficients: number of coefficnents for mns projection
        :param PCA_coefficients: number of n_coefficients for PCA transform
        :param N_neighbors: number of neighbors for embedding
        """
        self.MNE_coefficients = MNE_coefficients
        self.PCA_coefficients = PCA_coefficients
        self.N_neighbors = N_neighbors

        self.pca = decomposition.RandomizedPCA(
            n_components=self.PCA_coefficients, whiten=whiten)

        self.Embedding = manifold.SpectralEmbedding(
            n_components=self.MNE_coefficients,
            affinity='nearest_neighbors',
            gamma=None,
            random_state=0,
            n_neighbors=self.N_neighbors)
        self.X_pca = self.pca.fit_transform(self.Waves_Coefficients)
        self.X_red = self.Embedding.fit_transform(self.X_pca)
        return self.X_red
Example #8
0
    def PCA(self, X, Y=None, ncomp=2, method='PCA'):
        """ decompose a multivariate dataset in an orthogonal
            set that explain a maximum amount of the variance

        @param X: Input dataset

        Keyword Arguments:
        ncomp  -- number or components to be kept (Default: 2)
        method -- method to be used
                  PCA(default)/Randomized/Sparse

        """
        from sklearn import decomposition
        from sklearn import cross_decomposition
        if method == 'Randomized':
            pca = decomposition.RandomizedPCA(n_components=ncomp)
        elif method == 'Sparse':
            pca = decomposition.SparsePCA(n_components=ncomp)
        elif method == 'rbf':
            pca = decomposition.KernelPCA(n_components=ncomp,
                                          fit_inverse_transform=True,
                                          gamma=10,
                                          kernel="rbf")
        elif method == 'linear':
            pca = decomposition.KernelPCA(n_components=ncomp, kernel="linear")
        elif method == 'sigmoid':
            pca = decomposition.KernelPCA(n_components=ncomp, kernel="sigmoid")
        elif method == 'SVD':
            pca = decomposition.TruncatedSVD(n_components=ncomp)
        else:
            pca = decomposition.PCA(n_components=ncomp)
            method = 'PCA'
        print('[ML] Using %s method' % method)
        pca.fit(X)
        return pca.transform(X)
Example #9
0
    def __init__(self, data, training_movie_ids, rounded_rating=False, run_pca=True, sparse_matrix=True):
        logging.info("Initializing DataTransformer...")
        self.rounded_rating = rounded_rating
        self.run_pca = run_pca
        self.sparse_matrix = sparse_matrix

        # compute_cast_experience(data)

        # Maps feature name to it's index in feature vector
        feature_name_to_count = {}
        cast_to_count = {}
        for movie_id in training_movie_ids:
            if str(movie_id) not in data:
                continue
            movie_data = data[str(movie_id)]
            for feature_name in movie_data['features']:
                if feature_name not in feature_name_to_count:
                    feature_name_to_count[feature_name] = 1
                else:
                    feature_name_to_count[feature_name] += 1

                # Keeps track of cast apperance.
                if len(feature_name) >= 5 and feature_name[0:5] == "cast_":
                    if feature_name not in cast_to_count:
                        cast_to_count[feature_name] = 1
                    else:
                        cast_to_count[feature_name] += 1

        # Drop features
        self.feature_name_to_index = {}
        logging.info("Number of features before drop: %s" % len(feature_name_to_count))
        for feature_name, feature_count in feature_name_to_count.items():
            if feature_count >= MINIMUM_FEATURE_COUNT:
                self.feature_name_to_index[feature_name] = len(self.feature_name_to_index)
        logging.info("Number of features after drop: %s" % len(self.feature_name_to_index))

        # num_movies * num_features matrix.
        self.feature_matrix = []
        # num_movies array.
        self.labels = []
        for movie_id in training_movie_ids:
            if str(movie_id) not in data:
                continue
            movie_data = data[str(movie_id)]
            self.feature_matrix.append(self.transform_features(movie_data['features']))
            if self.rounded_rating:
                self.labels.append(movie_data['rating_rounded'])
            else:
                self.labels.append(movie_data['rating'])

        if self.sparse_matrix:
            self.feature_matrix = sparse.csr_matrix(self.feature_matrix)
        if self.run_pca:
            logging.info("Fitting pca...")
            self.pca = decomposition.RandomizedPCA(copy=False, n_components=5000)
            self.feature_matrix = self.pca.fit_transform(self.feature_matrix)
            logging.info("PCA fit")

        logging.info("Initializing DataTransformer done!")
Example #10
0
def pca_reduce(data,
               n=None,
               copy=True,
               method='random',
               whiten=False,
               cutoff=1000):
    """
    Principal component analysis dimensionality reduction using Scikit Learn.

    Inputs:
        data = timepoints x voxels matrix.
        n = None -- return all components
          = int -- return int components
        copy = if False, do pca in place.
        whiten = pre-whiten (decorrelate) data.
        cutoff = maximum number of input features before we move to an
                 efficient method.

    This mean-centers and auto-scales the data (in-place).

    Returns:
        pcs from the input data
        % variance explained by each of them

    methods
    -------
    normal -- standard PCA
    random -- randomized PCA (for large matricies [1])

    [1] Halko, N., Martinsson, P. G., Shkolnisky, Y., & Tygert, M. (2010).
        An algorithm for the principal component analysis of large data sets.
    """

    import sklearn.decomposition as dec

    data = data.astype(np.float)
    data -= np.mean(data)  # mean-center entire dataset

    # set n to be the cutoff if the dimensionality of the data is large
    if n == None and method == 'random':
        n = cutoff

    if method == 'random':
        pcmodel = dec.RandomizedPCA(n_components=n, copy=copy, whiten=whiten)
    elif method == 'normal':
        pcmodel = dec.pca.PCA(n_components=n, copy=copy, whiten=whiten)

    try:
        pcmodel.fit(data)
    except:
        print(
            'ERROR: failed to find the top principal components of input data:\n{}'
            .format(data))

    data = pcmodel.transform(data)
    #components = pcmodel.components_
    exp_var = pcmodel.explained_variance_ratio_

    return data, exp_var
Example #11
0
def search_outliers(X, m=6., mode=1, verbose=1):
    """ Search outliers in X matrix with mode:
        1. Select outliers in every column, than select rows-outliers 
            with too much columns-outlier  
        2. Select rows-outliers of sum of its all columns
        3. Select rows-outliers of max value of its all columns
        4. make PCA of the matrix X, than select rows-outliers of its 
            first four principal components
        parameter m - Z-score in std to select outliers
    """
    nrows, ncols = X.shape
    mode_search_outliers_array = int(mode / 10)
    if mode_search_outliers_array == 0:
        s_o_a = search_outliers_array
    else:
        s_o_a = search_outliers_array2
    mode_mode = mode % 10
    if mode_mode == 1:
        outliers = np.array([0.0] * nrows)
        for j in range(ncols):
            isout = s_o_a(X[:, j], m)
            if np.any(isout):
                bad = np.where(isout)[0]
                outliers[bad] += 1.0
                if verbose > 1:
                    print("outliers col:%d row_vals:%r" %
                          (j, zip(bad, X[bad, j]))),
                    print "data: ", np.mean(X[:, j]), "+-", np.std(X[:, j])
        sel_outliers = s_o_a(outliers, m=m)
    elif mode_mode == 2:
        outliers = np.sum(X, axis=1)
        sel_outliers = s_o_a(outliers, m=m)
    elif mode_mode == 3:
        outliers = np.max(X, axis=1)
        sel_outliers = s_o_a(outliers, m=m)
    elif mode_mode == 4:
        from feasel import VarSel
        pline = [
            ("varsel", VarSel(k=4000)),
            #("scaler", preprocessing.StandardScaler(with_mean=True)),
            ("pca",
             decomposition.RandomizedPCA(n_components=20,
                                         whiten=True,
                                         random_state=1))
        ]
        X1 = Pipeline(pline).fit_transform(X)
        #print "X1:",X1.shape,X1[:,:4]
        sel_outliers = np.array([False] * nrows)
        for j in range(4):
            outliers = X1[:, j]
            sel_outliers = sel_outliers | s_o_a(outliers, m=m)
            if np.any(sel_outliers): break
    else:
        raise ValueError("bad search_outliers mode: %r" % mode)
    if verbose > 0:
        #print "sel_outliers:",sel_outliers
        if type(sel_outliers) != bool:
            print "outliers:", outliers[sel_outliers]
    return np.where(sel_outliers)[0]
def reduce_randomizedPCA(x):
    '''
        Reduce the dimensions using Randomized PCA algorithm
    '''
    # create the CCA object
    randomPCA = dc.RandomizedPCA(n_components=2, whiten=True, copy=False)

    # learn the principal components from all the features
    return randomPCA.fit(x)
Example #13
0
def RPca_base_final(iX_train, iX_test, iy_train, iy_test, n_components=3):
    dX_train = copy.copy(iX_train)
    dX_test = copy.copy(iX_test)
    dy_train = copy.copy(iy_train)
    dy_test = copy.copy(iy_test)
    pca = decomposition.RandomizedPCA(n_components=n_components)
    pca.fit(dX_train)
    dX_train = pca.transform(dX_train)
    dX_test = pca.transform(dX_test)
    return dX_train, dX_test, dy_train, dy_test
Example #14
0
def pca_lr(params, n_classes):
    C = np.exp(params['log_C'])
    n_components = params['n_components']
    mclass = 'multinomial' if n_classes > 2 else 'ovr'
    solver = 'lbfgs' if n_classes > 2 else 'liblinear'

    logistic = linear_model.LogisticRegression(C=C, multi_class=mclass, solver=solver, penalty='l2')

    pca = decomposition.RandomizedPCA(n_components=n_components)
    pca_lr_classifier = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
    return pca_lr_classifier, 'PCA Logistic Regression'
Example #15
0
def RPca_base(iX_train, iX_test, iy_train, iy_test, n_components=3):
    dX_train = copy.copy(iX_train)
    dX_test = copy.copy(iX_test)
    dy_train = copy.copy(iy_train)
    dy_test = copy.copy(iy_test)
    for i in range(0, len(iX_train)):
        pca = decomposition.RandomizedPCA(n_components=n_components)
        pca.fit(dX_train[i])
        dX_train[i] = pca.transform(dX_train[i])
        dX_test[i] = pca.transform(dX_test[i])
    return dX_train, dX_test, dy_train, dy_test
def bench_skl(X, y, T, valid):
    #
    #       .. scikits.learn ..
    #
    from sklearn import decomposition
    start = datetime.now()
    clf = decomposition.RandomizedPCA(n_components=n_components)
    clf.fit(X)
    delta = datetime.now() - start
    ev = explained_variance(X, clf.components_).sum()
    return ev, delta
Example #17
0
 def RandomizedPCA(self, source):
     min_max_scaler = preprocessing.MinMaxScaler()
     data_source = min_max_scaler.fit_transform(source)
     pca = decomposition.RandomizedPCA(n_components=2)
     result = {}
     result['data'] = pca.fit_transform(data_source)
     rparams = 0.0
     for j in pca.explained_variance_ratio_:
         params = params + j
     result['params'] = params
     return result
def RandomizedPCA(array, percent_samples):
    print "Randomized PCA", percent_samples * 100, "% of training data."
    print "Features\tTime"

    array = array[:int(percent_samples * len(array))]
    for pct in pct_features_list:
        num_features = int(pct * len(array[0]))
        start = time()
        Y = decomposition.RandomizedPCA(
            n_components=num_features).fit_transform(array)
        end = time()
        print num_features, "\t", (end - start)
 def __init__(self,
              n_clusters=100,
              pca_n_components=10,
              kmpca_n_components=7,
              kernel_n_components=30):
     self.counter = text.CountVectorizer(stop_words='english',
                                         ngram_range=(1, 1),
                                         min_df=2,
                                         max_df=0.8,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Loc_CounterX', 'Loc_ClusterdX', 'Loc_KmX', 'Loc_PCAX',
         'Loc_PCAClusterdX', 'Loc_RbfX', 'Loc_TreeX'
     ]
     self.linear_feature_selector = None
     ## BUILD dictionary based on location_tree - faster for search
     location_tree = [
         row[0].lower().split('~')[::-1]
         for row in csv.reader(open(LOCATION_TREE_FILE))
     ]
     self.location_dict = {}
     for locs in location_tree:
         for i in range(len(locs)):
             if locs[i] not in self.location_dict:
                 self.location_dict[locs[i]] = locs[i:]
Example #20
0
    def compute(self):
        matrix = self.getInputFromPort('matrix')

        pca = decomposition.RandomizedPCA(
            n_components=self.forceGetInputFromPort('n_components', 2),
            copy=self.forceGetInputFromPort('copy', True),
            iterated_power=self.forceGetInputFromPort('iterated_power', 3),
            whiten=self.forceGetInputFromPort('whiten', False),
            random_state=self.forceGetInputFromPort('random_state', None))
        Y = pca.fit_transform(matrix.values)

        proj_matrix = copy.deepcopy(matrix)
        proj_matrix.values = Y
        self.setResult('proj_matrix', proj_matrix)
Example #21
0
def choose_decomposition_method(method, n_components):
    """Return the decomposition corresponding to `method`."""
    if method == 'PCA':
        return decomposition.PCA(n_components)
    elif method == 'Randomized PCA':
        return decomposition.RandomizedPCA(n_components)
    elif method == 'Kernel PCA':
        return decomposition.KernelPCA(n_components, kernel='rbf')
    elif method == 'Sparse PCA':
        return decomposition.SparsePCA(n_components, n_jobs=1)
    elif method == 'SVD':
        return decomposition.TruncatedSVD(n_components)
    elif method == 'Factor Analysis':
        return decomposition.FactorAnalysis(n_components)
    elif method == 'ICA':
        return decomposition.FastICA(n_components)
    raise ValueError('{} is not a known method'.format(method))
Example #22
0
def pca():
    #pca = PCA()
    pca = decomposition.RandomizedPCA(n_components=150, whiten=True)

    # Input
    train_data = np.load('train_data.npy')
    train_data = train_data.reshape(7000, 128 * 128)
    #test_data = np.load('test_data.npy')

    #train_data = train_data.reshape(train_data.shape[0], -1)
    #test_data = test_data.reshape(test_data.shape[0], -1)

    pca.fit(train_data)
    X_train_pca = pca.transform(train_data)
    #train_data = pca.fit_transform(train_data)
    #test_data = pca.transform(test_data)

    np.save("train_data_pca", X_train_pca)
Example #23
0
def gen_estimators():
    '''
    List of the different estimators, whether to center and transpose the problem, and whether the transformer uses the clustering API.
    '''
    rng = RandomState(0)
    estimators = [
        ('Eigenfaces - RandomizedPCA',
         decomposition.RandomizedPCA(n_components=n_components,
                                     whiten=True), True),
        ('Non-negative components - NMF tol=1e-4',
         decomposition.NMF(n_components=n_components,
                           init='nndsvda',
                           tol=1e-4,
                           solver='cd'), False),
        ('Non-negative components - NMF tol=1e-6',
         decomposition.NMF(
             n_components=n_components,
             init='nndsvd',
         ), False),
        ('Independent components - FastICA',
         decomposition.FastICA(n_components=n_components, whiten=True), True),
        ('Sparse comp. - MiniBatchSparsePCA',
         decomposition.MiniBatchSparsePCA(n_components=n_components,
                                          alpha=0.8,
                                          n_iter=100,
                                          batch_size=3,
                                          random_state=rng), True),
        ('MiniBatchDictionaryLearning',
         decomposition.MiniBatchDictionaryLearning(n_components=15,
                                                   alpha=0.1,
                                                   n_iter=50,
                                                   batch_size=3,
                                                   random_state=rng), True),
        ('Cluster centers - MiniBatchKMeans',
         MiniBatchKMeans(n_clusters=n_components,
                         tol=1e-3,
                         batch_size=20,
                         max_iter=50,
                         random_state=rng), True),
        ('Factor Analysis components - FA',
         decomposition.FactorAnalysis(n_components=n_components,
                                      max_iter=2), True),
    ]
    return estimators
Example #24
0
def pca_reduce(data, **kwargs):

    #extract parameters
    n_components = kwargs.get('n_components', 'mle')
    copy = kwargs.get('copy', True)
    whiten = kwargs.get('whiten', True)

    #set up PCA function
    pca = decomposition.RandomizedPCA(n_components=n_components,
                                      copy=copy,
                                      whiten=whiten)

    #fit the data
    pca.fit(data)

    #run the reduction
    reduced_data = pca.transform(data)

    return reduced_data
Example #25
0
def deriveBasisSetsRandomizedPCA(data,
                                 cut,
                                 outfolder,
                                 components=10,
                                 whiten=False):
    """
    Derives a basis set from input data using Randomized Principal component analysis (PCA).
    Saves the basis sets to a FITS file for further processing.

    Information about PCA can be found from the scikit-learn website:
    http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.RandomizedPCA.html#sklearn.decomposition.RandomizedPCA

    :param data: input data from which the basis set are derived from. The input data must be an array of arrays.
                 Each array should describe an independent data set that has been flatted to 1D.
    :type data: ndarray
    :param cut: size of the cutout region that has been used
    :type cut: int
    :param outfolder: name of the output folder e.g. 'output'
    :type outfolder: str
    :param components: the number of basis set function components to derive
    :type components: int
    :param whiten: When True (False by default) the components_ vectors are divided by n_samples times
                   singular values to ensure uncorrelated outputs with unit component-wise variances.
    :type whiten: bool

    :return: Randomized PCA components
    """
    pca = decomposition.RandomizedPCA(n_components=components, whiten=whiten)
    pca.fit(data)
    image = pca.components_

    #output the variance ratio
    print 'Variance Ratio:', pca.explained_variance_ratio_ * 100.

    #save each component to a FITS file
    for i, img in enumerate(image):
        image = img.reshape(cut, cut)
        #to compare IDL results
        #image = -image
        fileIO.writeFITS(image,
                         outfolder + '/RandomPCAbasis%03d.fits' % (i + 1),
                         int=False)
    return image
Example #26
0
    def execute_analysis(self):
        """
        function to do randomized PCA on openMSI data via sklearn.decomposition.RandomizedPCA()
        The "copy" parameter of RandomizedPCA() is not supported (is always False)
        The "whiten" parameter of RandomizedPCA() is not currently supported (assumed False)
        """
        # extract input parameters

        start = time.time()

        msidata = self['msidata']
        n_components = self['numComponents']
        iterated_power = self['iteratedPower']
        random_state = self['randomState']

        nx, ny, nmz = msidata.shape

        # Randomized PCA
        # # reshape msidata from 3D (x by y by mz) to 2D (xy by mz)
        flatdata = np.array(
            [np.array(msidata[:, :, i]).flatten() for i in range(nmz)]).T

        # # do randomized PCA
        pca = decomposition.RandomizedPCA(n_components=n_components,
                                          iterated_power=iterated_power,
                                          random_state=random_state)
        pca.fit(flatdata, n_components)

        # # make new image and reshape to expected size
        newImageCubeFlat = pca.transform(flatdata)
        newImageCube = newImageCubeFlat.reshape(nx, ny, n_components)

        # # return other pca data (redundant with returning full pca object but not sure which way is best right now)
        components = pca.components_
        explainedVariance = pca.explained_variance_ratio_
        mean = pca.mean_

        # # return analysis time
        stop = time.time()
        analysisTime = stop - start

        # # return results
        return newImageCube, components, explainedVariance, mean, analysisTime
Example #27
0
def embededParams(
        data,
        title,
        n_neighbors=2,
        method='pca',
        filename=None,
        labels=["First Principal Component", "Second Principal Component"]):
    """Plot performance against values of two-dimensional embedding of all the parameters.

    Args:
        data: Two dimensional numpy array. First column gives performance values.
        title: Title for the resulting plot
        (n_neighbors=2): Number of neighbors to use for embeddings requiring it.
        (method=pca): Dimensionality reduction method to use. Defaults to PCA.
        (filename=None): Filename to save the figure out as, if None (default) will show the figure.
        (labels=[...]): List of labels for the two axes.
    """

    if method == 'pca':
        X_pca = decomposition.RandomizedPCA(n_components=2).fit_transform(
            data[:, 1:])
    elif method == 'isomap':
        X_pca = manifold.Isomap(n_neighbors=2,
                                n_components=2).fit_transform(data[:, 1:])
    elif method == 'lle':
        X_pca = manifold.LocallyLinearEmbedding(
            n_neighbors, n_components=2,
            method='standard').fit_transform(data[:, 1:])
    elif method == 'mds':
        X_pca = manifold.MDS(n_components=2, n_init=1,
                             max_iter=100).fit_transform(data[:, 1:])
    else:
        print "Error unknown method"
        return
    plotTwoParams(numpy.array([data[:, 0].tolist()] + X_pca.T.tolist()).T,
                  title,
                  filename=filename,
                  labels=labels)
Example #28
0
def do_RandomizedPCA(armadillo):
  #
  # TODO: Write code to import the libraries required for
  # RandomizedPCA. Then, train your RandomizedPCA on the armadillo
  # dataframe. Finally, drop one dimension (reduce it down to 2D)
  # and project the armadillo down to the 2D principal component
  # feature space.
  #
  # NOTE: Be sure to RETURN your projected armadillo! 
  # (This projection is actually stored in a NumPy NDArray and
  # not a Pandas dataframe, which is something Pandas does for
  # you automatically. =)
  #
  # .. your code here ..

  from sklearn import decomposition
  from sklearn import datasets
  
  pca = decomposition.RandomizedPCA(n_components=3)
  pca.fit(armadillo)
  X = pca.transform(armadillo)

  return X
        vmax = max(comp.max(), -comp.min())
        plt.imshow(comp.reshape(image_shape),
                   cmap=plt.cm.gray,
                   interpolation='nearest',
                   vmin=-vmax,
                   vmax=vmax)
        plt.xticks(())
        plt.yticks(())
    plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)


###############################################################################
# List of the different estimators, whether to center and transpose the
# problem, and whether the transformer uses the clustering API.
estimatorname = 'Eigenfaces - RandomizedPCA'
estimator = decomposition.RandomizedPCA(n_components=n_components, whiten=True)
center = True

###############################################################################
# Plot a sample of the input data

plot_gallery("First centered B faces", faces_centered[:n_components])

###############################################################################
# Do the estimation and plot it

print("Extracting the top %d %s..." % (n_components, estimatorname))
t0 = time()
data = faces
if center:
    data = faces_centered
Example #30
0
pl.title('A selection from the 64-dimensional digits dataset')

#----------------------------------------------------------------------
# Random 2D projection using a random unitary matrix
print "Computing random projection"
rng = np.random.RandomState(42)
Q, _ = qr_economic(rng.normal(size=(n_features, 2)))
X_projected = np.dot(Q.T, X.T).T
plot_embedding(X_projected, "Random Projection of the digits")

#----------------------------------------------------------------------
# Projection on to the first 2 principal components

print "Computing PCA projection"
t0 = time()
X_pca = decomposition.RandomizedPCA(n_components=2).fit_transform(X)
plot_embedding(
    X_pca, "Principal Components projection of the digits (time %.2fs)" %
    (time() - t0))

#----------------------------------------------------------------------
# Projection on to the first 2 linear discriminant components

print "Computing LDA projection"
X2 = X.copy()
X2.flat[::X.shape[1] + 1] += 0.01  # Make X invertible
t0 = time()
X_lda = lda.LDA(n_components=2).fit_transform(X2, y)
plot_embedding(
    X_lda, "Linear Discriminant projection of the digits (time %.2fs)" %
    (time() - t0))