def __init__(self, n_clusters=50, pca_n_components=20, kmpca_n_components=3, kernel_n_components=30): self.counter = text.CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=30, binary=True, lowercase=True) self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters, n_init=10, batch_size=10000, verbose=1) self.pca = decomposition.RandomizedPCA(n_components=pca_n_components) self.kmpca = decomposition.RandomizedPCA( n_components=kmpca_n_components) self.rbf = kernel_approximation.RBFSampler( n_components=kernel_n_components) self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4) self.X_names = [ 'Title_CounterX', 'Title_ClusterdX', 'Title_KmX', 'Title_PCAX', 'Title_PCAClusterdX', 'Title_RbfX', 'Title_TreeX' ] self.linear_feature_selector = None
def pca_docs(docs,client,collectionname = None,filename = None,scalerfilename = None): if collectionname is None: collectionname = "doc2vec" if filename is None: filename = "doc2vec_pca" if scalerfilename is None: scalerfilename = "doc2vec_pca_scaler" modelstore = GridFS(client.models,collection=collectionname) try: pca_model = pickle.loads(modelstore.get_version(filename=filename).read()) except NoFile: pca_model = decomposition.RandomizedPCA(n_components=PCAVECTORSIZE) if pca_model.n_components != PCAVECTORSIZE: pca_model = decomposition.RandomizedPCA(n_components=PCAVECTORSIZE) training_data = [] for doc in docs: try: doc_result = get_vector(ObjectId(doc.tags[0]),client) if doc_result is not None: training_data.append(doc_result) except Exception: pass try: scaler = pickle.loads(modelstore.get_version(filename=scalerfilename).read()) except NoFile: scaler = StandardScaler() scaler.fit(training_data) pca_model.fit(scaler.transform(training_data)) modelstore.put(pickle.dumps(pca_model),filename=filename) modelstore.put(pickle.dumps(scaler),filename=scalerfilename) update_pcavecs(docs,pca_model,scaler,client)
def __init__(self, n_clusters=50, pca_n_components=30, kmpca_n_components=3, kernel_n_components=30): ## use (min_df = 30, max_df = 0.5) to generate a lot of features - more choic for feature selection ## use (min_df = 0.001, max_df = 0.05) to generate fewer features - better clustering self.counter = text.CountVectorizer(stop_words='english', charset='utf-8', charset_error='ignore', ngram_range=(1, 1), min_df=0.001, max_df=0.05, binary=True, lowercase=True) self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters, n_init=10, batch_size=10000, verbose=1) self.pca = decomposition.RandomizedPCA(n_components=pca_n_components) self.kmpca = decomposition.RandomizedPCA( n_components=kmpca_n_components) self.rbf = kernel_approximation.RBFSampler( n_components=kernel_n_components) self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4) self.X_names = [ 'Desc_CounterX', 'Desc_ClusterdX', 'Desc_KmX', 'Desc_PCAX', 'Desc_PCAClusterdX', 'Desc_RbfX', 'Desc_TreeX' ] self.linear_feature_selector = None
def show_PCA_training(digits): # Create a Randomized PCA model that takes two components from sklearn import decomposition randomized_pca = decomposition.RandomizedPCA(n_components=2) # Fit and transform the data to the model reduced_data_rpca = randomized_pca.fit_transform(digits.data) # Create a regular PCA model pca = decomposition.PCA(n_components=2) # Fit and transform the data to the model reduced_data_pca = pca.fit_transform(digits.data) # Inspect the shape reduced_data_pca.shape # Print out the data print(reduced_data_rpca) print(reduced_data_pca) colors = ['black', 'blue', 'purple', 'yellow', 'white', 'red', 'lime', 'cyan', 'orange', 'gray'] for i in range(len(colors)): x = reduced_data_rpca[:, 0][digits.target == i] y = reduced_data_rpca[:, 1][digits.target == i] plt.scatter(x, y, c=colors[i]) plt.legend(digits.target_names, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.xlabel('First Principal Component') plt.ylabel('Second Principal Component') plt.title("PCA Scatter Plot") plt.show()
def decompose(self,features,labels=None): if self.pca_components == 0 \ or self.pca_components+self.start_component > features.shape[1]: print ('WARNING no / too many pca-components given, take all' ' (={} dimensions)'.format(features.shape[1])) self.pca_components = features.shape[1] if self.start_component: self.pca_components -= self.start_component print ('run {} w. {} components and reg of ' '{} [start component:{}]'.format(self.decomp_method, self.pca_components, self.reg, self.start_component)\ + (' + whiten' if self.pca_whiten else '')) if 'pca' in self.decomp_method and not 'rpca' in self.decomp_method: if features.shape[1] > 500 and features.shape[0] > 500: print ('your data seems to be too much ({}) for simple PCA --> ' ' try RandomizePCA , however you should run it on asubset and transform then the' ' rest'.format(features.shape)) self.pca = decomposition.RandomizedPCA(self.pca_components, iterated_power=5, whiten=self.pca_whiten) else: self.pca = decomposition.PCA(self.pca_components, whiten=self.pca_whiten) elif 'rpca' in self.decomp_method: self.pca = RegularizedPCA(self.pca_components, whiten=self.pca_whiten, regularization=self.reg, start_component=self.start_component)
def __init__(self, texts, labels, weights): self.texts = texts self.classifier = linear_model.LogisticRegression() # Vectorize self.vectorizer = CountVectorizer( ngram_range=(1, 1), min_df=0.0, max_df=0.9, strip_accents='unicode', stop_words=stopwords.words('spanish'), binary=False) self.vectorizer.fit(texts) word_counts = self.vectorizer.transform(texts) # Normalize self.tf_transformer = TfidfTransformer(use_idf=True) data = self.tf_transformer.fit_transform(word_counts) n_eigenfaces = 200 self.pca = decomposition.RandomizedPCA(n_components=n_eigenfaces, whiten=True) pca_features = self.pca.fit_transform(data.toarray()) #with plt.style.context('cev_plot'): # revisando que el numero de eigenfaces tenga sentido.. plt.figure(figsize=(8, 6)) plt.title('cev vs eigenFace') plt.plot(self.pca.explained_variance_ratio_.cumsum()) #plt.show() self.classifier.fit(pca_features, labels)
def PreprocessingRandomizedPCA(self, PCA_coefficients, MNE_coefficients, N_neighbors, whiten=True): """ :type MNE_coefficients: int :type PCA_coefficients: int :param MNE_coefficients: number of coefficnents for mns projection :param PCA_coefficients: number of n_coefficients for PCA transform :param N_neighbors: number of neighbors for embedding """ self.MNE_coefficients = MNE_coefficients self.PCA_coefficients = PCA_coefficients self.N_neighbors = N_neighbors self.pca = decomposition.RandomizedPCA( n_components=self.PCA_coefficients, whiten=whiten) self.Embedding = manifold.SpectralEmbedding( n_components=self.MNE_coefficients, affinity='nearest_neighbors', gamma=None, random_state=0, n_neighbors=self.N_neighbors) self.X_pca = self.pca.fit_transform(self.Waves_Coefficients) self.X_red = self.Embedding.fit_transform(self.X_pca) return self.X_red
def PCA(self, X, Y=None, ncomp=2, method='PCA'): """ decompose a multivariate dataset in an orthogonal set that explain a maximum amount of the variance @param X: Input dataset Keyword Arguments: ncomp -- number or components to be kept (Default: 2) method -- method to be used PCA(default)/Randomized/Sparse """ from sklearn import decomposition from sklearn import cross_decomposition if method == 'Randomized': pca = decomposition.RandomizedPCA(n_components=ncomp) elif method == 'Sparse': pca = decomposition.SparsePCA(n_components=ncomp) elif method == 'rbf': pca = decomposition.KernelPCA(n_components=ncomp, fit_inverse_transform=True, gamma=10, kernel="rbf") elif method == 'linear': pca = decomposition.KernelPCA(n_components=ncomp, kernel="linear") elif method == 'sigmoid': pca = decomposition.KernelPCA(n_components=ncomp, kernel="sigmoid") elif method == 'SVD': pca = decomposition.TruncatedSVD(n_components=ncomp) else: pca = decomposition.PCA(n_components=ncomp) method = 'PCA' print('[ML] Using %s method' % method) pca.fit(X) return pca.transform(X)
def __init__(self, data, training_movie_ids, rounded_rating=False, run_pca=True, sparse_matrix=True): logging.info("Initializing DataTransformer...") self.rounded_rating = rounded_rating self.run_pca = run_pca self.sparse_matrix = sparse_matrix # compute_cast_experience(data) # Maps feature name to it's index in feature vector feature_name_to_count = {} cast_to_count = {} for movie_id in training_movie_ids: if str(movie_id) not in data: continue movie_data = data[str(movie_id)] for feature_name in movie_data['features']: if feature_name not in feature_name_to_count: feature_name_to_count[feature_name] = 1 else: feature_name_to_count[feature_name] += 1 # Keeps track of cast apperance. if len(feature_name) >= 5 and feature_name[0:5] == "cast_": if feature_name not in cast_to_count: cast_to_count[feature_name] = 1 else: cast_to_count[feature_name] += 1 # Drop features self.feature_name_to_index = {} logging.info("Number of features before drop: %s" % len(feature_name_to_count)) for feature_name, feature_count in feature_name_to_count.items(): if feature_count >= MINIMUM_FEATURE_COUNT: self.feature_name_to_index[feature_name] = len(self.feature_name_to_index) logging.info("Number of features after drop: %s" % len(self.feature_name_to_index)) # num_movies * num_features matrix. self.feature_matrix = [] # num_movies array. self.labels = [] for movie_id in training_movie_ids: if str(movie_id) not in data: continue movie_data = data[str(movie_id)] self.feature_matrix.append(self.transform_features(movie_data['features'])) if self.rounded_rating: self.labels.append(movie_data['rating_rounded']) else: self.labels.append(movie_data['rating']) if self.sparse_matrix: self.feature_matrix = sparse.csr_matrix(self.feature_matrix) if self.run_pca: logging.info("Fitting pca...") self.pca = decomposition.RandomizedPCA(copy=False, n_components=5000) self.feature_matrix = self.pca.fit_transform(self.feature_matrix) logging.info("PCA fit") logging.info("Initializing DataTransformer done!")
def pca_reduce(data, n=None, copy=True, method='random', whiten=False, cutoff=1000): """ Principal component analysis dimensionality reduction using Scikit Learn. Inputs: data = timepoints x voxels matrix. n = None -- return all components = int -- return int components copy = if False, do pca in place. whiten = pre-whiten (decorrelate) data. cutoff = maximum number of input features before we move to an efficient method. This mean-centers and auto-scales the data (in-place). Returns: pcs from the input data % variance explained by each of them methods ------- normal -- standard PCA random -- randomized PCA (for large matricies [1]) [1] Halko, N., Martinsson, P. G., Shkolnisky, Y., & Tygert, M. (2010). An algorithm for the principal component analysis of large data sets. """ import sklearn.decomposition as dec data = data.astype(np.float) data -= np.mean(data) # mean-center entire dataset # set n to be the cutoff if the dimensionality of the data is large if n == None and method == 'random': n = cutoff if method == 'random': pcmodel = dec.RandomizedPCA(n_components=n, copy=copy, whiten=whiten) elif method == 'normal': pcmodel = dec.pca.PCA(n_components=n, copy=copy, whiten=whiten) try: pcmodel.fit(data) except: print( 'ERROR: failed to find the top principal components of input data:\n{}' .format(data)) data = pcmodel.transform(data) #components = pcmodel.components_ exp_var = pcmodel.explained_variance_ratio_ return data, exp_var
def search_outliers(X, m=6., mode=1, verbose=1): """ Search outliers in X matrix with mode: 1. Select outliers in every column, than select rows-outliers with too much columns-outlier 2. Select rows-outliers of sum of its all columns 3. Select rows-outliers of max value of its all columns 4. make PCA of the matrix X, than select rows-outliers of its first four principal components parameter m - Z-score in std to select outliers """ nrows, ncols = X.shape mode_search_outliers_array = int(mode / 10) if mode_search_outliers_array == 0: s_o_a = search_outliers_array else: s_o_a = search_outliers_array2 mode_mode = mode % 10 if mode_mode == 1: outliers = np.array([0.0] * nrows) for j in range(ncols): isout = s_o_a(X[:, j], m) if np.any(isout): bad = np.where(isout)[0] outliers[bad] += 1.0 if verbose > 1: print("outliers col:%d row_vals:%r" % (j, zip(bad, X[bad, j]))), print "data: ", np.mean(X[:, j]), "+-", np.std(X[:, j]) sel_outliers = s_o_a(outliers, m=m) elif mode_mode == 2: outliers = np.sum(X, axis=1) sel_outliers = s_o_a(outliers, m=m) elif mode_mode == 3: outliers = np.max(X, axis=1) sel_outliers = s_o_a(outliers, m=m) elif mode_mode == 4: from feasel import VarSel pline = [ ("varsel", VarSel(k=4000)), #("scaler", preprocessing.StandardScaler(with_mean=True)), ("pca", decomposition.RandomizedPCA(n_components=20, whiten=True, random_state=1)) ] X1 = Pipeline(pline).fit_transform(X) #print "X1:",X1.shape,X1[:,:4] sel_outliers = np.array([False] * nrows) for j in range(4): outliers = X1[:, j] sel_outliers = sel_outliers | s_o_a(outliers, m=m) if np.any(sel_outliers): break else: raise ValueError("bad search_outliers mode: %r" % mode) if verbose > 0: #print "sel_outliers:",sel_outliers if type(sel_outliers) != bool: print "outliers:", outliers[sel_outliers] return np.where(sel_outliers)[0]
def reduce_randomizedPCA(x): ''' Reduce the dimensions using Randomized PCA algorithm ''' # create the CCA object randomPCA = dc.RandomizedPCA(n_components=2, whiten=True, copy=False) # learn the principal components from all the features return randomPCA.fit(x)
def RPca_base_final(iX_train, iX_test, iy_train, iy_test, n_components=3): dX_train = copy.copy(iX_train) dX_test = copy.copy(iX_test) dy_train = copy.copy(iy_train) dy_test = copy.copy(iy_test) pca = decomposition.RandomizedPCA(n_components=n_components) pca.fit(dX_train) dX_train = pca.transform(dX_train) dX_test = pca.transform(dX_test) return dX_train, dX_test, dy_train, dy_test
def pca_lr(params, n_classes): C = np.exp(params['log_C']) n_components = params['n_components'] mclass = 'multinomial' if n_classes > 2 else 'ovr' solver = 'lbfgs' if n_classes > 2 else 'liblinear' logistic = linear_model.LogisticRegression(C=C, multi_class=mclass, solver=solver, penalty='l2') pca = decomposition.RandomizedPCA(n_components=n_components) pca_lr_classifier = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) return pca_lr_classifier, 'PCA Logistic Regression'
def RPca_base(iX_train, iX_test, iy_train, iy_test, n_components=3): dX_train = copy.copy(iX_train) dX_test = copy.copy(iX_test) dy_train = copy.copy(iy_train) dy_test = copy.copy(iy_test) for i in range(0, len(iX_train)): pca = decomposition.RandomizedPCA(n_components=n_components) pca.fit(dX_train[i]) dX_train[i] = pca.transform(dX_train[i]) dX_test[i] = pca.transform(dX_test[i]) return dX_train, dX_test, dy_train, dy_test
def bench_skl(X, y, T, valid): # # .. scikits.learn .. # from sklearn import decomposition start = datetime.now() clf = decomposition.RandomizedPCA(n_components=n_components) clf.fit(X) delta = datetime.now() - start ev = explained_variance(X, clf.components_).sum() return ev, delta
def RandomizedPCA(self, source): min_max_scaler = preprocessing.MinMaxScaler() data_source = min_max_scaler.fit_transform(source) pca = decomposition.RandomizedPCA(n_components=2) result = {} result['data'] = pca.fit_transform(data_source) rparams = 0.0 for j in pca.explained_variance_ratio_: params = params + j result['params'] = params return result
def RandomizedPCA(array, percent_samples): print "Randomized PCA", percent_samples * 100, "% of training data." print "Features\tTime" array = array[:int(percent_samples * len(array))] for pct in pct_features_list: num_features = int(pct * len(array[0])) start = time() Y = decomposition.RandomizedPCA( n_components=num_features).fit_transform(array) end = time() print num_features, "\t", (end - start)
def __init__(self, n_clusters=100, pca_n_components=10, kmpca_n_components=7, kernel_n_components=30): self.counter = text.CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=2, max_df=0.8, binary=True, lowercase=True) self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters, n_init=10, batch_size=10000, verbose=1) self.pca = decomposition.RandomizedPCA(n_components=pca_n_components) self.kmpca = decomposition.RandomizedPCA( n_components=kmpca_n_components) self.rbf = kernel_approximation.RBFSampler( n_components=kernel_n_components) self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4) self.X_names = [ 'Loc_CounterX', 'Loc_ClusterdX', 'Loc_KmX', 'Loc_PCAX', 'Loc_PCAClusterdX', 'Loc_RbfX', 'Loc_TreeX' ] self.linear_feature_selector = None ## BUILD dictionary based on location_tree - faster for search location_tree = [ row[0].lower().split('~')[::-1] for row in csv.reader(open(LOCATION_TREE_FILE)) ] self.location_dict = {} for locs in location_tree: for i in range(len(locs)): if locs[i] not in self.location_dict: self.location_dict[locs[i]] = locs[i:]
def compute(self): matrix = self.getInputFromPort('matrix') pca = decomposition.RandomizedPCA( n_components=self.forceGetInputFromPort('n_components', 2), copy=self.forceGetInputFromPort('copy', True), iterated_power=self.forceGetInputFromPort('iterated_power', 3), whiten=self.forceGetInputFromPort('whiten', False), random_state=self.forceGetInputFromPort('random_state', None)) Y = pca.fit_transform(matrix.values) proj_matrix = copy.deepcopy(matrix) proj_matrix.values = Y self.setResult('proj_matrix', proj_matrix)
def choose_decomposition_method(method, n_components): """Return the decomposition corresponding to `method`.""" if method == 'PCA': return decomposition.PCA(n_components) elif method == 'Randomized PCA': return decomposition.RandomizedPCA(n_components) elif method == 'Kernel PCA': return decomposition.KernelPCA(n_components, kernel='rbf') elif method == 'Sparse PCA': return decomposition.SparsePCA(n_components, n_jobs=1) elif method == 'SVD': return decomposition.TruncatedSVD(n_components) elif method == 'Factor Analysis': return decomposition.FactorAnalysis(n_components) elif method == 'ICA': return decomposition.FastICA(n_components) raise ValueError('{} is not a known method'.format(method))
def pca(): #pca = PCA() pca = decomposition.RandomizedPCA(n_components=150, whiten=True) # Input train_data = np.load('train_data.npy') train_data = train_data.reshape(7000, 128 * 128) #test_data = np.load('test_data.npy') #train_data = train_data.reshape(train_data.shape[0], -1) #test_data = test_data.reshape(test_data.shape[0], -1) pca.fit(train_data) X_train_pca = pca.transform(train_data) #train_data = pca.fit_transform(train_data) #test_data = pca.transform(test_data) np.save("train_data_pca", X_train_pca)
def gen_estimators(): ''' List of the different estimators, whether to center and transpose the problem, and whether the transformer uses the clustering API. ''' rng = RandomState(0) estimators = [ ('Eigenfaces - RandomizedPCA', decomposition.RandomizedPCA(n_components=n_components, whiten=True), True), ('Non-negative components - NMF tol=1e-4', decomposition.NMF(n_components=n_components, init='nndsvda', tol=1e-4, solver='cd'), False), ('Non-negative components - NMF tol=1e-6', decomposition.NMF( n_components=n_components, init='nndsvd', ), False), ('Independent components - FastICA', decomposition.FastICA(n_components=n_components, whiten=True), True), ('Sparse comp. - MiniBatchSparsePCA', decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8, n_iter=100, batch_size=3, random_state=rng), True), ('MiniBatchDictionaryLearning', decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng), True), ('Cluster centers - MiniBatchKMeans', MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20, max_iter=50, random_state=rng), True), ('Factor Analysis components - FA', decomposition.FactorAnalysis(n_components=n_components, max_iter=2), True), ] return estimators
def pca_reduce(data, **kwargs): #extract parameters n_components = kwargs.get('n_components', 'mle') copy = kwargs.get('copy', True) whiten = kwargs.get('whiten', True) #set up PCA function pca = decomposition.RandomizedPCA(n_components=n_components, copy=copy, whiten=whiten) #fit the data pca.fit(data) #run the reduction reduced_data = pca.transform(data) return reduced_data
def deriveBasisSetsRandomizedPCA(data, cut, outfolder, components=10, whiten=False): """ Derives a basis set from input data using Randomized Principal component analysis (PCA). Saves the basis sets to a FITS file for further processing. Information about PCA can be found from the scikit-learn website: http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.RandomizedPCA.html#sklearn.decomposition.RandomizedPCA :param data: input data from which the basis set are derived from. The input data must be an array of arrays. Each array should describe an independent data set that has been flatted to 1D. :type data: ndarray :param cut: size of the cutout region that has been used :type cut: int :param outfolder: name of the output folder e.g. 'output' :type outfolder: str :param components: the number of basis set function components to derive :type components: int :param whiten: When True (False by default) the components_ vectors are divided by n_samples times singular values to ensure uncorrelated outputs with unit component-wise variances. :type whiten: bool :return: Randomized PCA components """ pca = decomposition.RandomizedPCA(n_components=components, whiten=whiten) pca.fit(data) image = pca.components_ #output the variance ratio print 'Variance Ratio:', pca.explained_variance_ratio_ * 100. #save each component to a FITS file for i, img in enumerate(image): image = img.reshape(cut, cut) #to compare IDL results #image = -image fileIO.writeFITS(image, outfolder + '/RandomPCAbasis%03d.fits' % (i + 1), int=False) return image
def execute_analysis(self): """ function to do randomized PCA on openMSI data via sklearn.decomposition.RandomizedPCA() The "copy" parameter of RandomizedPCA() is not supported (is always False) The "whiten" parameter of RandomizedPCA() is not currently supported (assumed False) """ # extract input parameters start = time.time() msidata = self['msidata'] n_components = self['numComponents'] iterated_power = self['iteratedPower'] random_state = self['randomState'] nx, ny, nmz = msidata.shape # Randomized PCA # # reshape msidata from 3D (x by y by mz) to 2D (xy by mz) flatdata = np.array( [np.array(msidata[:, :, i]).flatten() for i in range(nmz)]).T # # do randomized PCA pca = decomposition.RandomizedPCA(n_components=n_components, iterated_power=iterated_power, random_state=random_state) pca.fit(flatdata, n_components) # # make new image and reshape to expected size newImageCubeFlat = pca.transform(flatdata) newImageCube = newImageCubeFlat.reshape(nx, ny, n_components) # # return other pca data (redundant with returning full pca object but not sure which way is best right now) components = pca.components_ explainedVariance = pca.explained_variance_ratio_ mean = pca.mean_ # # return analysis time stop = time.time() analysisTime = stop - start # # return results return newImageCube, components, explainedVariance, mean, analysisTime
def embededParams( data, title, n_neighbors=2, method='pca', filename=None, labels=["First Principal Component", "Second Principal Component"]): """Plot performance against values of two-dimensional embedding of all the parameters. Args: data: Two dimensional numpy array. First column gives performance values. title: Title for the resulting plot (n_neighbors=2): Number of neighbors to use for embeddings requiring it. (method=pca): Dimensionality reduction method to use. Defaults to PCA. (filename=None): Filename to save the figure out as, if None (default) will show the figure. (labels=[...]): List of labels for the two axes. """ if method == 'pca': X_pca = decomposition.RandomizedPCA(n_components=2).fit_transform( data[:, 1:]) elif method == 'isomap': X_pca = manifold.Isomap(n_neighbors=2, n_components=2).fit_transform(data[:, 1:]) elif method == 'lle': X_pca = manifold.LocallyLinearEmbedding( n_neighbors, n_components=2, method='standard').fit_transform(data[:, 1:]) elif method == 'mds': X_pca = manifold.MDS(n_components=2, n_init=1, max_iter=100).fit_transform(data[:, 1:]) else: print "Error unknown method" return plotTwoParams(numpy.array([data[:, 0].tolist()] + X_pca.T.tolist()).T, title, filename=filename, labels=labels)
def do_RandomizedPCA(armadillo): # # TODO: Write code to import the libraries required for # RandomizedPCA. Then, train your RandomizedPCA on the armadillo # dataframe. Finally, drop one dimension (reduce it down to 2D) # and project the armadillo down to the 2D principal component # feature space. # # NOTE: Be sure to RETURN your projected armadillo! # (This projection is actually stored in a NumPy NDArray and # not a Pandas dataframe, which is something Pandas does for # you automatically. =) # # .. your code here .. from sklearn import decomposition from sklearn import datasets pca = decomposition.RandomizedPCA(n_components=3) pca.fit(armadillo) X = pca.transform(armadillo) return X
vmax = max(comp.max(), -comp.min()) plt.imshow(comp.reshape(image_shape), cmap=plt.cm.gray, interpolation='nearest', vmin=-vmax, vmax=vmax) plt.xticks(()) plt.yticks(()) plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.) ############################################################################### # List of the different estimators, whether to center and transpose the # problem, and whether the transformer uses the clustering API. estimatorname = 'Eigenfaces - RandomizedPCA' estimator = decomposition.RandomizedPCA(n_components=n_components, whiten=True) center = True ############################################################################### # Plot a sample of the input data plot_gallery("First centered B faces", faces_centered[:n_components]) ############################################################################### # Do the estimation and plot it print("Extracting the top %d %s..." % (n_components, estimatorname)) t0 = time() data = faces if center: data = faces_centered
pl.title('A selection from the 64-dimensional digits dataset') #---------------------------------------------------------------------- # Random 2D projection using a random unitary matrix print "Computing random projection" rng = np.random.RandomState(42) Q, _ = qr_economic(rng.normal(size=(n_features, 2))) X_projected = np.dot(Q.T, X.T).T plot_embedding(X_projected, "Random Projection of the digits") #---------------------------------------------------------------------- # Projection on to the first 2 principal components print "Computing PCA projection" t0 = time() X_pca = decomposition.RandomizedPCA(n_components=2).fit_transform(X) plot_embedding( X_pca, "Principal Components projection of the digits (time %.2fs)" % (time() - t0)) #---------------------------------------------------------------------- # Projection on to the first 2 linear discriminant components print "Computing LDA projection" X2 = X.copy() X2.flat[::X.shape[1] + 1] += 0.01 # Make X invertible t0 = time() X_lda = lda.LDA(n_components=2).fit_transform(X2, y) plot_embedding( X_lda, "Linear Discriminant projection of the digits (time %.2fs)" % (time() - t0))