def pca_incremental(df, n_c=7): X = df.drop(['class'], axis=1) transformer = IncrementalPCA(n_components=7) X_transformed = transformer.fit_transform(X) return X_transformed
align='center', alpha=0.5) plt.xticks(np.arange(len(MCAcolumns)), MCAcolumns) plt.ylabel('Percentage') plt.title('Explained Variance by Factor (%): Multiple Correspondence Analysis') plt.show() fig.savefig(''.join([ 'C:/Users/Jairo F Gudiño R/Desktop/Balance Sheet Commonality/', 'MCA', '.pdf' ])) ft = mca_ben.fs_r(N=F) # PCA Explained Variance # MCAFactorScores = pd.DataFrame(ft, columns=MCAcolumns) PCADataframe = pd.concat( [df_norm.iloc[:, range(df_norm.shape[1] - 3)], MCAFactorScores], axis=1) PCAModel = IncrementalPCA(n_components=3) reduced_data = PCAModel.fit_transform(PCADataframe) explained_variancePCA = PCAModel.explained_variance_ratio_ * 100 PCAcolumns = [("F" + str(i + 1)) for i in range(3)] fig, Graph = plt.subplots() Graph = plt.bar(np.arange(len(PCAcolumns)), explained_variancePCA, align='center', alpha=0.5) plt.xticks(np.arange(len(PCAcolumns)), PCAcolumns) plt.ylabel('Percentage') plt.title('Explained Variance by Factor (%): Principal Component Analysis') plt.show() fig.savefig(''.join([ 'C:/Users/Jairo F Gudiño R/Desktop/Balance Sheet Commonality/', 'PCA', '.pdf'
def pca_transform(X, n_components): pca = IncrementalPCA(n_components=n_components) pca.fit(X) Xt = pca.transform(X) return Xt
X_train, X_test, y_train, y_test = train_test_split(dt_features, dt_target, test_size=0.3, random_state=42) # Para verificar que el train_test_split tomo los valores proporcionalmente # se imprime el shape del entrenamiento y ambos deben de tener la mismas # dimensiones print(X_train.shape) print(y_train.shape) # n_components = min(n_muestras, n_features) valor por defecto pca = PCA(n_components=3) pca.fit(X_train) ipca = IncrementalPCA(n_components=3, batch_size=10) ipca.fit(X_train) plt.plot(range(len(pca.explained_variance_)), pca.explained_variance_ratio_) plt.show() logistic = LogisticRegression(solver='lbfgs') dt_train = pca.transform(X_train) dt_test = pca.transform(X_test) logistic.fit(dt_train, y_train) print("Score PCA: ", logistic.score(dt_test, y_test)) dt_train = ipca.transform(X_train)
def pca( data: Union[AnnData, np.ndarray, spmatrix], n_comps: int = N_PCS, zero_center: Optional[bool] = True, svd_solver: str = 'auto', random_state: int = 0, return_info: bool = False, use_highly_variable: Optional[bool] = None, dtype: str = 'float32', copy: bool = False, chunked: bool = False, chunk_size: Optional[int] = None, ) -> Union[AnnData, np.ndarray, spmatrix]: """Principal component analysis [Pedregosa11]_. Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of *scikit-learn* [Pedregosa11]_. Parameters ---------- data The (annotated) data matrix of shape ``n_obs`` × ``n_vars``. Rows correspond to cells and columns to genes. n_comps Number of principal components to compute. zero_center If `True`, compute standard PCA from covariance matrix. If ``False``, omit zero-centering variables (uses :class:`~sklearn.decomposition.TruncatedSVD`), which allows to handle sparse input efficiently. Passing ``None`` decides automatically based on sparseness of the data. svd_solver SVD solver to use: ``'arpack'`` for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`) ``'randomized'`` for the randomized algorithm due to Halko (2009). ``'auto'`` (the default) chooses automatically depending on the size of the problem. random_state Change to use different initial states for the optimization. return_info Only relevant when not passing an :class:`~anndata.AnnData`: see “**Returns**”. use_highly_variable Whether to use highly variable genes only, stored in ``.var['highly_variable']``. By default uses them if they have been determined beforehand. dtype Numpy data type string to which to convert the result. copy If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Is ignored otherwise. chunked If ``True``, perform an incremental PCA on segments of ``chunk_size``. The incremental PCA automatically zero centers and ignores settings of ``random_seed`` and ``svd_solver``. If ``False``, perform a full PCA. chunk_size Number of observations to include in each chunk. Required if ``chunked=True`` was passed. Returns ------- X_pca : :class:`scipy.sparse.spmatrix` or :class:`numpy.ndarray` If `data` is array-like and ``return_info=False`` was passed, this function only returns `X_pca`… adata : anndata.AnnData …otherwise if ``copy=True`` it returns or else adds fields to ``adata``: ``.obsm['X_pca']`` PCA representation of data. ``.varm['PCs']`` The principal components containing the loadings. ``.uns['pca']['variance_ratio']``) Ratio of explained variance. ``.uns['pca']['variance']`` Explained variance, equivalent to the eigenvalues of the covariance matrix. """ # chunked calculation is not randomized, anyways if svd_solver in {'auto', 'randomized'} and not chunked: logg.info( 'Note that scikit-learn\'s randomized PCA might not be exactly ' 'reproducible across different computational platforms. For exact ' 'reproducibility, choose `svd_solver=\'arpack\'.` This will likely ' 'become the Scanpy default in the future.') data_is_AnnData = isinstance(data, AnnData) if data_is_AnnData: adata = data.copy() if copy else data else: adata = AnnData(data) logg.info('computing PCA with n_comps =', n_comps, r=True) if adata.n_vars < n_comps: n_comps = adata.n_vars - 1 logg.msg('reducing number of computed PCs to', n_comps, 'as dim of data is only', adata.n_vars, v=4) if use_highly_variable is True and 'highly_variable' not in adata.var.keys( ): raise ValueError( 'Did not find adata.var[\'highly_variable\']. ' 'Either your data already only consists of highly-variable genes ' 'or consider running `pp.filter_genes_dispersion` first.') if use_highly_variable is None: use_highly_variable = True if 'highly_variable' in adata.var.keys( ) else False if use_highly_variable: logg.info('computing PCA on highly variable genes') adata_comp = adata[:, adata. var['highly_variable']] if use_highly_variable else adata if chunked: if not zero_center or random_state or svd_solver != 'auto': logg.msg('Ignoring zero_center, random_state, svd_solver', v=4) from sklearn.decomposition import IncrementalPCA X_pca = np.zeros((adata_comp.X.shape[0], n_comps), adata_comp.X.dtype) pca_ = IncrementalPCA(n_components=n_comps) for chunk, _, _ in adata_comp.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk pca_.partial_fit(chunk) for chunk, start, end in adata_comp.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk X_pca[start:end] = pca_.transform(chunk) else: if zero_center is None: zero_center = not issparse(adata_comp.X) if zero_center: from sklearn.decomposition import PCA if issparse(adata_comp.X): logg.msg( ' as `zero_center=True`, ' 'sparse input is densified and may ' 'lead to huge memory consumption', v=4) X = adata_comp.X.toarray( ) # Copying the whole adata_comp.X here, could cause memory problems else: X = adata_comp.X pca_ = PCA(n_components=n_comps, svd_solver=svd_solver, random_state=random_state) else: from sklearn.decomposition import TruncatedSVD logg.msg( ' without zero-centering: \n' ' the explained variance does not correspond to the exact statistical defintion\n' ' the first component, e.g., might be heavily influenced by different means\n' ' the following components often resemble the exact PCA very closely', v=4) pca_ = TruncatedSVD(n_components=n_comps, random_state=random_state) X = adata_comp.X X_pca = pca_.fit_transform(X) if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype) if data_is_AnnData: adata.obsm['X_pca'] = X_pca if use_highly_variable: adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps)) adata.varm['PCs'][ adata.var['highly_variable']] = pca_.components_.T else: adata.varm['PCs'] = pca_.components_.T adata.uns['pca'] = {} adata.uns['pca']['variance'] = pca_.explained_variance_ adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_ logg.info(' finished', t=True) logg.msg( 'and added\n' ' \'X_pca\', the PCA coordinates (adata.obs)\n' ' \'PC1\', \'PC2\', ..., the loadings (adata.var)\n' ' \'pca_variance\', the variance / eigenvalues (adata.uns)\n' ' \'pca_variance_ratio\', the variance ratio (adata.uns)', v=4) return adata if copy else None else: logg.info(' finished', t=True) if return_info: return X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_ else: return X_pca
self.X = X def chunked(self, chunks): start = 0 for i in range(chunks): stop = start + len(self.X[i::chunks]) yield self.X[start:stop] start = stop D = LikeAnnData(np.random.rand(100000, 1000)) n_comp = 80 n_chunks = 100 ipca = IncrementalPCA(n_components=n_comp) print('Training IPCA') for chunk in D.chunked(n_chunks): ipca.partial_fit(chunk) OutIPCA = np.array([]) print('Fitting IPCA') for chunk in D.chunked(n_chunks): Tr = ipca.transform(chunk) OutIPCA = np.vstack([OutIPCA, Tr]) if OutIPCA.size else Tr print('Training and fitting PCA')
plt.scatter(X_scaled_pca.values[:, 0], X_scaled_pca.values[:, 1], c=y, s=50, cmap='viridis') plt.title('PCA sklearn clusters') plt.xlabel('PC1') plt.ylabel('PC2') #plt.savefig(fname='numerical_pca2_sklearn') plt.show() # In[24]: # PCA incremental n_components = 2 X_scaled_ipca = IncrementalPCA( n_components=n_components).fit_transform(X_num_scaled) X_scaled_ipca = pd.DataFrame(X_scaled_ipca) X_scaled_ipca.head() # In[25]: plt.scatter(X_scaled_ipca.values[:, 0], X_scaled_ipca.values[:, 1], c=y, s=50, cmap='viridis') plt.title('Incremental PCA sklearn clusters') plt.xlabel('PC1') plt.ylabel('PC2') #plt.savefig(fname='numerical_pca2_sklearn_incremental') plt.show()
def test_incremental_pca_validation(): # Test that n_components is >=1 and <= n_features. X = [[0, 1], [1, 0]] for n_components in [-1, 0, .99, 3]: assert_raises(ValueError, IncrementalPCA(n_components, batch_size=10).fit, X)
import numpy as np from sklearn.decomposition import IncrementalPCA from basic_classes.helper import get_data_from_csv from basic_classes.get_input import Input from basic_classes.constants import preciser_csv_path, pca_dir from datetime import datetime import joblib # Create and save model dt = datetime.now() current_day = str(dt.day) current_month = str(dt.month) pca_transformer = IncrementalPCA(n_components=512, batch_size=100) filename = 'pca' + str(current_day) + '_' + str(current_month) + '.joblib' joblib.dump(pca_transformer, pca_dir + filename) """ Get Input-names """ input_names, output_pois = get_data_from_csv(preciser_csv_path) n_samples = input_names.shape[0] n_train = n_samples input_names = input_names[:n_train] in_obj = Input('PCA', 500, 512, 'melspectrogram') x_train = [] batch_count = 0 for i in range(n_train): sample_name = input_names[i] print( '\n==========================================================================' ) print("Reading: " + sample_name) try:
def fast_pca(*x, n_components=None, algo='pca', y=None, batch_size=1024, return_model=False, random_state=1234): r""" A shortcut for many different PCA algorithms Arguments: x : {list, tuple} list of matrices for transformation, the first matrix will be used for training n_components : {None, int} number of PCA components algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'} different PCA algorithm: 'ipca' - IncrementalPCA, 'ppca' - Probabilistic PCA, 'sppca' - Supervised Probabilistic PCA, 'plda' - Probabilistic LDA, 'rpca' - randomized PCA using randomized SVD 'pca' - Normal PCA y : {numpy.ndarray, None} required for labels in case of `sppca` batch_size : int (default: 1024) batch size, only used for IncrementalPCA return_model : bool (default: False) if True, return the trained PCA model as the FIRST return """ try: from cuml.decomposition import PCA as cuPCA except ImportError: cuPCA = None batch_size = int(batch_size) algo = str(algo).lower() if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'): raise ValueError("`algo` must be one of the following: 'pca', " "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo) if algo in ('sppca', 'plda') and y is None: raise RuntimeError("`y` must be not None if `algo='sppca'`") x = flatten_list(x, level=None) # ====== check input ====== # x_train = x[0] x_test = x[1:] input_shape = None if x_train.ndim > 2: # only 2D for PCA input_shape = (-1,) + x_train.shape[1:] new_shape = (-1, np.prod(input_shape[1:])) x_train = np.reshape(x_train, new_shape) x_test = [np.reshape(x, new_shape) for x in x_test] if n_components is not None: # no need to reshape back input_shape = None # ====== train PCA ====== # if algo == 'sppca': pca = SupervisedPPCA(n_components=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'plda': from odin.ml import PLDA pca = PLDA(n_phi=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'pca': if x_train.shape[1] > 1000 and x_train.shape[0] > 1e5 and cuPCA is not None: pca = cuPCA(n_components=n_components, random_state=random_state) else: pca = PCA(n_components=n_components, random_state=random_state) pca.fit(x_train) elif algo == 'rpca': # we copy the implementation of RandomizedPCA because # it is significantly faster than PCA(svd_solver='randomize') pca = RandomizedPCA(n_components=n_components, iterated_power=2, random_state=random_state) pca.fit(x_train) elif algo == 'ipca': pca = IncrementalPCA(n_components=n_components, batch_size=batch_size) prog = Progbar(target=x_train.shape[0], print_report=False, print_summary=False, name="Fitting PCA") for start, end in batching(batch_size=batch_size, n=x_train.shape[0], seed=1234): pca.partial_fit(x_train[start:end], check_input=False) prog.add(end - start) elif algo == 'ppca': pca = PPCA(n_components=n_components, random_state=random_state) pca.fit(x_train) # ====== transform ====== # x_train = pca.transform(x_train) x_test = [pca.transform(x) for x in x_test] # reshape back to original shape if necessary if input_shape is not None: x_train = np.reshape(x_train, input_shape) x_test = [np.reshape(x, input_shape) for x in x_test] # return the results if len(x_test) == 0: return x_train if not return_model else (pca, x_train) return tuple([x_train] + x_test) if not return_model else tuple([pca, x_train] + x_test)
df_cust_summary.index = df_cust_summary.index.map(int) cust_benchmark = round(df_cust_summary['count'].quantile(0.8),0) drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index print('Customer minimum times of review: {}'.format(cust_benchmark)) print('Original Shape: {}'.format(df.shape)) df = df[~df['MovieId'].isin(drop_movie_list)] df = df[~df['User'].isin(drop_cust_list)] print('After Trim Shape: {}'.format(df.shape)) # Convert the dataset to a user-movie matrix. To know more please google on "user item matrix" df = df.drop_duplicates(subset = ['User', 'MovieId'], keep = "last") df = df.pivot(index = 'User', columns = 'MovieId', values = 'Rating').fillna(0) # Using PCA to reduce the dimension of every user vector to 2 floating point numbers so we can plot on a graph and also perform kmeans clustering pca = IncrementalPCA(n_components=2, batch_size = 2) transformed_matrix = pca.fit_transform(df) # Perform Kmeans clustering kmeans = KMeans(n_clusters = 4).fit_predict(transformed_matrix) # Plotting the data data = [go.Scatter( x = transformed_matrix[:, 0], y = transformed_matrix[:, 1], text = list(df.index), hoverinfo = "text", mode = "markers", marker = dict(color = kmeans) )]
def pca_incremental(cube, angle_list, batch=0.25, ncomp=1, imlib='opencv', interpolation='lanczos4', collapse='median', verbose=True, full_output=False, return_residuals=False, start_time=None): """ Computes the full-frame PCA-ADI algorithm in batches, for processing fits files larger than the available system memory. It uses the incremental PCA algorithm from Sklearn. There is no ``scaling`` parameter as in other PCA algorithms in ``VIP``, but by default this implementation returns a temporally mean-centered frame ("temp-mean"). Parameters ---------- cube : str or numpy ndarray Input cube as numpy array or string with the path to the fits file to be opened in memmap mode. angle_list : str or numpy ndarray Corresponding parallactic angle for each frame. batch : int or float, optional When int it corresponds to the number of frames in each batch. If a float (0, 1] is passed then it is the size of the batch is computed wrt the available memory in the system. ncomp : int, optional How many PCs are used as a lower-dimensional subspace to project the target frames. imlib : str, optional See the documentation of the ``vip_hci.preproc.frame_rotate`` function. interpolation : str, optional See the documentation of the ``vip_hci.preproc.frame_rotate`` function. collapse : {'median', 'mean', 'sum', 'trimmean'}, str optional Sets the way of collapsing the frames for producing a final image. verbose : {True, False}, bool optional If True prints intermediate info and timing. full_output : boolean, optional Whether to return the final median combined image only or with other intermediate arrays. return_residuals : bool, optional If True, only the cube of residuals is returned (before de-rotating). start_time : None or datetime.datetime, optional Used when embedding this function in the main ``pca`` function. The object datetime.datetime is the global starting time. If None, it initiates its own counter. Returns ------- frame : numpy ndarray [return_residuals=False] Final frame (2d array). ipca : scikit-learn model [full_output=True, return_residuals=False] The incremental PCA model of scikit-learn. pcs : numpy ndarray [full_output=True, return_residuals=False] Principal components reshaped into images. medians : numpy ndarray [full_output=True, return_residuals=False] The median of the derotated residuals for each batch. cube_residuals : numpy ndarray [return_residuals=True] Cube of residuals. """ if start_time is None: start_time = time_ini(verbose) verbose_memcheck = True else: verbose_memcheck = False # checking cube and angle_list data types if not isinstance(cube, (np.ndarray, str)): raise TypeError('`cube` must be a str (full path on disk) or a numpy ' 'array') if not isinstance(angle_list, (np.ndarray, str)): raise TypeError('`angle_list` must be a str (full path on disk) or a ' 'numpy array') # opening data if isinstance(cube, str): # assuming the first HDULIST contains the datacube hdulist = open_fits(cube, n=0, return_memmap=True) cube = hdulist.data if not cube.ndim > 2: raise TypeError('Input array is not a 3d array') n_frames, y, x = cube.shape # checking angles length and ncomp if isinstance(angle_list, str): angle_list = open_fits(angle_list) angle_list = check_pa_vector(angle_list) if not n_frames == angle_list.shape[0] and not return_residuals: raise TypeError('`angle_list` vector has wrong length. It must be the ' 'same as the number of frames in the cube') if not isinstance(ncomp, (int, float)): raise TypeError("`ncomp` must be an int or a float in the ADI case") if ncomp > n_frames: ncomp = min(ncomp, n_frames) msg = 'Number of PCs too high (max PCs={}), using {} PCs instead.' print(msg.format(n_frames, ncomp)) # checking memory and determining batch size cube_size = cube.nbytes aval_mem = get_available_memory(verbose_memcheck) if isinstance(batch, int): # the batch size in n_fr batch_size = batch elif isinstance(batch, float): # the batch ratio wrt available memory if 1 > batch > 0: batch_size = min(int(n_frames * (batch * aval_mem) / cube_size), n_frames) else: raise TypeError("`batch` must be an int or float") if verbose: msg1 = "Cube size = {:.3f} GB ({} frames)" print(msg1.format(cube_size / 1e9, n_frames)) msg2 = "Batch size = {} frames ({:.3f} GB)\n" print(msg2.format(batch_size, cube[:batch_size].nbytes / 1e9)) n_batches = n_frames // batch_size # floor/int division remaining_frames = n_frames % batch_size if remaining_frames > 0: n_batches += 1 # computing the PCA model for each batch ipca = IncrementalPCA(n_components=ncomp) for i in range(n_batches): intini = i * batch_size intfin = (i + 1) * batch_size batch = cube[intini:min(n_frames, intfin)] msg = 'Batch {}/{}\tshape: {}\tsize: {:.1f} MB' if verbose: print(msg.format(i + 1, n_batches, batch.shape, batch.nbytes / 1e6)) matrix = prepare_matrix(batch, verbose=False) ipca.partial_fit(matrix) if verbose: timing(start_time) # getting PCs and the mean in order to center each batch V = ipca.components_ mean = ipca.mean_.reshape(y, x) if verbose: print('\nReconstructing and obtaining residuals') if return_residuals: cube_residuals = np.empty((n_frames, y, x)) else: medians = [] for i in range(n_batches): intini = i * batch_size intfin = (i + 1) * batch_size batch = cube[intini:min(n_frames, intfin)] - mean matrix = prepare_matrix(batch, verbose=False) reconst = np.dot(np.dot(matrix, V.T), V) resid = matrix - reconst resid_reshaped = resid.reshape(batch.shape) if return_residuals: cube_residuals[intini:intfin] = resid_reshaped else: resid_der = cube_derotate(resid_reshaped, angle_list[intini:intfin], imlib=imlib, interpolation=interpolation) medians.append(cube_collapse(resid_der, mode=collapse)) del matrix del batch if return_residuals: return cube_residuals else: medians = np.array(medians) frame = np.median(medians, axis=0) if verbose: timing(start_time) if full_output: pcs = reshape_matrix(V, y, x) return frame, ipca, pcs, medians else: return frame
def btnConvert_click(self): msgBox = QMessageBox() totalTime = 0 # Batch try: Batch = np.int32(ui.txtBatch.text()) except: msgBox.setText("Size of batch is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if Batch == 0: Batch = None # Kernel Kernel = ui.cbKernel.currentText() # Method Method = ui.cbMethod.currentText() # Gamma try: Gamma = np.float(ui.txtGamma.text()) except: msgBox.setText("Gamma is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Degree try: Degree = np.int32(ui.txtDegree.text()) except: msgBox.setText("Degree is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Coef0 try: Coef0 = np.float(ui.txtCoef0.text()) except: msgBox.setText("Coef0 is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Alpha try: Alpha = np.int32(ui.txtAlpha.text()) except: msgBox.setText("Alpha is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Tol try: Tol = np.float(ui.txtTole.text()) except: msgBox.setText("Tolerance is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # MaxIte try: MaxIter = np.int32(ui.txtMaxIter.text()) except: msgBox.setText("Maximum number of iterations is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if MaxIter <= 0: MaxIter = None # Number of Job try: NJob = np.int32(ui.txtJobs.text()) except: msgBox.setText("The number of parallel jobs is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if NJob < -1 or NJob == 0: msgBox.setText( "The number of parallel jobs must be -1 or greater than 0!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: FoldFrom = np.int32(ui.txtFoldFrom.text()) FoldTo = np.int32(ui.txtFoldTo.text()) except: print("Please check fold parameters!") return if FoldTo < FoldFrom: print("Please check fold parameters!") return for fold_all in range(FoldFrom, FoldTo + 1): tic = time.time() # OutFile OutFile = ui.txtOutFile.text() OutFile = OutFile.replace("$FOLD$", str(fold_all)) if not len(OutFile): msgBox.setText("Please enter out file!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # InFile InFile = ui.txtInFile.text() InFile = InFile.replace("$FOLD$", str(fold_all)) if not len(InFile): msgBox.setText("Please enter input file!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not os.path.isfile(InFile): msgBox.setText("Input file not found!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False InData = io.loadmat(InFile) OutData = dict() OutData["imgShape"] = InData["imgShape"] # Data if not len(ui.txtITrData.currentText()): msgBox.setText("Please enter Input Train Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeData.currentText()): msgBox.setText("Please enter Input Test Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrData.text()): msgBox.setText("Please enter Output Train Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeData.text()): msgBox.setText("Please enter Output Test Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: XTr = InData[ui.txtITrData.currentText()] XTe = InData[ui.txtITeData.currentText()] if ui.cbScale.isChecked(): XTr = preprocessing.scale(XTr) XTe = preprocessing.scale(XTe) print("Whole of data is scaled X~N(0,1).") except: print("Cannot load data") return try: NumFea = np.int32(ui.txtNumFea.text()) except: msgBox.setText("Number of features is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if NumFea < 0: msgBox.setText("Number of features must be greater than zero!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if NumFea > np.shape(XTr)[1]: msgBox.setText("Number of features is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if NumFea > np.shape(XTe)[1]: msgBox.setText("Number of features is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Label if not len(ui.txtITrLabel.currentText()): msgBox.setText("Please enter Train Input Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeLabel.currentText()): msgBox.setText("Please enter Test Input Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrLabel.text()): msgBox.setText( "Please enter Train Output Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeLabel.text()): msgBox.setText("Please enter Test Output Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrLabel.text()] = InData[ ui.txtITrLabel.currentText()] OutData[ui.txtOTeLabel.text()] = InData[ ui.txtITeLabel.currentText()] except: print("Cannot load labels!") # Subject if not len(ui.txtITrSubject.currentText()): msgBox.setText( "Please enter Train Input Subject variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeSubject.currentText()): msgBox.setText( "Please enter Test Input Subject variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrSubject.text()): msgBox.setText( "Please enter Train Output Subject variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeSubject.text()): msgBox.setText( "Please enter Test Output Subject variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: TrSubject = InData[ui.txtITrSubject.currentText()] OutData[ui.txtOTrSubject.text()] = TrSubject TeSubject = InData[ui.txtITeSubject.currentText()] OutData[ui.txtOTeSubject.text()] = TeSubject except: print("Cannot load Subject IDs") return # Task if ui.cbTask.isChecked(): if not len(ui.txtITrTask.currentText()): msgBox.setText( "Please enter Input Train Task variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeTask.currentText()): msgBox.setText( "Please enter Input Test Task variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrTask.text()): msgBox.setText( "Please enter Output Train Task variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeTask.text()): msgBox.setText( "Please enter Output Test Task variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: TrTask = InData[ui.txtITrTask.currentText()] OutData[ui.txtOTrTask.text()] = TrTask TeTask = InData[ui.txtITeTask.currentText()] OutData[ui.txtOTeTask.text()] = TeTask TrTaskIndex = TrTask.copy() for tasindx, tas in enumerate(np.unique(TrTask)): TrTaskIndex[TrTask == tas] = tasindx + 1 TeTaskIndex = TeTask.copy() for tasindx, tas in enumerate(np.unique(TeTask)): TeTaskIndex[TeTask == tas] = tasindx + 1 except: print("Cannot load Tasks!") return # Run if ui.cbRun.isChecked(): if not len(ui.txtITrRun.currentText()): msgBox.setText( "Please enter Train Input Run variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeRun.currentText()): msgBox.setText( "Please enter Test Input Run variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrRun.text()): msgBox.setText( "Please enter Train Output Run variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeRun.text()): msgBox.setText( "Please enter Test Output Run variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: TrRun = InData[ui.txtITrRun.currentText()] OutData[ui.txtOTrRun.text()] = TrRun TeRun = InData[ui.txtITeRun.currentText()] OutData[ui.txtOTeRun.text()] = TeRun except: print("Cannot load Runs!") return # Counter if ui.cbCounter.isChecked(): if not len(ui.txtITrCounter.currentText()): msgBox.setText( "Please enter Train Input Counter variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeCounter.currentText()): msgBox.setText( "Please enter Test Input Counter variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrCounter.text()): msgBox.setText( "Please enter Train Output Counter variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeCounter.text()): msgBox.setText( "Please enter Test Output Counter variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: TrCounter = InData[ui.txtITrCounter.currentText()] OutData[ui.txtOTrCounter.text()] = TrCounter TeCounter = InData[ui.txtITeCounter.currentText()] OutData[ui.txtOTeCounter.text()] = TeCounter except: print("Cannot load Counters!") return # Matrix Label if ui.cbmLabel.isChecked(): if not len(ui.txtITrmLabel.currentText()): msgBox.setText( "Please enter Train Input Matrix Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITemLabel.currentText()): msgBox.setText( "Please enter Test Input Matrix Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrmLabel.text()): msgBox.setText( "Please enter Train Output Matrix Label variable name!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTemLabel.text()): msgBox.setText( "Please enter Test Output Matrix Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrmLabel.text()] = InData[ ui.txtITrmLabel.currentText()] OutData[ui.txtOTemLabel.text()] = InData[ ui.txtITemLabel.currentText()] except: print("Cannot load matrix lables!") return # Design if ui.cbDM.isChecked(): if not len(ui.txtITrDM.currentText()): msgBox.setText( "Please enter Train Input Design Matrix variable name!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeDM.currentText()): msgBox.setText( "Please enter Test Input Design Matrix variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrDM.text()): msgBox.setText( "Please enter Train Output Design Matrix variable name!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeDM.text()): msgBox.setText( "Please enter Test Output Design Matrix variable name!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrDM.text()] = InData[ ui.txtITrDM.currentText()] OutData[ui.txtOTeDM.text()] = InData[ ui.txtITeDM.currentText()] except: print("Cannot load design matrices!") return # Coordinate if ui.cbCol.isChecked(): if not len(ui.txtCol.currentText()): msgBox.setText("Please enter Coordinator variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOCol.text()): msgBox.setText("Please enter Coordinator variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOCol.text()] = InData[ ui.txtCol.currentText()] except: print("Cannot load coordinator!") return # Condition if ui.cbCond.isChecked(): if not len(ui.txtCond.currentText()): msgBox.setText("Please enter Condition variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOCond.text()): msgBox.setText("Please enter Condition variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOCond.text()] = InData[ ui.txtCond.currentText()] except: print("Cannot load conditions!") return # FoldID if ui.cbFoldID.isChecked(): if not len(ui.txtFoldID.currentText()): msgBox.setText("Please enter FoldID variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOFoldID.text()): msgBox.setText("Please enter FoldID variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOFoldID.text()] = InData[ ui.txtFoldID.currentText()] except: print("Cannot load Fold ID!") return # FoldInfo if ui.cbFoldInfo.isChecked(): if not len(ui.txtFoldInfo.currentText()): msgBox.setText("Please enter FoldInfo variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOFoldInfo.text()): msgBox.setText("Please enter FoldInfo variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOFoldInfo.text()] = InData[ ui.txtFoldInfo.currentText()] except: print("Cannot load Fold Info!") return pass # Number of Scan if ui.cbNScan.isChecked(): if not len(ui.txtITrScan.currentText()): msgBox.setText( "Please enter Number of Scan variable name for Input Train!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeScan.currentText()): msgBox.setText( "Please enter Number of Scan variable name for Input Test!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrScan.text()): msgBox.setText( "Please enter Number of Scan variable name for Output Train!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeScan.text()): msgBox.setText( "Please enter Number of Scan variable name for Output Test!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrScan.text()] = InData[ ui.txtITrScan.currentText()] OutData[ui.txtOTeScan.text()] = InData[ ui.txtITeScan.currentText()] except: print("Cannot load NScan!") return if NumFea == 0: NumFea = np.min(np.shape(XTr)) print("Number of features are automatically selected as ", NumFea) try: if Method == "PCA": model = PCA(n_components=NumFea, copy=False, tol=Tol) elif Method == "Kernel PCA": model = KernelPCA(n_components=NumFea,kernel=Kernel,gamma=Gamma,degree=Degree,\ coef0=Coef0, alpha=Alpha, tol=Tol, max_iter=MaxIter, n_jobs=NJob,copy_X=False) else: model = IncrementalPCA(n_components=NumFea, copy=False, batch_size=Batch) print("Running PCA Functional Alignment on Training Data ...") OutData[ui.txtOTrData.text()] = model.fit_transform(XTr) print("Running PCA Functional Alignment on Testing Data ...") OutData[ui.txtOTeData.text()] = model.fit_transform(XTe) except Exception as e: print(str(e)) HAParam = dict() HAParam["Method"] = Method HAParam["NumFea"] = NumFea HAParam["Kernel"] = Kernel OutData["FunctionalAlignment"] = HAParam OutData["Runtime"] = time.time() - tic totalTime += OutData["Runtime"] print("Saving ...") io.savemat(OutFile, mdict=OutData) print("Fold " + str(fold_all) + " is DONE: " + OutFile) print("Runtime: ", totalTime) print("PCA Functional Alignment is done.") msgBox.setText("PCA Functional Alignment is done.") msgBox.setIcon(QMessageBox.Information) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_()
plt.title("Original", fontsize=16) plt.subplot(122) plot_digits(X_recovered[::2100]) plt.title("Compressed", fontsize=16) save_fig("mnist_compression_plot") plt.show() X_reduced_pca = X_reduced print('------------------------------------------------------------------------------------------------------\n' ' 8.3.8 Incremental PCA \n' '------------------------------------------------------------------------------------------------------\n') n_batches = 100 inc_pca = IncrementalPCA(n_components=154) for X_batch in np.array_split(X_train, n_batches): print(".", end="") # not shown in the book inc_pca.partial_fit(X_batch) X_reduced = inc_pca.transform(X_train) X_recovered_inc_pca = inc_pca.inverse_transform(X_reduced) plt.figure(figsize=(7, 4)) plt.subplot(121) plot_digits(X_train[::2100]) plt.subplot(122) plot_digits(X_recovered_inc_pca[::2100]) plt.tight_layout() plt.show()
def plot_at_k(k): ipca = IncrementalPCA(n_components=k) image_recon = ipca.inverse_transform(ipca.fit_transform(image_bw)) plt.imshow(image_recon, cmap=plt.cm.gray)
if pca_fraction_variance_to_retain < 1 or cluster_algorithm == "pca": print("reducing dimensionality with PCA...") activations_scaled = [None] * nlayers for ilayer in range(nlayers): if ilayer not in these_layers: continue mu = np.mean(activations_flattened[ilayer], axis=0) sigma = np.std(activations_flattened[ilayer], axis=0) activations_scaled[ilayer] = (activations_flattened[ilayer] - mu) / sigma if pca_batch_size == 0: pca = PCA() else: nfeatures = np.shape(activations_scaled[ilayer])[1] pca = IncrementalPCA(batch_size=pca_batch_size * nfeatures) fits_pca[ilayer] = pca.fit(activations_scaled[ilayer]) print(np.shape(fits_pca[ilayer])) import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt #plt.ion() activations_kept = [None] * nlayers fig = plt.figure() ax = fig.add_subplot(111) for ilayer in range(nlayers): if ilayer not in these_layers: continue cumsum = np.cumsum(fits_pca[ilayer].explained_variance_ratio_)
def calc_pca(self, h5_fname, n_comps=20, batch_size=1000, norm=''): """ Perform Incremental Principle Component Analysis on squared data stored in the provided h5 datafile. Breaks dataset into sets of 1000 spectra, which are fed to the IPCA model one at a time for training. Spectra are then similarly transformed according to results of the PCA in sets of 1000 at a time, then aggregated and stored to a new group in the same h5 datafile as the source dataset. """ from sklearn.decomposition import PCA, IncrementalPCA hf = h5py.File(h5_fname, 'r+') raw_data = hf['MALDI_001']['Intensities'] filtered = hf['Filtered_data'] height, length = raw_data.shape #Apply relevant normalization norm = norm.capitalize() if norm == '': norm_factors = np.ones(height) norm = None else: try: norm_factors = np.array(hf['MALDI_001']['Normalization Factors'][norm]) except KeyError: print('Desired normalization factors not yet calculated.') print('Calculate normalization factors, then try again.') return try: pca = hf.create_group('PCA') except: pca = hf['PCA'] try: norm_group = pca.create_group(norm) except KeyError: print('Selected PCA has already been calculated, aborting.') return roi_list = list(filtered.keys()) print('Calculating PCA') for roi in roi_list: roi_grp = pca.create_group(roi) if norm == None: norm_grp = roi_grp.create_group('Unnormalized') else: try: norm_group = roi_grp.create_group(norm) except KeyError: print('Selected PCA has already been calculated, aborting.') return (ptsx, ptsy) = filtered[roi].attrs.get('Image Dimensions') # Perform PCA, store results as MALDIdata attributes model = IncrementalPCA(n_components=n_comps, batch_size=batch_size) steps = int(height/batch_size) print('Training PCA....') for i in range(steps): low_bound = i*1000 high_bound = (i+1)*1000 subset = raw_data[low_bound:high_bound] norm_factor_subset = norm_factors[low_bound:high_bound] normalized_subset = np.empty_like(subset) for j, spectrum in enumerate(subset): n_factor = norm_factor_subset[j] spectrum = subset[j] norm_spectrum = spectrum / n_factor normalized_subset[j] = norm_spectrum model.partial_fit(normalized_subset) print(high_bound) if steps == 0: high_bound = 0 subset = raw_data[high_bound:] model.partial_fit(subset) fitted = np.empty((height, n_comps)) print('Mapping spectra to PCA...') for i in range(steps): low_bound = i*1000 high_bound = (i+1)*1000 subset = raw_data[low_bound:high_bound] norm_factor_subset = norm_factors[low_bound:high_bound] normalized_subset = np.empty_like(subset) for j, spectrum in enumerate(subset): n_factor = norm_factor_subset[j] spectrum = subset[j] norm_spectrum = spectrum / n_factor normalized_subset[j] = norm_spectrum transformation = model.transform(normalized_subset) fitted[low_bound:high_bound] = transformation print(high_bound) fitted[high_bound:] = model.transform(raw_data[high_bound:]) squared_indices = np.array(hf['Squared_data']['ROI_01']['Indices']) keep_indices = squared_indices.flatten() pca_maps = np.empty((ptsx, ptsy, n_comps)) for index in keep_indices: spectrum = fitted[index] coordinates = np.where(squared_indices == index) row, column = (coordinates[0][0], coordinates[1][0]) pca_maps[row, column, :] = spectrum roi_grp.create_dataset('Maps', data=pca_maps, dtype='float64') roi_grp.create_dataset('Endmembers', data=model.components_, dtype='float64') roi_grp.create_dataset('Eigenvalues', data=model.explained_variance_, dtype='float64') roi_grp.attrs['Number_of_components'] = n_comps hf.close() return
def pipe_main(pipe=None): '''pipeline construction using sklearn estimators, final step support only classifiers currently .. note:: data flows through a pipeline consisting of steps as below: raw data --> clean --> encoding --> scaling --> feature construction --> feature selection --> resampling --> final estimator see scikit-learn preprocess & estimators parameter ---- pipe - str - in the format of 'xx_xx' of which 'xx' means steps in pipeline, default None return ---- 1) pipeline instance of chosen steps 2) if pipe is None, a dict indicating possible choice of 'steps' ''' clean = { 'clean': Split_cls(dtype_filter='not_datetime', na1='null', na2=-999), 'cleanNA': Split_cls(dtype_filter='not_datetime', na1=None, na2=None), 'cleanMean': Split_cls(dtype_filter='not_datetime', na1='most_frequent', na2='mean'), } # encode = { 'woe': Woe_encoder(max_leaf_nodes=5), 'oht': Oht_encoder(), 'ordi': Ordi_encoder(), } resample = { # over_sampling 'rover': RandomOverSampler(), 'smote': SMOTE(), 'bsmote': BorderlineSMOTE(), 'adasyn': ADASYN(), # under sampling controlled methods 'runder': RandomUnderSampler(), 'nearmiss': NearMiss(version=3), 'pcart': InstanceHardnessThreshold(), # under sampling cleaning methods 'tlinks': TomekLinks(n_jobs=-1), 'oside': OneSidedSelection(n_jobs=-1), 'cleanNN': NeighbourhoodCleaningRule(n_jobs=-1), 'enn': EditedNearestNeighbours(n_jobs=-1), 'ann': AllKNN(n_jobs=-1), 'cnn': CondensedNearestNeighbour(n_jobs=-1), # clean outliers 'inlierForest': FunctionSampler(outlier_rejection, kw_args={'method': 'IsolationForest'}), 'inlierLocal': FunctionSampler(outlier_rejection, kw_args={'method': 'LocalOutlierFactor'}), 'inlierEllip': FunctionSampler(outlier_rejection, kw_args={'method': 'EllipticEnvelope'}), 'inlierOsvm': FunctionSampler(outlier_rejection, kw_args={'method': 'OneClassSVM'}), # combine 'smoteenn': SMOTEENN(), 'smotelink': SMOTETomek(), } scale = { 'stdscale': StandardScaler(), 'maxscale': MinMaxScaler(), 'rscale': RobustScaler(quantile_range=(10, 90)), 'qauntile': QuantileTransformer(), # uniform distribution 'power': PowerTransformer(), # Gaussian distribution 'norm': Normalizer(), # default L2 norm # scale sparse data 'maxabs': MaxAbsScaler(), 'stdscalesp': StandardScaler(with_mean=False), } # feature construction feature_c = { 'pca': PCA(whiten=True), 'spca': SparsePCA(normalize_components=True, n_jobs=-1), 'ipca': IncrementalPCA(whiten=True), 'kpca': KernelPCA(kernel='rbf', n_jobs=-1), 'poly': PolynomialFeatures(degree=2), 'rtembedding': RandomTreesEmbedding(n_estimators=10), 'LDA': LinearDiscriminantAnalysis(), 'QDA': QuadraticDiscriminantAnalysis(), } # select from model feature_m = { 'fwoe': SelectFromModel(Woe_encoder(max_leaf_nodes=5)), 'flog': SelectFromModel( LogisticRegressionCV(penalty='l1', solver='saga', scoring='roc_auc')), 'fsgd': SelectFromModel(SGDClassifier(penalty="l1")), 'fsvm': SelectFromModel(LinearSVC('l1', dual=False, C=1e-2)), 'fxgb': SelectFromModel(XGBClassifier(n_jobs=-1)), 'frf': SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5)), 'fRFExgb': RFE(XGBClassifier(n_jobs=-1), step=0.1, n_features_to_select=20), 'fRFErf': RFE(ExtraTreesClassifier(n_estimators=100, max_depth=5), step=0.3, n_features_to_select=20), 'fRFElog': RFE(LogisticRegressionCV(penalty='l1', solver='saga', scoring='roc_auc'), step=0.3, n_features_to_select=20) } # Univariate feature selection feature_u = { 'fchi2': GenericUnivariateSelect(chi2, 'percentile', 25), 'fMutualclf': GenericUnivariateSelect(mutual_info_classif, 'percentile', 25), 'fFclf': GenericUnivariateSelect(f_classif, 'percentile', 25), } # sklearn estimator t = all_estimators(type_filter=['classifier']) estimator = {} for i in t: try: estimator.update({i[0]: i[1]()}) except Exception: continue estimator.update( dummy=DummyClassifier(), XGBClassifier=XGBClassifier(n_jobs=-1), LogisticRegressionCV=LogisticRegressionCV(scoring='roc_auc'), EasyEnsembleClassifier=EasyEnsembleClassifier(), BalancedRandomForestClassifier=BalancedRandomForestClassifier(), RUSBoostClassifier=RUSBoostClassifier(), SVC=SVC(C=0.01, gamma='auto')) if pipe is None: feature_s = {} feature_s.update(**feature_m, **feature_u) return { 'clean': clean.keys(), 'encoding': encode.keys(), 'resample': resample.keys(), 'scale': scale.keys(), 'feature_c': feature_c.keys(), 'feature_s': feature_s.keys(), 'classifier': estimator.keys() } elif isinstance(pipe, str): l = pipe.split('_') all_keys_dict = {} all_keys_dict.update(**clean, **encode, **scale, **feature_c, **feature_m, **feature_u, **estimator, **resample) steps = [] for i in l: if all_keys_dict.get(i) is not None: steps.append((i, all_keys_dict.get(i))) else: raise KeyError( "'{}' invalid key for sklearn estimators".format(i)) return Pipeline(steps) else: raise ValueError("input pipe must be a string in format 'xx[_xx]'")
train_features = np.load('train_features_joined.npz') img_features = train_features['img_features'] tag_features = train_features['tag_features'] else: assert os.path.isfile('train_features.npz') logging.info('Loading features file') train_features = np.load('train_features.npz') img_features = train_features['img_features'] tag_features = train_features['tag_features'] if args.perform_PCA == True: N_PCA = img_features.shape[0] if args.npca == -1 else args.npca logging.info('Training: PCA of image features, N_PCA = %d', N_PCA) start = time.time() # pca = IncrementalPCA(n_components=100, batch_size=512) pca = IncrementalPCA(n_components=500, batch_size=512) pca.fit(img_features[:N_PCA, :]) end = time.time() logging.info('Time: %.4fm', (end - start) / 60) logging.info('Apply PCA to image features') start = time.time() X = pca.transform(img_features) end = time.time() logging.info('Time: %.4fm', (end - start) / 60) logging.info('Training: fit CCA') start = time.time() if args.perform_PCA == True: W_img, W_tag = cca.fit(X, tag_features, numCC=args.numCC, useGPU=args.gpu)
def btnConvert_click(self): totalTime = 0 msgBox = QMessageBox() # Batch try: Batch = np.int32(ui.txtBatch.text()) except: msgBox.setText("Size of batch is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if Batch == 0: Batch = None # Kernel Kernel = ui.cbKernel.currentText() # Method Method = ui.cbMethod.currentText() # Gamma try: Gamma = np.float(ui.txtGamma.text()) except: msgBox.setText("Gamma is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Degree try: Degree = np.int32(ui.txtDegree.text()) except: msgBox.setText("Degree is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Coef0 try: Coef0 = np.float(ui.txtCoef0.text()) except: msgBox.setText("Coef0 is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Alpha try: Alpha = np.int32(ui.txtAlpha.text()) except: msgBox.setText("Alpha is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Tol try: Tol = np.float(ui.txtTole.text()) except: msgBox.setText("Tolerance is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # MaxIte try: MaxIter = np.int32(ui.txtMaxIter.text()) except: msgBox.setText("Maximum number of iterations is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if MaxIter <= 0: MaxIter = None # Number of Job try: NJob = np.int32(ui.txtJobs.text()) except: msgBox.setText("The number of parallel jobs is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if NJob < -1 or NJob == 0: msgBox.setText("The number of parallel jobs must be -1 or greater than 0!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False TrFoldErr = list() TeFoldErr = list() try: FoldFrom = np.int32(ui.txtFoldFrom.text()) FoldTo = np.int32(ui.txtFoldTo.text()) except: print("Please check fold parameters!") return if FoldTo < FoldFrom: print("Please check fold parameters!") return for fold_all in range(FoldFrom, FoldTo+1): tic = time.time() # Regularization try: Regularization = np.float(ui.txtRegularization.text()) except: msgBox.setText("Regularization value is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # OutFile OutFile = ui.txtOutFile.text() OutFile = OutFile.replace("$FOLD$", str(fold_all)) if not len(OutFile): msgBox.setText("Please enter out file!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # InFile InFile = ui.txtInFile.text() InFile = InFile.replace("$FOLD$", str(fold_all)) if not len(InFile): msgBox.setText("Please enter input file!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not os.path.isfile(InFile): msgBox.setText("Input file not found!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False InData = io.loadmat(InFile) OutData = dict() OutData["imgShape"] = InData["imgShape"] # Data if not len(ui.txtITrData.currentText()): msgBox.setText("Please enter Input Train Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeData.currentText()): msgBox.setText("Please enter Input Test Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrData.text()): msgBox.setText("Please enter Output Train Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeData.text()): msgBox.setText("Please enter Output Test Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: XTr = InData[ui.txtITrData.currentText()] XTe = InData[ui.txtITeData.currentText()] if ui.cbScale.isChecked() and not ui.rbScale.isChecked(): XTr = preprocessing.scale(XTr) XTe = preprocessing.scale(XTe) print("Whole of data is scaled X~N(0,1).") except: print("Cannot load data") return # NComponent try: NumFea = np.int32(ui.txtNumFea.text()) except: msgBox.setText("Number of features is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if NumFea < 1: msgBox.setText("Number of features must be greater than zero!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if NumFea > np.shape(XTr)[1]: msgBox.setText("Number of features is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Label if not len(ui.txtITrLabel.currentText()): msgBox.setText("Please enter Train Input Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeLabel.currentText()): msgBox.setText("Please enter Test Input Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrLabel.text()): msgBox.setText("Please enter Train Output Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeLabel.text()): msgBox.setText("Please enter Test Output Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrLabel.text()] = InData[ui.txtITrLabel.currentText()] OutData[ui.txtOTeLabel.text()] = InData[ui.txtITeLabel.currentText()] except: print("Cannot load labels!") # Subject if not len(ui.txtITrSubject.currentText()): msgBox.setText("Please enter Train Input Subject variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeSubject.currentText()): msgBox.setText("Please enter Test Input Subject variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrSubject.text()): msgBox.setText("Please enter Train Output Subject variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeSubject.text()): msgBox.setText("Please enter Test Output Subject variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: TrSubject = InData[ui.txtITrSubject.currentText()] OutData[ui.txtOTrSubject.text()] = TrSubject TeSubject = InData[ui.txtITeSubject.currentText()] OutData[ui.txtOTeSubject.text()] = TeSubject except: print("Cannot load Subject IDs") return # Task if ui.cbTask.isChecked(): if not len(ui.txtITrTask.currentText()): msgBox.setText("Please enter Input Train Task variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeTask.currentText()): msgBox.setText("Please enter Input Test Task variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrTask.text()): msgBox.setText("Please enter Output Train Task variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeTask.text()): msgBox.setText("Please enter Output Test Task variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: TrTask = InData[ui.txtITrTask.currentText()] OutData[ui.txtOTrTask.text()] = TrTask TeTask = InData[ui.txtITeTask.currentText()] OutData[ui.txtOTeTask.text()] = TeTask TrTaskIndex = TrTask.copy() for tasindx, tas in enumerate(np.unique(TrTask)): TrTaskIndex[TrTask == tas] = tasindx + 1 TeTaskIndex = TeTask.copy() for tasindx, tas in enumerate(np.unique(TeTask)): TeTaskIndex[TeTask == tas] = tasindx + 1 except: print("Cannot load Tasks!") return # Run if ui.cbRun.isChecked(): if not len(ui.txtITrRun.currentText()): msgBox.setText("Please enter Train Input Run variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeRun.currentText()): msgBox.setText("Please enter Test Input Run variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrRun.text()): msgBox.setText("Please enter Train Output Run variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeRun.text()): msgBox.setText("Please enter Test Output Run variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: TrRun = InData[ui.txtITrRun.currentText()] OutData[ui.txtOTrRun.text()] = TrRun TeRun = InData[ui.txtITeRun.currentText()] OutData[ui.txtOTeRun.text()] = TeRun except: print("Cannot load Runs!") return # Counter if ui.cbCounter.isChecked(): if not len(ui.txtITrCounter.currentText()): msgBox.setText("Please enter Train Input Counter variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeCounter.currentText()): msgBox.setText("Please enter Test Input Counter variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrCounter.text()): msgBox.setText("Please enter Train Output Counter variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeCounter.text()): msgBox.setText("Please enter Test Output Counter variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: TrCounter = InData[ui.txtITrCounter.currentText()] OutData[ui.txtOTrCounter.text()] = TrCounter TeCounter = InData[ui.txtITeCounter.currentText()] OutData[ui.txtOTeCounter.text()] = TeCounter except: print("Cannot load Counters!") return # Matrix Label if ui.cbmLabel.isChecked(): if not len(ui.txtITrmLabel.currentText()): msgBox.setText("Please enter Train Input Matrix Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITemLabel.currentText()): msgBox.setText("Please enter Test Input Matrix Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrmLabel.text()): msgBox.setText("Please enter Train Output Matrix Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTemLabel.text()): msgBox.setText("Please enter Test Output Matrix Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrmLabel.text()] = InData[ui.txtITrmLabel.currentText()] OutData[ui.txtOTemLabel.text()] = InData[ui.txtITemLabel.currentText()] except: print("Cannot load matrix lables!") return # Design if ui.cbDM.isChecked(): if not len(ui.txtITrDM.currentText()): msgBox.setText("Please enter Train Input Design Matrix variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeDM.currentText()): msgBox.setText("Please enter Test Input Design Matrix variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrDM.text()): msgBox.setText("Please enter Train Output Design Matrix variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeDM.text()): msgBox.setText("Please enter Test Output Design Matrix variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrDM.text()] = InData[ui.txtITrDM.currentText()] OutData[ui.txtOTeDM.text()] = InData[ui.txtITeDM.currentText()] except: print("Cannot load design matrices!") return # Coordinate if ui.cbCol.isChecked(): if not len(ui.txtCol.currentText()): msgBox.setText("Please enter Coordinator variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOCol.text()): msgBox.setText("Please enter Coordinator variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOCol.text()] = InData[ui.txtCol.currentText()] except: print("Cannot load coordinator!") return # Condition if ui.cbCond.isChecked(): if not len(ui.txtCond.currentText()): msgBox.setText("Please enter Condition variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOCond.text()): msgBox.setText("Please enter Condition variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOCond.text()] = InData[ui.txtCond.currentText()] except: print("Cannot load conditions!") return # FoldID if ui.cbFoldID.isChecked(): if not len(ui.txtFoldID.currentText()): msgBox.setText("Please enter FoldID variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOFoldID.text()): msgBox.setText("Please enter FoldID variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOFoldID.text()] = InData[ui.txtFoldID.currentText()] except: print("Cannot load Fold ID!") return # FoldInfo if ui.cbFoldInfo.isChecked(): if not len(ui.txtFoldInfo.currentText()): msgBox.setText("Please enter FoldInfo variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOFoldInfo.text()): msgBox.setText("Please enter FoldInfo variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOFoldInfo.text()] = InData[ui.txtFoldInfo.currentText()] except: print("Cannot load Fold Info!") return pass # Number of Scan if ui.cbNScan.isChecked(): if not len(ui.txtITrScan.currentText()): msgBox.setText("Please enter Number of Scan variable name for Input Train!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeScan.currentText()): msgBox.setText("Please enter Number of Scan variable name for Input Test!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTrScan.text()): msgBox.setText("Please enter Number of Scan variable name for Output Train!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtOTeScan.text()): msgBox.setText("Please enter Number of Scan variable name for Output Test!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: OutData[ui.txtOTrScan.text()] = InData[ui.txtITrScan.currentText()] OutData[ui.txtOTeScan.text()] = InData[ui.txtITeScan.currentText()] except: print("Cannot load NScan!") return # Train Analysis Level print("Calculating Analysis Level for Training Set ...") TrGroupFold = None FoldStr = "" if ui.cbFSubject.isChecked(): if not ui.rbFRun.isChecked(): TrGroupFold = TrSubject FoldStr = "Subject" else: TrGroupFold = np.concatenate((TrSubject,TrRun)) FoldStr = "Subject+Run" if ui.cbFTask.isChecked(): TrGroupFold = np.concatenate((TrGroupFold,TrTaskIndex)) if TrGroupFold is not None else TrTaskIndex FoldStr = FoldStr + "+Task" if ui.cbFCounter.isChecked(): TrGroupFold = np.concatenate((TrGroupFold,TrCounter)) if TrGroupFold is not None else TrCounter FoldStr = FoldStr + "+Counter" TrGroupFold = np.transpose(TrGroupFold) TrUniqFold = np.array(list(set(tuple(i) for i in TrGroupFold.tolist()))) TrFoldIDs = np.arange(len(TrUniqFold)) + 1 TrListFold = list() for gfold in TrGroupFold: for ufoldindx, ufold in enumerate(TrUniqFold): if (ufold == gfold).all(): currentID = TrFoldIDs[ufoldindx] break TrListFold.append(currentID) TrListFold = np.int32(TrListFold) TrListFoldUniq = np.unique(TrListFold) # Test Analysis Level print("Calculating Analysis Level for Testing Set ...") TeGroupFold = None if ui.cbFSubject.isChecked(): if not ui.rbFRun.isChecked(): TeGroupFold = TeSubject else: TeGroupFold = np.concatenate((TeSubject,TeRun)) if ui.cbFTask.isChecked(): TeGroupFold = np.concatenate((TeGroupFold,TeTaskIndex)) if TeGroupFold is not None else TeTaskIndex if ui.cbFCounter.isChecked(): TeGroupFold = np.concatenate((TeGroupFold,TeCounter)) if TeGroupFold is not None else TeCounter TeGroupFold = np.transpose(TeGroupFold) TeUniqFold = np.array(list(set(tuple(i) for i in TeGroupFold.tolist()))) TeFoldIDs = np.arange(len(TeUniqFold)) + 1 TeListFold = list() for gfold in TeGroupFold: for ufoldindx, ufold in enumerate(TeUniqFold): if (ufold == gfold).all(): currentID = TeFoldIDs[ufoldindx] break TeListFold.append(currentID) TeListFold = np.int32(TeListFold) TeListFoldUniq = np.unique(TeListFold) # Train Partition print("Partitioning Training Data ...") TrX = list() TrShape = None if Method == "PCA": svdmodel = PCA(n_components=NumFea,copy=False,tol=Tol) elif Method == "Kernel PCA": svdmodel = KernelPCA(n_components=NumFea,kernel=Kernel,gamma=Gamma,degree=Degree,\ coef0=Coef0, alpha=Alpha, tol=Tol, max_iter=MaxIter, n_jobs=NJob,copy_X=False) else: svdmodel = IncrementalPCA(n_components=NumFea,copy=False,batch_size=Batch) for foldindx, fold in enumerate(TrListFoldUniq): dat = XTr[np.where(TrListFold == fold)] if ui.cbScale.isChecked() and ui.rbScale.isChecked(): dat = preprocessing.scale(dat) print("Data belong to View " + str(foldindx + 1) + " is scaled X~N(0,1).") dat = svdmodel.fit_transform(dat) TrX.append(dat) if TrShape is None: TrShape = np.shape(dat) else: if not(TrShape == np.shape(dat)): print("ERROR: Train, Reshape problem for Fold " + str(foldindx + 1) + ", Shape: " + str(np.shape(dat))) return print("Train: View " + str(foldindx + 1) + " is extracted. Shape: " + str(np.shape(dat))) print("Training Shape: " + str(np.shape(TrX))) # Test Partition print("Partitioning Testing Data ...") TeX = list() TeShape = None for foldindx, fold in enumerate(TeListFoldUniq): dat = XTe[np.where(TeListFold == fold)] if ui.cbScale.isChecked() and ui.rbScale.isChecked(): dat = preprocessing.scale(dat) print("Data belong to View " + str(foldindx + 1) + " is scaled X~N(0,1).") dat = svdmodel.fit_transform(dat) TeX.append(dat) if TeShape is None: TeShape = np.shape(dat) else: if not(TeShape == np.shape(dat)): print("Test: Reshape problem for Fold " + str(foldindx + 1)) return print("Test: View " + str(foldindx + 1) + " is extracted.") print("Testing Shape: " + str(np.shape(TeX))) model = RHA(Dim=NumFea,regularization=Regularization) print("Running Hyperalignment on Training Data ...") MappedXtr, G = model.train(TrX) print("Running Hyperalignment on Testing Data ...") MappedXte = model.test(TeX) # Train Dot Product print("Producting Training Data ...") TrHX = None TrErr = None for foldindx, fold in enumerate(TrListFoldUniq): TrErr = TrErr + (G - MappedXtr[foldindx]) if TrErr is not None else G - MappedXtr[foldindx] TrHX = np.concatenate((TrHX, MappedXtr[foldindx])) if TrHX is not None else MappedXtr[foldindx] OutData[ui.txtOTrData.text()] = TrHX foldindx = foldindx + 1 TrErr = TrErr / foldindx print("Train: alignment error ", np.linalg.norm(TrErr)) TrFoldErr.append(np.linalg.norm(TrErr)) # Train Dot Product print("Producting Testing Data ...") TeHX = None TeErr = None for foldindx, fold in enumerate(TeListFoldUniq): TeErr = TeErr + (G - MappedXte[foldindx]) if TeErr is not None else G - MappedXte[foldindx] TeHX = np.concatenate((TeHX, MappedXte[foldindx])) if TeHX is not None else MappedXte[foldindx] OutData[ui.txtOTeData.text()] = TeHX foldindx = foldindx + 1 TeErr = TeErr / foldindx print("Test: alignment error ", np.linalg.norm(TeErr)) TeFoldErr.append(np.linalg.norm(TeErr)) HAParam = dict() HAParam["Method"]= Method HAParam["Kernel"]= Kernel HAParam["Share"] = G HAParam["Level"] = FoldStr OutData["FunctionalAlignment"] = HAParam OutData["Runtime"] = time.time() - tic totalTime += OutData["Runtime"] print("Saving ...") io.savemat(OutFile, mdict=OutData) print("Fold " + str(fold_all) + " is DONE: " + OutFile) print("Training -> Alignment Error: mean " + str(np.mean(TrFoldErr)) + " std " + str(np.std(TrFoldErr))) print("Testing -> Alignment Error: mean " + str(np.mean(TeFoldErr)) + " std " + str(np.std(TeFoldErr))) print("Runtime: ", totalTime) print("Kernel/SVD Hyperalignment is done.") msgBox.setText("Kernel/SVD Hyperalignment is done.") msgBox.setIcon(QMessageBox.Information) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_()
def reduce(config, components, uuids=None, x_train=None, x_dev=None, x_test=None): """ Apply Incremental Principal Components Analysis to the tf-idf vectors. :param config: configuration dictionary :param components: number of desired components :param uuids: list of selected uuids :param x_train: List of train set uuids :param x_dev: List of dev set uuids :param x_test: List of test set uuids :return: """ print('Performing feature extraction using PCA') mini_batch_size = config['batch_size'] words = json.load( open(os.path.join(constants.dir_d, constants.json_words), 'r')) i_pca = IncrementalPCA(n_components=components, batch_size=mini_batch_size) if uuids: rand_uuids = random.sample(uuids, len(uuids)) rows = len(uuids) else: rand_uuids = random.sample(x_train, len(x_train)) rows = len(x_train) train_pca(config, i_pca, len(rand_uuids), rand_uuids, mini_batch_size) print('Explained Variance Ratio {}:'.format( sum(i_pca.explained_variance_ratio_))) if uuids: data = transform_vectors(config, i_pca, len(uuids), uuids, mini_batch_size) matrix_file = os.path.join( constants.dir_d, constants.dir_mat, 'pca_{}_{}.txt'.format(components, len(uuids))) np.savetxt(open(matrix_file, 'wb'), data) else: t_train = transform_vectors(config, i_pca, len(x_train), x_train, mini_batch_size) matrix_file = os.path.join( constants.dir_d, constants.dir_mat, 'pca_{}_{}_tr.txt'.format(components, len(t_train))) np.savetxt(open(matrix_file, 'wb'), t_train) t_dev = transform_vectors(config, i_pca, len(x_dev), x_dev, mini_batch_size) matrix_file = os.path.join( constants.dir_d, constants.dir_mat, 'pca_{}_{}_dv.txt'.format(components, len(t_dev))) np.savetxt(open(matrix_file, 'wb'), t_dev) t_test = transform_vectors(config, i_pca, len(x_test), x_test, mini_batch_size) matrix_file = os.path.join( constants.dir_d, constants.dir_mat, 'pca_{}_{}_te.txt'.format(components, len(t_test))) np.savetxt(open(matrix_file, 'wb'), t_test) data = (t_train, t_dev, t_test) model_file = os.path.join(constants.dir_d, constants.dir_mod, 'pca_{}_{}.pkl'.format(components, rows)) joblib.dump(i_pca, model_file) components_file = os.path.join( constants.dir_d, constants.dir_mod, "components_pca_{}_{}.txt".format(components, rows)) to_inspect = pd.DataFrame(np.absolute(i_pca.components_.T), index=sorted(set(words.keys())), columns=range(components)) to_inspect.idxmax(axis=0, skipna=True).to_csv(components_file) return data, i_pca
I_gt = np.load( '/home/abhi/Documents/Hyper/Dataset_Hyperspectral/Ground_truths/Indian_pines_gt.npy' ) igt = np.ravel(I_gt) # Expand the array for scale array_expand = I_vect[:, 0, :] for i_row in range(1, I_vect.shape[1]): tempmatirx = I_vect[:, i_row, :] array_expand = np.hstack((array_expand, tempmatirx)) # Data normalization array_expand_scaled = sp.scale(array_expand.T) pca = IncrementalPCA(n_components=num_com, whiten=True, copy=True, batch_size=None) array_pca = pca.fit_transform(array_expand_scaled) x = array_pca.reshape(145, 145, 8) y0 = x[:, :, 0] y1 = x[:, :, 1] y2 = x[:, :, 2] y3 = x[:, :, 3] y4 = x[:, :, 4] y5 = x[:, :, 5] y6 = x[:, :, 6] y7 = x[:, :, 7] #most prominent is the first component plt.imshow(y0)
def test_incremental_pca_feature_names_out(): """Check feature names out for IncrementalPCA.""" ipca = IncrementalPCA(n_components=2).fit(iris.data) names = ipca.get_feature_names_out() assert_array_equal([f"incrementalpca{i}" for i in range(2)], names)
def whitening(X, n_components, svd_solver, chunked, chunk_size, zero_center, random_state=None): """ Whiten data (i.e transform variables into a set of new uncorrelated and unit-variance variables) and reduce dimension trhough a PCA-like approach. This function handles array-like formats as well as sparse matrices. Parameters ---------- X : 2D ndarray or spmatrix, shape (n_observations , n_variables) n_components : int number of pricipal components to compute. If None, n_components = min(X.shape) svd_solver : str {‘auto’, ‘full’, ‘arpack’, ‘randomized’ , 'lobpcg'} solver for the different PCA methods. Please note that some solvers may not be compatible with some of the PCA methods. See PCA, TruncatedSVD and IncrementalPCA from sklearn.decompostion or scipy.sparse.linalg.svds. chunked : boolean if True, perform an incremental PCA on segments of chunk_size. The incremental PCA automatically zero centers and ignores settings of random_seed and svd_solver. chunk_size : int Number of observations to include in each chunk. Required if chunked=True was passed. zero_center : boolean If True, compute standard PCA from covariance matrix. If False, omit zero-centering variables (uses TruncatedSVD), which allows to handle sparse input efficiently. random_state : int, RandomState, optional Change to use different initial states for the optimization. The default is None. Returns ------- X_w : 2D ndarray, shape (n_observations , n_components) """ random_state = check_random_state(random_state) if n_components is None: n_components = min(X.shape) if chunked: pca = IncrementalPCA(n_components=n_components, whiten=True, batch_size=chunk_size) X_w = pca.fit_transform(X) elif issparse(X): if not zero_center: warnings.warn( 'TruncatedSVD is very similar to PCA, but differs in that the matrix is not centered first.' ' The following components still often resemble the exact PCA very closely' ) pca = TruncatedSVD(n_components=n_components, random_state=random_state, algorithm=svd_solver) X_w = pca.fit_transform(X) X_w = (X_w / pca.singular_values_) * np.sqrt(X.shape[0] - 1) X_w -= X_w.mean(axis=0) else: X_w = _pca_with_sparse(X, n_components, solver=svd_solver, random_state=random_state) else: pca = PCA(n_components=n_components, whiten=True, svd_solver=svd_solver) X_w = pca.fit_transform(X) return X_w
col = [] for word in clean_graph.keys(): for wd in clean_graph[word].keys(): row.append(index[word]) col.append(index[wd]) data.append(clean_graph[word][wd]) matrix = csr_matrix((data, (row, col))) return matrix matrix = create_sparse_matrix(clean_graph, index) # In[20]: from sklearn.decomposition import IncrementalPCA import pickle chunk_size = 100 n = matrix.shape[0] pca = IncrementalPCA(n_components=15, batch_size=100) for i in range(0, n // chunk_size): rows = matrix[i * chunk_size:(i + 1) * chunk_size].toarray() pca.partial_fit(rows) pca.fit(matrix) pickle.dump(file=open('pca.pickle', 'wb'), obj=pca) pickle.dump(file=open('sparce_matrix.pickle', 'wb'), obj=matrix)
def pca_incremental(cubepath, angle_list=None, n=0, batch_size=None, batch_ratio=0.1, ncomp=10, imlib='opencv', interpolation='lanczos4', collapse='median', verbose=True, full_output=False): """ Computes the full-frame PCA-ADI algorithm in batches, for processing fits files larger than the available system memory. It uses the incremental PCA algorithm from scikit-learn. Parameters ---------- cubepath : str String with the path to the fits file to be opened in memmap mode. angle_list : array_like, 1d Corresponding parallactic angle for each frame. If None the parallactic angles are obtained from the same fits file (extension). n : int optional The index of the HDULIST contaning the data/cube. batch_size : int optional The number of frames in each batch. If None the size of the batch is computed wrt the available memory in the system. batch_ratio : float If batch_size is None, batch_ratio indicates the % of the available memory that should be used by every batch. ncomp : int, optional How many PCs are used as a lower-dimensional subspace to project the target frames. imlib : str, optional See the documentation of the ``vip_hci.preproc.frame_rotate`` function. interpolation : str, optional See the documentation of the ``vip_hci.preproc.frame_rotate`` function. collapse : {'median', 'mean', 'sum', 'trimmean'}, str optional Sets the way of collapsing the frames for producing a final image. verbose : {True, False}, bool optional If True prints intermediate info and timing. full_output: boolean, optional Whether to return the final median combined image only or with other intermediate arrays. Returns ------- If full_output is True the algorithm returns the incremental PCA model of scikit-learn, the PCs reshaped into images, the median of the derotated residuals for each batch, and the final frame. If full_output is False then the final frame is returned. """ if verbose: start = time_ini() if not isinstance(cubepath, str): raise TypeError('Cubepath must be a string with the full path of your ' 'fits file') fitsfilename = cubepath hdulist = fits.open(fitsfilename, memmap=True) if not hdulist[n].data.ndim > 2: raise TypeError('Input array is not a 3d or 4d array') n_frames = hdulist[n].data.shape[0] y = hdulist[n].data.shape[1] x = hdulist[n].data.shape[2] if angle_list is None: try: angle_list = hdulist[n + 1].data except: raise RuntimeError('Parallactic angles were not provided') if not n_frames == angle_list.shape[0]: raise TypeError( 'Angle list vector has wrong length. It must equal the ' 'number of frames in the cube.') ipca = IncrementalPCA(n_components=ncomp) if batch_size is None: aval_mem = get_available_memory(verbose) total_size = hdulist[n].data.nbytes batch_size = int(n_frames / (total_size / (batch_ratio * aval_mem))) if verbose: msg1 = "Cube with {} frames ({:.3f} GB)" print(msg1.format(n_frames, hdulist[n].data.nbytes / 1e9)) msg2 = "Batch size set to {} frames ({:.3f} GB)\n" print( msg2.format(batch_size, hdulist[n].data[:batch_size].nbytes / 1e9)) res = n_frames % batch_size for i in range(0, n_frames // batch_size): intini = i * batch_size intfin = (i + 1) * batch_size batch = hdulist[n].data[intini:intfin] msg = 'Processing batch [{},{}] with shape {}' if verbose: print(msg.format(intini, intfin, batch.shape)) print('Batch size in memory = {:.3f} MB'.format(batch.nbytes / 1e6)) matrix = prepare_matrix(batch, verbose=False) ipca.partial_fit(matrix) if res > 0: batch = hdulist[n].data[intfin:] msg = 'Processing batch [{},{}] with shape {}' if verbose: print(msg.format(intfin, n_frames, batch.shape)) print('Batch size in memory = {:.3f} MB'.format(batch.nbytes / 1e6)) matrix = prepare_matrix(batch, verbose=False) ipca.partial_fit(matrix) if verbose: timing(start) V = ipca.components_ mean = ipca.mean_.reshape(batch.shape[1], batch.shape[2]) if verbose: print('\nReconstructing and obtaining residuals') medians = [] for i in range(0, n_frames // batch_size): intini = i * batch_size intfin = (i + 1) * batch_size batch = hdulist[n].data[intini:intfin] batch = batch - mean matrix = prepare_matrix(batch, verbose=False) reconst = np.dot(np.dot(matrix, V.T), V) resid = matrix - reconst resid_der = cube_derotate(resid.reshape(batch.shape[0], batch.shape[1], batch.shape[2]), angle_list[intini:intfin], imlib=imlib, interpolation=interpolation) medians.append(cube_collapse(resid_der, mode=collapse)) if res > 0: batch = hdulist[n].data[intfin:] batch = batch - mean matrix = prepare_matrix(batch, verbose=False) reconst = np.dot(np.dot(matrix, V.T), V) resid = matrix - reconst resid_der = cube_derotate(resid.reshape(batch.shape[0], batch.shape[1], batch.shape[2]), angle_list[intfin:], imlib=imlib, interpolation=interpolation) medians.append(cube_collapse(resid_der, mode=collapse)) del matrix del batch medians = np.array(medians) frame = np.median(medians, axis=0) if verbose: timing(start) if full_output: pcs = reshape_matrix(V, y, x) return ipca, pcs, medians, frame else: return frame
def incremental_pca(args): #https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html return IncrementalPCA(n_components=args['n_components'], whiten=args['whiten'], copy=True)
'kernel': ['rbf'], 'C': range(1, 100, 10), 'gamma': np.arange(0.05, 0.55, .05) } model = SVC() conf_matrix_list_of_arrays = [] scores = [] for i in range(10): for fold_ind, (train_index, test_index) in enumerate( stratified_group_k_fold(X, y, ids, k=8)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] train_groups, test_groups = ids[train_index], ids[test_index] ipca = IncrementalPCA(n_components=X_train.shape[1] // 5, batch_size=120) ipca.fit(X_train) X_train = ipca.transform(X_train) X_test = ipca.transform(X_test) X_train, y_train = pipeline.fit_resample(X_train, y_train) #Smote clf = GridSearchCV(model, parameters, cv=5, n_jobs=4) clf.fit(X_train, y_train) pred = clf.predict(X_test) conf_matrix = confusion_matrix(y_test, pred) conf_matrix_list_of_arrays.append(conf_matrix) score = accuracy_score(y_test, pred) scores.append(score) mean_of_conf_matrix_arrays = np.mean(conf_matrix_list_of_arrays, axis=0)
vectors.append(model[word]) labels.append(word) print ('- found ' + str(len(labels)) + ' entities x ' + str(len(vectors[0])) + ' dimensions') # convert both lists into numpy vectors for reduction vectors = np.asarray(vectors) labels = np.asarray(labels) print ('- done') # if specified, reduce using IncrementalPCA first (down # to a smaller number of dimensions before the final reduction) if run_init_reduction: print ('reducing to ' + str(init_dimensions) + 'D using IncrementalPCA...') ipca = IncrementalPCA(n_components=init_dimensions) vectors = ipca.fit_transform(vectors) print ('- done') # save reduced vector space to file print ('- saving as csv...') with open(''+model_name + '-' + str(init_dimensions) + 'D.csv', 'w') as f: for i in range(len(labels)): f.write(labels[i] + ',' + ','.join(map(str, vectors[i])) + '\n') # reduce using t-SNE print ('reducing to ' + str(num_dimensions) + 'D using t-SNE...') print ('- may take a really, really (really) long time :)') vectors = np.asarray(vectors) tsne = TSNE(n_components=num_dimensions, random_state=0)
def flatten_image(img_array): s = img_array.shape[0] * img_array.shape[1] img_width = img_array.reshape(1, s) return img_width[0] # %% dataset = [] for path in paths: img = Image.open(str(path.resolve())) img = image_to_matrix(img) # img = flatten_image(img) dataset.append(img) dataset = np.array(dataset) print('dataset shape: {}'.format(dataset.shape)) # %% n = dataset.shape[0] pca = IncrementalPCA(n_components=100) for i in range(n): r_dataset = pca.partial_fit(dataset[i:(i + 1)]) r_dataset = pca.transform(dataset) print('r_dataset.shape: {}'.format(r_dataset.shape))