Exemple #1
0
def pca_incremental(df, n_c=7):
    X = df.drop(['class'], axis=1)
    transformer = IncrementalPCA(n_components=7)
    X_transformed = transformer.fit_transform(X)
    return X_transformed    
Exemple #2
0
                align='center',
                alpha=0.5)
plt.xticks(np.arange(len(MCAcolumns)), MCAcolumns)
plt.ylabel('Percentage')
plt.title('Explained Variance by Factor (%): Multiple Correspondence Analysis')
plt.show()
fig.savefig(''.join([
    'C:/Users/Jairo F Gudiño R/Desktop/Balance Sheet Commonality/', 'MCA',
    '.pdf'
]))
ft = mca_ben.fs_r(N=F)
# PCA Explained Variance #
MCAFactorScores = pd.DataFrame(ft, columns=MCAcolumns)
PCADataframe = pd.concat(
    [df_norm.iloc[:, range(df_norm.shape[1] - 3)], MCAFactorScores], axis=1)
PCAModel = IncrementalPCA(n_components=3)
reduced_data = PCAModel.fit_transform(PCADataframe)
explained_variancePCA = PCAModel.explained_variance_ratio_ * 100
PCAcolumns = [("F" + str(i + 1)) for i in range(3)]
fig, Graph = plt.subplots()
Graph = plt.bar(np.arange(len(PCAcolumns)),
                explained_variancePCA,
                align='center',
                alpha=0.5)
plt.xticks(np.arange(len(PCAcolumns)), PCAcolumns)
plt.ylabel('Percentage')
plt.title('Explained Variance by Factor (%): Principal Component Analysis')
plt.show()
fig.savefig(''.join([
    'C:/Users/Jairo F Gudiño R/Desktop/Balance Sheet Commonality/', 'PCA',
    '.pdf'
def pca_transform(X, n_components):
    pca = IncrementalPCA(n_components=n_components)
    pca.fit(X)
    Xt = pca.transform(X)
    return Xt
Exemple #4
0
    X_train, X_test, y_train, y_test = train_test_split(dt_features,
                                                        dt_target,
                                                        test_size=0.3,
                                                        random_state=42)

    # Para verificar que el train_test_split tomo los valores proporcionalmente
    # se imprime el shape del entrenamiento y ambos deben de tener la mismas
    # dimensiones
    print(X_train.shape)
    print(y_train.shape)

    # n_components = min(n_muestras, n_features) valor por defecto
    pca = PCA(n_components=3)
    pca.fit(X_train)

    ipca = IncrementalPCA(n_components=3, batch_size=10)
    ipca.fit(X_train)

    plt.plot(range(len(pca.explained_variance_)),
             pca.explained_variance_ratio_)
    plt.show()

    logistic = LogisticRegression(solver='lbfgs')

    dt_train = pca.transform(X_train)
    dt_test = pca.transform(X_test)

    logistic.fit(dt_train, y_train)
    print("Score PCA: ", logistic.score(dt_test, y_test))

    dt_train = ipca.transform(X_train)
Exemple #5
0
def pca(
    data: Union[AnnData, np.ndarray, spmatrix],
    n_comps: int = N_PCS,
    zero_center: Optional[bool] = True,
    svd_solver: str = 'auto',
    random_state: int = 0,
    return_info: bool = False,
    use_highly_variable: Optional[bool] = None,
    dtype: str = 'float32',
    copy: bool = False,
    chunked: bool = False,
    chunk_size: Optional[int] = None,
) -> Union[AnnData, np.ndarray, spmatrix]:
    """Principal component analysis [Pedregosa11]_.

    Computes PCA coordinates, loadings and variance decomposition. Uses the
    implementation of *scikit-learn* [Pedregosa11]_.

    Parameters
    ----------
    data
        The (annotated) data matrix of shape ``n_obs`` × ``n_vars``.
        Rows correspond to cells and columns to genes.
    n_comps
        Number of principal components to compute.
    zero_center
        If `True`, compute standard PCA from covariance matrix.
        If ``False``, omit zero-centering variables
        (uses :class:`~sklearn.decomposition.TruncatedSVD`),
        which allows to handle sparse input efficiently.
        Passing ``None`` decides automatically based on sparseness of the data.
    svd_solver
        SVD solver to use:

        ``'arpack'``
          for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`)

        ``'randomized'``
          for the randomized algorithm due to Halko (2009).

        ``'auto'`` (the default)
          chooses automatically depending on the size of the problem.

    random_state
        Change to use different initial states for the optimization.
    return_info
        Only relevant when not passing an :class:`~anndata.AnnData`:
        see “**Returns**”.
    use_highly_variable
        Whether to use highly variable genes only, stored in
        ``.var['highly_variable']``.
        By default uses them if they have been determined beforehand.
    dtype
        Numpy data type string to which to convert the result.
    copy
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned. Is ignored otherwise.
    chunked
        If ``True``, perform an incremental PCA on segments of ``chunk_size``.
        The incremental PCA automatically zero centers and ignores settings of
        ``random_seed`` and ``svd_solver``. If ``False``, perform a full PCA.
    chunk_size
        Number of observations to include in each chunk.
        Required if ``chunked=True`` was passed.

    Returns
    -------
    X_pca : :class:`scipy.sparse.spmatrix` or :class:`numpy.ndarray`
        If `data` is array-like and ``return_info=False`` was passed,
        this function only returns `X_pca`…
    adata : anndata.AnnData
        …otherwise if ``copy=True`` it returns or else adds fields to ``adata``:

        ``.obsm['X_pca']``
             PCA representation of data.

        ``.varm['PCs']``
             The principal components containing the loadings.

        ``.uns['pca']['variance_ratio']``)
             Ratio of explained variance.

        ``.uns['pca']['variance']``
             Explained variance, equivalent to the eigenvalues of the covariance matrix.
    """
    # chunked calculation is not randomized, anyways
    if svd_solver in {'auto', 'randomized'} and not chunked:
        logg.info(
            'Note that scikit-learn\'s randomized PCA might not be exactly '
            'reproducible across different computational platforms. For exact '
            'reproducibility, choose `svd_solver=\'arpack\'.` This will likely '
            'become the Scanpy default in the future.')

    data_is_AnnData = isinstance(data, AnnData)
    if data_is_AnnData:
        adata = data.copy() if copy else data
    else:
        adata = AnnData(data)

    logg.info('computing PCA with n_comps =', n_comps, r=True)

    if adata.n_vars < n_comps:
        n_comps = adata.n_vars - 1
        logg.msg('reducing number of computed PCs to',
                 n_comps,
                 'as dim of data is only',
                 adata.n_vars,
                 v=4)

    if use_highly_variable is True and 'highly_variable' not in adata.var.keys(
    ):
        raise ValueError(
            'Did not find adata.var[\'highly_variable\']. '
            'Either your data already only consists of highly-variable genes '
            'or consider running `pp.filter_genes_dispersion` first.')
    if use_highly_variable is None:
        use_highly_variable = True if 'highly_variable' in adata.var.keys(
        ) else False
    if use_highly_variable:
        logg.info('computing PCA on highly variable genes')
    adata_comp = adata[:, adata.
                       var['highly_variable']] if use_highly_variable else adata

    if chunked:
        if not zero_center or random_state or svd_solver != 'auto':
            logg.msg('Ignoring zero_center, random_state, svd_solver', v=4)

        from sklearn.decomposition import IncrementalPCA

        X_pca = np.zeros((adata_comp.X.shape[0], n_comps), adata_comp.X.dtype)

        pca_ = IncrementalPCA(n_components=n_comps)

        for chunk, _, _ in adata_comp.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            pca_.partial_fit(chunk)

        for chunk, start, end in adata_comp.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            X_pca[start:end] = pca_.transform(chunk)
    else:
        if zero_center is None:
            zero_center = not issparse(adata_comp.X)
        if zero_center:
            from sklearn.decomposition import PCA
            if issparse(adata_comp.X):
                logg.msg(
                    '    as `zero_center=True`, '
                    'sparse input is densified and may '
                    'lead to huge memory consumption',
                    v=4)
                X = adata_comp.X.toarray(
                )  # Copying the whole adata_comp.X here, could cause memory problems
            else:
                X = adata_comp.X
            pca_ = PCA(n_components=n_comps,
                       svd_solver=svd_solver,
                       random_state=random_state)
        else:
            from sklearn.decomposition import TruncatedSVD
            logg.msg(
                '    without zero-centering: \n'
                '    the explained variance does not correspond to the exact statistical defintion\n'
                '    the first component, e.g., might be heavily influenced by different means\n'
                '    the following components often resemble the exact PCA very closely',
                v=4)
            pca_ = TruncatedSVD(n_components=n_comps,
                                random_state=random_state)
            X = adata_comp.X
        X_pca = pca_.fit_transform(X)

    if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype)

    if data_is_AnnData:
        adata.obsm['X_pca'] = X_pca
        if use_highly_variable:
            adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps))
            adata.varm['PCs'][
                adata.var['highly_variable']] = pca_.components_.T
        else:
            adata.varm['PCs'] = pca_.components_.T
        adata.uns['pca'] = {}
        adata.uns['pca']['variance'] = pca_.explained_variance_
        adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_
        logg.info('    finished', t=True)
        logg.msg(
            'and added\n'
            '    \'X_pca\', the PCA coordinates (adata.obs)\n'
            '    \'PC1\', \'PC2\', ..., the loadings (adata.var)\n'
            '    \'pca_variance\', the variance / eigenvalues (adata.uns)\n'
            '    \'pca_variance_ratio\', the variance ratio (adata.uns)',
            v=4)
        return adata if copy else None
    else:
        logg.info('    finished', t=True)
        if return_info:
            return X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_
        else:
            return X_pca
Exemple #6
0
        self.X = X

    def chunked(self, chunks):
        start = 0
        for i in range(chunks):
            stop = start + len(self.X[i::chunks])
            yield self.X[start:stop]
            start = stop


D = LikeAnnData(np.random.rand(100000, 1000))

n_comp = 80
n_chunks = 100

ipca = IncrementalPCA(n_components=n_comp)

print('Training IPCA')

for chunk in D.chunked(n_chunks):
    ipca.partial_fit(chunk)

OutIPCA = np.array([])

print('Fitting IPCA')

for chunk in D.chunked(n_chunks):
    Tr = ipca.transform(chunk)
    OutIPCA = np.vstack([OutIPCA, Tr]) if OutIPCA.size else Tr

print('Training and fitting PCA')
plt.scatter(X_scaled_pca.values[:, 0],
            X_scaled_pca.values[:, 1],
            c=y,
            s=50,
            cmap='viridis')
plt.title('PCA sklearn clusters')
plt.xlabel('PC1')
plt.ylabel('PC2')
#plt.savefig(fname='numerical_pca2_sklearn')
plt.show()

# In[24]:

# PCA incremental
n_components = 2
X_scaled_ipca = IncrementalPCA(
    n_components=n_components).fit_transform(X_num_scaled)
X_scaled_ipca = pd.DataFrame(X_scaled_ipca)
X_scaled_ipca.head()

# In[25]:

plt.scatter(X_scaled_ipca.values[:, 0],
            X_scaled_ipca.values[:, 1],
            c=y,
            s=50,
            cmap='viridis')
plt.title('Incremental PCA sklearn clusters')
plt.xlabel('PC1')
plt.ylabel('PC2')
#plt.savefig(fname='numerical_pca2_sklearn_incremental')
plt.show()
def test_incremental_pca_validation():
    # Test that n_components is >=1 and <= n_features.
    X = [[0, 1], [1, 0]]
    for n_components in [-1, 0, .99, 3]:
        assert_raises(ValueError, IncrementalPCA(n_components,
                                                 batch_size=10).fit, X)
import numpy as np
from sklearn.decomposition import IncrementalPCA
from basic_classes.helper import get_data_from_csv
from basic_classes.get_input import Input
from basic_classes.constants import preciser_csv_path, pca_dir
from datetime import datetime
import joblib

# Create and save model
dt = datetime.now()
current_day = str(dt.day)
current_month = str(dt.month)
pca_transformer = IncrementalPCA(n_components=512, batch_size=100)
filename = 'pca' + str(current_day) + '_' + str(current_month) + '.joblib'
joblib.dump(pca_transformer, pca_dir + filename)
""" Get Input-names """
input_names, output_pois = get_data_from_csv(preciser_csv_path)
n_samples = input_names.shape[0]
n_train = n_samples
input_names = input_names[:n_train]
in_obj = Input('PCA', 500, 512, 'melspectrogram')

x_train = []
batch_count = 0
for i in range(n_train):
    sample_name = input_names[i]
    print(
        '\n=========================================================================='
    )
    print("Reading: " + sample_name)
    try:
Exemple #10
0
def fast_pca(*x,
             n_components=None,
             algo='pca',
             y=None,
             batch_size=1024,
             return_model=False,
             random_state=1234):
  r""" A shortcut for many different PCA algorithms

  Arguments:
    x : {list, tuple}
      list of matrices for transformation, the first matrix will
      be used for training
    n_components : {None, int}
      number of PCA components
    algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'}
      different PCA algorithm:
        'ipca' - IncrementalPCA,
        'ppca' - Probabilistic PCA,
        'sppca' - Supervised Probabilistic PCA,
        'plda' - Probabilistic LDA,
        'rpca' - randomized PCA using randomized SVD
        'pca'  - Normal PCA
    y : {numpy.ndarray, None}
      required for labels in case of `sppca`
    batch_size : int (default: 1024)
      batch size, only used for IncrementalPCA
    return_model : bool (default: False)
      if True, return the trained PCA model as the FIRST return
  """
  try:
    from cuml.decomposition import PCA as cuPCA
  except ImportError:
    cuPCA = None

  batch_size = int(batch_size)
  algo = str(algo).lower()
  if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'):
    raise ValueError("`algo` must be one of the following: 'pca', "
                     "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" %
                     algo)
  if algo in ('sppca', 'plda') and y is None:
    raise RuntimeError("`y` must be not None if `algo='sppca'`")
  x = flatten_list(x, level=None)
  # ====== check input ====== #
  x_train = x[0]
  x_test = x[1:]
  input_shape = None
  if x_train.ndim > 2:  # only 2D for PCA
    input_shape = (-1,) + x_train.shape[1:]
    new_shape = (-1, np.prod(input_shape[1:]))
    x_train = np.reshape(x_train, new_shape)
    x_test = [np.reshape(x, new_shape) for x in x_test]
    if n_components is not None:  # no need to reshape back
      input_shape = None
  # ====== train PCA ====== #
  if algo == 'sppca':
    pca = SupervisedPPCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train, y)
  elif algo == 'plda':
    from odin.ml import PLDA
    pca = PLDA(n_phi=n_components, random_state=random_state)
    pca.fit(x_train, y)
  elif algo == 'pca':
    if x_train.shape[1] > 1000 and x_train.shape[0] > 1e5 and cuPCA is not None:
      pca = cuPCA(n_components=n_components, random_state=random_state)
    else:
      pca = PCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train)
  elif algo == 'rpca':
    # we copy the implementation of RandomizedPCA because
    # it is significantly faster than PCA(svd_solver='randomize')
    pca = RandomizedPCA(n_components=n_components,
                        iterated_power=2,
                        random_state=random_state)
    pca.fit(x_train)
  elif algo == 'ipca':
    pca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
    prog = Progbar(target=x_train.shape[0],
                   print_report=False,
                   print_summary=False,
                   name="Fitting PCA")
    for start, end in batching(batch_size=batch_size,
                               n=x_train.shape[0],
                               seed=1234):
      pca.partial_fit(x_train[start:end], check_input=False)
      prog.add(end - start)
  elif algo == 'ppca':
    pca = PPCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train)
  # ====== transform ====== #
  x_train = pca.transform(x_train)
  x_test = [pca.transform(x) for x in x_test]
  # reshape back to original shape if necessary
  if input_shape is not None:
    x_train = np.reshape(x_train, input_shape)
    x_test = [np.reshape(x, input_shape) for x in x_test]
  # return the results
  if len(x_test) == 0:
    return x_train if not return_model else (pca, x_train)
  return tuple([x_train] +
               x_test) if not return_model else tuple([pca, x_train] + x_test)
Exemple #11
0
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.8),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index
print('Customer minimum times of review: {}'.format(cust_benchmark))

print('Original Shape: {}'.format(df.shape))
df = df[~df['MovieId'].isin(drop_movie_list)]
df = df[~df['User'].isin(drop_cust_list)]
print('After Trim Shape: {}'.format(df.shape))

# Convert the dataset to a user-movie matrix. To know more please google on "user item matrix"
df = df.drop_duplicates(subset = ['User', 'MovieId'], keep = "last")
df = df.pivot(index = 'User', columns = 'MovieId', values = 'Rating').fillna(0)

# Using PCA to reduce the dimension of every user vector to 2 floating point numbers so we can plot on a graph and also perform kmeans clustering
pca = IncrementalPCA(n_components=2, batch_size = 2)
transformed_matrix = pca.fit_transform(df)

# Perform Kmeans clustering
kmeans = KMeans(n_clusters = 4).fit_predict(transformed_matrix)

# Plotting the data
data = [go.Scatter(
	x = transformed_matrix[:, 0],
	y = transformed_matrix[:, 1],
	text = list(df.index),
	hoverinfo = "text",
	mode = "markers",
	marker = dict(color = kmeans)
	)]
Exemple #12
0
def pca_incremental(cube,
                    angle_list,
                    batch=0.25,
                    ncomp=1,
                    imlib='opencv',
                    interpolation='lanczos4',
                    collapse='median',
                    verbose=True,
                    full_output=False,
                    return_residuals=False,
                    start_time=None):
    """ Computes the full-frame PCA-ADI algorithm in batches, for processing
    fits files larger than the available system memory. It uses the incremental
    PCA algorithm from Sklearn. There is no ``scaling`` parameter as in other
    PCA algorithms in ``VIP``, but by default this implementation returns a
    temporally mean-centered frame ("temp-mean").

    Parameters
    ----------
    cube : str or numpy ndarray
        Input cube as numpy array or string with the path to the fits file to be
        opened in memmap mode.
    angle_list : str or numpy ndarray
        Corresponding parallactic angle for each frame.
    batch : int or float, optional
        When int it corresponds to the number of frames in each batch. If a
        float (0, 1] is passed then it is the size of the batch is computed wrt
        the available memory in the system.
    ncomp : int, optional
        How many PCs are used as a lower-dimensional subspace to project the
        target frames.
    imlib : str, optional
        See the documentation of the ``vip_hci.preproc.frame_rotate`` function.
    interpolation : str, optional
        See the documentation of the ``vip_hci.preproc.frame_rotate`` function.
    collapse : {'median', 'mean', 'sum', 'trimmean'}, str optional
        Sets the way of collapsing the frames for producing a final image.
    verbose : {True, False}, bool optional
        If True prints intermediate info and timing.
    full_output : boolean, optional
        Whether to return the final median combined image only or with other
        intermediate arrays.
    return_residuals : bool, optional
        If True, only the cube of residuals is returned (before de-rotating).
    start_time : None or datetime.datetime, optional
        Used when embedding this function in the main ``pca`` function. The
        object datetime.datetime is the global starting time. If None, it
        initiates its own counter.

    Returns
    -------
    frame : numpy ndarray
        [return_residuals=False] Final frame (2d array).
    ipca : scikit-learn model
        [full_output=True, return_residuals=False] The incremental PCA model of
        scikit-learn.
    pcs : numpy ndarray
        [full_output=True, return_residuals=False] Principal components reshaped
        into images.
    medians : numpy ndarray
        [full_output=True, return_residuals=False] The median of the derotated
        residuals for each batch.
    cube_residuals : numpy ndarray
        [return_residuals=True] Cube of residuals.


    """
    if start_time is None:
        start_time = time_ini(verbose)
        verbose_memcheck = True
    else:
        verbose_memcheck = False

    # checking cube and angle_list data types
    if not isinstance(cube, (np.ndarray, str)):
        raise TypeError('`cube` must be a str (full path on disk) or a numpy '
                        'array')
    if not isinstance(angle_list, (np.ndarray, str)):
        raise TypeError('`angle_list` must be a str (full path on disk) or a '
                        'numpy array')

    # opening data
    if isinstance(cube, str):
        # assuming the first HDULIST contains the datacube
        hdulist = open_fits(cube, n=0, return_memmap=True)
        cube = hdulist.data
    if not cube.ndim > 2:
        raise TypeError('Input array is not a 3d array')
    n_frames, y, x = cube.shape

    # checking angles length and ncomp
    if isinstance(angle_list, str):
        angle_list = open_fits(angle_list)
    angle_list = check_pa_vector(angle_list)
    if not n_frames == angle_list.shape[0] and not return_residuals:
        raise TypeError('`angle_list` vector has wrong length. It must be the '
                        'same as the number of frames in the cube')
    if not isinstance(ncomp, (int, float)):
        raise TypeError("`ncomp` must be an int or a float in the ADI case")
    if ncomp > n_frames:
        ncomp = min(ncomp, n_frames)
        msg = 'Number of PCs too high (max PCs={}), using {} PCs instead.'
        print(msg.format(n_frames, ncomp))

    # checking memory and determining batch size
    cube_size = cube.nbytes
    aval_mem = get_available_memory(verbose_memcheck)
    if isinstance(batch, int):  # the batch size in n_fr
        batch_size = batch
    elif isinstance(batch, float):  # the batch ratio wrt available memory
        if 1 > batch > 0:
            batch_size = min(int(n_frames * (batch * aval_mem) / cube_size),
                             n_frames)
    else:
        raise TypeError("`batch` must be an int or float")

    if verbose:
        msg1 = "Cube size = {:.3f} GB ({} frames)"
        print(msg1.format(cube_size / 1e9, n_frames))
        msg2 = "Batch size = {} frames ({:.3f} GB)\n"
        print(msg2.format(batch_size, cube[:batch_size].nbytes / 1e9))

    n_batches = n_frames // batch_size  # floor/int division
    remaining_frames = n_frames % batch_size
    if remaining_frames > 0:
        n_batches += 1

    # computing the PCA model for each batch
    ipca = IncrementalPCA(n_components=ncomp)

    for i in range(n_batches):
        intini = i * batch_size
        intfin = (i + 1) * batch_size
        batch = cube[intini:min(n_frames, intfin)]
        msg = 'Batch {}/{}\tshape: {}\tsize: {:.1f} MB'
        if verbose:
            print(msg.format(i + 1, n_batches, batch.shape,
                             batch.nbytes / 1e6))
        matrix = prepare_matrix(batch, verbose=False)
        ipca.partial_fit(matrix)

    if verbose:
        timing(start_time)

    # getting PCs and the mean in order to center each batch
    V = ipca.components_
    mean = ipca.mean_.reshape(y, x)

    if verbose:
        print('\nReconstructing and obtaining residuals')

    if return_residuals:
        cube_residuals = np.empty((n_frames, y, x))
    else:
        medians = []

    for i in range(n_batches):
        intini = i * batch_size
        intfin = (i + 1) * batch_size
        batch = cube[intini:min(n_frames, intfin)] - mean
        matrix = prepare_matrix(batch, verbose=False)
        reconst = np.dot(np.dot(matrix, V.T), V)
        resid = matrix - reconst
        resid_reshaped = resid.reshape(batch.shape)
        if return_residuals:
            cube_residuals[intini:intfin] = resid_reshaped
        else:
            resid_der = cube_derotate(resid_reshaped,
                                      angle_list[intini:intfin],
                                      imlib=imlib,
                                      interpolation=interpolation)
            medians.append(cube_collapse(resid_der, mode=collapse))

    del matrix
    del batch

    if return_residuals:
        return cube_residuals

    else:
        medians = np.array(medians)
        frame = np.median(medians, axis=0)

        if verbose:
            timing(start_time)

        if full_output:
            pcs = reshape_matrix(V, y, x)
            return frame, ipca, pcs, medians
        else:
            return frame
Exemple #13
0
    def btnConvert_click(self):
        msgBox = QMessageBox()
        totalTime = 0
        # Batch
        try:
            Batch = np.int32(ui.txtBatch.text())
        except:
            msgBox.setText("Size of batch is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        if Batch == 0:
            Batch = None

        # Kernel
        Kernel = ui.cbKernel.currentText()
        # Method
        Method = ui.cbMethod.currentText()

        # Gamma
        try:
            Gamma = np.float(ui.txtGamma.text())
        except:
            msgBox.setText("Gamma is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # Degree
        try:
            Degree = np.int32(ui.txtDegree.text())
        except:
            msgBox.setText("Degree is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # Coef0
        try:
            Coef0 = np.float(ui.txtCoef0.text())
        except:
            msgBox.setText("Coef0 is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # Alpha
        try:
            Alpha = np.int32(ui.txtAlpha.text())
        except:
            msgBox.setText("Alpha is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # Tol
        try:
            Tol = np.float(ui.txtTole.text())
        except:
            msgBox.setText("Tolerance is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # MaxIte
        try:
            MaxIter = np.int32(ui.txtMaxIter.text())
        except:
            msgBox.setText("Maximum number of iterations is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        if MaxIter <= 0:
            MaxIter = None

        # Number of Job
        try:
            NJob = np.int32(ui.txtJobs.text())
        except:
            msgBox.setText("The number of parallel jobs is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        if NJob < -1 or NJob == 0:
            msgBox.setText(
                "The number of parallel jobs must be -1 or greater than 0!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        try:
            FoldFrom = np.int32(ui.txtFoldFrom.text())
            FoldTo = np.int32(ui.txtFoldTo.text())
        except:
            print("Please check fold parameters!")
            return

        if FoldTo < FoldFrom:
            print("Please check fold parameters!")
            return

        for fold_all in range(FoldFrom, FoldTo + 1):
            tic = time.time()
            # OutFile
            OutFile = ui.txtOutFile.text()
            OutFile = OutFile.replace("$FOLD$", str(fold_all))
            if not len(OutFile):
                msgBox.setText("Please enter out file!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False

            # InFile
            InFile = ui.txtInFile.text()
            InFile = InFile.replace("$FOLD$", str(fold_all))
            if not len(InFile):
                msgBox.setText("Please enter input file!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not os.path.isfile(InFile):
                msgBox.setText("Input file not found!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False

            InData = io.loadmat(InFile)
            OutData = dict()
            OutData["imgShape"] = InData["imgShape"]

            # Data
            if not len(ui.txtITrData.currentText()):
                msgBox.setText("Please enter Input Train Data variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtITeData.currentText()):
                msgBox.setText("Please enter Input Test Data variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtOTrData.text()):
                msgBox.setText("Please enter Output Train Data variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtOTeData.text()):
                msgBox.setText("Please enter Output Test Data variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False

            try:
                XTr = InData[ui.txtITrData.currentText()]
                XTe = InData[ui.txtITeData.currentText()]

                if ui.cbScale.isChecked():
                    XTr = preprocessing.scale(XTr)
                    XTe = preprocessing.scale(XTe)
                    print("Whole of data is scaled X~N(0,1).")
            except:
                print("Cannot load data")
                return

            try:
                NumFea = np.int32(ui.txtNumFea.text())
            except:
                msgBox.setText("Number of features is wrong!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if NumFea < 0:
                msgBox.setText("Number of features must be greater than zero!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False

            if NumFea > np.shape(XTr)[1]:
                msgBox.setText("Number of features is wrong!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False

            if NumFea > np.shape(XTe)[1]:
                msgBox.setText("Number of features is wrong!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False

            # Label
            if not len(ui.txtITrLabel.currentText()):
                msgBox.setText("Please enter Train Input Label variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtITeLabel.currentText()):
                msgBox.setText("Please enter Test Input Label variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtOTrLabel.text()):
                msgBox.setText(
                    "Please enter Train Output Label variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtOTeLabel.text()):
                msgBox.setText("Please enter Test Output Label variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            try:
                OutData[ui.txtOTrLabel.text()] = InData[
                    ui.txtITrLabel.currentText()]
                OutData[ui.txtOTeLabel.text()] = InData[
                    ui.txtITeLabel.currentText()]
            except:
                print("Cannot load labels!")

            # Subject
            if not len(ui.txtITrSubject.currentText()):
                msgBox.setText(
                    "Please enter Train Input Subject variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtITeSubject.currentText()):
                msgBox.setText(
                    "Please enter Test Input Subject variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtOTrSubject.text()):
                msgBox.setText(
                    "Please enter Train Output Subject variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtOTeSubject.text()):
                msgBox.setText(
                    "Please enter Test Output Subject variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            try:
                TrSubject = InData[ui.txtITrSubject.currentText()]
                OutData[ui.txtOTrSubject.text()] = TrSubject
                TeSubject = InData[ui.txtITeSubject.currentText()]
                OutData[ui.txtOTeSubject.text()] = TeSubject
            except:
                print("Cannot load Subject IDs")
                return

            # Task
            if ui.cbTask.isChecked():
                if not len(ui.txtITrTask.currentText()):
                    msgBox.setText(
                        "Please enter Input Train Task variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtITeTask.currentText()):
                    msgBox.setText(
                        "Please enter Input Test Task variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTrTask.text()):
                    msgBox.setText(
                        "Please enter Output Train Task variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTeTask.text()):
                    msgBox.setText(
                        "Please enter Output Test Task variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    TrTask = InData[ui.txtITrTask.currentText()]
                    OutData[ui.txtOTrTask.text()] = TrTask
                    TeTask = InData[ui.txtITeTask.currentText()]
                    OutData[ui.txtOTeTask.text()] = TeTask
                    TrTaskIndex = TrTask.copy()
                    for tasindx, tas in enumerate(np.unique(TrTask)):
                        TrTaskIndex[TrTask == tas] = tasindx + 1
                    TeTaskIndex = TeTask.copy()
                    for tasindx, tas in enumerate(np.unique(TeTask)):
                        TeTaskIndex[TeTask == tas] = tasindx + 1
                except:
                    print("Cannot load Tasks!")
                    return

            # Run
            if ui.cbRun.isChecked():
                if not len(ui.txtITrRun.currentText()):
                    msgBox.setText(
                        "Please enter Train Input Run variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtITeRun.currentText()):
                    msgBox.setText(
                        "Please enter Test Input Run variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTrRun.text()):
                    msgBox.setText(
                        "Please enter Train Output Run variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTeRun.text()):
                    msgBox.setText(
                        "Please enter Test Output Run variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    TrRun = InData[ui.txtITrRun.currentText()]
                    OutData[ui.txtOTrRun.text()] = TrRun
                    TeRun = InData[ui.txtITeRun.currentText()]
                    OutData[ui.txtOTeRun.text()] = TeRun
                except:
                    print("Cannot load Runs!")
                    return

            # Counter
            if ui.cbCounter.isChecked():
                if not len(ui.txtITrCounter.currentText()):
                    msgBox.setText(
                        "Please enter Train Input Counter variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtITeCounter.currentText()):
                    msgBox.setText(
                        "Please enter Test Input Counter variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTrCounter.text()):
                    msgBox.setText(
                        "Please enter Train Output Counter variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTeCounter.text()):
                    msgBox.setText(
                        "Please enter Test Output Counter variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    TrCounter = InData[ui.txtITrCounter.currentText()]
                    OutData[ui.txtOTrCounter.text()] = TrCounter
                    TeCounter = InData[ui.txtITeCounter.currentText()]
                    OutData[ui.txtOTeCounter.text()] = TeCounter
                except:
                    print("Cannot load Counters!")
                    return

            # Matrix Label
            if ui.cbmLabel.isChecked():
                if not len(ui.txtITrmLabel.currentText()):
                    msgBox.setText(
                        "Please enter Train Input Matrix Label variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtITemLabel.currentText()):
                    msgBox.setText(
                        "Please enter Test Input Matrix Label variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTrmLabel.text()):
                    msgBox.setText(
                        "Please enter Train Output Matrix Label variable name!"
                    )
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTemLabel.text()):
                    msgBox.setText(
                        "Please enter Test Output Matrix Label variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    OutData[ui.txtOTrmLabel.text()] = InData[
                        ui.txtITrmLabel.currentText()]
                    OutData[ui.txtOTemLabel.text()] = InData[
                        ui.txtITemLabel.currentText()]
                except:
                    print("Cannot load matrix lables!")
                    return

            # Design
            if ui.cbDM.isChecked():
                if not len(ui.txtITrDM.currentText()):
                    msgBox.setText(
                        "Please enter Train Input Design Matrix variable name!"
                    )
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtITeDM.currentText()):
                    msgBox.setText(
                        "Please enter Test Input Design Matrix variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTrDM.text()):
                    msgBox.setText(
                        "Please enter Train Output Design Matrix variable name!"
                    )
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTeDM.text()):
                    msgBox.setText(
                        "Please enter Test Output Design Matrix variable name!"
                    )
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    OutData[ui.txtOTrDM.text()] = InData[
                        ui.txtITrDM.currentText()]
                    OutData[ui.txtOTeDM.text()] = InData[
                        ui.txtITeDM.currentText()]
                except:
                    print("Cannot load design matrices!")
                    return

            # Coordinate
            if ui.cbCol.isChecked():
                if not len(ui.txtCol.currentText()):
                    msgBox.setText("Please enter Coordinator variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOCol.text()):
                    msgBox.setText("Please enter Coordinator variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    OutData[ui.txtOCol.text()] = InData[
                        ui.txtCol.currentText()]
                except:
                    print("Cannot load coordinator!")
                    return

            # Condition
            if ui.cbCond.isChecked():
                if not len(ui.txtCond.currentText()):
                    msgBox.setText("Please enter Condition variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOCond.text()):
                    msgBox.setText("Please enter Condition variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    OutData[ui.txtOCond.text()] = InData[
                        ui.txtCond.currentText()]
                except:
                    print("Cannot load conditions!")
                    return

            # FoldID
            if ui.cbFoldID.isChecked():
                if not len(ui.txtFoldID.currentText()):
                    msgBox.setText("Please enter FoldID variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOFoldID.text()):
                    msgBox.setText("Please enter FoldID variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    OutData[ui.txtOFoldID.text()] = InData[
                        ui.txtFoldID.currentText()]
                except:
                    print("Cannot load Fold ID!")
                    return

            # FoldInfo
            if ui.cbFoldInfo.isChecked():
                if not len(ui.txtFoldInfo.currentText()):
                    msgBox.setText("Please enter FoldInfo variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOFoldInfo.text()):
                    msgBox.setText("Please enter FoldInfo variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    OutData[ui.txtOFoldInfo.text()] = InData[
                        ui.txtFoldInfo.currentText()]
                except:
                    print("Cannot load Fold Info!")
                    return
                pass

            # Number of Scan
            if ui.cbNScan.isChecked():
                if not len(ui.txtITrScan.currentText()):
                    msgBox.setText(
                        "Please enter Number of Scan variable name for Input Train!"
                    )
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtITeScan.currentText()):
                    msgBox.setText(
                        "Please enter Number of Scan variable name for Input Test!"
                    )
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTrScan.text()):
                    msgBox.setText(
                        "Please enter Number of Scan variable name for Output Train!"
                    )
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTeScan.text()):
                    msgBox.setText(
                        "Please enter Number of Scan variable name for Output Test!"
                    )
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    OutData[ui.txtOTrScan.text()] = InData[
                        ui.txtITrScan.currentText()]
                    OutData[ui.txtOTeScan.text()] = InData[
                        ui.txtITeScan.currentText()]
                except:
                    print("Cannot load NScan!")
                    return

            if NumFea == 0:
                NumFea = np.min(np.shape(XTr))
                print("Number of features are automatically selected as ",
                      NumFea)

            try:
                if Method == "PCA":
                    model = PCA(n_components=NumFea, copy=False, tol=Tol)
                elif Method == "Kernel PCA":
                    model = KernelPCA(n_components=NumFea,kernel=Kernel,gamma=Gamma,degree=Degree,\
                                  coef0=Coef0, alpha=Alpha, tol=Tol, max_iter=MaxIter, n_jobs=NJob,copy_X=False)
                else:
                    model = IncrementalPCA(n_components=NumFea,
                                           copy=False,
                                           batch_size=Batch)

                print("Running PCA Functional Alignment on Training Data ...")
                OutData[ui.txtOTrData.text()] = model.fit_transform(XTr)
                print("Running PCA Functional Alignment on Testing Data ...")
                OutData[ui.txtOTeData.text()] = model.fit_transform(XTe)
            except Exception as e:
                print(str(e))

            HAParam = dict()
            HAParam["Method"] = Method
            HAParam["NumFea"] = NumFea
            HAParam["Kernel"] = Kernel
            OutData["FunctionalAlignment"] = HAParam
            OutData["Runtime"] = time.time() - tic
            totalTime += OutData["Runtime"]

            print("Saving ...")
            io.savemat(OutFile, mdict=OutData)
            print("Fold " + str(fold_all) + " is DONE: " + OutFile)
        print("Runtime: ", totalTime)
        print("PCA Functional Alignment is done.")
        msgBox.setText("PCA Functional Alignment is done.")
        msgBox.setIcon(QMessageBox.Information)
        msgBox.setStandardButtons(QMessageBox.Ok)
        msgBox.exec_()
Exemple #14
0
plt.title("Original", fontsize=16)
plt.subplot(122)
plot_digits(X_recovered[::2100])
plt.title("Compressed", fontsize=16)

save_fig("mnist_compression_plot")
plt.show()

X_reduced_pca = X_reduced

print('------------------------------------------------------------------------------------------------------\n'
      '          8.3.8 Incremental PCA                                                                       \n'
      '------------------------------------------------------------------------------------------------------\n')

n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train, n_batches):
    print(".", end="") # not shown in the book
    inc_pca.partial_fit(X_batch)

X_reduced = inc_pca.transform(X_train)

X_recovered_inc_pca = inc_pca.inverse_transform(X_reduced)

plt.figure(figsize=(7, 4))
plt.subplot(121)
plot_digits(X_train[::2100])
plt.subplot(122)
plot_digits(X_recovered_inc_pca[::2100])
plt.tight_layout()
plt.show()
Exemple #15
0
def plot_at_k(k):
    ipca = IncrementalPCA(n_components=k)
    image_recon = ipca.inverse_transform(ipca.fit_transform(image_bw))
    plt.imshow(image_recon, cmap=plt.cm.gray)
Exemple #16
0
if pca_fraction_variance_to_retain < 1 or cluster_algorithm == "pca":
    print("reducing dimensionality with PCA...")

    activations_scaled = [None] * nlayers
    for ilayer in range(nlayers):
        if ilayer not in these_layers:
            continue
        mu = np.mean(activations_flattened[ilayer], axis=0)
        sigma = np.std(activations_flattened[ilayer], axis=0)
        activations_scaled[ilayer] = (activations_flattened[ilayer] -
                                      mu) / sigma
        if pca_batch_size == 0:
            pca = PCA()
        else:
            nfeatures = np.shape(activations_scaled[ilayer])[1]
            pca = IncrementalPCA(batch_size=pca_batch_size * nfeatures)
        fits_pca[ilayer] = pca.fit(activations_scaled[ilayer])
        print(np.shape(fits_pca[ilayer]))

    import matplotlib as mpl
    mpl.use('Agg')
    import matplotlib.pyplot as plt
    #plt.ion()

    activations_kept = [None] * nlayers
    fig = plt.figure()
    ax = fig.add_subplot(111)
    for ilayer in range(nlayers):
        if ilayer not in these_layers:
            continue
        cumsum = np.cumsum(fits_pca[ilayer].explained_variance_ratio_)
Exemple #17
0
    def calc_pca(self, h5_fname, n_comps=20, batch_size=1000, norm=''):
        """
        Perform Incremental Principle Component Analysis on squared data
        stored in the provided h5 datafile. Breaks dataset into sets of 1000
        spectra, which are fed to the IPCA model one at a time for training.
        Spectra are then similarly transformed according to results of the PCA
        in sets of 1000 at a time, then aggregated and stored to a new group
        in the same h5 datafile as the source dataset.
        """
        from sklearn.decomposition import PCA, IncrementalPCA
        hf = h5py.File(h5_fname, 'r+')
        raw_data = hf['MALDI_001']['Intensities']
        filtered = hf['Filtered_data']
        height, length = raw_data.shape
        
        #Apply relevant normalization
        norm = norm.capitalize()
        if norm == '':
            norm_factors = np.ones(height)
            norm = None
        else:
            try:
                norm_factors = np.array(hf['MALDI_001']['Normalization Factors'][norm])
            except KeyError:
                print('Desired normalization factors not yet calculated.')
                print('Calculate normalization factors, then try again.')
                return
        try:
            pca = hf.create_group('PCA')
        except:
            pca = hf['PCA']
            try:
                norm_group = pca.create_group(norm)
            except KeyError:
                print('Selected PCA has already been calculated, aborting.')
                return
        roi_list = list(filtered.keys())
        
        print('Calculating PCA')
        for roi in roi_list:
            roi_grp = pca.create_group(roi)
            if norm == None:
                norm_grp = roi_grp.create_group('Unnormalized')
            else:
                try:
                    norm_group = roi_grp.create_group(norm)
                except KeyError:
                    print('Selected PCA has already been calculated, aborting.')
                    return
            (ptsx, ptsy) = filtered[roi].attrs.get('Image Dimensions')
            
            # Perform PCA, store results as MALDIdata attributes
            model = IncrementalPCA(n_components=n_comps, batch_size=batch_size)
            steps = int(height/batch_size)
            print('Training PCA....')
            for i in range(steps):
                low_bound = i*1000
                high_bound = (i+1)*1000
                subset = raw_data[low_bound:high_bound]
                norm_factor_subset = norm_factors[low_bound:high_bound]
                normalized_subset = np.empty_like(subset)
                for j, spectrum in enumerate(subset):
                    n_factor = norm_factor_subset[j]
                    spectrum = subset[j]
                    norm_spectrum = spectrum / n_factor
                    normalized_subset[j] = norm_spectrum
                model.partial_fit(normalized_subset)
                print(high_bound)
            if steps == 0:
                high_bound = 0
            subset = raw_data[high_bound:]
            model.partial_fit(subset)
            fitted = np.empty((height, n_comps))
            
            print('Mapping spectra to PCA...')
            for i in range(steps):
                low_bound = i*1000
                high_bound = (i+1)*1000
                subset = raw_data[low_bound:high_bound]
                norm_factor_subset = norm_factors[low_bound:high_bound]
                normalized_subset = np.empty_like(subset)
                for j, spectrum in enumerate(subset):
                    n_factor = norm_factor_subset[j]
                    spectrum = subset[j]
                    norm_spectrum = spectrum / n_factor
                    normalized_subset[j] = norm_spectrum
                transformation = model.transform(normalized_subset)
                fitted[low_bound:high_bound] = transformation
                print(high_bound)
            fitted[high_bound:] = model.transform(raw_data[high_bound:])
            
            squared_indices = np.array(hf['Squared_data']['ROI_01']['Indices'])
            keep_indices = squared_indices.flatten()
            
            pca_maps = np.empty((ptsx, ptsy, n_comps))
            for index in keep_indices:
                spectrum = fitted[index]
                coordinates = np.where(squared_indices == index)
                row, column = (coordinates[0][0], coordinates[1][0])
                pca_maps[row, column, :] = spectrum

            roi_grp.create_dataset('Maps', data=pca_maps, dtype='float64')
            roi_grp.create_dataset('Endmembers', data=model.components_, dtype='float64')
            roi_grp.create_dataset('Eigenvalues', data=model.explained_variance_, dtype='float64')
            roi_grp.attrs['Number_of_components'] = n_comps
        hf.close()
        return
def pipe_main(pipe=None):
    '''pipeline construction using sklearn estimators, final step support only
    classifiers currently
    
    .. note::
        data flows through a pipeline consisting of steps as below:
            raw data --> clean --> encoding --> scaling --> feature construction 
            --> feature selection --> resampling --> final estimator
            see scikit-learn preprocess & estimators
    parameter
    ----
    pipe - str 
        - in the format of 'xx_xx' of which 'xx' means steps in pipeline,
          default None
    return
    ----
        1) pipeline instance of chosen steps
        2) if pipe is None, a dict indicating possible choice of 'steps'
    '''
    clean = {
        'clean':
        Split_cls(dtype_filter='not_datetime', na1='null', na2=-999),
        'cleanNA':
        Split_cls(dtype_filter='not_datetime', na1=None, na2=None),
        'cleanMean':
        Split_cls(dtype_filter='not_datetime', na1='most_frequent',
                  na2='mean'),
    }
    #
    encode = {
        'woe': Woe_encoder(max_leaf_nodes=5),
        'oht': Oht_encoder(),
        'ordi': Ordi_encoder(),
    }

    resample = {

        # over_sampling
        'rover':
        RandomOverSampler(),
        'smote':
        SMOTE(),
        'bsmote':
        BorderlineSMOTE(),
        'adasyn':
        ADASYN(),

        # under sampling controlled methods
        'runder':
        RandomUnderSampler(),
        'nearmiss':
        NearMiss(version=3),
        'pcart':
        InstanceHardnessThreshold(),

        # under sampling cleaning methods
        'tlinks':
        TomekLinks(n_jobs=-1),
        'oside':
        OneSidedSelection(n_jobs=-1),
        'cleanNN':
        NeighbourhoodCleaningRule(n_jobs=-1),
        'enn':
        EditedNearestNeighbours(n_jobs=-1),
        'ann':
        AllKNN(n_jobs=-1),
        'cnn':
        CondensedNearestNeighbour(n_jobs=-1),

        # clean outliers
        'inlierForest':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'IsolationForest'}),
        'inlierLocal':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'LocalOutlierFactor'}),
        'inlierEllip':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'EllipticEnvelope'}),
        'inlierOsvm':
        FunctionSampler(outlier_rejection, kw_args={'method': 'OneClassSVM'}),
        # combine
        'smoteenn':
        SMOTEENN(),
        'smotelink':
        SMOTETomek(),
    }

    scale = {
        'stdscale': StandardScaler(),
        'maxscale': MinMaxScaler(),
        'rscale': RobustScaler(quantile_range=(10, 90)),
        'qauntile': QuantileTransformer(),  # uniform distribution
        'power': PowerTransformer(),  # Gaussian distribution
        'norm': Normalizer(),  # default L2 norm

        # scale sparse data
        'maxabs': MaxAbsScaler(),
        'stdscalesp': StandardScaler(with_mean=False),
    }
    # feature construction
    feature_c = {
        'pca': PCA(whiten=True),
        'spca': SparsePCA(normalize_components=True, n_jobs=-1),
        'ipca': IncrementalPCA(whiten=True),
        'kpca': KernelPCA(kernel='rbf', n_jobs=-1),
        'poly': PolynomialFeatures(degree=2),
        'rtembedding': RandomTreesEmbedding(n_estimators=10),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
    }
    # select from model
    feature_m = {
        'fwoe':
        SelectFromModel(Woe_encoder(max_leaf_nodes=5)),
        'flog':
        SelectFromModel(
            LogisticRegressionCV(penalty='l1',
                                 solver='saga',
                                 scoring='roc_auc')),
        'fsgd':
        SelectFromModel(SGDClassifier(penalty="l1")),
        'fsvm':
        SelectFromModel(LinearSVC('l1', dual=False, C=1e-2)),
        'fxgb':
        SelectFromModel(XGBClassifier(n_jobs=-1)),
        'frf':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5)),
        'fRFExgb':
        RFE(XGBClassifier(n_jobs=-1), step=0.1, n_features_to_select=20),
        'fRFErf':
        RFE(ExtraTreesClassifier(n_estimators=100, max_depth=5),
            step=0.3,
            n_features_to_select=20),
        'fRFElog':
        RFE(LogisticRegressionCV(penalty='l1',
                                 solver='saga',
                                 scoring='roc_auc'),
            step=0.3,
            n_features_to_select=20)
    }
    # Univariate feature selection
    feature_u = {
        'fchi2':
        GenericUnivariateSelect(chi2, 'percentile', 25),
        'fMutualclf':
        GenericUnivariateSelect(mutual_info_classif, 'percentile', 25),
        'fFclf':
        GenericUnivariateSelect(f_classif, 'percentile', 25),
    }
    # sklearn estimator
    t = all_estimators(type_filter=['classifier'])
    estimator = {}
    for i in t:
        try:
            estimator.update({i[0]: i[1]()})
        except Exception:
            continue

    estimator.update(
        dummy=DummyClassifier(),
        XGBClassifier=XGBClassifier(n_jobs=-1),
        LogisticRegressionCV=LogisticRegressionCV(scoring='roc_auc'),
        EasyEnsembleClassifier=EasyEnsembleClassifier(),
        BalancedRandomForestClassifier=BalancedRandomForestClassifier(),
        RUSBoostClassifier=RUSBoostClassifier(),
        SVC=SVC(C=0.01, gamma='auto'))

    if pipe is None:
        feature_s = {}
        feature_s.update(**feature_m, **feature_u)
        return {
            'clean': clean.keys(),
            'encoding': encode.keys(),
            'resample': resample.keys(),
            'scale': scale.keys(),
            'feature_c': feature_c.keys(),
            'feature_s': feature_s.keys(),
            'classifier': estimator.keys()
        }
    elif isinstance(pipe, str):
        l = pipe.split('_')
        all_keys_dict = {}
        all_keys_dict.update(**clean, **encode, **scale, **feature_c,
                             **feature_m, **feature_u, **estimator, **resample)
        steps = []
        for i in l:
            if all_keys_dict.get(i) is not None:
                steps.append((i, all_keys_dict.get(i)))
            else:
                raise KeyError(
                    "'{}' invalid key for sklearn estimators".format(i))
        return Pipeline(steps)

    else:
        raise ValueError("input pipe must be a string in format 'xx[_xx]'")
Exemple #19
0
    train_features = np.load('train_features_joined.npz')
    img_features = train_features['img_features']
    tag_features = train_features['tag_features']
else:
    assert os.path.isfile('train_features.npz')
    logging.info('Loading features file')
    train_features = np.load('train_features.npz')
    img_features = train_features['img_features']
    tag_features = train_features['tag_features']

if args.perform_PCA == True:
    N_PCA = img_features.shape[0] if args.npca == -1 else args.npca
    logging.info('Training: PCA of image features, N_PCA = %d', N_PCA)
    start = time.time()
    # pca = IncrementalPCA(n_components=100, batch_size=512)
    pca = IncrementalPCA(n_components=500, batch_size=512)
    pca.fit(img_features[:N_PCA, :])
    end = time.time()
    logging.info('Time: %.4fm', (end - start) / 60)

    logging.info('Apply PCA to image features')
    start = time.time()
    X = pca.transform(img_features)
    end = time.time()
    logging.info('Time: %.4fm', (end - start) / 60)

logging.info('Training: fit CCA')
start = time.time()

if args.perform_PCA == True:
    W_img, W_tag = cca.fit(X, tag_features, numCC=args.numCC, useGPU=args.gpu)
Exemple #20
0
    def btnConvert_click(self):
        totalTime = 0
        msgBox = QMessageBox()

        # Batch
        try:
            Batch = np.int32(ui.txtBatch.text())
        except:
            msgBox.setText("Size of batch is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        if Batch == 0:
            Batch = None

        # Kernel
        Kernel = ui.cbKernel.currentText()
        # Method
        Method = ui.cbMethod.currentText()

        # Gamma
        try:
            Gamma = np.float(ui.txtGamma.text())
        except:
            msgBox.setText("Gamma is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # Degree
        try:
            Degree = np.int32(ui.txtDegree.text())
        except:
            msgBox.setText("Degree is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # Coef0
        try:
            Coef0 = np.float(ui.txtCoef0.text())
        except:
            msgBox.setText("Coef0 is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # Alpha
        try:
            Alpha = np.int32(ui.txtAlpha.text())
        except:
            msgBox.setText("Alpha is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # Tol
        try:
            Tol = np.float(ui.txtTole.text())
        except:
            msgBox.setText("Tolerance is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # MaxIte
        try:
            MaxIter = np.int32(ui.txtMaxIter.text())
        except:
            msgBox.setText("Maximum number of iterations is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        if MaxIter <= 0:
            MaxIter = None

        # Number of Job
        try:
            NJob = np.int32(ui.txtJobs.text())
        except:
            msgBox.setText("The number of parallel jobs is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        if NJob < -1 or NJob == 0:
            msgBox.setText("The number of parallel jobs must be -1 or greater than 0!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False



        TrFoldErr = list()
        TeFoldErr = list()

        try:
            FoldFrom = np.int32(ui.txtFoldFrom.text())
            FoldTo   = np.int32(ui.txtFoldTo.text())
        except:
            print("Please check fold parameters!")
            return

        if FoldTo < FoldFrom:
            print("Please check fold parameters!")
            return

        for fold_all in range(FoldFrom, FoldTo+1):
            tic = time.time()
            # Regularization
            try:
                Regularization = np.float(ui.txtRegularization.text())
            except:
                msgBox.setText("Regularization value is wrong!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False

            # OutFile
            OutFile = ui.txtOutFile.text()
            OutFile = OutFile.replace("$FOLD$", str(fold_all))
            if not len(OutFile):
                msgBox.setText("Please enter out file!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False

            # InFile
            InFile = ui.txtInFile.text()
            InFile = InFile.replace("$FOLD$", str(fold_all))
            if not len(InFile):
                msgBox.setText("Please enter input file!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not os.path.isfile(InFile):
                msgBox.setText("Input file not found!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False

            InData = io.loadmat(InFile)
            OutData = dict()
            OutData["imgShape"] = InData["imgShape"]

            # Data
            if not len(ui.txtITrData.currentText()):
                msgBox.setText("Please enter Input Train Data variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtITeData.currentText()):
                msgBox.setText("Please enter Input Test Data variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtOTrData.text()):
                msgBox.setText("Please enter Output Train Data variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtOTeData.text()):
                msgBox.setText("Please enter Output Test Data variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False

            try:
                XTr = InData[ui.txtITrData.currentText()]
                XTe = InData[ui.txtITeData.currentText()]

                if ui.cbScale.isChecked() and not ui.rbScale.isChecked():
                    XTr = preprocessing.scale(XTr)
                    XTe = preprocessing.scale(XTe)
                    print("Whole of data is scaled X~N(0,1).")
            except:
                print("Cannot load data")
                return

            # NComponent
            try:
                NumFea = np.int32(ui.txtNumFea.text())
            except:
                msgBox.setText("Number of features is wrong!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if NumFea < 1:
                msgBox.setText("Number of features must be greater than zero!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if NumFea > np.shape(XTr)[1]:
                msgBox.setText("Number of features is wrong!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False

            # Label
            if not len(ui.txtITrLabel.currentText()):
                    msgBox.setText("Please enter Train Input Label variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
            if not len(ui.txtITeLabel.currentText()):
                    msgBox.setText("Please enter Test Input Label variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
            if not len(ui.txtOTrLabel.text()):
                    msgBox.setText("Please enter Train Output Label variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
            if not len(ui.txtOTeLabel.text()):
                    msgBox.setText("Please enter Test Output Label variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
            try:
                OutData[ui.txtOTrLabel.text()] = InData[ui.txtITrLabel.currentText()]
                OutData[ui.txtOTeLabel.text()] = InData[ui.txtITeLabel.currentText()]
            except:
                print("Cannot load labels!")

            # Subject
            if not len(ui.txtITrSubject.currentText()):
                msgBox.setText("Please enter Train Input Subject variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtITeSubject.currentText()):
                msgBox.setText("Please enter Test Input Subject variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtOTrSubject.text()):
                msgBox.setText("Please enter Train Output Subject variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            if not len(ui.txtOTeSubject.text()):
                msgBox.setText("Please enter Test Output Subject variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            try:
                TrSubject = InData[ui.txtITrSubject.currentText()]
                OutData[ui.txtOTrSubject.text()] = TrSubject
                TeSubject = InData[ui.txtITeSubject.currentText()]
                OutData[ui.txtOTeSubject.text()] = TeSubject
            except:
                print("Cannot load Subject IDs")
                return

            # Task
            if ui.cbTask.isChecked():
                if not len(ui.txtITrTask.currentText()):
                    msgBox.setText("Please enter Input Train Task variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtITeTask.currentText()):
                    msgBox.setText("Please enter Input Test Task variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTrTask.text()):
                    msgBox.setText("Please enter Output Train Task variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTeTask.text()):
                    msgBox.setText("Please enter Output Test Task variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    TrTask = InData[ui.txtITrTask.currentText()]
                    OutData[ui.txtOTrTask.text()] = TrTask
                    TeTask = InData[ui.txtITeTask.currentText()]
                    OutData[ui.txtOTeTask.text()] = TeTask
                    TrTaskIndex = TrTask.copy()
                    for tasindx, tas in enumerate(np.unique(TrTask)):
                        TrTaskIndex[TrTask == tas] = tasindx + 1
                    TeTaskIndex = TeTask.copy()
                    for tasindx, tas in enumerate(np.unique(TeTask)):
                        TeTaskIndex[TeTask == tas] = tasindx + 1
                except:
                    print("Cannot load Tasks!")
                    return

            # Run
            if ui.cbRun.isChecked():
                if not len(ui.txtITrRun.currentText()):
                    msgBox.setText("Please enter Train Input Run variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtITeRun.currentText()):
                    msgBox.setText("Please enter Test Input Run variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTrRun.text()):
                    msgBox.setText("Please enter Train Output Run variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTeRun.text()):
                    msgBox.setText("Please enter Test Output Run variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    TrRun = InData[ui.txtITrRun.currentText()]
                    OutData[ui.txtOTrRun.text()] = TrRun
                    TeRun = InData[ui.txtITeRun.currentText()]
                    OutData[ui.txtOTeRun.text()] = TeRun
                except:
                    print("Cannot load Runs!")
                    return

            # Counter
            if ui.cbCounter.isChecked():
                if not len(ui.txtITrCounter.currentText()):
                    msgBox.setText("Please enter Train Input Counter variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtITeCounter.currentText()):
                    msgBox.setText("Please enter Test Input Counter variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTrCounter.text()):
                    msgBox.setText("Please enter Train Output Counter variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTeCounter.text()):
                    msgBox.setText("Please enter Test Output Counter variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    TrCounter = InData[ui.txtITrCounter.currentText()]
                    OutData[ui.txtOTrCounter.text()] = TrCounter
                    TeCounter = InData[ui.txtITeCounter.currentText()]
                    OutData[ui.txtOTeCounter.text()] = TeCounter
                except:
                    print("Cannot load Counters!")
                    return

            # Matrix Label
            if ui.cbmLabel.isChecked():
                if not len(ui.txtITrmLabel.currentText()):
                    msgBox.setText("Please enter Train Input Matrix Label variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtITemLabel.currentText()):
                    msgBox.setText("Please enter Test Input Matrix Label variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTrmLabel.text()):
                    msgBox.setText("Please enter Train Output Matrix Label variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTemLabel.text()):
                    msgBox.setText("Please enter Test Output Matrix Label variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    OutData[ui.txtOTrmLabel.text()] = InData[ui.txtITrmLabel.currentText()]
                    OutData[ui.txtOTemLabel.text()] = InData[ui.txtITemLabel.currentText()]
                except:
                    print("Cannot load matrix lables!")
                    return

            # Design
            if ui.cbDM.isChecked():
                if not len(ui.txtITrDM.currentText()):
                    msgBox.setText("Please enter Train Input Design Matrix variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtITeDM.currentText()):
                    msgBox.setText("Please enter Test Input Design Matrix variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTrDM.text()):
                    msgBox.setText("Please enter Train Output Design Matrix variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTeDM.text()):
                    msgBox.setText("Please enter Test Output Design Matrix variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    OutData[ui.txtOTrDM.text()] = InData[ui.txtITrDM.currentText()]
                    OutData[ui.txtOTeDM.text()] = InData[ui.txtITeDM.currentText()]
                except:
                    print("Cannot load design matrices!")
                    return

            # Coordinate
            if ui.cbCol.isChecked():
                if not len(ui.txtCol.currentText()):
                    msgBox.setText("Please enter Coordinator variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOCol.text()):
                    msgBox.setText("Please enter Coordinator variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    OutData[ui.txtOCol.text()] = InData[ui.txtCol.currentText()]
                except:
                    print("Cannot load coordinator!")
                    return

            # Condition
            if ui.cbCond.isChecked():
                if not len(ui.txtCond.currentText()):
                    msgBox.setText("Please enter Condition variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOCond.text()):
                    msgBox.setText("Please enter Condition variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    OutData[ui.txtOCond.text()] = InData[ui.txtCond.currentText()]
                except:
                    print("Cannot load conditions!")
                    return

            # FoldID
            if ui.cbFoldID.isChecked():
                if not len(ui.txtFoldID.currentText()):
                    msgBox.setText("Please enter FoldID variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOFoldID.text()):
                    msgBox.setText("Please enter FoldID variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    OutData[ui.txtOFoldID.text()] = InData[ui.txtFoldID.currentText()]
                except:
                    print("Cannot load Fold ID!")
                    return

            # FoldInfo
            if ui.cbFoldInfo.isChecked():
                if not len(ui.txtFoldInfo.currentText()):
                    msgBox.setText("Please enter FoldInfo variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOFoldInfo.text()):
                    msgBox.setText("Please enter FoldInfo variable name!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    OutData[ui.txtOFoldInfo.text()] = InData[ui.txtFoldInfo.currentText()]
                except:
                    print("Cannot load Fold Info!")
                    return
                pass

            # Number of Scan
            if ui.cbNScan.isChecked():
                if not len(ui.txtITrScan.currentText()):
                    msgBox.setText("Please enter Number of Scan variable name for Input Train!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtITeScan.currentText()):
                    msgBox.setText("Please enter Number of Scan variable name for Input Test!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTrScan.text()):
                    msgBox.setText("Please enter Number of Scan variable name for Output Train!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                if not len(ui.txtOTeScan.text()):
                    msgBox.setText("Please enter Number of Scan variable name for Output Test!")
                    msgBox.setIcon(QMessageBox.Critical)
                    msgBox.setStandardButtons(QMessageBox.Ok)
                    msgBox.exec_()
                    return False
                try:
                    OutData[ui.txtOTrScan.text()] = InData[ui.txtITrScan.currentText()]
                    OutData[ui.txtOTeScan.text()] = InData[ui.txtITeScan.currentText()]
                except:
                    print("Cannot load NScan!")
                    return

            # Train Analysis Level
            print("Calculating Analysis Level for Training Set ...")
            TrGroupFold = None
            FoldStr = ""
            if ui.cbFSubject.isChecked():
                if not ui.rbFRun.isChecked():
                    TrGroupFold = TrSubject
                    FoldStr = "Subject"
                else:
                    TrGroupFold = np.concatenate((TrSubject,TrRun))
                    FoldStr = "Subject+Run"

            if ui.cbFTask.isChecked():
                TrGroupFold = np.concatenate((TrGroupFold,TrTaskIndex)) if TrGroupFold is not None else TrTaskIndex
                FoldStr = FoldStr + "+Task"

            if ui.cbFCounter.isChecked():
                TrGroupFold = np.concatenate((TrGroupFold,TrCounter)) if TrGroupFold is not None else TrCounter
                FoldStr = FoldStr + "+Counter"

            TrGroupFold = np.transpose(TrGroupFold)

            TrUniqFold = np.array(list(set(tuple(i) for i in TrGroupFold.tolist())))

            TrFoldIDs = np.arange(len(TrUniqFold)) + 1

            TrListFold = list()
            for gfold in TrGroupFold:
                for ufoldindx, ufold in enumerate(TrUniqFold):
                    if (ufold == gfold).all():
                        currentID = TrFoldIDs[ufoldindx]
                        break
                TrListFold.append(currentID)
            TrListFold = np.int32(TrListFold)
            TrListFoldUniq = np.unique(TrListFold)


            # Test Analysis Level
            print("Calculating Analysis Level for Testing Set ...")
            TeGroupFold = None
            if ui.cbFSubject.isChecked():
                if not ui.rbFRun.isChecked():
                    TeGroupFold = TeSubject
                else:
                    TeGroupFold = np.concatenate((TeSubject,TeRun))

            if ui.cbFTask.isChecked():
                TeGroupFold = np.concatenate((TeGroupFold,TeTaskIndex)) if TeGroupFold is not None else TeTaskIndex

            if ui.cbFCounter.isChecked():
                TeGroupFold = np.concatenate((TeGroupFold,TeCounter)) if TeGroupFold is not None else TeCounter

            TeGroupFold = np.transpose(TeGroupFold)

            TeUniqFold = np.array(list(set(tuple(i) for i in TeGroupFold.tolist())))

            TeFoldIDs = np.arange(len(TeUniqFold)) + 1

            TeListFold = list()
            for gfold in TeGroupFold:
                for ufoldindx, ufold in enumerate(TeUniqFold):
                    if (ufold == gfold).all():
                        currentID = TeFoldIDs[ufoldindx]
                        break
                TeListFold.append(currentID)
            TeListFold = np.int32(TeListFold)
            TeListFoldUniq = np.unique(TeListFold)

            # Train Partition
            print("Partitioning Training Data ...")
            TrX = list()
            TrShape = None

            if Method == "PCA":
                svdmodel = PCA(n_components=NumFea,copy=False,tol=Tol)
            elif Method == "Kernel PCA":
                svdmodel = KernelPCA(n_components=NumFea,kernel=Kernel,gamma=Gamma,degree=Degree,\
                              coef0=Coef0, alpha=Alpha, tol=Tol, max_iter=MaxIter, n_jobs=NJob,copy_X=False)
            else:
                svdmodel = IncrementalPCA(n_components=NumFea,copy=False,batch_size=Batch)

            for foldindx, fold in enumerate(TrListFoldUniq):
                dat = XTr[np.where(TrListFold == fold)]
                if ui.cbScale.isChecked() and ui.rbScale.isChecked():
                    dat = preprocessing.scale(dat)
                    print("Data belong to View " + str(foldindx + 1) + " is scaled X~N(0,1).")

                dat = svdmodel.fit_transform(dat)
                TrX.append(dat)
                if TrShape is None:
                    TrShape = np.shape(dat)
                else:
                    if not(TrShape == np.shape(dat)):
                        print("ERROR: Train, Reshape problem for Fold " + str(foldindx + 1) + ", Shape: " + str(np.shape(dat)))
                        return
                print("Train: View " + str(foldindx + 1) + " is extracted. Shape: " + str(np.shape(dat)))

            print("Training Shape: " + str(np.shape(TrX)))

            # Test Partition
            print("Partitioning Testing Data ...")
            TeX = list()
            TeShape = None
            for foldindx, fold in enumerate(TeListFoldUniq):
                dat = XTe[np.where(TeListFold == fold)]
                if ui.cbScale.isChecked() and ui.rbScale.isChecked():
                    dat = preprocessing.scale(dat)
                    print("Data belong to View " + str(foldindx + 1) + " is scaled X~N(0,1).")

                dat = svdmodel.fit_transform(dat)
                TeX.append(dat)
                if TeShape is None:
                    TeShape = np.shape(dat)
                else:
                    if not(TeShape == np.shape(dat)):
                        print("Test: Reshape problem for Fold " + str(foldindx + 1))
                        return
                print("Test: View " + str(foldindx + 1) + " is extracted.")

            print("Testing Shape: " + str(np.shape(TeX)))

            model = RHA(Dim=NumFea,regularization=Regularization)

            print("Running Hyperalignment on Training Data ...")
            MappedXtr, G = model.train(TrX)

            print("Running Hyperalignment on Testing Data ...")
            MappedXte =  model.test(TeX)

            # Train Dot Product
            print("Producting Training Data ...")
            TrHX = None
            TrErr = None
            for foldindx, fold in enumerate(TrListFoldUniq):
                TrErr = TrErr + (G - MappedXtr[foldindx]) if TrErr is not None else G - MappedXtr[foldindx]
                TrHX = np.concatenate((TrHX, MappedXtr[foldindx])) if TrHX is not None else MappedXtr[foldindx]
            OutData[ui.txtOTrData.text()] = TrHX
            foldindx = foldindx + 1
            TrErr = TrErr / foldindx
            print("Train: alignment error ", np.linalg.norm(TrErr))
            TrFoldErr.append(np.linalg.norm(TrErr))

            # Train Dot Product
            print("Producting Testing Data ...")
            TeHX = None
            TeErr = None
            for foldindx, fold in enumerate(TeListFoldUniq):
                TeErr = TeErr + (G - MappedXte[foldindx]) if TeErr is not None else G - MappedXte[foldindx]
                TeHX = np.concatenate((TeHX, MappedXte[foldindx])) if TeHX is not None else MappedXte[foldindx]
            OutData[ui.txtOTeData.text()] = TeHX
            foldindx = foldindx + 1
            TeErr = TeErr / foldindx
            print("Test: alignment error ", np.linalg.norm(TeErr))
            TeFoldErr.append(np.linalg.norm(TeErr))

            HAParam = dict()
            HAParam["Method"]= Method
            HAParam["Kernel"]= Kernel
            HAParam["Share"] = G
            HAParam["Level"] = FoldStr
            OutData["FunctionalAlignment"] = HAParam
            OutData["Runtime"] = time.time() - tic
            totalTime += OutData["Runtime"]

            print("Saving ...")
            io.savemat(OutFile, mdict=OutData)
            print("Fold " + str(fold_all) + " is DONE: " + OutFile)

        print("Training -> Alignment Error: mean " + str(np.mean(TrFoldErr)) + " std " + str(np.std(TrFoldErr)))
        print("Testing  -> Alignment Error: mean " + str(np.mean(TeFoldErr)) + " std " + str(np.std(TeFoldErr)))
        print("Runtime: ", totalTime)
        print("Kernel/SVD Hyperalignment is done.")
        msgBox.setText("Kernel/SVD Hyperalignment is done.")
        msgBox.setIcon(QMessageBox.Information)
        msgBox.setStandardButtons(QMessageBox.Ok)
        msgBox.exec_()
Exemple #21
0
def reduce(config,
           components,
           uuids=None,
           x_train=None,
           x_dev=None,
           x_test=None):
    """
    Apply Incremental Principal Components Analysis to the tf-idf vectors.
    
    :param config: configuration dictionary
    :param components: number of desired components
    :param uuids: list of selected uuids
    :param x_train: List of train set uuids
    :param x_dev: List of dev set uuids
    :param x_test: List of test set uuids
    :return:
    """

    print('Performing feature extraction using PCA')

    mini_batch_size = config['batch_size']
    words = json.load(
        open(os.path.join(constants.dir_d, constants.json_words), 'r'))

    i_pca = IncrementalPCA(n_components=components, batch_size=mini_batch_size)

    if uuids:
        rand_uuids = random.sample(uuids, len(uuids))
        rows = len(uuids)

    else:
        rand_uuids = random.sample(x_train, len(x_train))
        rows = len(x_train)

    train_pca(config, i_pca, len(rand_uuids), rand_uuids, mini_batch_size)

    print('Explained Variance Ratio {}:'.format(
        sum(i_pca.explained_variance_ratio_)))

    if uuids:
        data = transform_vectors(config, i_pca, len(uuids), uuids,
                                 mini_batch_size)
        matrix_file = os.path.join(
            constants.dir_d, constants.dir_mat,
            'pca_{}_{}.txt'.format(components, len(uuids)))
        np.savetxt(open(matrix_file, 'wb'), data)

    else:
        t_train = transform_vectors(config, i_pca, len(x_train), x_train,
                                    mini_batch_size)
        matrix_file = os.path.join(
            constants.dir_d, constants.dir_mat,
            'pca_{}_{}_tr.txt'.format(components, len(t_train)))
        np.savetxt(open(matrix_file, 'wb'), t_train)

        t_dev = transform_vectors(config, i_pca, len(x_dev), x_dev,
                                  mini_batch_size)
        matrix_file = os.path.join(
            constants.dir_d, constants.dir_mat,
            'pca_{}_{}_dv.txt'.format(components, len(t_dev)))
        np.savetxt(open(matrix_file, 'wb'), t_dev)

        t_test = transform_vectors(config, i_pca, len(x_test), x_test,
                                   mini_batch_size)
        matrix_file = os.path.join(
            constants.dir_d, constants.dir_mat,
            'pca_{}_{}_te.txt'.format(components, len(t_test)))
        np.savetxt(open(matrix_file, 'wb'), t_test)

        data = (t_train, t_dev, t_test)

    model_file = os.path.join(constants.dir_d, constants.dir_mod,
                              'pca_{}_{}.pkl'.format(components, rows))
    joblib.dump(i_pca, model_file)

    components_file = os.path.join(
        constants.dir_d, constants.dir_mod,
        "components_pca_{}_{}.txt".format(components, rows))
    to_inspect = pd.DataFrame(np.absolute(i_pca.components_.T),
                              index=sorted(set(words.keys())),
                              columns=range(components))
    to_inspect.idxmax(axis=0, skipna=True).to_csv(components_file)

    return data, i_pca
I_gt = np.load(
    '/home/abhi/Documents/Hyper/Dataset_Hyperspectral/Ground_truths/Indian_pines_gt.npy'
)
igt = np.ravel(I_gt)

# Expand the array for scale
array_expand = I_vect[:, 0, :]
for i_row in range(1, I_vect.shape[1]):
    tempmatirx = I_vect[:, i_row, :]
    array_expand = np.hstack((array_expand, tempmatirx))

# Data normalization
array_expand_scaled = sp.scale(array_expand.T)

pca = IncrementalPCA(n_components=num_com,
                     whiten=True,
                     copy=True,
                     batch_size=None)
array_pca = pca.fit_transform(array_expand_scaled)

x = array_pca.reshape(145, 145, 8)
y0 = x[:, :, 0]
y1 = x[:, :, 1]
y2 = x[:, :, 2]
y3 = x[:, :, 3]
y4 = x[:, :, 4]
y5 = x[:, :, 5]
y6 = x[:, :, 6]
y7 = x[:, :, 7]

#most prominent is the first component
plt.imshow(y0)
def test_incremental_pca_feature_names_out():
    """Check feature names out for IncrementalPCA."""
    ipca = IncrementalPCA(n_components=2).fit(iris.data)

    names = ipca.get_feature_names_out()
    assert_array_equal([f"incrementalpca{i}" for i in range(2)], names)
Exemple #24
0
def whitening(X,
              n_components,
              svd_solver,
              chunked,
              chunk_size,
              zero_center,
              random_state=None):
    """ Whiten data (i.e transform variables into a set of new uncorrelated and unit-variance variables) 
    and reduce dimension trhough a PCA-like approach.
    This function handles array-like formats as well as sparse matrices.
    
    Parameters
    ----------
    X : 2D ndarray or spmatrix, shape (n_observations , n_variables)
        
    n_components : int
        number of pricipal components to compute. If None, n_components = min(X.shape)
        
    svd_solver : str {‘auto’, ‘full’, ‘arpack’, ‘randomized’ , 'lobpcg'}
        solver for the different PCA methods. Please note that some solvers may not be compatible with
        some of the PCA methods. See PCA, TruncatedSVD and IncrementalPCA from sklearn.decompostion or
        scipy.sparse.linalg.svds.
        
    chunked : boolean
        if True, perform an incremental PCA on segments of chunk_size. The incremental PCA automatically 
        zero centers and ignores settings of random_seed and svd_solver.
        
    chunk_size : int
        Number of observations to include in each chunk. Required if chunked=True was passed.
        
    zero_center : boolean
        If True, compute standard PCA from covariance matrix. If False, omit zero-centering variables
        (uses TruncatedSVD), which allows to handle sparse input efficiently.
        
    random_state : int, RandomState, optional
        Change to use different initial states for the optimization. The default is None.

    Returns
    -------
    X_w : 2D ndarray, shape (n_observations , n_components)

    """
    random_state = check_random_state(random_state)

    if n_components is None:
        n_components = min(X.shape)

    if chunked:

        pca = IncrementalPCA(n_components=n_components,
                             whiten=True,
                             batch_size=chunk_size)
        X_w = pca.fit_transform(X)

    elif issparse(X):

        if not zero_center:

            warnings.warn(
                'TruncatedSVD is very similar to PCA, but differs in that the matrix is not centered first.'
                ' The following components still often resemble the exact PCA very closely'
            )

            pca = TruncatedSVD(n_components=n_components,
                               random_state=random_state,
                               algorithm=svd_solver)
            X_w = pca.fit_transform(X)
            X_w = (X_w / pca.singular_values_) * np.sqrt(X.shape[0] - 1)
            X_w -= X_w.mean(axis=0)
        else:
            X_w = _pca_with_sparse(X,
                                   n_components,
                                   solver=svd_solver,
                                   random_state=random_state)

    else:

        pca = PCA(n_components=n_components,
                  whiten=True,
                  svd_solver=svd_solver)
        X_w = pca.fit_transform(X)

    return X_w
Exemple #25
0
    col = []
    for word in clean_graph.keys():
        for wd in clean_graph[word].keys():
            row.append(index[word])
            col.append(index[wd])
            data.append(clean_graph[word][wd])
    matrix = csr_matrix((data, (row, col)))
    return matrix


matrix = create_sparse_matrix(clean_graph, index)

# In[20]:

from sklearn.decomposition import IncrementalPCA
import pickle

chunk_size = 100
n = matrix.shape[0]

pca = IncrementalPCA(n_components=15, batch_size=100)

for i in range(0, n // chunk_size):
    rows = matrix[i * chunk_size:(i + 1) * chunk_size].toarray()
    pca.partial_fit(rows)

pca.fit(matrix)

pickle.dump(file=open('pca.pickle', 'wb'), obj=pca)
pickle.dump(file=open('sparce_matrix.pickle', 'wb'), obj=matrix)
Exemple #26
0
def pca_incremental(cubepath,
                    angle_list=None,
                    n=0,
                    batch_size=None,
                    batch_ratio=0.1,
                    ncomp=10,
                    imlib='opencv',
                    interpolation='lanczos4',
                    collapse='median',
                    verbose=True,
                    full_output=False):
    """ Computes the full-frame PCA-ADI algorithm in batches, for processing 
    fits files larger than the available system memory. It uses the incremental 
    PCA algorithm from scikit-learn. 
    
    Parameters
    ----------
    cubepath : str
        String with the path to the fits file to be opened in memmap mode.
    angle_list : array_like, 1d
        Corresponding parallactic angle for each frame. If None the parallactic
        angles are obtained from the same fits file (extension). 
    n : int optional
        The index of the HDULIST contaning the data/cube.
    batch_size : int optional
        The number of frames in each batch. If None the size of the batch is 
        computed wrt the available memory in the system.
    batch_ratio : float
        If batch_size is None, batch_ratio indicates the % of the available 
        memory that should be used by every batch.
    ncomp : int, optional
        How many PCs are used as a lower-dimensional subspace to project the
        target frames.
    imlib : str, optional
        See the documentation of the ``vip_hci.preproc.frame_rotate`` function.
    interpolation : str, optional
        See the documentation of the ``vip_hci.preproc.frame_rotate`` function.
    collapse : {'median', 'mean', 'sum', 'trimmean'}, str optional
        Sets the way of collapsing the frames for producing a final image.
    verbose : {True, False}, bool optional
        If True prints intermediate info and timing. 
    full_output: boolean, optional
        Whether to return the final median combined image only or with other 
        intermediate arrays.  
        
    Returns
    -------
    If full_output is True the algorithm returns the incremental PCA model of
    scikit-learn, the PCs reshaped into images, the median of the derotated 
    residuals for each batch, and the final frame. If full_output is False then
    the final frame is returned.
    
    """
    if verbose: start = time_ini()
    if not isinstance(cubepath, str):
        raise TypeError('Cubepath must be a string with the full path of your '
                        'fits file')

    fitsfilename = cubepath
    hdulist = fits.open(fitsfilename, memmap=True)
    if not hdulist[n].data.ndim > 2:
        raise TypeError('Input array is not a 3d or 4d array')

    n_frames = hdulist[n].data.shape[0]
    y = hdulist[n].data.shape[1]
    x = hdulist[n].data.shape[2]
    if angle_list is None:
        try:
            angle_list = hdulist[n + 1].data
        except:
            raise RuntimeError('Parallactic angles were not provided')
    if not n_frames == angle_list.shape[0]:
        raise TypeError(
            'Angle list vector has wrong length. It must equal the '
            'number of frames in the cube.')

    ipca = IncrementalPCA(n_components=ncomp)

    if batch_size is None:
        aval_mem = get_available_memory(verbose)
        total_size = hdulist[n].data.nbytes
        batch_size = int(n_frames / (total_size / (batch_ratio * aval_mem)))

    if verbose:
        msg1 = "Cube with {} frames ({:.3f} GB)"
        print(msg1.format(n_frames, hdulist[n].data.nbytes / 1e9))
        msg2 = "Batch size set to {} frames ({:.3f} GB)\n"
        print(
            msg2.format(batch_size, hdulist[n].data[:batch_size].nbytes / 1e9))

    res = n_frames % batch_size
    for i in range(0, n_frames // batch_size):
        intini = i * batch_size
        intfin = (i + 1) * batch_size
        batch = hdulist[n].data[intini:intfin]
        msg = 'Processing batch [{},{}] with shape {}'
        if verbose:
            print(msg.format(intini, intfin, batch.shape))
            print('Batch size in memory = {:.3f} MB'.format(batch.nbytes /
                                                            1e6))
        matrix = prepare_matrix(batch, verbose=False)
        ipca.partial_fit(matrix)
    if res > 0:
        batch = hdulist[n].data[intfin:]
        msg = 'Processing batch [{},{}] with shape {}'
        if verbose:
            print(msg.format(intfin, n_frames, batch.shape))
            print('Batch size in memory = {:.3f} MB'.format(batch.nbytes /
                                                            1e6))
        matrix = prepare_matrix(batch, verbose=False)
        ipca.partial_fit(matrix)

    if verbose:
        timing(start)

    V = ipca.components_
    mean = ipca.mean_.reshape(batch.shape[1], batch.shape[2])

    if verbose:
        print('\nReconstructing and obtaining residuals')
    medians = []
    for i in range(0, n_frames // batch_size):
        intini = i * batch_size
        intfin = (i + 1) * batch_size
        batch = hdulist[n].data[intini:intfin]
        batch = batch - mean
        matrix = prepare_matrix(batch, verbose=False)
        reconst = np.dot(np.dot(matrix, V.T), V)
        resid = matrix - reconst
        resid_der = cube_derotate(resid.reshape(batch.shape[0], batch.shape[1],
                                                batch.shape[2]),
                                  angle_list[intini:intfin],
                                  imlib=imlib,
                                  interpolation=interpolation)
        medians.append(cube_collapse(resid_der, mode=collapse))
    if res > 0:
        batch = hdulist[n].data[intfin:]
        batch = batch - mean
        matrix = prepare_matrix(batch, verbose=False)
        reconst = np.dot(np.dot(matrix, V.T), V)
        resid = matrix - reconst
        resid_der = cube_derotate(resid.reshape(batch.shape[0], batch.shape[1],
                                                batch.shape[2]),
                                  angle_list[intfin:],
                                  imlib=imlib,
                                  interpolation=interpolation)
        medians.append(cube_collapse(resid_der, mode=collapse))
    del matrix
    del batch

    medians = np.array(medians)
    frame = np.median(medians, axis=0)

    if verbose:
        timing(start)

    if full_output:
        pcs = reshape_matrix(V, y, x)
        return ipca, pcs, medians, frame
    else:
        return frame
Exemple #27
0
def incremental_pca(args):
    #https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html

    return IncrementalPCA(n_components=args['n_components'],
                          whiten=args['whiten'],
                          copy=True)
Exemple #28
0
        'kernel': ['rbf'],
        'C': range(1, 100, 10),
        'gamma': np.arange(0.05, 0.55, .05)
    }
    model = SVC()

    conf_matrix_list_of_arrays = []
    scores = []
    for i in range(10):
        for fold_ind, (train_index, test_index) in enumerate(
                stratified_group_k_fold(X, y, ids, k=8)):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            train_groups, test_groups = ids[train_index], ids[test_index]

            ipca = IncrementalPCA(n_components=X_train.shape[1] // 5,
                                  batch_size=120)
            ipca.fit(X_train)
            X_train = ipca.transform(X_train)
            X_test = ipca.transform(X_test)

            X_train, y_train = pipeline.fit_resample(X_train, y_train)  #Smote

            clf = GridSearchCV(model, parameters, cv=5, n_jobs=4)
            clf.fit(X_train, y_train)
            pred = clf.predict(X_test)
            conf_matrix = confusion_matrix(y_test, pred)
            conf_matrix_list_of_arrays.append(conf_matrix)
            score = accuracy_score(y_test, pred)
            scores.append(score)

    mean_of_conf_matrix_arrays = np.mean(conf_matrix_list_of_arrays, axis=0)
		vectors.append(model[word])
		labels.append(word)
print ('- found ' + str(len(labels)) + ' entities x ' + str(len(vectors[0])) + ' dimensions')


# convert both lists into numpy vectors for reduction
vectors = np.asarray(vectors)
labels =  np.asarray(labels)
print ('- done')


# if specified, reduce using IncrementalPCA first (down 
# to a smaller number of dimensions before the final reduction)
if run_init_reduction:
	print ('reducing to ' + str(init_dimensions) + 'D using IncrementalPCA...')
	ipca = IncrementalPCA(n_components=init_dimensions)
	vectors = ipca.fit_transform(vectors)
	print ('- done')

	# save reduced vector space to file
	print ('- saving as csv...')
	with open(''+model_name + '-' + str(init_dimensions) + 'D.csv', 'w') as f:
		for i in range(len(labels)):
			f.write(labels[i] + ',' + ','.join(map(str, vectors[i])) + '\n')


# reduce using t-SNE
print ('reducing to ' + str(num_dimensions) + 'D using t-SNE...')
print ('- may take a really, really (really) long time :)')
vectors = np.asarray(vectors)
tsne = TSNE(n_components=num_dimensions, random_state=0)
Exemple #30
0
def flatten_image(img_array):
    s = img_array.shape[0] * img_array.shape[1]
    img_width = img_array.reshape(1, s)

    return img_width[0]


# %%
dataset = []
for path in paths:
    img = Image.open(str(path.resolve()))

    img = image_to_matrix(img)

    # img = flatten_image(img)

    dataset.append(img)

dataset = np.array(dataset)
print('dataset shape: {}'.format(dataset.shape))

# %%
n = dataset.shape[0]
pca = IncrementalPCA(n_components=100)

for i in range(n):
    r_dataset = pca.partial_fit(dataset[i:(i + 1)])

r_dataset = pca.transform(dataset)
print('r_dataset.shape: {}'.format(r_dataset.shape))