def ipca():
	train_features, test_features = gf.get_tfidf()
	vectorizer = gf.get_tfidf()
	n_components = 250
	ipca = IncrementalPCA(n_components=n_components, batch_size=1250)
	start_time = time.time()
	print 'start ipca on train'
	X_ipca = ipca.fit_transform(train_features)
	runtime = time.time() - start_time
	print '-----'
	print '%.2f seconds to ipca on train' % runtime
	print '-----'
	train_features = None
	
	print 'ipca train done'
	np.savetxt('train_features.csv', X_ipca, fmt='%.8e', delimiter=",")
	X_ipca = None
	print 'ipca train file done'
	test_features = gf.get_tfidf(vectorizer, False)
	Y_ipca = ipca.fit_transform(test_features)
	test_features, vectorizer = None, None
	print 'ipca test done'
	np.savetxt('test_features.csv', Y_ipca, fmt='%.8e', delimiter=",")
	svd_test_features = None
	print 'ipca test file done'
 def reduceDataset(self,nr=3,method='PCA'):
     '''It reduces the dimensionality of a given dataset using different techniques provided by Sklearn library
      Methods available:
                         'PCA'
                         'FactorAnalysis'
                         'KPCArbf','KPCApoly'
                         'KPCAcosine','KPCAsigmoid'
                         'IPCA'
                         'FastICADeflation'
                         'FastICAParallel'
                         'Isomap'
                         'LLE'
                         'LLEmodified'
                         'LLEltsa'
     '''
     dataset=self.ModelInputs['Dataset']
     #dataset=self.dataset[Model.in_columns]
     #dataset=self.dataset[['Humidity','TemperatureF','Sea Level PressureIn','PrecipitationIn','Dew PointF','Value']]
     #PCA
     if method=='PCA':
         sklearn_pca = sklearnPCA(n_components=nr)
         reduced = sklearn_pca.fit_transform(dataset)
     #Factor Analysis
     elif method=='FactorAnalysis':
         fa=FactorAnalysis(n_components=nr)
         reduced=fa.fit_transform(dataset)
     #kernel pca with rbf kernel
     elif method=='KPCArbf':
         kpca=KernelPCA(nr,kernel='rbf')
         reduced=kpca.fit_transform(dataset)
     #kernel pca with poly kernel
     elif method=='KPCApoly':
         kpca=KernelPCA(nr,kernel='poly')
         reduced=kpca.fit_transform(dataset)
     #kernel pca with cosine kernel
     elif method=='KPCAcosine':
         kpca=KernelPCA(nr,kernel='cosine')
         reduced=kpca.fit_transform(dataset)
     #kernel pca with sigmoid kernel
     elif method=='KPCAsigmoid':
         kpca=KernelPCA(nr,kernel='sigmoid')
         reduced=kpca.fit_transform(dataset)
     #ICA
     elif method=='IPCA':
         ipca=IncrementalPCA(nr)
         reduced=ipca.fit_transform(dataset)
     #Fast ICA
     elif method=='FastICAParallel':
         fip=FastICA(nr,algorithm='parallel')
         reduced=fip.fit_transform(dataset)
     elif method=='FastICADeflation':
         fid=FastICA(nr,algorithm='deflation')
         reduced=fid.fit_transform(dataset)
     elif method == 'All':
         self.dimensionalityReduction(nr=nr)
         return self
     
     self.ModelInputs.update({method:reduced})
     self.datasetsAvailable.append(method)
     return self
Exemple #3
0
def get_pca_array(list_chunks, topology):
    """
    Takes a list of mdtraj.Trajectory objects and featurize them to backbone -
    Alpha Carbons pairwise distances. Perform 2 component Incremental
    PCA on the featurized trajectory.

    Parameters
    ----------
    list_chunks: list of mdTraj.Trajectory objects
    topology: str
            Name of the Topology file

    Returns
    -------
    Y: np.array shape(frames, features)

    """
    pca = IncrementalPCA(n_components=2)
    top = md.load_prmtop(topology)
    ca_backbone = top.select("name CA")
    pairs = top.select_pairs(ca_backbone, ca_backbone)
    pair_distances = []
    for chunk in list_chunks:
        X = md.compute_distances(chunk, pairs)
        pair_distances.append(X)
    distance_array = np.concatenate(pair_distances)
    print("No. of data points: %d" % distance_array.shape[0])
    print("No. of features (pairwise distances): %d" % distance_array.shape[1])
    Y = pca.fit_transform(distance_array)
    return Y
 def dimensionalityReduction(self,nr=5):
     '''It applies all the dimensionality reduction techniques available in this class:
     Techniques available:
                         'PCA'
                         'FactorAnalysis'
                         'KPCArbf','KPCApoly'
                         'KPCAcosine','KPCAsigmoid'
                         'IPCA'
                         'FastICADeflation'
                         'FastICAParallel'
                         'Isomap'
                         'LLE'
                         'LLEmodified'
                         'LLEltsa'
     '''
     dataset=self.ModelInputs['Dataset']
     sklearn_pca = sklearnPCA(n_components=nr)
     p_components = sklearn_pca.fit_transform(dataset)
     fa=FactorAnalysis(n_components=nr)
     factors=fa.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='rbf')
     rbf=kpca.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='poly')
     poly=kpca.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='cosine')
     cosine=kpca.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='sigmoid')
     sigmoid=kpca.fit_transform(dataset)
     ipca=IncrementalPCA(nr)
     i_components=ipca.fit_transform(dataset)
     fip=FastICA(nr,algorithm='parallel')
     fid=FastICA(nr,algorithm='deflation')
     ficaD=fip.fit_transform(dataset)
     ficaP=fid.fit_transform(dataset)
     '''isomap=Isomap(n_components=nr).fit_transform(dataset)
     try:
         lle1=LocallyLinearEmbedding(n_components=nr).fit_transform(dataset)
     except ValueError:
         lle1=LocallyLinearEmbedding(n_components=nr,eigen_solver='dense').fit_transform(dataset)
     try:
         
         lle2=LocallyLinearEmbedding(n_components=nr,method='modified').fit_transform(dataset)
     except ValueError:
         lle2=LocallyLinearEmbedding(n_components=nr,method='modified',eigen_solver='dense').fit_transform(dataset) 
     try:
         lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa').fit_transform(dataset)
     except ValueError:
         lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa',eigen_solver='dense').fit_transform(dataset)'''
     values=[p_components,factors,rbf,poly,cosine,sigmoid,i_components,ficaD,ficaP]#,isomap,lle1,lle2,lle3]
     keys=['PCA','FactorAnalysis','KPCArbf','KPCApoly','KPCAcosine','KPCAsigmoid','IPCA','FastICADeflation','FastICAParallel']#,'Isomap','LLE','LLEmodified','LLEltsa']
     self.ModelInputs.update(dict(zip(keys, values)))
     [self.datasetsAvailable.append(key) for key in keys ]
     
     #debug
     #dataset=pd.DataFrame(self.ModelInputs['Dataset'])
     #dataset['Output']=self.ModelOutput
     #self.debug['Dimensionalityreduction']=dataset
     ###
     return self
Exemple #5
0
def reduce_data(features, out_dir, dim=10, first_column=True):
    array = np.load(features)
    subarray = array
    if not first_column:
        subarray = array[:, 1:]

    ipca = IncrementalPCA(n_components=dim, copy=False, batch_size=500000)
    ipca.fit_transform(subarray)
    new_array = subarray
    # when it cannot fit into memory do it incrementally like below
    # new_array_1 = tsvd.fit_transform(subarray[:1500000, :])
    # new_array_2 = tsvd.fit_transform(subarray[1500000:3400000, :])
    # new_array_3 = tsvd.fit_transform(subarray[3400000:, :])
    # new_array = np.vstack([new_array_1, new_array_2, new_array_3])
    if not first_column:
        new_array = np.c_[array[:, 0], new_array]

    assert new_array.shape[0] == array.shape[0]
    np.save(os.path.join(out_dir, os.path.basename(features) + "_pca"), new_array)
def ipca(data, labels, new_dimension):
    print "start incremental pca..."

    if hasattr(data, "todense"):
        data = np.array(data.todense())

    start = time.time()
    pca = IncrementalPCA(n_components=new_dimension)
    reduced = pca.fit_transform(data)
    end = time.time()
    return (reduced, end-start)
def run_pca(n_components,n_sites,order_dict,sim_mat):
   
   	output_file = open('pca_100000_100','w')
   	
        ipca = IncrementalPCA(n_components=n_components,batch_size=8000)
	sim_mat_ipca = ipca.fit_transform(sim_mat)
	var_sim_ipca = ipca.explained_variance_ratio_
	
	output_file.write(",".join(str(x) for x in var_sim_ipca)+'\n')

	for siteid in order_dict:
		stringa = ' '.join(
			[siteid,
        	str(sim_mat_ipca[order_dict[siteid], 0]),
        	str(sim_mat_ipca[order_dict[siteid], 1]),
         	str(sim_mat_ipca[order_dict[siteid], 2]),
         	str(sim_mat_ipca[order_dict[siteid], 3]),
         	str(sim_mat_ipca[order_dict[siteid], 4]),
         	str(sim_mat_ipca[order_dict[siteid], 5]),
         	str(sim_mat_ipca[order_dict[siteid], 6])
        	])
		output_file.write(stringa +'\n')
    	
	n_bins = 1000.
	binned = np.empty((n_sites,5)).astype(np.int32)
	for k in range(5):
		delta = (sim_mat_ipca[:, k].max()-sim_mat_ipca[:, k].min())/n_bins
		min_k = sim_mat_ipca[:, k].min()
		for i in range(n_sites):
			binned[i,k] = int((sim_mat_ipca[i, k]-min_k)/delta)
        	
	f = open('pc_100000_100.csv','w')
	for siteid in order_dict:
		stringa = ' '.join(
			[siteid,
        	str(binned[order_dict[siteid], 0]),
        	str(binned[order_dict[siteid], 1]),
         	str(binned[order_dict[siteid], 2]),
         	str(binned[order_dict[siteid], 3]),
         	str(binned[order_dict[siteid], 4])    
        	])
    	f.write(stringa +'\n')
	f.close()
def test_incremental_pca():
    """Incremental PCA on dense arrays."""
    X = iris.data
    batch_size = X.shape[0] // 3
    ipca = IncrementalPCA(n_components=2, batch_size=batch_size)
    pca = PCA(n_components=2)
    pca.fit_transform(X)

    X_transformed = ipca.fit_transform(X)

    np.testing.assert_equal(X_transformed.shape, (X.shape[0], 2))
    assert_almost_equal(ipca.explained_variance_ratio_.sum(),
                        pca.explained_variance_ratio_.sum(), 1)

    for n_components in [1, 2, X.shape[1]]:
        ipca = IncrementalPCA(n_components, batch_size=batch_size)
        ipca.fit(X)
        cov = ipca.get_covariance()
        precision = ipca.get_precision()
        assert_array_almost_equal(np.dot(cov, precision),
                                  np.eye(X.shape[1]))
Exemple #9
0
class PCASK(AbstractFeature):
    def __init__(self, n_components):
        AbstractFeature.__init__(self)
        self.n_components = n_components
        #for key in options:
            #setattr(self,key,options[key])

    def compute(self,X,y):
        if X.ndim == 3:
            X = X.reshape((X.shape[0],X.shape[1]*X.shape[2]))
        self.ipca = IncrementalPCA(n_components=self.n_components, batch_size=None)
        return self.ipca.fit_transform(X)


    def extract(self,X):
        if X.ndim == 2:
            X = X.reshape((X.shape[0]*X.shape[1]))
        return list(self.ipca.transform([X])[0])

    def __repr__(self):
        return "PCASK"
Exemple #10
0
def pca(
    data: Union[AnnData, np.ndarray, spmatrix],
    n_comps: int = N_PCS,
    zero_center: Optional[bool] = True,
    svd_solver: str = 'auto',
    random_state: int = 0,
    return_info: bool = False,
    use_highly_variable: Optional[bool] = None,
    dtype: str = 'float32',
    copy: bool = False,
    chunked: bool = False,
    chunk_size: Optional[int] = None,
) -> Union[AnnData, np.ndarray, spmatrix]:
    """Principal component analysis [Pedregosa11]_.

    Computes PCA coordinates, loadings and variance decomposition. Uses the
    implementation of *scikit-learn* [Pedregosa11]_.

    Parameters
    ----------
    data
        The (annotated) data matrix of shape ``n_obs`` × ``n_vars``.
        Rows correspond to cells and columns to genes.
    n_comps
        Number of principal components to compute.
    zero_center
        If `True`, compute standard PCA from covariance matrix.
        If ``False``, omit zero-centering variables
        (uses :class:`~sklearn.decomposition.TruncatedSVD`),
        which allows to handle sparse input efficiently.
        Passing ``None`` decides automatically based on sparseness of the data.
    svd_solver
        SVD solver to use:

        ``'arpack'``
          for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`)

        ``'randomized'``
          for the randomized algorithm due to Halko (2009).

        ``'auto'`` (the default)
          chooses automatically depending on the size of the problem.

    random_state
        Change to use different initial states for the optimization.
    return_info
        Only relevant when not passing an :class:`~anndata.AnnData`:
        see “**Returns**”.
    use_highly_variable
        Whether to use highly variable genes only, stored in
        ``.var['highly_variable']``.
        By default uses them if they have been determined beforehand.
    dtype
        Numpy data type string to which to convert the result.
    copy
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned. Is ignored otherwise.
    chunked
        If ``True``, perform an incremental PCA on segments of ``chunk_size``.
        The incremental PCA automatically zero centers and ignores settings of
        ``random_seed`` and ``svd_solver``. If ``False``, perform a full PCA.
    chunk_size
        Number of observations to include in each chunk.
        Required if ``chunked=True`` was passed.

    Returns
    -------

    X_pca : :class:`scipy.sparse.spmatrix` or :class:`numpy.ndarray`
        If `data` is array-like and ``return_info=False`` was passed,
        this function only returns `X_pca`…
    adata : :class:`~anndata.AnnData`
        …otherwise if ``copy=True`` it returns or else adds fields to ``adata``:

        ``.obsm['X_pca']``
             PCA representation of data.

        ``.varm['PCs']``
             The principal components containing the loadings.

        ``.uns['pca']['variance_ratio']``)
             Ratio of explained variance.

        ``.uns['pca']['variance']``
             Explained variance, equivalent to the eigenvalues of the covariance matrix.
    """
    # chunked calculation is not randomized, anyways
    if svd_solver in {'auto', 'randomized'} and not chunked:
        logg.info(
            'Note that scikit-learn\'s randomized PCA might not be exactly '
            'reproducible across different computational platforms. For exact '
            'reproducibility, choose `svd_solver=\'arpack\'.` This will likely '
            'become the Scanpy default in the future.')

    data_is_AnnData = isinstance(data, AnnData)
    if data_is_AnnData:
        adata = data.copy() if copy else data
    else:
        adata = AnnData(data)

    logg.msg('computing PCA with n_comps =', n_comps, r=True, v=4)

    if adata.n_vars < n_comps:
        n_comps = adata.n_vars - 1
        logg.msg('reducing number of computed PCs to',
                 n_comps,
                 'as dim of data is only',
                 adata.n_vars,
                 v=4)

    if use_highly_variable is True and 'highly_variable' not in adata.var.keys(
    ):
        raise ValueError(
            'Did not find adata.var[\'highly_variable\']. '
            'Either your data already only consists of highly-variable genes '
            'or consider running `pp.filter_genes_dispersion` first.')
    if use_highly_variable is None:
        use_highly_variable = True if 'highly_variable' in adata.var.keys(
        ) else False
    adata_comp = adata[:, adata.
                       var['highly_variable']] if use_highly_variable else adata

    if chunked:
        if not zero_center or random_state or svd_solver != 'auto':
            logg.msg('Ignoring zero_center, random_state, svd_solver', v=4)

        from sklearn.decomposition import IncrementalPCA

        X_pca = np.zeros((adata_comp.X.shape[0], n_comps), adata_comp.X.dtype)

        pca_ = IncrementalPCA(n_components=n_comps)

        for chunk, _, _ in adata_comp.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            pca_.partial_fit(chunk)

        for chunk, start, end in adata_comp.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            X_pca[start:end] = pca_.transform(chunk)
    else:
        if zero_center is None:
            zero_center = not issparse(adata_comp.X)
        if zero_center:
            from sklearn.decomposition import PCA
            if issparse(adata_comp.X):
                logg.msg(
                    '    as `zero_center=True`, '
                    'sparse input is densified and may '
                    'lead to huge memory consumption',
                    v=4)
                X = adata_comp.X.toarray(
                )  # Copying the whole adata_comp.X here, could cause memory problems
            else:
                X = adata_comp.X
            pca_ = PCA(n_components=n_comps,
                       svd_solver=svd_solver,
                       random_state=random_state)
        else:
            from sklearn.decomposition import TruncatedSVD
            logg.msg(
                '    without zero-centering: \n'
                '    the explained variance does not correspond to the exact statistical defintion\n'
                '    the first component, e.g., might be heavily influenced by different means\n'
                '    the following components often resemble the exact PCA very closely',
                v=4)
            pca_ = TruncatedSVD(n_components=n_comps,
                                random_state=random_state)
            X = adata_comp.X
        X_pca = pca_.fit_transform(X)

    if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype)

    if data_is_AnnData:
        adata.obsm['X_pca'] = X_pca
        if use_highly_variable:
            adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps))
            adata.varm['PCs'][
                adata.var['highly_variable']] = pca_.components_.T
        else:
            adata.varm['PCs'] = pca_.components_.T
        adata.uns['pca'] = {}
        adata.uns['pca']['variance'] = pca_.explained_variance_
        adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_
        logg.msg('    finished', t=True, end=' ', v=4)
        logg.msg(
            'and added\n'
            '    \'X_pca\', the PCA coordinates (adata.obs)\n'
            '    \'PC1\', \'PC2\', ..., the loadings (adata.var)\n'
            '    \'pca_variance\', the variance / eigenvalues (adata.uns)\n'
            '    \'pca_variance_ratio\', the variance ratio (adata.uns)',
            v=4)
        return adata if copy else None
    else:
        if return_info:
            return X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_
        else:
            return X_pca
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA, IncrementalPCA

iris = load_iris()
x = iris.data
y = iris.target
target_names = iris.target_names
n_components = 2

pca = PCA(n_components=n_components)
x_pca = pca.fit_transform(x)

ipca = IncrementalPCA(n_components=n_components, batch_size=10)  #分批处理,每次10条数据
x_ipca = ipca.fit_transform(x)

colors = ['navy', 'turquoise', 'darkorange']  #海军蓝/蓝绿色/暗橘色

for x_transformed, title, number in [(x_pca, 'PCA', 1),
                                     (x_ipca, 'Incremental PCA', 2)]:
    plt.figure(number, figsize=(8, 8))
    for color, i, target_name in zip(colors, [0, 1, 2], target_names):
        plt.scatter(x_transformed[y == i, 0],
                    x_transformed[y == i, 1],
                    color=color,
                    lw=2,
                    label=target_name)

    if 'Incremental' in title:
        err = np.abs(np.abs(x_pca) - np.abs(x_ipca)).mean()
Exemple #12
0
def feature_extraction_pca(raw_data_features, raw_data_labels, timestamps):
  """
  Args:
    raw_data_features: The fourth column is the barometer data.

  Returns:
    features: Features extracted from the data features, where
              features[:, 0] is the mean magnitude of acceleration;
              features[:, 1] is the variance of acceleration;
              features[:, 2:6] is the fft power spectrum of equally-spaced frequencies;
              features[: 6:12] is the fft power spectrum of frequencies in logarithmic sacle;
              features[:, 13] is the slope of pressure.
  """
  features = None
  labels = None

  accel_magnitudes = np.sqrt((raw_data_features[:, 0]**2).reshape(-1, 1)+
                             (raw_data_features[:, 1]**2).reshape(-1, 1)+
                             (raw_data_features[:, 2]**2).reshape(-1, 1))

  # The window size for feature extraction
  segment_size = 128

  for i in range(0, accel_magnitudes.shape[0]-segment_size, 64):

  # TO DO Compute mean and variance of acceleration for each segment          

    segment = accel_magnitudes[i:i+segment_size]
    accel_mean = np.mean(segment)
    accel_var = np.var(segment)
    accel_var_skew = skew(segment)
    accel_var_kurt = kurtosis(segment)
    

    segment_fft_powers = np.abs(np.fft.fft(segment))**2
    #print(segment_fft_powers)

    # Aggreate band power within frequency range, with equal space (window size=32) or logarithmic scale
    # Band power of equally-sapced bands: 4 features
    equal_band_power = list()
    window_size = 32
    for j in range(0, len(segment_fft_powers), window_size):
      equal_band_power.append(sum(segment_fft_powers[j: j+32]).tolist()[0])

    # Band power of bands in logarithmic scale: 7 features
    log_band_power = list()
    freqs = [0, 2, 4, 8, 16, 32, 64, 128]
    for j in range(len(freqs)-1):
      log_band_power.append(sum(segment_fft_powers[freqs[j]: freqs[j+1]]).tolist()[0])

    # Slope of barometer data
    # bar_slope = raw_data_features[i+segment_size-1, 3] - raw_data_features[i, 3]
    bar_slope = np.polyfit(timestamps[i:i+segment_size], raw_data_features[i:i+segment_size, 3], 1)[0]
    # bar_slope = np.polyfit([x*0.1 for x in range(segment_size)], raw_data_features[i:i+segment_size, 3], 1)[0]

    feature = [accel_mean, accel_var, accel_var_skew, accel_var_kurt] + equal_band_power + log_band_power + [bar_slope]

    if features is None:
      features = np.array([feature])
    else:
      features = np.append(features, [feature], axis=0)

    label = Counter(raw_data_labels[i:i+segment_size][:, 0].tolist()).most_common(1)[0][0]

    if labels is None:
      labels = np.array([label])
    else:
      labels = np.append(labels, [label], axis=0)
      
  pca = IncrementalPCA(n_components=5)
  features = pca.fit_transform(features)

  return features, labels
pca.fit(X_train)

print(pca.components_)

colnames = list(X_train.columns)

pcs_df = pd.DataFrame({'PC1': pca.components_[0], \
                       'PC2': pca.components_[1], \
                       'Feature': colnames})

pcs_df.head()

explained_variance_ratio_ = np.around(pca.explained_variance_ratio_,
                                      decimals=3)
explained_variance_ratio_

fig = plt.figure(figsize=(12, 8))
plt.plot(np.cumsum(pca.explained_variance_ratio_), 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principle components')
plt.ylabel('variance')
plt.show()

from sklearn.decomposition import IncrementalPCA

pca_final = IncrementalPCA(n_components=2)

df_pca = pca_final.fit_transform(
    cleaned_master_data.drop(['Hospital overall rating'], axis=1))
df_pca.shape
# Every entry has 13 features and one binary label. In order to avoid "Out-of-memory" errors, we need to preprocess our dataset.
# For this purpose i use StandardScaler for scaling and IncrementalPCA for preprocessing data.

import numpy as np
import pandas as pd
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler

# I select 10000 rows per chunk to be sure, that it will be finished (i have only 2 Gb RAM)

chunk_size = 10000
components = 3

# Only three components is not good idea, but i have huge dataset and after preprocessing my data also can be large.

for chunk in pd.read_csv('train.tar.gz', compression='gzip', sep=';', header=0, quotechar='"', chunksize=chunk_size):
    labels = chunk.iloc[:, 2]
    selected_features = chunk.iloc[:, [3:16]]

    scaled_features = StandardScaler().fit_transform(selected_features)

    ipca = IncrementalPCA(n_components=components)
    principalComponents = ipca.fit_transform(scaled_features)
    preprocessed_data = pd.DataFrame(data=principalComponents)

    merged_data = pd.concat([preprocessed_data, labels], axis=1)

    merged_data.to_csv('preprocessed_data.csv', mode='w', sep=';', header=0)

# P.S. Of course chunksize and number of components depends on your available computer resources.
Exemple #15
0
 def fit_pca(self, matrix):
     """Fit pca matrix and save sklearn model """
     reducer = IncrementalPCA(n_components=800, batch_size=2500)
     reduced_matrix = reducer.fit_transform(matrix)
     self.rev_matrix_pca = reduced_matrix
     self.pca_model = reducer
Exemple #16
0
def pca(
    data: Union[AnnData, np.ndarray, spmatrix],
    n_comps: Optional[int] = None,
    zero_center: Optional[bool] = True,
    svd_solver: str = 'arpack',
    random_state: AnyRandom = 0,
    return_info: bool = False,
    use_highly_variable: Optional[bool] = None,
    dtype: str = 'float32',
    copy: bool = False,
    chunked: bool = False,
    chunk_size: Optional[int] = None,
) -> Union[AnnData, np.ndarray, spmatrix]:
    """\
    Principal component analysis [Pedregosa11]_.

    Computes PCA coordinates, loadings and variance decomposition.
    Uses the implementation of *scikit-learn* [Pedregosa11]_.

    .. versionchanged:: 1.5.0

        In previous versions, computing a PCA on a sparse matrix would make a dense copy of
        the array for mean centering.
        As of scanpy 1.5.0, mean centering is implicit.
        While results are extremely similar, they are not exactly the same.
        If you would like to reproduce the old results, pass a dense array.

    Parameters
    ----------
    data
        The (annotated) data matrix of shape `n_obs` × `n_vars`.
        Rows correspond to cells and columns to genes.
    n_comps
        Number of principal components to compute. Defaults to 50, or 1 - minimum
        dimension size of selected representation.
    zero_center
        If `True`, compute standard PCA from covariance matrix.
        If `False`, omit zero-centering variables
        (uses :class:`~sklearn.decomposition.TruncatedSVD`),
        which allows to handle sparse input efficiently.
        Passing `None` decides automatically based on sparseness of the data.
    svd_solver
        SVD solver to use:

        `'arpack'` (the default)
          for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`)
        `'randomized'`
          for the randomized algorithm due to Halko (2009).
        `'auto'`
          chooses automatically depending on the size of the problem.
        `'lobpcg'`
          An alternative SciPy solver.

        .. versionchanged:: 1.4.5
           Default value changed from `'auto'` to `'arpack'`.

        Efficient computation of the principal components of a sparse matrix
        currently only works with the `'arpack`' or `'lobpcg'` solvers.

    random_state
        Change to use different initial states for the optimization.
    return_info
        Only relevant when not passing an :class:`~anndata.AnnData`:
        see “**Returns**”.
    use_highly_variable
        Whether to use highly variable genes only, stored in
        `.var['highly_variable']`.
        By default uses them if they have been determined beforehand.
    dtype
        Numpy data type string to which to convert the result.
    copy
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned. Is ignored otherwise.
    chunked
        If `True`, perform an incremental PCA on segments of `chunk_size`.
        The incremental PCA automatically zero centers and ignores settings of
        `random_seed` and `svd_solver`. If `False`, perform a full PCA.
    chunk_size
        Number of observations to include in each chunk.
        Required if `chunked=True` was passed.

    Returns
    -------
    X_pca : :class:`~scipy.sparse.spmatrix`, :class:`~numpy.ndarray`
        If `data` is array-like and `return_info=False` was passed,
        this function only returns `X_pca`…
    adata : anndata.AnnData
        …otherwise if `copy=True` it returns or else adds fields to `adata`:

        `.obsm['X_pca']`
             PCA representation of data.
        `.varm['PCs']`
             The principal components containing the loadings.
        `.uns['pca']['variance_ratio']`
             Ratio of explained variance.
        `.uns['pca']['variance']`
             Explained variance, equivalent to the eigenvalues of the
             covariance matrix.
    """
    logg_start = logg.info(f'computing PCA')

    # chunked calculation is not randomized, anyways
    if svd_solver in {'auto', 'randomized'} and not chunked:
        logg.info(
            'Note that scikit-learn\'s randomized PCA might not be exactly '
            'reproducible across different computational platforms. For exact '
            'reproducibility, choose `svd_solver=\'arpack\'.`')
    data_is_AnnData = isinstance(data, AnnData)
    if data_is_AnnData:
        adata = data.copy() if copy else data
    else:
        adata = AnnData(data)

    if use_highly_variable is True and 'highly_variable' not in adata.var.keys(
    ):
        raise ValueError(
            'Did not find adata.var[\'highly_variable\']. '
            'Either your data already only consists of highly-variable genes '
            'or consider running `pp.highly_variable_genes` first.')
    if use_highly_variable is None:
        use_highly_variable = True if 'highly_variable' in adata.var.keys(
        ) else False
    if use_highly_variable:
        logg.info('    on highly variable genes')
    adata_comp = (adata[:, adata.var['highly_variable']]
                  if use_highly_variable else adata)

    if n_comps is None:
        min_dim = min(adata_comp.n_vars, adata_comp.n_obs)
        if settings.N_PCS >= min_dim:
            n_comps = min_dim - 1
        else:
            n_comps = settings.N_PCS

    logg.info(f'    with n_comps={n_comps}')

    random_state = check_random_state(random_state)

    X = adata_comp.X

    if chunked:
        if not zero_center or random_state or svd_solver != 'arpack':
            logg.debug('Ignoring zero_center, random_state, svd_solver')

        from sklearn.decomposition import IncrementalPCA

        X_pca = np.zeros((X.shape[0], n_comps), X.dtype)

        pca_ = IncrementalPCA(n_components=n_comps)

        for chunk, _, _ in adata_comp.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            pca_.partial_fit(chunk)

        for chunk, start, end in adata_comp.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            X_pca[start:end] = pca_.transform(chunk)
    elif (not issparse(X) or svd_solver == "randomized") and zero_center:
        from sklearn.decomposition import PCA

        if issparse(X) and svd_solver == "randomized":
            # This  is for backwards compat. Better behaviour would be to either error or use arpack.
            logg.warning(
                "svd_solver 'randomized' does not work with sparse input. Densifying the array. "
                "This may take a very large amount of memory.")
            X = X.toarray()
        pca_ = PCA(n_components=n_comps,
                   svd_solver=svd_solver,
                   random_state=random_state)
        X_pca = pca_.fit_transform(X)
    elif issparse(X) and zero_center:
        from sklearn.decomposition import PCA

        if svd_solver == "auto":
            svd_solver = "arpack"
        if svd_solver not in {'lobpcg', 'arpack'}:
            raise ValueError(
                'svd_solver: {svd_solver} can not be used with sparse input.\n'
                'Use "arpack" (the default) or "lobpcg" instead.')

        output = _pca_with_sparse(X,
                                  n_comps,
                                  solver=svd_solver,
                                  random_state=random_state)
        # this is just a wrapper for the results
        X_pca = output['X_pca']
        pca_ = PCA(n_components=n_comps, svd_solver=svd_solver)
        pca_.components_ = output['components']
        pca_.explained_variance_ = output['variance']
        pca_.explained_variance_ratio_ = output['variance_ratio']
    elif not zero_center:
        from sklearn.decomposition import TruncatedSVD

        logg.debug(
            '    without zero-centering: \n'
            '    the explained variance does not correspond to the exact statistical defintion\n'
            '    the first component, e.g., might be heavily influenced by different means\n'
            '    the following components often resemble the exact PCA very closely'
        )
        pca_ = TruncatedSVD(n_components=n_comps,
                            random_state=random_state,
                            algorithm=svd_solver)
        X_pca = pca_.fit_transform(X)
    else:
        raise Exception("This shouldn't happen. Please open a bug report.")

    if X_pca.dtype.descr != np.dtype(dtype).descr:
        X_pca = X_pca.astype(dtype)

    if data_is_AnnData:
        adata.obsm['X_pca'] = X_pca
        adata.uns['pca'] = {}
        adata.uns['pca']['params'] = {
            'zero_center': zero_center,
            'use_highly_variable': use_highly_variable,
        }
        if use_highly_variable:
            adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps))
            adata.varm['PCs'][
                adata.var['highly_variable']] = pca_.components_.T
        else:
            adata.varm['PCs'] = pca_.components_.T
        adata.uns['pca']['variance'] = pca_.explained_variance_
        adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_
        logg.info('    finished', time=logg_start)
        logg.debug(
            'and added\n'
            '    \'X_pca\', the PCA coordinates (adata.obs)\n'
            '    \'PC1\', \'PC2\', ..., the loadings (adata.var)\n'
            '    \'pca_variance\', the variance / eigenvalues (adata.uns)\n'
            '    \'pca_variance_ratio\', the variance ratio (adata.uns)')
        return adata if copy else None
    else:
        logg.info('    finished', time=logg_start)
        if return_info:
            return (
                X_pca,
                pca_.components_,
                pca_.explained_variance_ratio_,
                pca_.explained_variance_,
            )
        else:
            return X_pca
Exemple #17
0
def main():
    print('\033[1m' + 'Loading all the datasets...' + '\033[0m')
    arffs_dic = obtain_arffs('./datasets/')

    # Extract an specific database
    dataset_name = 'breast-w'  # possible datasets ('hypothyroid', 'breast-w', 'waveform')
    dat1 = arffs_dic[dataset_name]
    df1 = pd.DataFrame(dat1[0])  # original data in pandas dataframe
    groundtruth_labels = df1[df1.columns[
        len(df1.columns) - 1]].values  # original labels in a numpy array
    df1 = df1.drop(df1.columns[len(df1.columns) - 1], 1)
    if dataset_name == 'hypothyroid':
        df1 = df1.drop(
            'TBG', 1
        )  # This column only contains NaNs so does not add any value to the clustering
    data1 = df1.values  # original data in a numpy array without labels
    load = Preprocess()
    data_x = load.preprocess_method(data1)
    data_x = data_x.astype(np.float64)
    le = LabelEncoder()
    le.fit(np.unique(groundtruth_labels))
    groundtruth_labels = le.transform(groundtruth_labels)

    num_clusters = len(
        np.unique(groundtruth_labels))  # Number of different labels

    # -------------------------------------------------------------------------------Compute covariance and eigenvectors
    original_mean = np.mean(data_x, axis=0)

    cov_m = compute_covariance(data_x, original_mean)
    eig_vals, eig_vect = np.linalg.eig(cov_m)

    idxsort = eig_vals.argsort()[::-1]
    eig_vals = eig_vals[idxsort].real
    eig_vect = eig_vect[:, idxsort].real

    # ---------------------------------------------------------------------Decide the number of features we want to keep
    prop_variance = 0.9
    k = proportion_of_variance(eig_vals, prop_variance)
    print('\nThe value of K selected to obtain a proportion of variance = ' +
          str(prop_variance) + ' is: ' + str(k) + '\n')

    eig_vals_red = eig_vals[:k]
    eig_vect_red = eig_vect[:, :k]  # Eigenvectors are in columns (8xk)

    # ---------------------------------------------------------------------------------Reduce dimensionality of the data
    # A1) Using our implementation of PCA
    transf_data_x = np.dot((eig_vect_red.T), (data_x - original_mean).T).T

    # B1) Using the PCA implementation of sklearn
    pca = PCA(n_components=k)
    transf_data_x_sklearn = pca.fit_transform(data_x)

    # C1) Using the incremental PCA implementation of sklearn
    incrementalpca = IncrementalPCA(n_components=k)
    transf_data_x_sklearn2 = incrementalpca.fit_transform(data_x)

    # --------------------------------------------------------------------------------------------------Reconstruct data
    # A2) Reconstruct data with our method
    reconstruct_data_x = np.dot(eig_vect_red, transf_data_x.T)
    reconstruct_data_x = reconstruct_data_x.T + original_mean

    # B2) Reconstruct data with PCA sklearn
    reconstruct_data_x1 = np.dot(pca.components_.T, transf_data_x_sklearn.T)
    reconstruct_data_x1 = reconstruct_data_x1.T + original_mean

    # C2) Reconstruct data with incremental PCA sklearn
    reconstruct_data_x2 = np.dot(incrementalpca.components_.T,
                                 transf_data_x_sklearn2.T)
    reconstruct_data_x2 = reconstruct_data_x2.T + original_mean

    # ----------------------------------------------------------------Error between original data and reconstructed data
    # A3) Error between original data and reconstruct data
    error = reconstruct_data_x - data_x
    total_error = (np.sum(abs(error)) / np.sum(abs(data_x))) * 100
    print(
        'The relative error after reconstructing the original matrix with K = '
        + str(k) + ' is ' + '\033[1m' + '\033['
        '94m' + str(round(total_error, 2)) + '%' + '\033[0m' +
        ' [using our implementation of PCA]')

    # B3) Error between original data and reconstruct data 1
    error1 = reconstruct_data_x1 - data_x
    total_error1 = (np.sum(abs(error1)) / np.sum(abs(data_x))) * 100
    print(
        'The relative error after reconstructing the original matrix with K = '
        + str(k) + ' is ' + '\033[1m' + '\033['
        '94m' + str(round(total_error1, 2)) + '%' + '\033[0m' +
        ' [using pca.fit_transform of Sklearn]')

    # C3) Error between original data and reconstruct data 2
    error2 = reconstruct_data_x2 - data_x
    total_error2 = (np.sum(abs(error2)) / np.sum(abs(data_x))) * 100
    print(
        'The relative error after reconstructing the original matrix with K = '
        + str(k) + ' is ' + '\033[1m' + '\033['
        '94m' + str(round(total_error2, 2)) + '%' + '\033[0m' +
        ' [using incrementalpca.fit_transform of Sklearn]')

    # ------------------------------------------------------------------------------Kmeans with dimensionality reduction
    print(
        '\n---------------------------------------------------------------------------------------------------------'
    )
    print('K-MEANS APPLIED TO THE ORIGINAL DATA')
    tester_kmeans(data_x, groundtruth_labels)
    print(
        '\n---------------------------------------------------------------------------------------------------------'
    )
    print(
        'K-MEANS APPLIED TO THE TRANSFORMED DATA USING OUR IMPLEMENTATION OF PCA'
    )
    labels = tester_kmeans(transf_data_x, groundtruth_labels)
    print(
        '\n---------------------------------------------------------------------------------------------------------'
    )
    print(
        'K-MEANS APPLIED TO THE TRANSFORMED DATA USING pca.fit_transform OF SKLEARN'
    )
    tester_kmeans(transf_data_x_sklearn, groundtruth_labels)
    print(
        '\n---------------------------------------------------------------------------------------------------------'
    )
    print(
        'K-MEANS APPLIED TO THE TRANSFORMED DATA USING incrementalpca.fit_transform OF SKLEARN'
    )
    tester_kmeans(transf_data_x_sklearn2, groundtruth_labels)
    print(
        '\n---------------------------------------------------------------------------------------------------------'
    )

    # -----------------------------------------------------------------------------------------------------Scatter plots
    ploting_boolean = False
    plot_scatters = False  # only change to True for a database with not too many features (like breast-w)

    if ploting_boolean:
        # Plot eigenvector
        plt.plot(eig_vals, 'ro-', linewidth=2, markersize=6)
        plt.title('Magnitude of the eigenvalues')
        plt.show()

        if plot_scatters:
            # Plottings: scatter plots
            # Original data with groundtruth labels
            ploting_v(data_x, num_clusters, groundtruth_labels,
                      'original data with groundtruth labels')
            # Transfomed data with our implementation of PCA and with groundtruth labels
            ploting_v(transf_data_x, num_clusters, groundtruth_labels,
                      'transformed data (our PCA) with groundtruth '
                      'labels')
            # Transfomed data with pca.fit_transform and with groundtruth labels
            ploting_v(
                transf_data_x_sklearn, num_clusters, groundtruth_labels,
                'transformed data (Sklearn PCA v1) '
                'with groundtruth labels')
            # Transfomed data with incrementalpca.fit_transform and with groundtruth labels
            ploting_v(
                transf_data_x_sklearn2, num_clusters, groundtruth_labels,
                'transformed data (Sklearn PCA v2) '
                'with groundtruth labels')

        # ------------------------------------------------------------------------------------------------------3D plots
        # Plottings: 3D plots
        # Original data without labels
        ploting_v3d(data_x, 1, np.zeros(len(groundtruth_labels)),
                    'original data without labels')
        # Original data with groundtruth labels
        ploting_v3d(data_x, num_clusters, groundtruth_labels,
                    'original data with groundtruth labels')
        # Reconstructed data without labels
        ploting_v3d(reconstruct_data_x, 1, np.zeros(len(groundtruth_labels)),
                    'reconstructed data without labels')
        # Transfomed data with our implementation of PCA and without labels
        ploting_v3d(transf_data_x, 1, np.zeros(len(groundtruth_labels)),
                    'transformed data without labels')
        # Transfomed data with our implementation of PCA and with groundtruth_labels
        ploting_v3d(transf_data_x, num_clusters, groundtruth_labels,
                    'transformed data with groundtruth labels')
        # Transfomed data with our implementation of PCA and with the labels obtained with our K-means
        ploting_v3d(transf_data_x, num_clusters, labels,
                    'transformed data with labels from our K-means')
        # Plot of the correlation matrix of the dataset
        plot_corr_matrix(data_x, legend=False)

# ## 2. Dimensionality Reduction
# 
# We use Principal Component analysis to reduce the dimension of the remaining 176 bands into just 4 principle components. This is done using the Incremental PCA function imported from the Scikit-learn library. Incremental PCA is chosen because of the large number of features in the input data and would work well running it in small batches.
# 

# In[15]:


#Using the IncrementalPCA method from the scikit-learn library

from sklearn.decomposition import IncrementalPCA

IPCA = IncrementalPCA(n_components=4, batch_size = 5)
X_IPCA = IPCA.fit_transform(X_flat_transposed)

print(X_IPCA.shape)


# In[16]:


#Array is transposed to be consistent with the original format

X_IPCA = np.transpose(X_IPCA)
X_IPCA.shape


# 
# 
# Time = 
 
 
 
 
# Data decomposition 
print("Now Decompositing Data") 
start_time = time.clock() 
 
 
#from sklearn.decomposition import TruncatedSVD 
 

#decomp = TruncatedSVD(n_components=1000,n_iter=5) 
#decomp.fit(train_data)  
train_data = pca.fit_transform(train_data)
 
 
end_time = time.clock() 
print("Decompositing Complete \nTime =", end_time - start_time) 
# Time = 
print(train_data) 
 
 
 

# Saving decomposed data as csv 
csv_decomp_train_path = 'csv_pca900decomp_alphabets_train.csv' 
 
 
with open( csv_decomp_train_path, 'w') as f: 
Exemple #20
0
    print stanames
    AttSta.setInPaths(InPath)
    Files = AttSta.getatt(stanames, 'InPath')

    net = LCB_net()
    net.AddFilesSta(Files)
    X = net.getvarallsta(var=var, by='H', how='mean', From=From, To=To)

    #     X = X[0:100]
    #     for i in range(17):
    #         X = pd.concat([X,X], axis=1)
    #     print X.shape

    tic = time.clock()
    pca = IncrementalPCA(n_components=2, batch_size=3)
    K = pca.fit_transform(X)
    toc = time.clock()
    print toc - tic

    #For comparison, compute PCA
    tic = time.clock()
    pca = PCA(n_components=2)
    H = pca.fit_transform(X)
    toc = time.clock()
    print toc - tic

    plt.figure()
    plt.plot(K)

    plt.figure()
    plt.plot(H)
vec.fit(description)
features = vec.transform(description)

US_df['category_id'].nunique()
cls = MiniBatchKMeans(n_clusters=16, random_state=0)
cls.fit(features)
cls.predict(features)

from sklearn.metrics import homogeneity_score
homogeneity_score(US_df.category_id, cls.predict(features))
from sklearn.metrics import completeness_score
completeness_score(US_df.category_id, cls.predict(features))

# reduce the features to 2D
ipca = IncrementalPCA(n_components=2, batch_size=100)
reduced_features = ipca.fit_transform(features.toarray())
# reduce the cluster centers to 2D
reduced_cluster_centers = ipca.transform(cls.cluster_centers_)

plt.scatter(features[:, 0], features[:, 1], c=cls.predict(features))
plt.scatter(cls.cluster_centers_[:, 0],
            cls.cluster_centers_[:, 1],
            marker='x',
            s=150,
            c='b')

#CLUSTERING TITLE

#TF-IDF
vec = TfidfVectorizer(stop_words="english")
vec.fit(US_df["title"])
def prep_data(
    train,
    test,
    feature_groups,
    scale_data=False,
):
    """Feature selection and data preprocessing.

    Args:
        feature_groups (string): Which features to use.
                                 Options:
                                 "all" - All features
                                 "submodels" - Only features from submodels
                                 "basic_text" - Only basic text features like word counts, etc.
                                 "non_linguistic" - All main features not including pos,dep,ent features from spacy
                                 "other" - review_stars, grade_level, polarity, subjectivity
                                 "spacy_linguistic" - Only pos,ent.dep features from spacy
                                 "top_features" - Top 15 features chosen from feature selection steps
                                 "pca" - Top 20 PCA Features
        scale_data (bool, optional): Whether or not to standard scale data. Defaults to False.
    """
    # Feature Selection
    feature_options = {}
    feature_options["submodels"] = [
        "nb_prob",
        "svm_pred",
        "ft_prob",
        "lda_t1",
        "lda_t2",
        "lda_t3",
        "lda_t4",
        "lda_t5",
    ]
    feature_options["other"] = [
        "review_stars",
        "grade_level",
        "polarity",
        "subjectivity",
    ]
    feature_options["basic_text"] = [
        "word_cnt",
        "character_cnt",
        "num_cnt",
        "uppercase_cnt",
        "#@_cnt",
        "sentence_cnt",
        "lexicon_cnt",
        "syllable_cnt",
        "avg_word_len",
        "token_cnt",
        "stopword_cnt",
        "stopword_pct",
        "ent_cnt",
        "ent_pct",
    ]
    feature_options["spacy_linguistic"] = [
        "pos_adj_pct",
        "pos_adj_cnt",
        "pos_adp_pct",
        "pos_adp_cnt",
        "pos_adv_pct",
        "pos_adv_cnt",
        "pos_aux_pct",
        "pos_aux_cnt",
        "pos_conj_pct",
        "pos_conj_cnt",
        "pos_det_pct",
        "pos_det_cnt",
        "pos_intj_pct",
        "pos_intj_cnt",
        "pos_noun_pct",
        "pos_noun_cnt",
        "pos_num_pct",
        "pos_num_cnt",
        "pos_part_pct",
        "pos_part_cnt",
        "pos_pron_pct",
        "pos_pron_cnt",
        "pos_propn_pct",
        "pos_propn_cnt",
        "pos_punct_pct",
        "pos_punct_cnt",
        "pos_sconj_pct",
        "pos_sconj_cnt",
        "pos_sym_pct",
        "pos_sym_cnt",
        "pos_verb_pct",
        "pos_verb_cnt",
        "pos_x_pct",
        "pos_x_cnt",
        "dep_root_pct",
        "dep_root_cnt",
        "dep_acl_pct",
        "dep_acl_cnt",
        "dep_acomp_pct",
        "dep_acomp_cnt",
        "dep_advcl_pct",
        "dep_advcl_cnt",
        "dep_advmod_pct",
        "dep_advmod_cnt",
        "dep_agent_pct",
        "dep_agent_cnt",
        "dep_amod_pct",
        "dep_amod_cnt",
        "dep_appos_pct",
        "dep_appos_cnt",
        "dep_attr_pct",
        "dep_attr_cnt",
        "dep_aux_pct",
        "dep_aux_cnt",
        "dep_auxpass_pct",
        "dep_auxpass_cnt",
        "dep_case_pct",
        "dep_case_cnt",
        "dep_cc_pct",
        "dep_cc_cnt",
        "dep_ccomp_pct",
        "dep_ccomp_cnt",
        "dep_compound_pct",
        "dep_compound_cnt",
        "dep_conj_pct",
        "dep_conj_cnt",
        "dep_csubj_pct",
        "dep_csubj_cnt",
        "dep_csubjpass_pct",
        "dep_csubjpass_cnt",
        "dep_dative_pct",
        "dep_dative_cnt",
        "dep_dep_pct",
        "dep_dep_cnt",
        "dep_det_pct",
        "dep_det_cnt",
        "dep_dobj_pct",
        "dep_dobj_cnt",
        "dep_expl_pct",
        "dep_expl_cnt",
        "dep_intj_pct",
        "dep_intj_cnt",
        "dep_mark_pct",
        "dep_mark_cnt",
        "dep_meta_pct",
        "dep_meta_cnt",
        "dep_neg_pct",
        "dep_neg_cnt",
        "dep_nmod_pct",
        "dep_nmod_cnt",
        "dep_npadvmod_pct",
        "dep_npadvmod_cnt",
        "dep_nsubj_pct",
        "dep_nsubj_cnt",
        "dep_nsubjpass_pct",
        "dep_nsubjpass_cnt",
        "dep_nummod_pct",
        "dep_nummod_cnt",
        "dep_oprd_pct",
        "dep_oprd_cnt",
        "dep_parataxis_pct",
        "dep_parataxis_cnt",
        "dep_pcomp_pct",
        "dep_pcomp_cnt",
        "dep_pobj_pct",
        "dep_pobj_cnt",
        "dep_poss_pct",
        "dep_poss_cnt",
        "dep_preconj_pct",
        "dep_preconj_cnt",
        "dep_predet_pct",
        "dep_predet_cnt",
        "dep_prep_pct",
        "dep_prep_cnt",
        "dep_prt_pct",
        "dep_prt_cnt",
        "dep_punct_pct",
        "dep_punct_cnt",
        "dep_quantmod_pct",
        "dep_quantmod_cnt",
        "dep_relcl_pct",
        "dep_relcl_cnt",
        "dep_xcomp_pct",
        "dep_xcomp_cnt",
        "ent_cardinal_pct",
        "ent_cardinal_cnt",
        "ent_date_pct",
        "ent_date_cnt",
        "ent_event_pct",
        "ent_event_cnt",
        "ent_fac_pct",
        "ent_fac_cnt",
        "ent_gpe_pct",
        "ent_gpe_cnt",
        "ent_language_pct",
        "ent_language_cnt",
        "ent_law_pct",
        "ent_law_cnt",
        "ent_loc_pct",
        "ent_loc_cnt",
        "ent_money_pct",
        "ent_money_cnt",
        "ent_norp_pct",
        "ent_norp_cnt",
        "ent_ordinal_pct",
        "ent_ordinal_cnt",
        "ent_org_pct",
        "ent_org_cnt",
        "ent_percent_pct",
        "ent_percent_cnt",
        "ent_person_pct",
        "ent_person_cnt",
        "ent_product_pct",
        "ent_product_cnt",
        "ent_quantity_pct",
        "ent_quantity_cnt",
        "ent_time_pct",
        "ent_time_cnt",
        "ent_work_of_art_pct",
        "ent_work_of_art_cnt",
    ]

    feature_options["top_features"] = [
        "svm_pred",
        "ft_prob",
        "nb_prob",
        "token_cnt",
        "review_stars",
        "polarity",
        "subjectivity",
        "grade_level",
        "character_cnt",
        "avg_word_len",
        "lda_t1",
        "lda_t2",
        "lda_t3",
        "lda_t4",
        "lda_t5",
    ]

    feature_options["all"] = (
        feature_options["submodels"]
        + feature_options["other"]
        + feature_options["basic_text"]
        + feature_options["spacy_linguistic"]
    )
    feature_options["non_linguistic"] = (
        feature_options["submodels"]
        + feature_options["other"]
        + feature_options["basic_text"]
    )
    feature_options["pca"] = feature_options["all"]

    features = feature_options[feature_groups] + [
        "review_id",
        "target_clf",
        "target_reg",
    ]

    train = train[features]
    test = test[features]

    # Data Split (Train/Test)
    X_train = train.drop(columns=["review_id", "target_clf", "target_reg"])
    X_test = test.drop(columns=["review_id", "target_clf", "target_reg"])
    y_train = train["target_clf"]
    y_test = test["target_clf"]

    print("\nData Split Complete")
    print(f"X_train Shape: {X_train.shape}")
    print(f"X_test Shape: {X_test.shape}")
    print(f"y_train Shape: {y_train.shape}")
    print(f"y_test Shape: {y_test.shape}")

    # Preprocessing Options
    if scale_data and feature_groups != "pca":
        start = time.perf_counter()
        standard_scaler = StandardScaler()
        X_train_scaled = standard_scaler.fit_transform(X_train)
        X_test_scaled = standard_scaler.transform(X_test)
        end = time.perf_counter()
        print("\nTrain and Test Data Scaled")
        print(f"Preprocessing took {(end-start):.2f} seconds.")
        print(f"X_train Shape: {X_train_scaled.shape}")
        print(f"X_test Shape: {X_test_scaled.shape}")
        print(f"y_train Shape: {y_train.shape}")
        print(f"y_test Shape: {y_test.shape}")
        return (X_train_scaled, X_test_scaled, y_train, y_test)
    elif feature_groups == "pca":
        start = time.perf_counter()
        standard_scaler = StandardScaler()
        X_train_scaled = standard_scaler.fit_transform(X_train)
        X_test_scaled = standard_scaler.transform(X_test)
        end = time.perf_counter()
        print("\nTrain and Test Data Scaled")
        print(f"Feature Scaling took {(end-start):.2f} seconds.")

        start = time.perf_counter()
        pca = IncrementalPCA(n_components=20)
        X_train_pca = pca.fit_transform(X_train_scaled)
        X_test_pca = pca.transform(X_test_scaled)
        end = time.perf_counter()
        print("\nTrain and Test Data PCA Complete")
        print(f"PCA took {(end-start):.2f} seconds.")
        print(f"X_train Shape: {X_train_pca.shape}")
        print(f"X_test Shape: {X_test_pca.shape}")
        print(f"y_train Shape: {y_train.shape}")
        print(f"y_test Shape: {y_test.shape}")
        return (X_train_pca, X_test_pca, y_train, y_test)
    else:
        return (X_train, X_test, y_train, y_test)
Exemple #23
0
from PIL import Image
import numpy as np
from pybooru import Danbooru
from pathlib import Path
import urllib.request
import os

from sklearn.externals import joblib
from sklearn.decomposition import IncrementalPCA

classfier = joblib.load("model.pkl")
ipca = IncrementalPCA(n_components=20)

hoge = [0.5] * 200
hoge = np.array(hoge)
#hoge.reshape(1,-1)
#print(hoge.shape)
data = []
data.append(hoge)
data = np.array(data)
print(data.shape)
data = ipca.fit_transform(data)
print(data.shape)
pr_label = classfier.predict(data)
print(pr_label)
Exemple #24
0
        print (i.shape)
        i = np.pad(i,pad_width=(0,max-i.shape[0]), mode='constant', constant_values = 0 ).flatten()
        print(i.shape)
    return array_list,max

def switch_list_to_ndarray(array_list,max):
    new_array = np.array()
    for i in array_list:
        new_array.append(i)
    return new_array



spectrograms,max_length= clean_and_pad_mfcc(spectrograms)
spectrograms = np.array(spectrograms)
print(type(spectrograms[0]))
print(spectrograms[0].shape)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(spectrograms, labels, test_size=0.4, random_state=0)


from sklearn.decomposition import IncrementalPCA

ipca = IncrementalPCA(n_components=100, batch_size=3)
ipca.fit_transform(X_train)



Exemple #25
0
def parse_args():
    parser = argparse.ArgumentParser(
        description='Tag wikipedia article names with artist names')
    parser.add_argument('--infile', default='apsp.npy', help='APSP matrix')
    parser.add_argument('--outfile',
                        default='model.pkl',
                        help='Model save file')
    parser.add_argument('--num_components',
                        default=100,
                        type=int,
                        help='Number latent topics in the model')

    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()

    model = IncrementalPCA(n_components=args.num_components)

    apsp = np.load(args.infile)
    # apsp_gpu = gpuarray.GPUArray(np.shape(apsp), np.float32, order="F")
    # apsp_gpu.set(apsp)

    print('Fitting model')
    # model = model.fit_transform(apsp_gpu)
    model = model.fit_transform(apsp)

    print('Saving model')
    joblib.dump(model, args.outfile)
Exemple #26
0
def main():
    """
    Get data from db and save it as csv
    """

    bq = BQHandler()
    io = IO(gs_bucket=options.gs_bucket)
    viz = Viz(io=io)

    starttime, endtime = io.get_dates(options)
    logging.info('Using dataset {} and time range {} - {}'.format(
        options.feature_dataset, starttime.strftime('%Y-%m-%d'),
        endtime.strftime('%Y-%m-%d')))

    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    if options.model == 'rf':
        model = RandomForestRegressor(
            n_estimators=options.n_estimators,
            n_jobs=-1,
            min_samples_leaf=options.min_samples_leaf,
            min_samples_split=options.min_samples_split,
            max_features=options.max_features,
            max_depth=options.max_depth,
            bootstrap=options.bootstrap)
    elif options.model == 'lr':
        model = SGDRegressor(warm_start=True,
                             max_iter=options.n_loops,
                             shuffle=options.shuffle,
                             power_t=options.power_t,
                             penalty=options.regularizer,
                             learning_rate=options.learning_rate,
                             eta0=options.eta0,
                             alpha=options.alpha,
                             tol=0.0001)
    elif options.model == 'svr':
        model = SVR()
    elif options.model == 'ard':
        model = ARDRegression(n_iter=options.n_loops,
                              alpha_1=options.alpha_1,
                              alpha_2=options.alpha_2,
                              lambda_1=options.lambda_1,
                              lambda_2=options.lambda_2,
                              threshold_lambda=options.threshold_lambda,
                              fit_intercept=options.fit_intercept,
                              copy_X=options.copy_X)
    elif options.model == 'gp':
        k_long_term = 66.0**2 * RBF(length_scale=67.0)
        k_seasonal = 2.4**2 * RBF(length_scale=90.0) * ExpSineSquared(
            length_scale=150, periodicity=1.0, periodicity_bounds=(0, 10000))
        k_medium_term = 0.66**2 * RationalQuadratic(length_scale=1.2,
                                                    alpha=0.78)
        k_noise = 0.18**2 * RBF(length_scale=0.134) + WhiteKernel(
            noise_level=0.19**2)
        #kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise
        kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise

        model = GaussianProcessRegressor(
            kernel=kernel_gpml,  #alpha=0,
            optimizer=None,
            normalize_y=True)
    elif options.model == 'llasso':
        model = LocalizedLasso(num_iter=options.n_loops,
                               batch_size=options.batch_size)
    elif options.model == 'nlasso':
        model = NetworkLasso(num_iter=options.n_loops,
                             batch_size=options.batch_size)

        graph_data = pd.read_csv(options.graph_data,
                                 names=[
                                     'date', 'start_hour', 'src', 'dst',
                                     'type', 'sum_delay', 'sum_ahead',
                                     'add_delay', 'add_ahead', 'train_count'
                                 ])

        #stations_to_pick = options.stations_to_pick.split(',')
        #graph = model.fetch_connections(graph_data, stations_to_pick)
        model.fetch_connections(graph_data)

    if options.pca:
        ipca = IncrementalPCA(n_components=options.pca_components,
                              whiten=options.whiten,
                              copy=False)

    rmses, maes, r2s, skills, start_times, end_times, end_times_obj = [], [], [], [], [], [], []
    X_complete = []  # Used for feature selection

    start = starttime
    end = start + timedelta(days=int(options.day_step),
                            hours=int(options.hour_step))
    if end > endtime: end = endtime

    while end <= endtime and start < end:
        logging.info('Processing time range {} - {}'.format(
            start.strftime('%Y-%m-%d %H:%M'), end.strftime('%Y-%m-%d %H:%M')))

        # Load data ############################################################
        try:
            logging.info('Reading data...')
            data = bq.get_rows(start,
                               end,
                               loc_col='trainstation',
                               project=options.project,
                               dataset=options.feature_dataset,
                               table=options.feature_table,
                               parameters=all_param_names,
                               only_winters=options.only_winters)
            data = io.filter_train_type(labels_df=data,
                                        train_types=options.train_types,
                                        sum_types=True,
                                        train_type_column='train_type',
                                        location_column='trainstation',
                                        time_column='time',
                                        sum_columns=['train_count', 'delay'],
                                        aggs=aggs)

            # Filter only timesteps with large distribution in the whole network
            if options.filter_delay_limit is not None:
                data = io.filter_delay_with_limit(data,
                                                  options.filter_delay_limit)

            if options.y_avg_hours is not None:
                data = io.calc_running_delay_avg(data, options.y_avg_hours)

            if options.y_avg:
                data = io.calc_delay_avg(data)

            data.sort_values(by=['time', 'trainstation'], inplace=True)

            if options.impute:
                logging.info('Imputing missing values...')
                data.drop(columns=['train_type'], inplace=True)
                data = imputer.fit_transform(data)
                data.loc[:, 'train_type'] = None

            if options.month:
                logging.info('Adding month to the dataset...')
                data['month'] = data['time'].map(lambda x: x.month)
                if 'month' not in options.feature_params:
                    options.feature_params.append('month')

            if options.model == 'ard' and len(data) > options.n_samples:
                logging.info('Sampling {} values from data...'.format(
                    options.n_samples))
                data = data.sample(options.n_samples)

            l_data = data.loc[:, options.label_params]
            f_data = data.loc[:, options.feature_params]

        except ValueError as e:
            f_data, l_data = [], []

        if len(f_data) < 2 or len(l_data) < 2:
            start = end
            end = start + timedelta(days=int(options.day_step),
                                    hours=int(options.hour_step))
            continue

        logging.info('Processing {} rows...'.format(len(f_data)))

        train, test = train_test_split(data, test_size=0.1)
        X_train = train.loc[:,
                            options.feature_params].astype(np.float32).values
        y_train = train.loc[:, options.label_params].astype(
            np.float32).values.ravel()
        X_test = test.loc[:, options.feature_params].astype(np.float32).values
        y_test = test.loc[:, options.label_params].astype(
            np.float32).values.ravel()

        logging.debug('Features shape: {}'.format(X_train.shape))

        if options.normalize:
            logging.info('Normalizing data...')
            xscaler, yscaler = StandardScaler(), StandardScaler()

            X_train = xscaler.fit_transform(X_train)
            X_test = xscaler.transform(X_test)

            if len(options.label_params) == 1:
                y_train = yscaler.fit_transform(y_train.reshape(-1, 1)).ravel()
                #y_test = yscaler.transform(y_test.reshape(-1, 1)).ravel()
            else:
                y_train = yscaler.fit_transform(y_train)
                #y_test = yscaler.transform(y_test)

        if options.pca:
            logging.info('Doing PCA analyzis for the data...')
            X_train = ipca.fit_transform(X_train)
            fname = options.output_path + '/ipca_explained_variance.png'
            viz.explained_variance(ipca, fname)
            #io._upload_to_bucket(filename=fname, ext_filename=fname)
            X_test = ipca.fit_transform(X_test)

        if options.model == 'llasso':
            graph_data = pd.read_csv(options.graph_data,
                                     names=[
                                         'date', 'start_hour', 'src', 'dst',
                                         'type', 'sum_delay', 'sum_ahead',
                                         'add_delay', 'add_ahead',
                                         'train_count'
                                     ])
            graph = model.fetch_connections(graph_data)

        logging.debug('Features shape after pre-processing: {}'.format(
            X_train.shape))

        # FIT ##################################################################

        if options.cv:
            logging.info('Doing random search for hyper parameters...')

            if options.model == 'rf':
                param_grid = {
                    "n_estimators": [10, 100, 200, 800],
                    "max_depth": [3, 20, None],
                    "max_features": ["auto", "sqrt", "log2", None],
                    "min_samples_split": [2, 5, 10],
                    "min_samples_leaf": [1, 2, 4, 10],
                    "bootstrap": [True, False]
                }
            elif options.model == 'lr':
                param_grid = {
                    "penalty": [None, 'l2', 'l1'],
                    "alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1],
                    "l1_ratio": [0.1, 0.15, 0.2, 0.5],
                    "shuffle": [True, False],
                    "learning_rate": ['constant', 'optimal', 'invscaling'],
                    "eta0": [0.001, 0.01, 0.1],
                    "power_t": [0.1, 0.25, 0.5]
                }
            elif options.model == 'svr':
                param_grid = {
                    "C": [0.001, 0.01, 0.1, 1, 10],
                    "epsilon": [0.01, 0.1, 0.5],
                    "kernel":
                    ['rbf', 'linear', 'poly', 'sigmoid', 'precomputed'],
                    "degree": [2, 3, 4],
                    "shrinking": [True, False],
                    "gamma": [0.001, 0.01, 0.1],
                    "coef0": [0, 0.1, 1]
                }
            else:
                raise ("No param_grid set for given model ({})".format(
                    options.model))

            random_search = RandomizedSearchCV(model,
                                               param_distributions=param_grid,
                                               n_iter=int(
                                                   options.n_iter_search),
                                               n_jobs=-1)

            random_search.fit(X_train, y_train)
            logging.info("RandomizedSearchCV done.")
            fname = options.output_path + '/random_search_cv_results.txt'
            io.report_cv_results(random_search.cv_results_, fname)
            #io._upload_to_bucket(filename=fname, ext_filename=fname)
            sys.exit()
        else:
            logging.info('Training...')
            if options.model in ['rf', 'svr', 'ard', 'gp']:
                model.fit(X_train, y_train)
                if options.feature_selection:
                    X_complete = X_train
                    y_complete = y_train
                    meta_complete = data.loc[:, options.meta_params]
            elif options.model in ['llasso']:
                model.fit(X_train,
                          y_train,
                          stations=train.loc[:, 'trainstation'].values)
            elif options.model in ['nlasso']:
                model.partial_fit(X_train,
                                  y_train,
                                  stations=train.loc[:, 'trainstation'].values)
            else:
                model.partial_fit(X_train, y_train)
                if options.feature_selection:
                    try:
                        X_complete = np.append(X_complete, X_train)
                        y_complete = np.append(Y_complete, y_train)
                        meta_complete = meta_complete.append(
                            data.loc[:, options.meta_params])
                    except (ValueError, NameError):
                        X_complete = X_train
                        y_complete = y_train
                        meta_complete = data.loc[:, options.meta_params]

        # EVALUATE #############################################################

        # Check training score to estimate amount of overfitting
        # Here we assume that we have a datetime index (from time columns)
        y_pred_train = model.predict(X_train)
        rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
        mae_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
        logging.info('Training data RMSE: {} and MAE: {}'.format(
            rmse_train, mae_train))

        #try:
        if True:
            print(train)
            #range = ('2013-02-01','2013-02-28')
            range = ('2010-01-01', '2010-01-02')
            X_train_sample = train.loc[range[0]:range[1],
                                       options.feature_params].astype(
                                           np.float32).values

            target = train.loc[range[0]:range[1], options.label_params].astype(
                np.float32).values.ravel()
            y_pred_sample = model.predict(X_train_sample)

            times = train.loc[range[0]:range[1], 'time'].values
            df = pd.DataFrame(times + y_pred_sample)
            print(df)
            sys.exit()

            # Draw visualisation
            fname = '{}/timeseries_training_data.png'.format(
                options.output_path)
            viz.plot_delay(times, target, y_pred,
                           'Delay for station {}'.format(stationName), fname)

            fname = '{}/scatter_all_stations.png'.format(options.vis_path)
            viz.scatter_predictions(times,
                                    target,
                                    y_pred,
                                    savepath=options.vis_path,
                                    filename='scatter_{}'.format(station))
        #except KeyError:
        #    pass

        # Mean delay over the whole dataset (both train and validation),
        # used to calculate Brier Skill
        if options.y_avg:
            mean_delay = 3.375953418071136
        else:
            mean_delay = 6.011229358531166

        if options.model == 'llasso':
            print('X_test shape: {}'.format(X_test.shape))
            y_pred, weights = model.predict(X_test,
                                            test.loc[:, 'trainstation'].values)
        else:
            y_pred = model.predict(X_test)

        if options.normalize:
            y_pred = yscaler.inverse_transform(y_pred)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        rmse_stat = math.sqrt(
            mean_squared_error(y_test, np.full_like(y_test, mean_delay)))
        skill = 1 - rmse / rmse_stat

        rmses.append(rmse)
        maes.append(mae)
        r2s.append(r2)
        skills.append(skill)
        start_times.append(start.strftime('%Y-%m-%dT%H:%M:%S'))
        end_times.append(end.strftime('%Y-%m-%dT%H:%M:%S'))
        end_times_obj.append(end)

        if options.model in ['rf', 'lr', 'ard', 'gp']:
            logging.info('R2 score for training: {}'.format(
                model.score(X_train, y_train)))

        logging.info('RMSE: {}'.format(rmse))
        logging.info('MAE: {}'.format(mae))
        logging.info('R2 score: {}'.format(r2))
        logging.info('Brier Skill Score score: {}'.format(skill))

        start = end
        end = start + timedelta(days=int(options.day_step),
                                hours=int(options.hour_step))
        if end > endtime: end = endtime

    # SAVE #####################################################################
    io.save_scikit_model(model,
                         filename=options.save_file,
                         ext_filename=options.save_file)
    if options.normalize:
        fname = options.save_path + '/xscaler.pkl'
        io.save_scikit_model(xscaler, filename=fname, ext_filename=fname)
        fname = options.save_path + '/yscaler.pkl'
        io.save_scikit_model(yscaler, filename=fname, ext_filename=fname)

    if options.model == 'rf':
        fname = options.output_path + '/rfc_feature_importance.png'
        viz.rfc_feature_importance(model.feature_importances_,
                                   fname,
                                   feature_names=options.feature_params)
        #io._upload_to_bucket(filename=fname, ext_filename=fname)

    try:
        fname = options.output_path + '/learning_over_time.png'
        viz.plot_learning_over_time(end_times_obj,
                                    rmses,
                                    maes,
                                    r2s,
                                    filename=fname)
        #io._upload_to_bucket(filename=fname, ext_filename=fname)
    except Exception as e:
        logging.error(e)

    error_data = {
        'start_times': start_times,
        'end_times': end_times,
        'rmse': rmses,
        'mae': maes,
        'r2': r2s,
        'skill': skills
    }
    fname = '{}/training_time_validation_errors.csv'.format(
        options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)

    # FEATURE SELECTION ########################################################
    if options.feature_selection:
        logging.info('Doing feature selection...')
        selector = SelectFromModel(model, prefit=True)
        print(pd.DataFrame(data=X_complete))
        X_selected = selector.transform(X_complete)

        selected_columns = f_data.columns.values[selector.get_support()]
        logging.info(
            'Selected following parameters: {}'.format(selected_columns))
        data_sel = meta_complete.join(
            pd.DataFrame(data=y_complete, columns=options.label_params)).join(
                pd.DataFrame(data=X_selected, columns=selected_columns))

        print(pd.DataFrame(data=X_selected, columns=selected_columns))
        print(data_sel)
Exemple #27
0
# print(train_data)
# print(train_label)

end_time = time.clock()
print("Loading Complete \nTime =", end_time - start_time)
# Time =

# Data decomposition
print("Now Decompositing Data")
start_time = time.clock()

#from sklearn.decomposition import TruncatedSVD

#decomp = TruncatedSVD(n_components=1000,n_iter=5)
#decomp.fit(train_data)
train_data = pca.fit_transform(train_data)

end_time = time.clock()
print("Decompositing Complete \nTime =", end_time - start_time)
# Time =
print(train_data)

# Saving decomposed data as csv
csv_decomp_train_path = 'csv_pca900decomp_alphabets_train.csv'

with open(csv_decomp_train_path, 'w') as f:
    writer = csv.writer(f, lineterminator='\n')
    writer.writerow(train_data)
    #writer.writerow('\n')

########## Learning ###################################
Exemple #28
0
    def projectPCA(self):
        return np.dot(self.data, self.eigenvector)[:, :2]


if __name__ == '__main__':
    range = np.random.RandomState(1)
    # Load Data
    data = np.dot(range.rand(2, 2), range.randn(2, 200)).T
    plt.scatter(data[:, 0], data[:, 1])
    # Show Data
    plt.axis('equal')
    plt.savefig('PCAData.png')


    # Numpy PCA
    pca = PCANumpy(data=data)
    # Print projected PCA
    print('Numpy PCA: ')
    print(pca.projectPCA())
    print('\n============\n')

    # Scikitlearn PCA
    pcaSklearn = IncrementalPCA(n_components=2, batch_size=10)
    newSklearnPCA = pcaSklearn.fit_transform(data)
    print('Scikit-learn PCA: ')
    print(newSklearnPCA)

    # Distance between matrices
    print('\n============\n')
    print("Distance Between Matrices:")
    print(np.linalg.norm(pca.projectPCA() - newSklearnPCA))
Exemple #29
0
def post_eval(data, params, replot_runtimeplots=False):
    fontsize = 18

    if replot_runtimeplots:
        print('Plotting runtime plots...')
        try:
            les = np.loadtxt(params['saveto'] + 'les.out')
            plot_les(les, params, fontsize)
        except:
            print('LE plotting failed.')

        try:
            pws = np.loadtxt(params['saveto'] + 'pws.out')
            plot_traj(pws, params, fontsize)
        except:
            print('Projected trajectory plotting failed.')

    d_rng = intersection(params['freeze_d_its'],
                         (params['start_lam_it'], params['max_iter'] - 1))
    g_rng = intersection(params['freeze_g_its'],
                         (params['start_lam_it'], params['max_iter'] - 1))
    if d_rng is not None and g_rng is not None:
        both_rng = intersection(list(d_rng), list(g_rng))
    else:
        both_rng = None

    print('Plotting gradient norms...')
    try:
        ds = np.loadtxt(params['saveto'] + 'd_norm.out')
        fig = plt.figure()
        ax = plt.subplot(111)
        plt.plot(range(len(ds)), ds, 'k-')
        if d_rng is not None:
            plt.plot(d_rng, ds[d_rng], '-', color='dodgerblue')
        if g_rng is not None: plt.plot(g_rng, ds[g_rng], 'r-')
        if both_rng is not None:
            plt.plot(both_rng, ds[both_rng], '-', color='lime')
        ax.set_ylabel('Discriminator Gradient L2 Norm', fontsize=fontsize)
        ax.set_xlabel('Iteration', fontsize=fontsize)
        plt.tick_params(axis='both', which='major', labelsize=fontsize)
        plt.title('Final Norm: {:.3e}'.format(ds[-1]), fontsize=fontsize)
        locs, _ = plt.xticks()
        if locs[-1] >= 10000:
            newlocs = [loc for loc in locs if loc >= 0 and loc < len(ds)]
            xlabels = [str(int(loc) // 1000) + 'k' for loc in newlocs]
            plt.xticks(newlocs, xlabels)
        plt.tight_layout()
        fig.savefig(params['saveto'] + 'd_norm.pdf')
    except:
        print('d_norm.out not found.')

    try:
        gs = np.loadtxt(params['saveto'] + 'g_norm.out')
        fig = plt.figure()
        ax = plt.subplot(111)
        plt.plot(range(len(gs)), gs, 'k-')
        if d_rng is not None:
            plt.plot(d_rng, gs[d_rng], '-', color='dodgerblue')
        if g_rng is not None: plt.plot(g_rng, gs[g_rng], 'r-')
        if both_rng is not None:
            plt.plot(both_rng, gs[both_rng], '-', color='lime')
        ax.set_ylabel('Generator Gradient L2 Norm', fontsize=fontsize)
        ax.set_xlabel('Iteration', fontsize=fontsize)
        plt.tick_params(axis='both', which='major', labelsize=fontsize)
        plt.title('Final Norm: {:.3e}'.format(gs[-1]), fontsize=fontsize)
        locs, _ = plt.xticks()
        if locs[-1] >= 10000:
            newlocs = [loc for loc in locs if loc >= 0 and loc < len(gs)]
            xlabels = [str(int(loc) // 1000) + 'k' for loc in newlocs]
            plt.xticks(newlocs, xlabels)
        plt.tight_layout()
        fig.savefig(params['saveto'] + 'g_norm.pdf')
    except:
        print('g_norm.out not found.')

    print('Plotting loss...')
    try:
        fs = np.loadtxt(params['saveto'] + 'loss.out')
        fig = plt.figure()
        ax = plt.subplot(111)
        plt.plot(range(len(fs)), np.array(fs), 'k-')
        if d_rng is not None:
            plt.plot(d_rng, fs[d_rng], '-', color='dodgerblue')
        if g_rng is not None: plt.plot(g_rng, fs[g_rng], 'r-')
        if both_rng is not None:
            plt.plot(both_rng, fs[both_rng], '-', color='lime')
        ax.set_ylabel('Minimax Loss', fontsize=fontsize)
        ax.set_xlabel('Iteration', fontsize=fontsize)
        plt.tick_params(axis='both', which='major', labelsize=fontsize)
        plt.title('Final Loss: {:.3e}'.format(fs[-1]), fontsize=fontsize)
        locs, _ = plt.xticks()
        if locs[-1] >= 10000:
            newlocs = [loc for loc in locs if loc >= 0 and loc < len(fs)]
            xlabels = [str(int(loc) // 1000) + 'k' for loc in newlocs]
            plt.xticks(newlocs, xlabels)
        plt.tight_layout()
        fig.savefig(params['saveto'] + 'loss.pdf')
    except:
        print('loss.out not found.')

    print('Loading weights from saved files...')
    weights = []
    for w_i in range(params['start_lam_it'], params['max_iter'],
                     params['weights_every']):
        w_D = flatten_nested(
            pickle.load(
                open(params['saveto'] + 'weights/D_' + str(w_i) + '.pkl',
                     'rb')))
        w_G = flatten_nested(
            pickle.load(
                open(params['saveto'] + 'weights/G_' + str(w_i) + '.pkl',
                     'rb')))
        weights.append(np.hstack([w_D, w_G]))
    weights = np.vstack(weights)

    d_rng = shift_range(d_rng,
                        shift=-params['start_lam_it'],
                        keep_every=params['weights_every'])
    g_rng = shift_range(g_rng,
                        shift=-params['start_lam_it'],
                        keep_every=params['weights_every'])
    both_rng = shift_range(both_rng,
                           shift=-params['start_lam_it'],
                           keep_every=params['weights_every'])

    print('Plotting PCA of trajectory...')
    ipca = IncrementalPCA(n_components=2, batch_size=10)
    X_ipca = ipca.fit_transform(weights)
    fig, ax = plt.subplots()
    path = mpath.Path(X_ipca)
    verts = path.interpolated(steps=1).vertices
    x, y = verts[:, 0], verts[:, 1]
    z = np.linspace(0, 1, len(x))
    colorline(x, y, z, cmap=plt.get_cmap('Greys'), linewidth=1.0)
    if d_rng is not None:
        plt.plot(X_ipca[d_rng, 0],
                 X_ipca[d_rng, 1],
                 '-',
                 color='dodgerblue',
                 lw=0.5)
    if g_rng is not None:
        plt.plot(X_ipca[g_rng, 0], X_ipca[g_rng, 1], 'r-', lw=0.5)
    if both_rng is not None:
        plt.plot(X_ipca[both_rng, 0],
                 X_ipca[both_rng, 1],
                 '-',
                 color='lime',
                 lw=0.5)
    ax.set_xlim([X_ipca[:, 0].min(), X_ipca[:, 0].max()])
    ax.set_ylim([X_ipca[:, 1].min(), X_ipca[:, 1].max()])
    plt.title('Weights Trajectory Projected onto Top-2 PCs\n' +
              r'($p2p_x,p2p_y$)' +
              ' = ({:.3f},{:.3f})'.format(np.ptp(x), np.ptp(y)),
              fontsize=fontsize)
    plt.tick_params(axis='both',
                    which='major',
                    bottom=False,
                    top=False,
                    left=False,
                    right=False)
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    fig.tight_layout()
    fig.savefig(params['saveto'] + 'weights_pca.pdf')
    plt.close(fig)

    print('Plotting PCA of normalized trajectory...')
    ipca2 = IncrementalPCA(n_components=2, batch_size=10)
    weights_normalized = (weights - weights.min(axis=0)) / (
        np.ptp(weights, axis=0) + 1e-10)
    X_ipca2 = ipca2.fit_transform(weights_normalized)
    fig, ax = plt.subplots()
    path2 = mpath.Path(X_ipca2)
    verts2 = path2.interpolated(steps=1).vertices
    x2, y2 = verts2[:, 0], verts2[:, 1]
    z2 = np.linspace(0, 1, len(x2))
    colorline(x2, y2, z2, cmap=plt.get_cmap('Greys'), linewidth=1.0)
    if d_rng is not None:
        plt.plot(X_ipca2[d_rng, 0],
                 X_ipca2[d_rng, 1],
                 '-',
                 color='dodgerblue',
                 lw=0.5)
    if g_rng is not None:
        plt.plot(X_ipca2[g_rng, 0], X_ipca2[g_rng, 1], 'r-', lw=0.5)
    if both_rng is not None:
        plt.plot(X_ipca2[both_rng, 0],
                 X_ipca2[both_rng, 1],
                 '-',
                 color='lime',
                 lw=0.5)
    plt.title('Normalized Weights Trajectory\nProjected onto Top-2 PCs\n' +
              r'($p2p_x,p2p_y$)' +
              ' = ({:.3f},{:.3f})'.format(np.ptp(x2), np.ptp(y2)),
              fontsize=fontsize)
    plt.tick_params(axis='both',
                    which='major',
                    bottom=False,
                    top=False,
                    left=False,
                    right=False)
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    fig.tight_layout()
    fig.savefig(params['saveto'] + 'weights_pca2.pdf')
    plt.close(fig)

    print('Plotting norm of weights over trajectory...')
    w_norms = np.linalg.norm(weights, axis=1)
    fig = plt.figure()
    plt.plot(range(len(w_norms)), w_norms, 'k-')
    if d_rng is not None:
        plt.plot(d_rng, w_norms[d_rng], '-', color='dodgerblue')
    if g_rng is not None: plt.plot(g_rng, w_norms[g_rng], 'r-')
    if both_rng is not None:
        plt.plot(both_rng, w_norms[both_rng], '-', color='lime')
    plt.xlabel('Iteration', fontsize=fontsize)
    plt.ylabel(r'Norm of Weights ($||w||$)', fontsize=fontsize)
    plt.title('Norm of Weights Over Trajectory\n' + r'($p2p$' +
              '={:.3f})'.format(np.ptp(w_norms)),
              fontsize=fontsize)
    plt.tick_params(axis='both', which='major', labelsize=fontsize)
    locs, _ = plt.xticks()
    if locs[-1] >= 10000:
        newlocs = [loc for loc in locs if loc >= 0 and loc < len(w_norms)]
        xlabels = [str(int(loc) // 1000) + 'k' for loc in newlocs]
        plt.xticks(newlocs, xlabels)
    plt.tight_layout()
    fig.savefig(params['saveto'] + 'weight_norms.pdf')
    plt.close(fig)

    print('Plotting distance of weights from mean over trajectory...')
    weights_mean = weights.mean(axis=0)
    w_mean_norms = np.linalg.norm(weights - weights_mean, axis=1)
    fig = plt.figure()
    plt.plot(range(len(w_mean_norms)), w_mean_norms, 'k-')
    if d_rng is not None:
        plt.plot(d_rng, w_mean_norms[d_rng], '-', color='dodgerblue')
    if g_rng is not None: plt.plot(g_rng, w_mean_norms[g_rng], 'r-')
    if both_rng is not None:
        plt.plot(both_rng, w_mean_norms[both_rng], '-', color='lime')
    plt.xlabel('Iteration', fontsize=fontsize)
    plt.ylabel(r'Norm of Weights ($||w||$)', fontsize=fontsize)
    plt.title('Norm of Weights Over Trajectory\n' + r'($p2p$' +
              '={:.3f})'.format(np.ptp(w_mean_norms)),
              fontsize=fontsize)
    plt.tick_params(axis='both', which='major', labelsize=fontsize)
    locs, _ = plt.xticks()
    if locs[-1] >= 10000:
        newlocs = [loc for loc in locs if loc >= 0 and loc < len(w_mean_norms)]
        xlabels = [str(int(loc) // 1000) + 'k' for loc in newlocs]
        plt.xticks(newlocs, xlabels)
    plt.tight_layout()
    fig.savefig(params['saveto'] + 'weight_mean_norms.pdf')
    plt.close(fig)

    print('Plotting angular distance of weights from mean over trajectory...')
    w_mean_angles = 180 / np.pi * np.arccos(
        np.sum(weights * weights_mean, axis=1) / w_norms /
        np.linalg.norm(weights_mean))
    fig = plt.figure()
    plt.plot(range(len(w_mean_angles)), w_mean_angles, 'k-')
    if d_rng is not None:
        plt.plot(d_rng, w_mean_angles[d_rng], '-', color='dodgerblue')
    if g_rng is not None: plt.plot(g_rng, w_mean_angles[g_rng], 'r-')
    if both_rng is not None:
        plt.plot(both_rng, w_mean_angles[both_rng], '-', color='lime')
    plt.title('Angular Deviation of Weights\nfrom Mean Over Trajectory\n' +
              r'($p2p$' + '={:.3f})'.format(np.ptp(w_mean_angles)),
              fontsize=fontsize)
    plt.tick_params(axis='both', which='major', labelsize=fontsize)
    plt.ylabel('Angles in degrees', fontsize=fontsize)
    plt.xlabel('Iteration', fontsize=fontsize)
    locs, _ = plt.xticks()
    if locs[-1] >= 10000:
        newlocs = [
            loc for loc in locs if loc >= 0 and loc < len(w_mean_angles)
        ]
        xlabels = [str(int(loc) // 1000) + 'k' for loc in newlocs]
        plt.xticks(newlocs, xlabels)
    plt.tight_layout()
    fig.savefig(params['saveto'] + 'weight_mean_angles.pdf')
    plt.close(fig)

    print(
        'Plotting distance of weights from closest vector over trajectory...')
    D = pairwise_distances(weights)
    closest = weights[D.sum(axis=1).argmin()]
    w_closest_norms = np.linalg.norm(weights - closest, axis=1)
    fig = plt.figure()
    plt.plot(range(len(w_closest_norms)), w_closest_norms, 'k-')
    if d_rng is not None:
        plt.plot(d_rng, w_closest_norms[d_rng], '-', color='dodgerblue')
    if g_rng is not None: plt.plot(g_rng, w_closest_norms[g_rng], 'r-')
    if both_rng is not None:
        plt.plot(both_rng, w_closest_norms[both_rng], '-', color='lime')
    plt.title('Norm of Weights Over Trajectory\n' + r'($p2p$' +
              '={:.3f})'.format(np.ptp(w_closest_norms)),
              fontsize=fontsize)
    plt.tick_params(axis='both', which='major', labelsize=fontsize)
    plt.xlabel('Iteration', fontsize=fontsize)
    plt.ylabel(r'Norm of Weights ($||w||$)', fontsize=fontsize)
    locs, _ = plt.xticks()
    if locs[-1] >= 10000:
        newlocs = [
            loc for loc in locs if loc >= 0 and loc < len(w_closest_norms)
        ]
        xlabels = [str(int(loc) // 1000) + 'k' for loc in newlocs]
        plt.xticks(newlocs, xlabels)
    plt.tight_layout()
    fig.savefig(params['saveto'] + 'weight_closest_norms.pdf')
    plt.close(fig)

    print('Plotting sample series over epochs...')
    if params['n_viz'] > 0:
        np_samples = []
        for viz_i in range(0, params['max_iter'], params['viz_every']):
            np_samples.append(
                np.load(params['saveto'] + 'samples/' + str(viz_i) + '.npy'))
        data.plot_series(np_samples, params)

    print('Complete.')
Exemple #30
0
def train_cluster(data_type=0,
                  dimension_reduction=0,
                  cluster_way=0,
                  n_components=50,
                  threshold=2,
                  n_clusters=210,
                  branching_factor=50,
                  linkage=0,
                  max_iter=500,
                  eps=1.0):
    if data_type == 0:
        train_data = load_stage2_tf_idf("")
    elif data_type == 1:
        train_data = load_stage2_tf_idf("")
        nn_data = load_nn_stage2_features()
        train_data = pd.merge(train_data, nn_data, 'left', on="file_name")
    elif data_type == 2:
        train_data = load_nn_stage2_features()
    elif data_type == 3:
        train_data = load_stage2_tf_idf("1000")
        nn_data = load_nn_stage2_features()
        train_data = pd.merge(train_data, nn_data, 'left', on="file_name")
        dll = load_stage2_tf_idf("_dll")
        train_data = pd.merge(train_data, dll, 'left', on="file_name")
        dll = load_stage2_tf_idf("_hkey", "first")
        train_data = pd.merge(train_data, dll, 'left', on="file_name")
        dll = load_stage2_tf_idf("_hkey", "last")
        train_data = pd.merge(train_data, dll, 'left', on="file_name")
        train_data.fillna(0, inplace=True)
    elif data_type == 4:
        train_data = load_stage2_tf_idf("1000")
        nn_data = load_nn_stage2_features()
        train_data = pd.merge(train_data, nn_data, 'left', on="file_name")
        dll = load_stage2_tf_idf("_dll")
        train_data = pd.merge(train_data, dll, 'left', on="file_name")
        dll = load_stage2_tf_idf("_hkey", "first")
        train_data = pd.merge(train_data, dll, 'left', on="file_name")
        dll = load_stage2_tf_idf("_hkey", "last")
        train_data = pd.merge(train_data, dll, 'left', on="file_name")
        dll = load_clustering_statics_files()
        train_data = pd.merge(train_data, dll, 'left', on="file_name")
        train_data.fillna(0, inplace=True)

    file_name = train_data["file_name"]
    train_data.drop(columns=["file_name"], inplace=True)
    X = StandardScaler(with_mean=False).fit_transform(train_data)
    origin_data = X

    if dimension_reduction == 0:
        pass
    elif dimension_reduction == 1:
        model = IncrementalPCA(n_components=n_components)
        X = model.fit_transform(X)
    elif dimension_reduction == 2:
        model = NMF(n_components=n_components,
                    init='random',
                    random_state=0,
                    max_iter=max_iter)
        X = model.fit_transform(X)
    elif dimension_reduction == 3:
        model = PCA(n_components=n_components)
        X = model.fit_transform(X)

    print(len(X[0]))
    if cluster_way == 0:
        mode = ["ward", "complete", "average", "single"]
        db = AgglomerativeClustering(n_clusters=n_clusters,
                                     linkage=mode[linkage]).fit(X)
        labels = db.labels_
        pd.DataFrame(data={
            "id": file_name,
            "family_id": db.labels_
        }).to_csv(os.path.join(
            "predictions",
            "aggcl" + "_" + str(n_clusters) + "_" + str(data_type) + "_" +
            str(dimension_reduction) + "_" + str(n_components) + ".csv"),
                  index=False)
        print(len(set(labels)))
    elif cluster_way == 1:
        db = Birch(branching_factor=branching_factor,
                   n_clusters=n_clusters,
                   threshold=threshold).fit(X)
        labels = db.predict(X)
        pd.DataFrame(data={
            "id": file_name,
            "family_id": db.labels_
        }).to_csv(os.path.join("predictions", "birch" + ".csv"), index=False)
        print(len(set(labels)))
    elif cluster_way == 2:
        db = hdbscan.HDBSCAN(min_cluster_size=40)
        db.fit(X)
        labels = db.labels_
        pd.DataFrame(data={
            "id": file_name,
            "family_id": db.labels_
        }).to_csv(os.path.join("predictions", "hdb_40" + ".csv"), index=False)
        print(len(set(labels)))
    elif cluster_way == 3:
        db = DBSCAN(eps=eps, n_jobs=-1).fit(X)
        labels = db.labels_
        pd.DataFrame(data={
            "id": file_name,
            "family_id": db.labels_
        }).to_csv(os.path.join(
            "predictions",
            "db" + "_" + str(eps) + "_" + str(dimension_reduction) + ".csv"),
                  index=False)
        print(len(set(labels)))
    elif cluster_way == 4:
        labels = np.zeros((len(file_name), ))
        pd.DataFrame(data={
            "id": file_name,
            "family_id": np.zeros((len(file_name), ))
        }).to_csv(os.path.join("predictions", "zeros" + ".csv"), index=False)
    elif cluster_way == 5:
        db = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
        labels = db.labels_
        pd.DataFrame(data={
            "id": file_name,
            "family_id": db.labels_
        }).to_csv(os.path.join("predictions",
                               "kmeans" + str(n_clusters) + ".csv"),
                  index=False)
        print(len(set(labels)))
    elif cluster_way == 6:
        db = AffinityPropagation()

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    print('Estimated number of clusters: %d' % n_clusters_)
    print('Estimated number of noise points: %d' % n_noise_)

    scores = evaluate_cluster_performance(origin_data, labels)
    evaluate_cluster_performance(X, labels)
    return scores
Exemple #31
0
# In[5]:

if __name__ == "__main__":
    # Load in data file
    df = pd.read_table(_ROOT_DIR + "data/2018_07_11_pca_te_enhancers/test.tsv")

    # Get features used in PCA
    features_df = df.loc[:, "aaaaaa":"tttttt"]

    # Get labels
    labels_df = df.loc[:, "label"]

    # Create the PCA
    ipca = IncrementalPCA(n_components=N_COMPONENTS)
    features_transformed = ipca.fit_transform(features_df)

    # Label the transformed coordinates
    transformed_df = pca.label_coordinates(
        transformed_coordinates=features_transformed, labels=labels_df)

    # Get a list of all unique labels
    labels_list = list(set(labels_df))

    # Create combinations of principal components to plot
    components = (1, 2, 3, 4, 5)
    combinations_list = generate_combinations(components)

    # Plot different combinations of principal components
    for combination in combinations_list:
Exemple #32
0
from sklearn.decomposition import IncrementalPCA
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

# 读取数据
f = open(r"breast.txt")
line = f.readline()
data_list = []
while line:
    num = list(map(float, line.split()))
    data_list.append(num)
    line = f.readline()
f.close()
data_array = np.array(data_list)
X = data_array[:, :-1]
y = data_array[:, -1]

# 降维处理
pca = IncrementalPCA(n_components=1)
X = pca.fit_transform(X)

# 聚类
clst = AgglomerativeClustering(n_clusters=2)
clst.fit(X)

# 计算 NMI
result_NMI = metrics.normalized_mutual_info_score(y, clst.labels_)
print("result_NMI:", result_NMI)
        vectors.append(model[word])
        labels.append(word)
print('- found ' + str(len(labels)) + ' entities x ' + str(len(vectors[0])) +
      ' dimensions')

# convert both lists into numpy vectors for reduction
vectors = np.asarray(vectors)
labels = np.asarray(labels)
print('- done')

# if specified, reduce using IncrementalPCA first (down
# to a smaller number of dimensions before the final reduction)
if run_init_reduction:
    print('reducing to ' + str(init_dimensions) + 'D using IncrementalPCA...')
    ipca = IncrementalPCA(n_components=init_dimensions)
    vectors = ipca.fit_transform(vectors)
    print('- done')

    # save reduced vector space to file
    print('- saving as csv...')
    with open(
            'ModelsAndData/' + model_name + '-' + str(init_dimensions) +
            'D.csv', 'w') as f:
        for i in range(len(labels)):
            f.write(labels[i] + ',' + ','.join(map(str, vectors[i])) + '\n')

# reduce using t-SNE
print('reducing to ' + str(num_dimensions) + 'D using t-SNE...')
print('- may take a really, really (really) long time :)')
vectors = np.asarray(vectors)
tsne = TSNE(n_components=num_dimensions, random_state=0)
Exemple #34
0
def pca(data,
        n_comps=None,
        zero_center=True,
        svd_solver='auto',
        random_state=0,
        return_info=False,
        dtype='float32',
        copy=False,
        chunked=False,
        chunk_size=None):
    """Principal component analysis [Pedregosa11]_.

    Computes PCA coordinates, loadings and variance decomposition. Uses the
    implementation of *scikit-learn* [Pedregosa11]_.

    Parameters
    ----------
    data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse`
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    n_comps : `int`, optional (default: 50)
        Number of principal components to compute.
    zero_center : `bool` or `None`, optional (default: `True`)
        If `True`, compute standard PCA from covariance matrix. If `False`, omit
        zero-centering variables (uses *TruncatedSVD* from scikit-learn), which
        allows to handle sparse input efficiently.
    svd_solver : `str`, optional (default: 'auto')
        SVD solver to use. Either 'arpack' for the ARPACK wrapper in SciPy
        (scipy.sparse.linalg.svds), or 'randomized' for the randomized algorithm
        due to Halko (2009). 'auto' chooses automatically depending on the size
        of the problem.
    random_state : `int`, optional (default: 0)
        Change to use different intial states for the optimization.
    return_info : `bool`, optional (default: `False`)
        Only relevant when not passing an :class:`~anndata.AnnData`: see
        "Returns".
    dtype : `str` (default: 'float32')
        Numpy data type string to which to convert the result.
    copy : `bool`, optional (default: `False`)
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned. Is ignored otherwise.
    chunked : `bool`, optional (default: `False`)
        If `True`, perform an incremental PCA on segments of `chunk_size`. The 
        incremental PCA automatically zero centers and ignores settings of 
        `random_seed` and `svd_solver`. If `False`, perform a full PCA.
    chunk_size : `int`, optional (default: `None`)
        Number of observations to include in each chunk. Required if `chunked`
        is `True`.

    Returns
    -------
    If `data` is array-like and `return_info == False`, only returns `X_pca`,\
    otherwise returns or adds to `adata`:
    X_pca : `.obsm`
         PCA representation of data.
    PCs : `.varm`
         The principal components containing the loadings.
    variance_ratio : `.uns['pca']`
         Ratio of explained variance.
    variance : `.uns['pca']`
         Explained variance, equivalent to the eigenvalues of the covariance matrix.
    """
    # chunked calculation is not randomized, anyways
    if svd_solver in {'auto', 'randomized'} and not chunked:
        logg.info(
            'Note that scikit-learn\'s randomized PCA might not be exactly '
            'reproducible across different computational platforms. For exact '
            'reproducibility, choose `svd_solver=\'arpack\'.` This will likely '
            'become the Scanpy default in the future.')

    if n_comps is None: n_comps = N_PCS

    if isinstance(data, AnnData):
        data_is_AnnData = True
        adata = data.copy() if copy else data
    else:
        data_is_AnnData = False
        adata = AnnData(data)

    logg.msg('computing PCA with n_comps =', n_comps, r=True, v=4)

    if adata.n_vars < n_comps:
        n_comps = adata.n_vars - 1
        logg.msg('reducing number of computed PCs to',
                 n_comps,
                 'as dim of data is only',
                 adata.n_vars,
                 v=4)

    if chunked:
        if not zero_center or random_state or svd_solver != 'auto':
            logg.msg('Ignoring zero_center, random_state, svd_solver', v=4)

        from sklearn.decomposition import IncrementalPCA

        X_pca = np.zeros((adata.X.shape[0], n_comps), adata.X.dtype)

        pca_ = IncrementalPCA(n_components=n_comps)

        for chunk, _, _ in adata.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            pca_.partial_fit(chunk)

        for chunk, start, end in adata.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            X_pca[start:end] = pca_.transform(chunk)
    else:
        zero_center = zero_center if zero_center is not None else False if issparse(
            adata.X) else True
        if zero_center:
            from sklearn.decomposition import PCA
            if issparse(adata.X):
                logg.msg(
                    '    as `zero_center=True`, '
                    'sparse input is densified and may '
                    'lead to huge memory consumption',
                    v=4)
                X = adata.X.toarray(
                )  # Copying the whole adata.X here, could cause memory problems
            else:
                X = adata.X
            pca_ = PCA(n_components=n_comps,
                       svd_solver=svd_solver,
                       random_state=random_state)
        else:
            from sklearn.decomposition import TruncatedSVD
            logg.msg(
                '    without zero-centering: \n'
                '    the explained variance does not correspond to the exact statistical defintion\n'
                '    the first component, e.g., might be heavily influenced by different means\n'
                '    the following components often resemble the exact PCA very closely',
                v=4)
            pca_ = TruncatedSVD(n_components=n_comps,
                                random_state=random_state)
            X = adata.X
        X_pca = pca_.fit_transform(X)

    if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype)

    if data_is_AnnData:
        adata.obsm['X_pca'] = X_pca
        adata.varm['PCs'] = pca_.components_.T
        adata.uns['pca'] = {}
        adata.uns['pca']['variance'] = pca_.explained_variance_
        adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_
        logg.msg('    finished', t=True, end=' ', v=4)
        logg.msg(
            'and added\n'
            '    \'X_pca\', the PCA coordinates (adata.obs)\n'
            '    \'PC1\', \'PC2\', ..., the loadings (adata.var)\n'
            '    \'pca_variance\', the variance / eigenvalues (adata.uns)\n'
            '    \'pca_variance_ratio\', the variance ratio (adata.uns)',
            v=4)
        return adata if copy else None
    else:
        if return_info:
            return X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_
        else:
            return X_pca
    print("SparsePCA transforming...")
    pca = SparsePCA(n_components=args.pl_dim)
    Train = pca.fit_transform(Train)
    Devel = pca.fit_transform(Devel)
elif args.kpca:
    print("KernelPCA transforming...")
    pca = KernelPCA(n_components=args.pl_dim)
    Train = pca.fit_transform(Train)
    Devel = pca.fit_transform(Devel)
elif args.ipca:
    print("i-PCA transforming...")
    ipca = IncrementalPCA(batch_size=args.ipca_batch,
                          copy=True,
                          n_components=args.pl_dim,
                          whiten=True)
    Train = ipca.fit_transform(Train)
    Devel = ipca.fit_transform(Devel)
elif args.lda:
    print("LDA transforming...")
    lda = LDA(n_components=args.pl_dim)

    if args.arousal:
        labels = Train_L[:, 0]
    elif args.valence:
        labels = Train_L[:, 1]
    elif args.liking:
        labels = Train_L[:, 2]

    lda = lda.fit(Train, labels)  #learning the projection matrix
    Train = lda.transform(Train)
    Devel = lda.transfrom(Devel)
def main(date, takeSubset=False):
    """
    Reduces the dimensionality of the training data to 3 dimensions, 
    plots the transformed data in 3d space. The idea is to bring
    out separability between the resistance classes which may be 
    hidden in the dimensionality of the data.

    :param date: (string) Data collection date YYYY_MMDD
    :param takeSubset: (boolean) Transform and plot a random subset of
                                 the trainng data?

    :return: (None)
    """

    mkl.set_num_threads(8)

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)

    if takeSubset:
        indices = np.random.choice(range(0, len(trainY)), size=NUM_SAMPLES, replace=False)
        X = trainX[indices,:]
        y = trainY[indices]
    else:
        X = trainX
        y = trainY

    X = np.nan_to_num(X)

    # Break the data into resistance classes
    susIndex = Constants.LABEL_TO_INDEX[Constants.SUSCEPTIBLE]
    drIndex = Constants.LABEL_TO_INDEX[Constants.DR_RESISTANT]
    grIndex = Constants.LABEL_TO_INDEX[Constants.GR_RESISTANT]

    susX = X[y==susIndex, :]
    drX = X[y==drIndex, :]
    grX = X[y==grIndex, :]

    # Transform the data using PCA
    pca = IncrementalPCA(n_components=6)

    pointsSUS = pca.fit_transform(susX)
    pointsGR= pca.fit_transform(grX)
    pointsDR = pca.fit_transform(drX)

    # Plot the transformed data in 3D space
    traceSUS = go.Scatter3d(
        x=pointsSUS[:, 0],
        y=pointsSUS[:, 1],
        z=pointsSUS[:, 2],
        mode='markers',
        marker=dict(
            size=5,
            line=dict(
                color='rgba(255, 0, 0, 0)',
                width=0.1
            ),
            opacity=0
        )
    )

    traceDR = go.Scatter3d(
        x=pointsDR[:, 0],
        y=pointsDR[:, 1],
        z=pointsDR[:, 2],
        mode='markers',
        marker=dict(
            size=5,
            line=dict(
                color='rgba(0, 255, 0, 0)',
                width=0.1
            ),
            opacity=0
        )
    )

    traceGR = go.Scatter3d(
        x=pointsGR[:, 0],
        y=pointsGR[:, 1],
        z=pointsGR[:, 2],
        mode='markers',
        marker=dict(
            size=5,
            line=dict(
                color='rgba(0, 0, 255, 0)',
                width=0.1
            ),
            opacity=0
        )
    )

    data = [traceSUS, traceDR, traceGR]
    fig = go.Figure(data=data)
    py.iplot(fig, filename='3D PCA Wavelength Plot')

    # Plot the principle components
    eigenSpectra = pca.components_

    plt.subplot(3,1,1)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[0, :])
    plt.title("Principle Components 1 - 3")
    plt.subplot(3,1,2)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[1, :])
    plt.subplot(3,1,3)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[2, :])
    plt.xlabel("Wavelength (nm)")
    plt.show()

    plt.clf()
    plt.subplot(3,1,1)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[3, :])
    plt.title("Principle Components 4 - 6")
    plt.subplot(3,1,2)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[4, :])
    plt.subplot(3,1,3)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[5, :])
    plt.xlabel("Wavelength (nm)")
    plt.show()
def get_image_features(data_type, block):
    """
    Method which returns the data type expected
    """

    if data_type == 'lab':

        block_file_path = '/tmp/lab_img.png'
        block.save(block_file_path)
        data = transform.get_LAB_L_SVD_s(Image.open(block_file_path))

    if data_type == 'mscn':

        img_mscn_revisited = transform.rgb_to_mscn(block)

        # save tmp as img
        img_output = Image.fromarray(img_mscn_revisited.astype('uint8'), 'L')
        mscn_revisited_file_path = '/tmp/mscn_revisited_img.png'
        img_output.save(mscn_revisited_file_path)
        img_block = Image.open(mscn_revisited_file_path)

        # extract from temp image
        data = compression.get_SVD_s(img_block)
    """if data_type == 'mscn':

        img_gray = np.array(color.rgb2gray(np.asarray(block))*255, 'uint8')
        img_mscn = transform.calculate_mscn_coefficients(img_gray, 7)
        img_mscn_norm = transform.normalize_2D_arr(img_mscn)

        img_mscn_gray = np.array(img_mscn_norm*255, 'uint8')

        data = compression.get_SVD_s(img_mscn_gray)
    """

    if data_type == 'low_bits_6':

        low_bits_6 = transform.rgb_to_LAB_L_low_bits(block, 6)
        data = compression.get_SVD_s(low_bits_6)

    if data_type == 'low_bits_5':

        low_bits_5 = transform.rgb_to_LAB_L_low_bits(block, 5)
        data = compression.get_SVD_s(low_bits_5)

    if data_type == 'low_bits_4':

        low_bits_4 = transform.rgb_to_LAB_L_low_bits(block, 4)
        data = compression.get_SVD_s(low_bits_4)

    if data_type == 'low_bits_3':

        low_bits_3 = transform.rgb_to_LAB_L_low_bits(block, 3)
        data = compression.get_SVD_s(low_bits_3)

    if data_type == 'low_bits_2':

        low_bits_2 = transform.rgb_to_LAB_L_low_bits(block, 2)
        data = compression.get_SVD_s(low_bits_2)

    if data_type == 'low_bits_4_shifted_2':

        data = compression.get_SVD_s(transform.rgb_to_LAB_L_bits(
            block, (3, 6)))

    if data_type == 'sub_blocks_stats':

        block = np.asarray(block)
        width, height, _ = block.shape
        sub_width, sub_height = int(width / 4), int(height / 4)

        sub_blocks = segmentation.divide_in_blocks(block,
                                                   (sub_width, sub_height))

        data = []

        for sub_b in sub_blocks:

            # by default use the whole lab L canal
            l_svd_data = np.array(transform.get_LAB_L_SVD_s(sub_b))

            # get information we want from svd
            data.append(np.mean(l_svd_data))
            data.append(np.median(l_svd_data))
            data.append(np.percentile(l_svd_data, 25))
            data.append(np.percentile(l_svd_data, 75))
            data.append(np.var(l_svd_data))

            area_under_curve = utils.integral_area_trapz(l_svd_data, dx=100)
            data.append(area_under_curve)

        # convert into numpy array after computing all stats
        data = np.asarray(data)

    if data_type == 'sub_blocks_stats_reduced':

        block = np.asarray(block)
        width, height, _ = block.shape
        sub_width, sub_height = int(width / 4), int(height / 4)

        sub_blocks = segmentation.divide_in_blocks(block,
                                                   (sub_width, sub_height))

        data = []

        for sub_b in sub_blocks:

            # by default use the whole lab L canal
            l_svd_data = np.array(transform.get_LAB_L_SVD_s(sub_b))

            # get information we want from svd
            data.append(np.mean(l_svd_data))
            data.append(np.median(l_svd_data))
            data.append(np.percentile(l_svd_data, 25))
            data.append(np.percentile(l_svd_data, 75))
            data.append(np.var(l_svd_data))

        # convert into numpy array after computing all stats
        data = np.asarray(data)

    if data_type == 'sub_blocks_area':

        block = np.asarray(block)
        width, height, _ = block.shape
        sub_width, sub_height = int(width / 8), int(height / 8)

        sub_blocks = segmentation.divide_in_blocks(block,
                                                   (sub_width, sub_height))

        data = []

        for sub_b in sub_blocks:

            # by default use the whole lab L canal
            l_svd_data = np.array(transform.get_LAB_L_SVD_s(sub_b))

            area_under_curve = utils.integral_area_trapz(l_svd_data, dx=50)
            data.append(area_under_curve)

        # convert into numpy array after computing all stats
        data = np.asarray(data)

    if data_type == 'sub_blocks_area_normed':

        block = np.asarray(block)
        width, height, _ = block.shape
        sub_width, sub_height = int(width / 8), int(height / 8)

        sub_blocks = segmentation.divide_in_blocks(block,
                                                   (sub_width, sub_height))

        data = []

        for sub_b in sub_blocks:

            # by default use the whole lab L canal
            l_svd_data = np.array(transform.get_LAB_L_SVD_s(sub_b))
            l_svd_data = utils.normalize_arr(l_svd_data)

            area_under_curve = utils.integral_area_trapz(l_svd_data, dx=50)
            data.append(area_under_curve)

        # convert into numpy array after computing all stats
        data = np.asarray(data)

    if data_type == 'mscn_var_4':

        data = _get_mscn_variance(block, (100, 100))

    if data_type == 'mscn_var_16':

        data = _get_mscn_variance(block, (50, 50))

    if data_type == 'mscn_var_64':

        data = _get_mscn_variance(block, (25, 25))

    if data_type == 'mscn_var_16_max':

        data = _get_mscn_variance(block, (50, 50))
        data = np.asarray(data)
        size = int(len(data) / 4)
        indices = data.argsort()[-size:][::-1]
        data = data[indices]

    if data_type == 'mscn_var_64_max':

        data = _get_mscn_variance(block, (25, 25))
        data = np.asarray(data)
        size = int(len(data) / 4)
        indices = data.argsort()[-size:][::-1]
        data = data[indices]

    if data_type == 'ica_diff':
        current_image = transform.get_LAB_L(block)

        ica = FastICA(n_components=50)
        ica.fit(current_image)

        image_ica = ica.fit_transform(current_image)
        image_restored = ica.inverse_transform(image_ica)

        final_image = utils.normalize_2D_arr(image_restored)
        final_image = np.array(final_image * 255, 'uint8')

        sv_values = utils.normalize_arr(compression.get_SVD_s(current_image))
        ica_sv_values = utils.normalize_arr(compression.get_SVD_s(final_image))

        data = abs(np.array(sv_values) - np.array(ica_sv_values))

    if data_type == 'svd_trunc_diff':

        current_image = transform.get_LAB_L(block)

        svd = TruncatedSVD(n_components=30, n_iter=100, random_state=42)
        transformed_image = svd.fit_transform(current_image)
        restored_image = svd.inverse_transform(transformed_image)

        reduced_image = (current_image - restored_image)

        U, s, V = compression.get_SVD(reduced_image)
        data = s

    if data_type == 'ipca_diff':

        current_image = transform.get_LAB_L(block)

        transformer = IncrementalPCA(n_components=20, batch_size=25)
        transformed_image = transformer.fit_transform(current_image)
        restored_image = transformer.inverse_transform(transformed_image)

        reduced_image = (current_image - restored_image)

        U, s, V = compression.get_SVD(reduced_image)
        data = s

    if data_type == 'svd_reconstruct':

        reconstructed_interval = (90, 200)
        begin, end = reconstructed_interval

        lab_img = transform.get_LAB_L(block)
        lab_img = np.array(lab_img, 'uint8')

        U, s, V = lin_svd(lab_img, full_matrices=True)

        smat = np.zeros((end - begin, end - begin), dtype=complex)
        smat[:, :] = np.diag(s[begin:end])
        output_img = np.dot(U[:, begin:end], np.dot(smat, V[begin:end, :]))

        output_img = np.array(output_img, 'uint8')

        data = compression.get_SVD_s(output_img)

    if 'sv_std_filters' in data_type:

        # convert into lab by default to apply filters
        lab_img = transform.get_LAB_L(block)
        arr = np.array(lab_img)
        images = []

        # Apply list of filter on arr
        images.append(medfilt2d(arr, [3, 3]))
        images.append(medfilt2d(arr, [5, 5]))
        images.append(wiener(arr, [3, 3]))
        images.append(wiener(arr, [5, 5]))

        # By default computation of current block image
        s_arr = compression.get_SVD_s(arr)
        sv_vector = [s_arr]

        # for each new image apply SVD and get SV
        for img in images:
            s = compression.get_SVD_s(img)
            sv_vector.append(s)

        sv_array = np.array(sv_vector)

        _, length = sv_array.shape

        sv_std = []

        # normalize each SV vectors and compute standard deviation for each sub vectors
        for i in range(length):
            sv_array[:, i] = utils.normalize_arr(sv_array[:, i])
            sv_std.append(np.std(sv_array[:, i]))

        indices = []

        if 'lowest' in data_type:
            indices = utils.get_indices_of_lowest_values(sv_std, 200)

        if 'highest' in data_type:
            indices = utils.get_indices_of_highest_values(sv_std, 200)

        # data are arranged following std trend computed
        data = s_arr[indices]

    # with the use of wavelet
    if 'wave_sv_std_filters' in data_type:

        # convert into lab by default to apply filters
        lab_img = transform.get_LAB_L(block)
        arr = np.array(lab_img)
        images = []

        # Apply list of filter on arr
        images.append(medfilt2d(arr, [3, 3]))

        # By default computation of current block image
        s_arr = compression.get_SVD_s(arr)
        sv_vector = [s_arr]

        # for each new image apply SVD and get SV
        for img in images:
            s = compression.get_SVD_s(img)
            sv_vector.append(s)

        sv_array = np.array(sv_vector)

        _, length = sv_array.shape

        sv_std = []

        # normalize each SV vectors and compute standard deviation for each sub vectors
        for i in range(length):
            sv_array[:, i] = utils.normalize_arr(sv_array[:, i])
            sv_std.append(np.std(sv_array[:, i]))

        indices = []

        if 'lowest' in data_type:
            indices = utils.get_indices_of_lowest_values(sv_std, 200)

        if 'highest' in data_type:
            indices = utils.get_indices_of_highest_values(sv_std, 200)

        # data are arranged following std trend computed
        data = s_arr[indices]

    # with the use of wavelet
    if 'sv_std_filters_full' in data_type:

        # convert into lab by default to apply filters
        lab_img = transform.get_LAB_L(block)
        arr = np.array(lab_img)
        images = []

        # Apply list of filter on arr
        kernel = np.ones((3, 3), np.float32) / 9
        images.append(cv2.filter2D(arr, -1, kernel))

        kernel = np.ones((5, 5), np.float32) / 25
        images.append(cv2.filter2D(arr, -1, kernel))

        images.append(cv2.GaussianBlur(arr, (3, 3), 0.5))

        images.append(cv2.GaussianBlur(arr, (3, 3), 1))

        images.append(cv2.GaussianBlur(arr, (3, 3), 1.5))

        images.append(cv2.GaussianBlur(arr, (5, 5), 0.5))

        images.append(cv2.GaussianBlur(arr, (5, 5), 1))

        images.append(cv2.GaussianBlur(arr, (5, 5), 1.5))

        images.append(medfilt2d(arr, [3, 3]))

        images.append(medfilt2d(arr, [5, 5]))

        images.append(wiener(arr, [3, 3]))

        images.append(wiener(arr, [5, 5]))

        wave = w2d(arr, 'db1', 2)
        images.append(np.array(wave, 'float64'))

        # By default computation of current block image
        s_arr = compression.get_SVD_s(arr)
        sv_vector = [s_arr]

        # for each new image apply SVD and get SV
        for img in images:
            s = compression.get_SVD_s(img)
            sv_vector.append(s)

        sv_array = np.array(sv_vector)

        _, length = sv_array.shape

        sv_std = []

        # normalize each SV vectors and compute standard deviation for each sub vectors
        for i in range(length):
            sv_array[:, i] = utils.normalize_arr(sv_array[:, i])
            sv_std.append(np.std(sv_array[:, i]))

        indices = []

        if 'lowest' in data_type:
            indices = utils.get_indices_of_lowest_values(sv_std, 200)

        if 'highest' in data_type:
            indices = utils.get_indices_of_highest_values(sv_std, 200)

        # data are arranged following std trend computed
        data = s_arr[indices]

    if 'sv_entropy_std_filters' in data_type:

        lab_img = transform.get_LAB_L(block)
        arr = np.array(lab_img)

        images = []

        kernel = np.ones((3, 3), np.float32) / 9
        images.append(cv2.filter2D(arr, -1, kernel))

        kernel = np.ones((5, 5), np.float32) / 25
        images.append(cv2.filter2D(arr, -1, kernel))

        images.append(cv2.GaussianBlur(arr, (3, 3), 0.5))

        images.append(cv2.GaussianBlur(arr, (3, 3), 1))

        images.append(cv2.GaussianBlur(arr, (3, 3), 1.5))

        images.append(cv2.GaussianBlur(arr, (5, 5), 0.5))

        images.append(cv2.GaussianBlur(arr, (5, 5), 1))

        images.append(cv2.GaussianBlur(arr, (5, 5), 1.5))

        images.append(medfilt2d(arr, [3, 3]))

        images.append(medfilt2d(arr, [5, 5]))

        images.append(wiener(arr, [3, 3]))

        images.append(wiener(arr, [5, 5]))

        wave = w2d(arr, 'db1', 2)
        images.append(np.array(wave, 'float64'))

        sv_vector = []
        sv_entropy_list = []

        # for each new image apply SVD and get SV
        for img in images:
            s = compression.get_SVD_s(img)
            sv_vector.append(s)

            sv_entropy = [
                utils.get_entropy_contribution_of_i(s, id_sv)
                for id_sv, sv in enumerate(s)
            ]
            sv_entropy_list.append(sv_entropy)

        sv_std = []

        sv_array = np.array(sv_vector)
        _, length = sv_array.shape

        # normalize each SV vectors and compute standard deviation for each sub vectors
        for i in range(length):
            sv_array[:, i] = utils.normalize_arr(sv_array[:, i])
            sv_std.append(np.std(sv_array[:, i]))

        indices = []

        if 'lowest' in data_type:
            indices = utils.get_indices_of_lowest_values(sv_std, 200)

        if 'highest' in data_type:
            indices = utils.get_indices_of_highest_values(sv_std, 200)

        # data are arranged following std trend computed
        s_arr = compression.get_SVD_s(arr)
        data = s_arr[indices]

    if 'convolutional_kernels' in data_type:

        sub_zones = segmentation.divide_in_blocks(block, (20, 20))

        data = []

        diff_std_list_3 = []
        diff_std_list_5 = []
        diff_mean_list_3 = []
        diff_mean_list_5 = []

        plane_std_list_3 = []
        plane_std_list_5 = []
        plane_mean_list_3 = []
        plane_mean_list_5 = []

        plane_max_std_list_3 = []
        plane_max_std_list_5 = []
        plane_max_mean_list_3 = []
        plane_max_mean_list_5 = []

        for sub_zone in sub_zones:
            l_img = transform.get_LAB_L(sub_zone)
            normed_l_img = utils.normalize_2D_arr(l_img)

            # bilateral with window of size (3, 3)
            normed_diff = convolution.convolution2D(normed_l_img,
                                                    kernels.min_bilateral_diff,
                                                    (3, 3))
            std_diff = np.std(normed_diff)
            mean_diff = np.mean(normed_diff)

            diff_std_list_3.append(std_diff)
            diff_mean_list_3.append(mean_diff)

            # bilateral with window of size (5, 5)
            normed_diff = convolution.convolution2D(normed_l_img,
                                                    kernels.min_bilateral_diff,
                                                    (5, 5))
            std_diff = np.std(normed_diff)
            mean_diff = np.mean(normed_diff)

            diff_std_list_5.append(std_diff)
            diff_mean_list_5.append(mean_diff)

            # plane mean with window of size (3, 3)
            normed_plane_mean = convolution.convolution2D(
                normed_l_img, kernels.plane_mean, (3, 3))
            std_plane_mean = np.std(normed_plane_mean)
            mean_plane_mean = np.mean(normed_plane_mean)

            plane_std_list_3.append(std_plane_mean)
            plane_mean_list_3.append(mean_plane_mean)

            # plane mean with window of size (5, 5)
            normed_plane_mean = convolution.convolution2D(
                normed_l_img, kernels.plane_mean, (5, 5))
            std_plane_mean = np.std(normed_plane_mean)
            mean_plane_mean = np.mean(normed_plane_mean)

            plane_std_list_5.append(std_plane_mean)
            plane_mean_list_5.append(mean_plane_mean)

            # plane max error with window of size (3, 3)
            normed_plane_max = convolution.convolution2D(
                normed_l_img, kernels.plane_max_error, (3, 3))
            std_plane_max = np.std(normed_plane_max)
            mean_plane_max = np.mean(normed_plane_max)

            plane_max_std_list_3.append(std_plane_max)
            plane_max_mean_list_3.append(mean_plane_max)

            # plane max error with window of size (5, 5)
            normed_plane_max = convolution.convolution2D(
                normed_l_img, kernels.plane_max_error, (5, 5))
            std_plane_max = np.std(normed_plane_max)
            mean_plane_max = np.mean(normed_plane_max)

            plane_max_std_list_5.append(std_plane_max)
            plane_max_mean_list_5.append(mean_plane_max)

        diff_std_list_3 = np.array(diff_std_list_3)
        diff_std_list_5 = np.array(diff_std_list_5)

        diff_mean_list_3 = np.array(diff_mean_list_3)
        diff_mean_list_5 = np.array(diff_mean_list_5)

        plane_std_list_3 = np.array(plane_std_list_3)
        plane_std_list_5 = np.array(plane_std_list_5)

        plane_mean_list_3 = np.array(plane_mean_list_3)
        plane_mean_list_5 = np.array(plane_mean_list_5)

        plane_max_std_list_3 = np.array(plane_max_std_list_3)
        plane_max_std_list_5 = np.array(plane_max_std_list_5)

        plane_max_mean_list_3 = np.array(plane_max_mean_list_3)
        plane_max_mean_list_5 = np.array(plane_max_mean_list_5)

        if 'std_max_blocks' in data_type:

            data.append(np.std(diff_std_list_3[0:int(len(sub_zones) / 5)]))
            data.append(np.std(diff_mean_list_3[0:int(len(sub_zones) / 5)]))
            data.append(np.std(diff_std_list_5[0:int(len(sub_zones) / 5)]))
            data.append(np.std(diff_mean_list_5[0:int(len(sub_zones) / 5)]))

            data.append(np.std(plane_std_list_3[0:int(len(sub_zones) / 5)]))
            data.append(np.std(plane_mean_list_3[0:int(len(sub_zones) / 5)]))
            data.append(np.std(plane_std_list_5[0:int(len(sub_zones) / 5)]))
            data.append(np.std(plane_mean_list_5[0:int(len(sub_zones) / 5)]))

            data.append(np.std(plane_max_std_list_3[0:int(len(sub_zones) /
                                                          5)]))
            data.append(
                np.std(plane_max_mean_list_3[0:int(len(sub_zones) / 5)]))
            data.append(np.std(plane_max_std_list_5[0:int(len(sub_zones) /
                                                          5)]))
            data.append(
                np.std(plane_max_mean_list_5[0:int(len(sub_zones) / 5)]))

        if 'mean_max_blocks' in data_type:

            data.append(np.mean(diff_std_list_3[0:int(len(sub_zones) / 5)]))
            data.append(np.mean(diff_mean_list_3[0:int(len(sub_zones) / 5)]))
            data.append(np.mean(diff_std_list_5[0:int(len(sub_zones) / 5)]))
            data.append(np.mean(diff_mean_list_5[0:int(len(sub_zones) / 5)]))

            data.append(np.mean(plane_std_list_3[0:int(len(sub_zones) / 5)]))
            data.append(np.mean(plane_mean_list_3[0:int(len(sub_zones) / 5)]))
            data.append(np.mean(plane_std_list_5[0:int(len(sub_zones) / 5)]))
            data.append(np.mean(plane_mean_list_5[0:int(len(sub_zones) / 5)]))

            data.append(
                np.mean(plane_max_std_list_3[0:int(len(sub_zones) / 5)]))
            data.append(
                np.mean(plane_max_mean_list_3[0:int(len(sub_zones) / 5)]))
            data.append(
                np.mean(plane_max_std_list_5[0:int(len(sub_zones) / 5)]))
            data.append(
                np.mean(plane_max_mean_list_5[0:int(len(sub_zones) / 5)]))

        if 'std_normed' in data_type:

            data.append(np.std(diff_std_list_3))
            data.append(np.std(diff_mean_list_3))
            data.append(np.std(diff_std_list_5))
            data.append(np.std(diff_mean_list_5))

            data.append(np.std(plane_std_list_3))
            data.append(np.std(plane_mean_list_3))
            data.append(np.std(plane_std_list_5))
            data.append(np.std(plane_mean_list_5))

            data.append(np.std(plane_max_std_list_3))
            data.append(np.std(plane_max_mean_list_3))
            data.append(np.std(plane_max_std_list_5))
            data.append(np.std(plane_max_mean_list_5))

        if 'mean_normed' in data_type:

            data.append(np.mean(diff_std_list_3))
            data.append(np.mean(diff_mean_list_3))
            data.append(np.mean(diff_std_list_5))
            data.append(np.mean(diff_mean_list_5))

            data.append(np.mean(plane_std_list_3))
            data.append(np.mean(plane_mean_list_3))
            data.append(np.mean(plane_std_list_5))
            data.append(np.mean(plane_mean_list_5))

            data.append(np.mean(plane_max_std_list_3))
            data.append(np.mean(plane_max_mean_list_3))
            data.append(np.mean(plane_max_std_list_5))
            data.append(np.mean(plane_max_mean_list_5))

        data = np.array(data)

    if data_type == 'convolutional_kernel_stats_svd':

        l_img = transform.get_LAB_L(block)
        normed_l_img = utils.normalize_2D_arr(l_img)

        # bilateral with window of size (5, 5)
        normed_diff = convolution.convolution2D(normed_l_img,
                                                kernels.min_bilateral_diff,
                                                (5, 5))

        # getting sigma vector from SVD compression
        s = compression.get_SVD_s(normed_diff)

        data = s

    if data_type == 'svd_entropy':
        l_img = transform.get_LAB_L(block)

        blocks = segmentation.divide_in_blocks(l_img, (20, 20))

        values = []
        for b in blocks:
            sv = compression.get_SVD_s(b)
            values.append(utils.get_entropy(sv))
        data = np.array(values)

    if data_type == 'svd_entropy_20':
        l_img = transform.get_LAB_L(block)

        blocks = segmentation.divide_in_blocks(l_img, (20, 20))

        values = []
        for b in blocks:
            sv = compression.get_SVD_s(b)
            values.append(utils.get_entropy(sv))
        data = np.array(values)

    if data_type == 'svd_entropy_noise_20':
        l_img = transform.get_LAB_L(block)

        blocks = segmentation.divide_in_blocks(l_img, (20, 20))

        values = []
        for b in blocks:
            sv = compression.get_SVD_s(b)
            sv_size = len(sv)
            values.append(utils.get_entropy(sv[int(sv_size / 4):]))
        data = np.array(values)

    return data
Exemple #38
0
import numpy as np

from gensim.models import Word2Vec
from sklearn.decomposition import IncrementalPCA
# from bhtsne import tsne

WORD2VEC_MODEL = 'GNews.model'
WORD2VEC_JSON = 'word2vec.json'

model = Word2Vec.load(WORD2VEC_MODEL)

words = []
vectors = np.empty((len(model.vocab.keys()), 300))
# vectors = np.empty((6, 300))

# for i, w in enumerate(['email', 'password', 'user', 'date', 'this', 'is']):
for i, w in enumerate(model.vocab.keys()):
    words.append(w)
    vectors[i] = model[w]

# vectors = tsne(vectors, dimensions=3, perplexity=50)
ipca = IncrementalPCA(n_components=2, batch_size=25000)
vectors = ipca.fit_transform(vectors)

json_vectors = {}
for i, w in enumerate(words):
    json_vectors[w] = vectors[i].tolist()

with open(WORD2VEC_JSON, 'w') as f:
    json.dump(json_vectors, f)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.tight_layout()
#fig, ax = plt.subplots()
#ax.legend(LABEL_COLOR_MAP, legend_names)
#plt.legend(legend_names, loc='best')
plt.title('Principal Component Analysis', fontsize=12)
img_file = results_path.joinpath('Principal_Component_Scatter_Plot.png')
plt.savefig(img_file)
plt.show()

# Looks like approx. 50 components are enough to describe 90% of the variance in the dataset
# We'll choose 50 components for our modeling
#Using incremental PCA for efficiency - saves a lot of time on larger datasets
pca_final = IncrementalPCA(n_components=16)
df_train_pca = pca_final.fit_transform(X_train_rus)
print("df_train_pca.shape")
print(df_train_pca.shape)

#Creating correlation matrix for the principal components - I expect little to no correlation
df_corr = data_df.corr()
corrmat = np.corrcoef(df_train_pca.transpose())
plt.figure(figsize=(16, 16))
sns.set(font_scale=.8)
sns.heatmap(corrmat,
            vmin=df_corr.values.min(),
            vmax=1,
            fmt='.1f',
            square=True,
            cmap="Blues",
            linewidths=0.1,
#         sample = np.random.choice(len(data[0]), 2)
#         interpolation_coefficient = np.random.beta(2, 2)
#         interpolation_coefficient = 0.5
#         new_data_point = interpolation_coefficient * data[0][sample[0]] + (1 - interpolation_coefficient) * data[0][sample[1]]
#         new_target = np.any([data[1][sample[0]].astype(int), data[1][sample[1]].astype(int)], axis=0)
#         X_training_list.append(new_data_point)
#         y_training_list.append(new_target)
# X_training = np.array(X_training_list)
# y_training = np.array(y_training_list)
# print(X_training.shape)
# print(y_training.shape)

print("PCA ...")
print("original data: ", dataset_matrix.shape)
pca = IncrementalPCA(n_components=800, batch_size=1000)
X_training = pca.fit_transform(X_training)
X_testing = pca.transform(X_testing)
print("training data: ", X_training.shape)
print("testing data: ", X_testing.shape)

device = torch.device('cuda')
activation_functions = {nn.ReLU(), torch.tanh}
size_hidden1 = range(100, 500, 50)
size_hidden2 = range(100, 500, 50)
regularization_coefficients = [
    1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2
]

print(
    cross_validation(X_training,
                     y_training,
Exemple #41
0
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.decomposition import PCA, IncrementalPCA

iris = load_iris()
X = iris.data
y = iris.target
print(X)
print(y)
n_components = 2
ipca = IncrementalPCA(n_components=n_components, batch_size=10)
X_ipca = ipca.fit_transform(X)

print(X_ipca)
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

print("pca:")
print(X_pca)
colors = ['navy', 'turquoise', 'darkorange']

for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]:
    plt.figure(figsize=(8, 8))
    for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names):
        plt.scatter(X_transformed[y == i, 0],
                    X_transformed[y == i, 1],
                    color=color,
Exemple #42
0
def main():
    """
    Get data from db and save it as csv
    """

    bq = _bq.BQHandler()
    io = _io.IO(gs_bucket=options.gs_bucket)
    viz = _viz.Viz()

    starttime, endtime = io.get_dates(options)
    print('Using dataset {} and time range {} - {}'.format(
        options.feature_dataset, starttime.strftime('%Y-%m-%d'),
        endtime.strftime('%Y-%m-%d')))

    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    if options.pca:
        ipca = IncrementalPCA(n_components=options.pca_components,
                              whiten=options.whiten,
                              copy=False)

    rmses, maes, r2s, vars, start_times, end_times, end_times_obj = [], [], [], [], [], [], []

    start = starttime
    end = endtime
    print('Processing time range {} - {}'.format(
        start.strftime('%Y-%m-%d %H:%M'), end.strftime('%Y-%m-%d %H:%M')))

    try:
        print('Reading data...')
        data = bq.get_rows(start,
                           end,
                           loc_col='trainstation',
                           project=options.project,
                           dataset=options.feature_dataset,
                           table=options.feature_table,
                           parameters=all_param_names)
        data = io.filter_train_type(labels_df=data,
                                    train_types=options.train_types,
                                    sum_types=True,
                                    train_type_column='train_type',
                                    location_column='trainstation',
                                    time_column='time',
                                    sum_columns=['delay'],
                                    aggs=aggs)

        if options.y_avg_hours is not None:
            data = io.calc_running_delay_avg(data, options.y_avg_hours)

        data.sort_values(by=['time', 'trainstation'], inplace=True)

        if options.impute:
            print('Imputing missing values...')
            data.drop(columns=['train_type'], inplace=True)
            data = imputer.fit_transform(data)
            data.loc[:, 'train_type'] = None

        if options.model == 'ard' and len(data) > options.n_samples:
            print('Sampling {} values from data...'.format(options.n_samples))
            data = data.sample(options.n_samples)

        #l_data = data.loc[:,options.meta_params + options.label_params]
        #f_data = data.loc[:,options.meta_params + options.feature_params]

    except ValueError as e:
        f_data, l_data = [], []

    #f_data.rename(columns={'trainstation':'loc_name'}, inplace=True)

    #logging.debug('Labels shape: {}'.format(l_data.shape))
    print('Processing {} rows...'.format(len(data)))
    #assert l_data.shape[0] == f_data.shape[0]

    target = data.loc[:, options.label_params].astype(np.float32).values
    #print(f_data.columns)
    #features = f_data.drop(columns=['loc_name', 'time']).astype(np.float32).values
    features = data.loc[:, options.feature_params].astype(np.float32).values

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        target,
                                                        test_size=0.33)

    logging.debug('Features shape: {}'.format(X_train.shape))

    n_samples, n_dims = X_train.shape

    if options.normalize:
        print('Normalizing data...')
        print(X_train)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)

    if options.pca:
        print('Doing PCA analyzis for the data...')
        X_train = ipca.fit_transform(X_train)
        fname = options.output_path + '/ipca_explained_variance.png'
        viz.explained_variance(ipca, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)
        X_test = ipca.fit_transform(X_test)

    logging.debug('Features shape after pre-processing: {}'.format(
        X_train.shape))

    print('Training...')
    print(X_train.shape)
    input_dim = X_train.shape[1]
    #k1 = gpflow.kernels.Matern52(input_dim, lengthscales=0.3)
    #k_seasonal = gpflow.kernels.Periodic(input_dim=input_dim, period=2190, name='k_seasonal')
    #k_small = gpflow.kernels.Periodic(input_dim=input_dim, period=120, name='k_small')
    k_weather = gpflow.kernels.RBF(input_dim=input_dim, ARD=True)
    #k_noise = gpflow.kernels.White(input_dim=input_dim)

    #k = k_seasonal + k_weather + k_noise
    k = k_weather
    Z = np.random.rand(150, input_dim)

    if options.cv:
        logging.info('Doing random search for hyper parameters...')

        param_grid = {"length_scale": [0.1, 1, 2], "whiten": [True, False]}

        model = GP(dim=input_dim, Z=Z)

        random_search = RandomizedSearchCV(model,
                                           param_distributions=param_grid,
                                           n_iter=int(options.n_iter_search),
                                           n_jobs=-1)

        random_search.fit(X_train, y_train)
        logging.info("RandomizedSearchCV done.")
        sys.exit()
    else:
        model = GP(dim=input_dim, Z=Z)
        model.fit(X_train.astype(np.float64),
                  y_train.reshape((-1, 1)).astype(np.float64))

        model.save(options.save_file)

        print('Training finished')
        print(model.model)

        #    Z_list = options.z_list.split(',')

        #for size in Z_list:

        #    with tf.Session() as sess:
        #custom_config = gpflow.settings.get_settings()
        #custom_config.verbosity.tf_compile_verb = True

        #with gpflow.settings.temp_settings(custom_config), gpflow.session_manager.get_session().as_default():

        #Z = X_train[::5].copy()
        # Z = np.random.rand(int(size), 19)
        # print('Training with inducing points: {}'.format(Z.shape))
        #
        # # model = gpflow.models.SVGP(X_train.astype(np.float64),
        # #                            y_train.reshape((-1,1)).astype(np.float64),
        # #                            kern=k,
        # #                            likelihood=gpflow.likelihoods.Gaussian(),
        # #                            Z=Z,
        # #                            #Z=X_train.copy(),
        # #                            minibatch_size=100,
        # #                            whiten=options.normalize
        # #                            )
        # #                            #model.likelihood.variance = 0.01
        # #
        # # model.compile(session=sess)
        # # opt = gpflow.train.ScipyOptimizer()
        # # opt.minimize(model)
        #
        # model = GP(dim=19,
        #            Z=Z
        #            )
        # model.fit(X_train.astype(np.float64),
        #           y_train.reshape((-1,1)).astype(np.float64))
        #
        # model.save(options.save_file)
        #
        # print('Training finished')
        # print(model.model)

        #fname=options.output_path+'/svga_performance.png'
        #viz.plot_svga(model, fname)

        # k_long_term = 66.0**2 * RBF(length_scale=67.0)
        # k_seasonal = 2.4**2 * RBF(length_scale=90.0)* ExpSineSquared(length_scale=150, periodicity=1.0, periodicity_bounds=(0,10000))
        # k_medium_term = 0.66**2 * RationalQuadratic(length_scale=1.2, alpha=0.78)
        # k_noise = 0.18**2 * RBF(length_scale=0.134) + WhiteKernel(noise_level=0.19**2)
        # #kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise
        # kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise
        #
        # model = GaussianProcessRegressor(kernel=kernel_gpml, #alpha=0,
        #                                  optimizer=None, normalize_y=True)

        # Metrics
        y_pred, var = model.predict_f(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        rmses.append(rmse)
        maes.append(mae)
        r2s.append(r2)
        vars.append(var.mean())
        start_times.append(start.strftime('%Y-%m-%dT%H:%M:%S'))
        end_times.append(end.strftime('%Y-%m-%dT%H:%M:%S'))
        end_times_obj.append(end)

        print('RMSE: {:.2f}'.format(rmse))
        print('MAE: {:.2f}'.format(mae))
        print('Variance: {:.2f}-{:.2f}'.format(var.min(), var.max()))
        print('R2 score: {:.2f}'.format(r2))

    #io.save_scikit_model(model, filename=options.save_file, ext_filename=options.save_file)
    if options.model == 'rf':
        fname = options.output_path + '/rfc_feature_importance.png'
        viz.rfc_feature_importance(model.feature_importances_, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)

    try:
        fname = options.output_path + '/learning_over_time.png'
        viz.plot_learning_over_time(end_times_obj,
                                    rmses,
                                    maes,
                                    r2s,
                                    filename=fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)
    except Exception as e:
        logging.error(e)

    error_data = {
        'start_times': start_times,
        'end_times': end_times,
        'rmse': rmses,
        'mae': maes,
        'var': vars,
        'r2': r2s
    }
    fname = '{}/training_time_validation_errors.csv'.format(
        options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)
# Authors: Kyle Kastner
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.decomposition import PCA, IncrementalPCA

iris = load_iris()
X = iris.data
y = iris.target

n_components = 2
ipca = IncrementalPCA(n_components=n_components, batch_size=10)
X_ipca = ipca.fit_transform(X)

pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

colors = ['navy', 'turquoise', 'darkorange']

for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]:
    plt.figure(figsize=(8, 8))
    for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names):
        plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1],
                    color=color, lw=2, label=target_name)

    if "Incremental" in title:
        err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean()
        plt.title(title + " of iris dataset\nMean absolute unsigned error "
Exemple #44
0
nhalf =  40000
itrain = 0

print 'DFT Difference PC= 100'

matname = 'X2.mat'
clasname = 'cover_dft2.pkl'
n_components = 100
mat = scipy.io.loadmat('myFile.mat')
songData = mat['songData'][:,1:17000]
labels = mat['songData'][:,0]

songScaled = preprocessing.scale(songData)

ipca = IncrementalPCA(n_components=n_components, batch_size=2000)
X_ipca = ipca.fit_transform(songScaled,labels)


def getTrainData(X_ipca,labels):
	i = 0
	X_pos = []
	y = []
	while(i<15500):
		label = labels[i]
		j =0
		temp = []
		while(labels[i] == label):
			temp.append(X_ipca[i])
			j=j+1
			i=i+1