def initialize_variables(self):
     self.N = self.allelic_counts.shape[0]
     self.T = self.allelic_counts.shape[1]
     self.num_cov = self.cov.shape[1]
     # Randomly initialize (only U matters)
     self.U = np.random.randn(self.N, self.K)
     self.V = np.random.randn(self.K, self.T)
     mom_conc_init = 1.0 / np.nanvar(
         self.allelic_counts / self.total_counts)
     self.conc = np.ones(self.T) * mom_conc_init
     self.C = np.random.randn(self.num_cov, self.T)
     ppca_init = False
     if ppca_init == True:
         rat = self.allelic_counts / self.total_counts
         nans = np.isnan(rat)
         scaled_rat = scale_allelic_ratios(rat)
         scaled_residual_rat = regress_out_cell_line(
             scaled_rat, self.cov[:, 1:])
         rescaled_residual_rat = scale_allelic_ratios(scaled_residual_rat)
         ppca = PPCA()
         ppca.fit(data=np.transpose(rescaled_residual_rat),
                  d=self.K,
                  verbose=True,
                  tol=1e-6)
         self.U = ppca.C / np.std(ppca.C)
    def run_factorization(self, N, S, X, Z, I, K, k, n):
        # Smart initialization
        #rat = k/n
        rat = filter_lowly_expressed_sites(k, n, 3)
        nans = np.isnan(rat)

        scaled_rat = scale_allelic_ratios(rat)
        scaled_residual_rat = regress_out_cell_line(scaled_rat, Z)
        rescaled_residual_rat = scale_allelic_ratios(scaled_residual_rat)
        ppca = PPCA()
        ppca.fit(data=np.transpose(rescaled_residual_rat),
                 d=K,
                 verbose=True,
                 tol=1e-6)
        U = ppca.C
        V = ppca.transform()
        pickle.dump(ppca, open(self.output_root + '_model', 'wb'))
        np.savetxt(self.output_root + '_temper_U.txt',
                   U,
                   fmt="%s",
                   delimiter='\t')
        np.savetxt(self.output_root + '_temper_V.txt',
                   V.T,
                   fmt="%s",
                   delimiter='\t')
Example #3
0
class PCAImputer:
    def __init__(self, n_dimension):
        self._q = n_dimension

    def fit_transform(self,
                      data,
                      method='eig',
                      probabilistic=False,
                      n_iteration=100):
        """fitting a PCA to the original data by iterativly filling the missing entries
        with value generated from PCA. Each missing entries are initialized with the
        row mean."""
        self._data = data.copy()
        self._missing = np.isnan(data)
        self._observed = ~self._missing
        self._pca = PPCA(n_dimension=self._q)

        row_defau = np.zeros(self._data.shape[0])
        row_means = np.repeat(np.nanmean(self._data, axis=1, out=row_defau).reshape(-1, 1), \
                              self._data.shape[1], axis=1)
        self._data[self._missing] = row_means[self._missing]
        for i in range(n_iteration):
            self._pca.fit(self._data, method=method)
            self._data[self._missing] = self._pca.inverse_transform(self._pca.transform(self._data, \
                                        probabilistic), probabilistic)[self._missing]
        return self._data
Example #4
0
    def probPCA(self):
        """
        Do probabilistic PCA
        :return: result 2d array X
        """
        # Normalizing X
        sc = StandardScaler()
        X_normalized = sc.fit_transform(self.X)

        ppca = PPCA()
        ppca.fit(data=X_normalized, d=2)
        result = ppca.transform()
        return result
Example #5
0
def compute_pca(data,
                predictors,
                how='pca',
                what='factors',
                n_components=1,
                use_corr=True):

    data[predictors] = data[predictors].astype('float64')
    X = data[predictors].values
    if (use_corr == True):
        ## If PCA are computed using the correlation matrix --> standardize the data (zero mean, unit std.)
        scaler = preprocessing.StandardScaler()
        X_std = scaler.fit_transform(X)
    else:
        ## If PCA are computed using the covariance matrix --> center the data only (zero mean)
        X_mean = np.mean(X, axis=0)
        X_std = X - X_mean

    if (how == 'pca'):
        pca = PCA(n_components)
        pca.fit(X_std)
        factors = pca.transform(X_std)
        explained_variance = pca.explained_variance_ratio_
        Xhat_std = pca.inverse_transform(factors)
        if (use_corr == True):
            Xhat = scaler.inverse_transform(Xhat_std)
        else:
            Xhat = Xhat_std + X_mean

    if (how == 'ppca'):
        ppca = PPCA()
        ppca.fit(X_std, n_components)
        factors = ppca.transform()
        explained_variance = ppca.var_exp
        Xhat_std = ppca.reconstruct()
        if (use_corr == True):
            Xhat = scaler.inverse_transform(Xhat_std)
        else:
            Xhat = Xhat_std + X_mean

    if (what != 'recon'):
        pca_columns = []
        for i in range(factors.shape[1]):
            pca_columns.append('pca_{}'.format(i))
            data['pca_{}'.format(i)] = factors[:, i]
        return list([data, explained_variance])
    else:
        rec_data = pd.DataFrame(Xhat)
        return rec_data
Example #6
0
def run_ppca(data, features, ncomponents = None, min_obs = 0.1, use_corr = False):

    X = data[features]
    if ncomponents is None:
        ncomponents=len(features)
    ppca = PPCA(X,d = ncomponents, min_obs = min_obs)
    ppca.standardize()
    ppca.fit()
    scores, loadings = ppca.transform()
    explained_variance = ppca.var_exp

    pca_columns = []
    for i in range(scores.shape[1]):
        pca_columns.append('pc_{}'.format(i))
        data['pc_{}'.format(i)] = scores[:,i]
    return list([data,explained_variance])
 def run_factorization(self, N, S, X, Z, I, K, k, n):
     # Smart initialization
     rat = k / n
     nans = np.isnan(rat)
     scaled_rat = scale_allelic_ratios(rat)
     ppca = PPCA()
     ppca.fit(data=np.transpose(scaled_rat), d=K, verbose=True)
     U = ppca.C
     V = ppca.transform()
     pickle.dump(ppca, open(self.output_root + '_model', 'wb'))
     np.savetxt(self.output_root + '_temper_U.txt',
                U,
                fmt="%s",
                delimiter='\t')
     np.savetxt(self.output_root + '_temper_V.txt',
                V.T,
                fmt="%s",
                delimiter='\t')
Example #8
0
    def remove_nan_test(self):

        N = 101
        k = 23
        p_nan = 0.02
        n_components = 3

        data = np.random.random((N, k))
        for n in range(N):
            for _k in range(k):
                if random.random() < p_nan:
                    data[n, _k] = np.nan

        pca = PPCA()
        pca.fit(data, n_components)

        self.assertEqual(pca.data[np.isnan(pca.data)].shape, (0, ))
        self.assertEqual(pca.C.shape, (k, n_components))
        self.assertEqual(pca.transform().shape, (N, n_components))
Example #9
0
	def run_ppca_initialization(self):
		print('Starting PPCA initialization')
		rat = self.allelic_counts/self.total_counts
		nans = np.isnan(rat)

		scaled_rat = scale_allelic_ratios(rat)
		scaled_residual_rat = regress_out_cell_line(scaled_rat, self.Z)
		rescaled_residual_rat = scale_allelic_ratios(scaled_residual_rat)
		ppca = PPCA()
		ppca.fit(data=np.transpose(rescaled_residual_rat), d=self.K, verbose=True, tol=1e-6)
		self.U_init = ppca.C/np.std(ppca.C)
		# Run bb-mf
		with pm.Model() as bb_glm_init:
			CONC = pm.HalfCauchy('CONC', beta=5, shape=(1,self.S), testval=self.conc_init)
			BETA = pm.Normal('BETA', mu=0, tau=(1/1000000.0), shape=(self.S, self.num_cov), testval=self.beta_init)
			#U = pm.Normal('U', mu=0, tau=(1.0/1.0), shape=(N, K), testval=self.U_init)
			V = pm.Normal('V', mu=0, tau=(1.0/1.0), shape=(self.S, self.K), testval=np.zeros(self.V_init.shape))

			MU_A = pm.Normal("MU_A", mu=0., sd=100**2, shape=(1,self.S), testval=self.mu_a_init)
			SIGMA_A = pm.HalfCauchy("SIGMA_A", beta=5.0, shape=(1,self.S), testval=self.sigma_a_init)
			mu_a_mat = pm.math.dot(np.ones((self.I,1)), MU_A)
			sigma_a_mat = pm.math.dot(np.ones((self.I,1)), SIGMA_A)
			A = pm.Normal('A', mu=mu_a_mat, sigma=sigma_a_mat, shape=(self.I,self.S), testval=self.A_init)

			p = pm.math.invlogit(pm.math.dot(self.cov, BETA.T) + pm.math.dot(self.U_init,V.T) + A[self.Z,:])
			conc_mat = pm.math.dot(np.ones((self.N,1)), CONC)
			R = pm.BetaBinomial('like',alpha=(p*conc_mat)[~nans], beta=((1.0-p)*conc_mat)[~nans], n=self.total_counts[~nans], observed=self.allelic_counts[~nans])
			approx_init = pm.fit(method='advi', n=2000)
		pickle.dump(approx_init, open(self.output_root + '_model_init', 'wb'))
		init_dict = approx_init.bij.rmap(approx_init.params[0].eval())
		self.beta_init = init_dict['BETA']
		self.A_init = init_dict['A']
		self.sigma_a_init = np.exp(init_dict['SIGMA_A_log__'])
		self.mu_a_init = init_dict['MU_A']
		self.conc_init = np.exp(init_dict['CONC_log__'])
		self.V_init = init_dict['V']
		print('Smart PPCA complete')
Example #10
0
def reducePCA(x, ndim):

    # if there are any nans in any of the lists, use ppca
    if np.isnan(np.vstack(x)).any():
        warnings.warn(
            'Missing data: Inexact solution computed with PPCA (see https://github.com/allentran/pca-magic for details)'
        )

        # ppca if missing data
        m = PPCA(np.vstack(x))
        m.fit(d=ndim)
        x_pca = m.transform()

        # if the whole row is missing, return nans
        all_missing = [
            idx for idx, a in enumerate(np.vstack(x))
            if all([type(b) == np.nan for b in a])
        ]
        if len(all_missing) > 0:
            for i in all_missing:
                x_pca[i, :] = np.nan

        # get the original lists back
        if len(x) > 1:
            x_split = np.cumsum([i.shape[0] for i in x][:-1])
            return list(np.split(x_pca, x_split, axis=0))
        else:
            return [x_pca]

    else:
        m = PCA(n_components=ndim, whiten=True)
        m.fit(np.vstack(x))
        if len(x) > 1:
            return [m.transform(i) for i in x]
        else:
            return [m.transform(x[0])]
def do_probabilistic_PCA(df, var, output):
    """ Perform probabilistic PCA (PPCA) on scaled values for the whole screen

            Args:
                df:             Existing combined dictionary
                var:            Minimum explained variance required
                output:         Output filenames

            Return:
                df:             Updated with added 'DataPCA' values
                exp_var:        List of explained variance with each added PC
                num_PCs:        Number of PCs to explain var
                PCA_loadings:   Principal axes in feature space (n_components, n_features)
    """

    print('Feature selection using probabilistic PCA...')
    log_write(output['log'], 'Feature selection using probabilistic PCA...\n')

    # Initialize parameters
    exp_var = [0]
    exp_var_ratio = []
    num_PCs = 0
    ppca = PPCA()
    ppca.fit(df['DataScaled'], d=2)
    exp_var.append(ppca.var_exp[0])
    exp_var_ratio.append(ppca.var_exp[0])

    # Do PPCA with number of components iteratively (max is the number of features, min is 2)
    for i in range(2, df['DataScaled'].shape[1]):
        num_PCs = i
        ppca = PPCA()
        ppca.fit(df['DataScaled'], d=i)
        total_var = ppca.var_exp[i - 1]
        exp_var.append(total_var)
        exp_var_ratio.append(ppca.var_exp[i - 1] - ppca.var_exp[i - 2])
        # End PCA if the total variance passes the minimum variance required
        if total_var > var:
            num_PCs = i
            np.savetxt(output['PCAExplainedVariance'],
                       exp_var_ratio,
                       fmt='%0.4f')
            break

    # Do the final PCA with num_PCs
    ppca = PPCA()
    ppca.fit(df['Data'], d=num_PCs)
    df['DataPCA'] = ppca.transform()
    PPCA_loadings = np.transpose(ppca.C)

    return df, exp_var, num_PCs, PPCA_loadings
E = []


for i in range(len(X)):
	Y = np.fromstring(X[i], dtype=int, sep = ' ')
	Y = np.reshape(Y,(48, 48))
	E.append(Y)

X_inp = np.array(E)
#X_train = X_inp.reshape(-1,X_inp.shape[1], X_inp.shape[2],1)
X_train = X_inp.astype('float32')
print X_inp

inp_img = X_train[0,:,:]
ppca = PPCA(inp_img)
ppca.fit(d=20, verbose=False)
component_mat = ppca.transform()
E_y = component_mat.reshape(1,component_mat.shape[0],component_mat.shape[1])

for i in range(1,len(X_train)):
	print i
	inp_img =   X_train[i,:,:]
	ppca = PPCA(inp_img)
	try:
		ppca.fit(d=20, verbose=False)
		component_mat = ppca.transform()
		shape = component_mat.shape
		component_mat = component_mat.reshape(1,component_mat.shape[0],component_mat.shape[1])
		if shape[1] == 20:
			E_y = concatenate((E_y,component_mat))
	except numpy.linalg.linalg.LinAlgError:
Example #13
0
SelectedImage = showImagesRandomImages(
    3)  #select and image randomly from MNSIT dataset
missingPercentage = 0.2  # missing rate percentage
missingImage = generateMissingFig(
    SelectedImage,
    missingPercentage)  #inserting missing values to the original image

imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputed_by_KNN = imputer.fit_transform(missingImage)
KNNImputed_RMSE = mean_squared_error(SelectedImage, imputed_by_KNN)
#plt.imshow(imputed_by_KNN, cmap='gray', vmin=0, vmax=1)
#plt.show()

imputer = MissForest()
MissForest_imputed = imputer.fit_transform(missingImage)
MissForest_RMSE = mean_squared_error(SelectedImage, MissForest_imputed)
#plt.imshow(MissForest_imputed, cmap='gray', vmin=0, vmax=1)
#plt.show()

imputer = IterativeImputer()
MICE_imputed = imputer.fit_transform(missingImage)
MICE_RMSE = mean_squared_error(SelectedImage, MICE_imputed)
#plt.imshow(MICE_imputed, cmap='gray', vmin=0, vmax=1)
#plt.show()

ppca = PPCA()
ppca.fit(data=SelectedImage, d=100, verbose=True)
PPCA_imputed = ppca.transform(missingImage)
PPCA_RMSE = mean_squared_error(SelectedImage, PPCA_imputed)
#plt.imshow(PPCA_imputed, cmap='gray', vmin=0, vmax=1)
#plt.show()
Example #14
0
# @Date:   2020-07-07 11:22:28
# @Last Modified by:   ashayaan
# @Last Modified time: 2020-07-07 12:44:33

import torch
import pandas as pd 
import numpy as np
import json
from ppca import PPCA
import pickle

if __name__ == '__main__':
	roberta_features = pd.read_csv('../data/roberta.csv')
	roberta_features = roberta_features.set_index('idx')

	mca_features = pd.read_csv('../data/mca.csv')
	mca_features = mca_features.set_index('idx')

	pca_features = pd.read_csv('../data/pca.csv')
	pca_features = pca_features.set_index('idx')

	links = pd.read_csv('../data/ml-20m/links.csv')

	df = pd.concat([roberta_features, mca_features, pca_features], axis=1)
	ppca = PPCA() 
	ppca.fit(data=df.values.astype(float),d=128,verbose=True)
	print(ppca.var_exp)

	transformed = ppca.transform()
	films_dict = dict([(k, torch.tensor(transformed[i]).float()) for k, i in zip(df.index, range(transformed.shape[0]))])
	pickle.dump(films_dict, open('../data//ml20_pca128.pkl', 'wb'))
Example #15
0
import numpy             as np
import matplotlib.pyplot as plt
from numpy.random       import multivariate_normal
from ppca               import PPCA

if __name__ == '__main__':
    
    cov  = np.diag([10, 9, 8, 7] + [1]*28 + [6, 5, 4, 3] + [1]*28)**2
    data = multivariate_normal(np.zeros(64), cov, 256)
   
    ppca1 = PPCA(n_dimension=4)
    ppca1.fit(data, method='EM')
    ppca2 = PPCA(n_dimension=4)
    ppca2.fit(data, method='eig')
    
    print('\n\n\n\n**TEST FITTING THE COVARIANCE MATRIX**')
    plt.matshow(cov);
    print('\n\noriginal covariance matrix')
    plt.show()
    plt.matshow(ppca1._C);
    print('\n\nfitted covariance matrix (fitted by EM)')
    plt.show()
    plt.matshow(ppca2._C);
    print('\n\nfitted covariance matrix (fitted by eigen)')
    plt.show()
    
    print('\n\n\n\n**TEST GENERATING DATA**')
    plt.scatter(data[:, 0], data[:, 1], alpha=0.2);
    print('\n\noriginal data (first 2 dimensions)')
    plt.show()
    gene = ppca1.generate(256)
Example #16
0
if __name__ == '__main__':
	omdb = json.load(open('../data/parsed/omdb.json'))
	tmdb = json.load(open('../data/parsed/tmdb.json'))
	
	numerical_features = {'omdb': ['Year', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes'],
						'tmdb': ['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']}

	omdb_numerical = extractFeatures(omdb,'omdb',numerical_features['omdb'])
	tmdb_numerical = extractFeatures(tmdb,'tmdb',numerical_features['tmdb'])
	data = dict([(i, {**omdb_numerical[i], **tmdb_numerical[i]}) for i in omdb_numerical.keys()])
	data = extractRatings(data)
	# data = dict([(i,{**omdb_numerical[i],**tmdb_numerical[i]}) for i in omdb_numerical.keys()])
	df = pd.DataFrame.from_dict(data).T
	df.replace('N/A',np.nan,inplace=True)
	df.to_pickle('temp.pkl')
	df = fixData(df)
	
	df = df.head(100)

	ppca = PPCA()
	print (time.ctime())
	ppca.fit(df.values.astype(float), d=16,verbose=True)
	print (time.ctime())

	transformed = ppca.transform()
	transformed = pd.DataFrame(transformed)
	transformed['idx'] = pd.Series(list(omdb.keys()))
	transformed = transformed.set_index('idx')
	transformed.head()
	transformed.to_csv('../data/pca.csv',index=True,index_label='idx')