def my_kernelPCA(kernel): pca_traj.superpose( pca_traj, 0, atom_indices=sele_grp ) # Superpose each conformation in the trajectory upon first frame sele_trj = pca_traj.xyz[:, sele_grp, :] # select coordinates of selected atom groups sele_traj_reshaped = sele_trj.reshape(pca_traj.n_frames, len(sele_grp) * 3) sele_traj_reshaped = sele_traj_reshaped.astype( float) ## to avoid numpy Conversion Error during scaling sele_traj_reshaped_scaled = preprocessing.scale( sele_traj_reshaped, axis=0, with_std=False) # center to the mean kpca = KernelPCA(kernel=kernel, fit_inverse_transform=True, gamma=10) kpca.fit(sele_traj_reshaped_scaled) #print "Trace of the covariance matrix is: ", np.trace(kpca.get_covariance()) kpca_reduced = kpca.transform(sele_traj_reshaped_scaled) #write plots write_plots('kpca_projection', kpca_reduced, out_dir) title = 'kPCA Projection' write_fig('kpca_projection', kpca_reduced, out_dir, title) #write variance kpca_variance_fname = out_dir + '/kpca_variance' np.savetxt(kpca_variance_fname, kpca.lambdas_) pc1_cos = get_cosine(kpca_reduced, 0) print 'cosine content of first PC=', pc1_cos pc2_cos = get_cosine(kpca_reduced, 1) print 'cosine content of second PC=', pc2_cos pc3_cos = get_cosine(kpca_reduced, 2) print 'cosine content of 3rd PC=', pc3_cos pc4_cos = get_cosine(kpca_reduced, 3) print 'cosine content of 4th PC=', pc4_cos return
def mds(input, type): 'metric and nonmetric Multidimensional scaling' seed = np.random.RandomState(seed=1) #np.savetxt('mds_input.txt', input) ## testing value error if type == 'nm': nmds = MDS(n_components=100, max_iter=3000, metric=False, random_state=seed, dissimilarity="precomputed") print("Performing non-metric MDS..") npos = nmds.fit_transform(input) # write PC plots write_plots('nmds_projection', npos, out_dir) title = 'nMDS Projection' write_fig('nmds_projection', npos, out_dir, title) # cosine content pc1_cos = get_cosine(npos, 0) print('cosine content of first PC=', pc1_cos) pc2_cos = get_cosine(npos, 1) print('cosine content of second PC=', pc2_cos) pc3_cos = get_cosine(npos, 2) print('cosine content of 3rd PC=', pc3_cos) pc4_cos = get_cosine(npos, 3) print('cosine content of 4th PC=', pc4_cos) elif type == 'metric': mmds = MDS(n_components=100, max_iter=3000, random_state=seed, dissimilarity="precomputed") print("Performing metric MDS..") mpos = mmds.fit_transform(input) # write PC plots write_plots('mmds_projection', mpos, out_dir) title = 'mMDS Projection' write_fig('mmds_projection', mpos, out_dir, title) # cosine content pc1_cos = get_cosine(mpos, 0) print('cosine content of first PC=', pc1_cos) pc2_cos = get_cosine(mpos, 1) print('cosine content of second PC=', pc2_cos) pc3_cos = get_cosine(mpos, 2) print('cosine content of 3rd PC=', pc3_cos) pc4_cos = get_cosine(mpos, 3) print('cosine content of 4th PC=', pc4_cos) else: print('ERROR: Please check -mt flag options by running mds.py -h') return
def tsne(input): 't-distributed Stochastic Neighbor Embedding' seed = np.random.RandomState(seed=1) my_tsne = TSNE(n_components=3, n_iter=3000, random_state=seed, init='pca') ## apparantly n_components more than 3 throws error in certain cases. print "Performing TSNE..." mpos = my_tsne.fit_transform(input) write_plots('tsne_projection', mpos, out_dir) title='t-SNE Projection' write_fig('tsne_projection', mpos, out_dir, title) return;
def tsne(input): 't-distributed Stochastic Neighbor Embedding' seed = np.random.RandomState(seed=1) my_tsne = TSNE( n_components=3, perplexity=args.perplexity, n_iter=args.n_iter, learning_rate=args.learning_rate, random_state=seed, init='pca' ) ## apparantly n_components more than 3 throws error in certain cases. print("Performing TSNE...with perplexity", args.perplexity, "n_iter", args.n_iter, " and learning_rate", args.learning_rate) mpos = my_tsne.fit_transform(input) write_plots('tsne_projection', mpos, out_dir) title = 't-SNE Projection' write_fig('tsne_projection', mpos, out_dir, title) return
def distance_pca(int_cord1): 'Internal Coordinate Based PCA' pca = PCA(n_components=comp) dpca = pca.fit(int_cord1) dpca_reduced=dpca.transform(int_cord1) write_plots('dpca_projection', dpca_reduced, out_dir) write_pcs('dpca_pcs', dpca, out_dir) title='internal coordinate PCA Projection' write_fig('dpca_projection', dpca_reduced, out_dir, title) pc1_cos=get_cosine(dpca_reduced, 0) print 'cosine content of first PC=',pc1_cos pc2_cos=get_cosine(dpca_reduced, 1) print 'cosine content of second PC=', pc2_cos pc3_cos=get_cosine(dpca_reduced, 2) print 'cosine content of 3rd PC=',pc3_cos pc4_cos=get_cosine(dpca_reduced, 3) print 'cosine content of 4th PC=', pc4_cos return;
def svd_pca(svd): 'single value decomposition based PCA' pca_traj.superpose( pca_traj, 0, atom_indices=sele_grp ) # Superpose each conformation in the trajectory upon first frame sele_trj = pca_traj.xyz[:, sele_grp, :] # select coordinates of selected atom groups sele_traj_reshaped = sele_trj.reshape(pca_traj.n_frames, len(sele_grp) * 3) sele_traj_reshaped = sele_traj_reshaped.astype( float) ## to avoid numpy Conversion Error during scaling sele_traj_reshaped_scaled = preprocessing.scale( sele_traj_reshaped, axis=0, with_std=False) # center to the mean pca_sele_traj = PCA(n_components=comp) pca_sele_traj.fit(sele_traj_reshaped_scaled) pca_sele_traj_reduced = pca_sele_traj.transform(sele_traj_reshaped_scaled) print "Trace of the covariance matrix is: ", np.trace( pca_sele_traj.get_covariance()) # write the plots write_plots('pca_projection', pca_sele_traj_reduced, out_dir) title = 'PCA Projection' write_fig('pca_projection', pca_sele_traj_reduced, out_dir, title) #write the pcs variance write_pcs('pca_variance', pca_sele_traj, out_dir) pc1_cos = get_cosine(pca_sele_traj_reduced, 0) print 'cosine content of first PC=', pc1_cos pc2_cos = get_cosine(pca_sele_traj_reduced, 1) print 'cosine content of second PC=', pc2_cos pc3_cos = get_cosine(pca_sele_traj_reduced, 2) print 'cosine content of 3rd PC=', pc3_cos pc4_cos = get_cosine(pca_sele_traj_reduced, 3) print 'cosine content of 4th PC=', pc4_cos return
def incremental_pca(): ' normal PCA is very memory intesive. It can be problemetic for large dataset, \ since dataset is stored in memory. Incremental principal component analysis (IPCA) is \ typically used for such cases. ' pca_traj.superpose( pca_traj, 0, atom_indices=sele_grp ) # Superpose each conformation in the trajectory upon first frame sele_trj = pca_traj.xyz[:, sele_grp, :] # select coordinates of selected atom groups sele_traj_reshaped = sele_trj.reshape(pca_traj.n_frames, len(sele_grp) * 3) sele_traj_reshaped = sele_traj_reshaped.astype( float) ## to avoid numpy Conversion Error during scaling sele_traj_reshaped_scaled = preprocessing.scale( sele_traj_reshaped, axis=0, with_std=False) # center to the mean ipca = IncrementalPCA() ipca = ipca.fit(sele_traj_reshaped_scaled) ipca_reduced = ipca.transform(sele_traj_reshaped_scaled) #write plots write_plots('ipca_projection', ipca_reduced, out_dir) title = 'iPCA Projection' write_fig('ipca_projection', ipca_reduced, out_dir, title) #write variance #np.savetxt('ipca_variance', kpca.lambdas_) pc1_cos = get_cosine(ipca_reduced, 0) print 'cosine content of first PC=', pc1_cos pc2_cos = get_cosine(ipca_reduced, 1) print 'cosine content of second PC=', pc2_cos pc3_cos = get_cosine(ipca_reduced, 2) print 'cosine content of 3rd PC=', pc3_cos pc4_cos = get_cosine(ipca_reduced, 3) print 'cosine content of 4th PC=', pc4_cos return
def my_pca(): 'eigenvales decomposition PCA' pca_traj.superpose( pca_traj, 0, atom_indices=sele_grp ) # Superpose each conformation in the trajectory upon first frame sele_trj = pca_traj.xyz[:, sele_grp, :] # select coordinates of selected atom groups sele_traj_reshaped = sele_trj.reshape(pca_traj.n_frames, len(sele_grp) * 3) #arr1=sele_traj_reshaped sele_traj_reshaped = sele_traj_reshaped.astype( float) ## to avoid numpy Conversion Error during scaling sele_traj_reshaped_scaled = preprocessing.scale( sele_traj_reshaped, axis=0, with_std=False) # center to the mean arr = sele_traj_reshaped_scaled #=============================================== # covariance matrix cov_mat = np.corrcoef(arr, rowvar=False) trj_eval, trj_evec = np.linalg.eig(cov_mat) print "Trace of cov matrix is ", np.trace(cov_mat) #============================= # sanity check of calculated eigenvector and eigen values # it must be cov matrix * eigen vector = eigen vector * eigen value for i in range(len(trj_eval)): eigv = trj_evec.real[:, i].reshape( 1, len(trj_evec[:, 0]), ).T np.testing.assert_array_almost_equal(cov_mat.dot(eigv), trj_eval[i] * eigv, decimal=3, err_msg='', verbose=True) #============================================= # sort the eigenvalues and eigenvector sort_idx = trj_eval.argsort()[::-1] trj_eval = trj_eval[sort_idx] trj_evec = trj_evec[sort_idx] tot_var = np.sum(trj_eval.real) variation = [] cum = [] j = 0 eigv = [] n_comp = 100 pca = trj_evec.real[:, 0:n_comp] ## keep first 100 eigenvectors for i in trj_eval.real[0:n_comp]: eigv.append(i) variation.append((i / tot_var) * 100) j += 1 # write PC plot pca_variance_fname = out_dir + '/pca_variance.agr' np.savetxt(pca_variance_fname, variation) ef = open(pca_variance_fname, 'r') ef_cont = ef.read() ef.close() title = '\tcreated by pca.py\t' my_time = strftime("%Y-%m-%d %a %H:%M:%S", gmtime()) legends = '@ title "explained_variance of PCs"\n\ @ xaxis label "PCs"\n\ @ yaxis label "% Variance"\n\ @ TYPE xy\n\ @ s0 symbol 1\n\ @ s0 symbol size 0.250000\n\ @ s0 symbol color 1\n\ @ s0 symbol pattern 1\n\ @ s0 symbol fill color 1\n\ @ s0 symbol fill pattern 1\n\ @ s0 symbol linewidth 1.0\n\ @ s0 symbol linestyle 1\n\ @ s0 symbol char 25\n\ @ s0 symbol fill color 2\n\ @ s0 symbol color 2\n\ @ s0 symbol char font 0\n\ @ s0 symbol skip 0\n' ef = open(pca_variance_fname, 'w') ef.write('#' + title + '\ton\t' + my_time + '\n' + legends + '\n' + ef_cont + '\n') ef.close() #======================================================== # transform the input data into choosen pc arr_transformed = arr.dot(pca) #arr_transformed = pca.T.dot(arr.T) print arr_transformed.shape write_plots('pca_projection', arr_transformed, out_dir) title = 'PCA Projection' write_fig('pca_projection', arr_transformed, out_dir, title) ## RMSF get_rmsf(pca_traj, sele_grp, trj_eval, out_dir) pc1_cos = get_cosine(arr_transformed, 0) print 'cosine content of first PC=', pc1_cos pc2_cos = get_cosine(arr_transformed, 1) print 'cosine content of second PC=', pc2_cos pc3_cos = get_cosine(arr_transformed, 2) print 'cosine content of 3rd PC=', pc3_cos pc4_cos = get_cosine(arr_transformed, 3) print 'cosine content of 4th PC=', pc4_cos return