def project_and_cluster(trajfiles, featurizer, sparsify=False, tica=True, lag=100000, scale=True, var_cutoff=1.0, ncluster=100): """ Returns ------- trans_obj, Y, clustering """ X = coor.load(trajfiles, featurizer) if sparsify: X = remove_constant(X) if tica: trans_obj = coor.tica(X, lag=lag, var_cutoff=var_cutoff) Y = trans_obj.get_output() else: trans_obj = coor.pca(X, dim=-1, var_cutoff=var_cutoff) Y = trans_obj.get_output() if scale: for y in Y: y *= trans_obj.eigenvalues[:trans_obj.dimension()] if cluster: cl_obj = coor.cluster_kmeans(Y, k=ncluster, max_iter=3, fixed_seed=True) return trans_obj, Y, cl_obj return trans_obj, Y
def loadCoordinates(path, trajectories_basename='*traj*.pdb', topfile=''): """ Load the coordinates from the simulation into an object that can be used with the rest of PyEMMA tools. Returns an array with the trajectories""" feat = coor.featurizer(topfile) path_to_file = os.path.join(path, trajectories_basename) files = glob.glob(path_to_file) return coor.load(files, feat)
def run_sampling(args): topology = "Native.pdb" ticadim = 10 num_sample_frames = 10000 fn = args.file # file name wn = args.weights # weights name weights = np.loadtxt(wn) weights = weights / np.sum(weights) # first time time1 = time.clock() feat = coor.featurizer(topology) feat.add_distances_ca() X1 = coor.load(fn, feat, stride=1) # time for loading time2 = time.clock() print "Took %f minutes to load a file" % ((time2 - time1) / 60.0) sampled_frames = np.zeros((num_sample_frames, np.shape(X1)[1])) selected_frames = np.random.choice(np.shape(X1)[0], size=num_sample_frames, replace=True, p=weights) time3 = time.clock() print "Took %f minutes to select new frames" % ((time3 - time2) / 60.0) for i in range(num_sample_frames): ##debug # print np.shape(sampled_frames) # print np.shape(X1) ##debugg sampled_frames[i, :] = X1[selected_frames[i], :] time4 = time.clock() print "Took %f minutes to load the new frames" % ((time4 - time3) / 60.0) ##debug for j in sampled_frames: for i in j: if i == 0: print "ERROR, distance too short, something not written" f = open("log.txt", "w") f.write("ERROR, distance too short, something not written") f.close() ##debugg time5 = time.clock() print "Took %f minutes to go through the debug check" % ((time5 - time4) / 60.0) tica_obj = coor.tica(sampled_frames, stride=1, lag=1, dim=ticadim) time6 = time.clock() print "Took %f minutes to calculate the tica_object" % ((time6 - time5) / 60.0) outputs = tica_obj.get_output()[0] eigen = tica_obj.eigenvalues time7 = time.clock() print "Took %f minutes to get the output of the tica_object" % ((time7 - time6) / 60.0) print "saving files" np.savetxt("output.dat", outputs) np.savetxt("eigenvalues.dat", eigen) print "files saved" time8 = time.clock() print "Took %f minutes to write the output files" % ((time8 - time7) / 60.0)
def run_analysis(args): feat = coor.featurizer(args.topfile) feat.add_distances(tmeth.generate_pairs(args.range[0],args.range[1], args.step_size, args.cut_value)) traj = coor.load(args.traj_file, feat, stride=args.stride) tica_obj = coor.tica(traj, stride=1, lag=args.lag, dim=args.ticadim) outputs = tica_obj.get_output()[0] eigen = tica_obj.eigenvalues np.savetxt("%s_output_raw.dat"%args.title, outputs) np.savetxt("%s_eigenvalues_raw.dat"%args.title, eigen) tmeth.plot_eigen_series(eigen, args.title, time_scale=args.time_step*args.stride) tmeth.plot_output(outputs, args.title, time_scale=args.time_step*args.stride)
def run_sampling(args): topology = args.topfile ticadim = 10 num_sample_frames = 10000 tica_lag_time = 5 fn = args.filedir # file name wn = args.weights # weights name weights = np.loadtxt(wn) weights = weights / np.sum(weights) # first time time1 = time.clock() feat = coor.featurizer(topology) feat.add_distances(tmeth.generate_pairs(5, 288, 4, 4)) selected_frames = np.random.choice(args.number_traj, size=num_sample_frames, replace=True, p=weights) selected_files = [] selected_frames.sort() for i in selected_frames: selected_files.append("%s/traj%d.xtc" % (fn, i)) time2 = time.clock() print "Took %f minutes to select new frames" % ((time2 - time1) / 60.0) sampled_frames = coor.load(selected_files, feat, stride=10) time3 = time.clock() print "Took %f minutes to load the new frames" % ((time3 - time2) / 60.0) tica_obj = coor.tica(sampled_frames, stride=1, lag=tica_lag_time, dim=ticadim) time4 = time.clock() print "Took %f minutes to calculate the tica_object" % ((time4 - time3) / 60.0) all_outputs = tica_obj.get_output()[0] for i in xrange(num_sample_frames - 1): outputs = tica_obj.get_output()[i + 1] all_outputs = np.append(all_outputs, outputs, axis=0) eigen = tica_obj.eigenvalues print "saving files" np.savetxt("output.dat", all_outputs) np.savetxt("eigenvalues.dat", eigen) np.savetxt("selected_frames.dat", selected_frames) print "files saved" time5 = time.clock() print "Took %f minutes to write the output files" % ((time5 - time4) / 60.0)
import analysis_scripts.plot_package as pltpkg if __name__ == "__main__": topology = "firstframe.pdb" feat = coor.featurizer(topology) pairs = np.array([[79, 492]]) feat.add_distances(pairs) print feat.describe() files_list = [] for i in np.arange(0, 10, 1): files_list.append("ww_2-protein-00%d.dcd" % i) for i in np.arange(10, 50, 1): files_list.append("ww_2-protein-0%d.dcd" % i) output = coor.load(files_list, features=feat) print np.shape(output) yvalues = np.array(output).flatten() print np.shape(yvalues) np.savetxt("trace_ww_2.dat", yvalues) print np.max(yvalues) print np.min(yvalues)
import mdtraj as md import pickle import matplotlib.pyplot as plt import time ## set path start_time = time.time() indir = '/scratch/jap12009/gamma/pg' topfile = '/scratch/jap12009/gamma/gamma1.pdb' save_file = 'pgTICs-ca-con-8.pkl' ## create list of trajectories and Colvar files traj_list = [indir + "/p60win{:2.1f}.xtc".format(i) for i in np.arange(0.8,15.5,0.1)] ## define topology f = coor.featurizer(topfile) f.add_distances_ca() ## load trajectories and colvar files inp = coor.load(traj_list, f) ## tica inp1 = [i[20000::10] for i in inp] cumvar3 = [] timescales1 = [] timescales2 = [] timescales3 = [] print('length of inp1 is ', len(inp1)) print('length of inp1 is ', inp1[0].shape) #for i in range(5000,39999,5000): for i in [ 2000, 4000, 6000, 8000, 10000, 12000] k = int(i/10)
#! /usr/bin/env/ python # Plot tica data import mdtraj as md import pyemma.coordinates as coor import numpy as np import pickle import pyemma import os import pandas as pd import pyemma.plots as pyemma_plots import matplotlib.pyplot as plt sys = 'tica_plots_06/fdis/fdis' ## What tica data do you want to look at tica_data = coor.load( 'tica_data_05/fdis_tica_data.h5') ## Where is that tica data? tica_data_cat = np.concatenate(tica_data) def kin_var_plot(): ''' Plot kinetic variance vs time to identify how many tica components to use ''' cumvar = np.load(open("tica_data_05/chi2_cumvar.npy", "rb")) fig, ax = plt.subplots() index = range(1, len(cumvar) + 1) ax.plot(index, cumvar) ax.axhline(y=0.95, c='y') ax.axhline(y=0.90, c='C1') ax.axhline(y=0.85, c='r') ax.set_xlabel('Tica index', fontsize=16) ax.set_ylabel('Cumulative Variance (%)', fontsize=16)
import pyemma.coordinates as coor import numpy as np import pyemma.msm as msm import pyemma.plots as pyemma_plots import matplotlib.pyplot as plt sys = 'fdis' n_clusters = 100 dtrajs = coor.load(f'cluster_data_10/{sys}_{n_clusters}_cluster_dtrajs.h5') max_lag = 15 dt2 = [i.astype(np.int_) for i in dtrajs] dt3 = [i.reshape((i.shape[0])) for i in dt2] msm1 = msm.estimate_markov_model(dt3, 5) print(f'Acitve state percentage is {msm1.active_state_fraction}') print(f'Acitve count percentage is {msm1.active_count_fraction}')
#start = 10 #stop = 50 ##debugg for i in np.arange(start, stop, cutoff): for j in np.arange(i+4, stop, cutoff): pair.append([i, j]) print np.shape(pair) pairs = np.array(pair) feat.add_distances(pairs) #feat.add_distances_ca() X1 = coor.load("traj.xtc", feat, stride=1) #traj = md.load("traj.xtc", top="Native.pdb") #X1 = md.compute_distances(traj, [[115, 192]], periodic=False) print np.shape(X1) possible_times = np.logspace(1,100,5) possible_times = possible_times.astype(int) lag_times = [] for i in possible_times: if i not in lag_times: lag_times.append(i) print lag_times collected_eigenvalues=[]
'''run vamp score on list of feature options at several different lag times returns scores and errors''' scores = [0] * len(lag_list) errors = [0] * len(lag_list) for i, lag in enumerate(lag_list): scores[i] = [] errors[i] = [] vamp_score = [0] * len(feat_option_list) for j in range(len(feat_option_list)): vamp_score[j] = score_cv(feat_option_list[j], lag=lag, dim=dim, number_of_splits=number_of_splits) scores[i] += [vamp_score[j].mean()] errors[i] += [vamp_score[j].std()] return scores, errors feat_optionsA = [coor.load(i) for i in feature_list['path_a']] feat_optionsB = [coor.load(i) for i in feature_list['path_b']] feat_options = [0] * len(feat_optionsA) for i in range(len(feat_optionsA)): feat_options[i] = feat_optionsA[i] + feat_optionsB[i] scores, errors = run_vamp_score(feat_options, dim=10, number_of_splits=20) with open('vamp_scores_10dim_both_1.npy','wb') as handle: np.save(handle, scores) with open('vamp_errors_10dim_both_1.npy','wb') as handle: np.save(handle, errors) print(scores) print(errors)
#cluster tica data into clusters import pyemma.coordinates as coor import numpy as np sys = 'fdis' tica_data = coor.load('tica_data_05/fdis_tica_data.h5') n_clusters = 100 cl = coor.cluster_kmeans(tica_data, k=n_clusters, max_iter=50) #cl.save(f'cluster_data/{sys}_{n_clusters}_mini_cluster_object.h5', overwrite=True) cl.write_to_hdf5(f'cluster_data_11/{sys}_{n_clusters}_cluster_dtrajs22.h5')
from util.plot_structure_util import plot_vmd_cylinder_from_inds, plot_pymol_cylinder_from_inds dis_cutoff = 1.0 std_cutoff = 0.035 outfile = 'filtered_distance_featurization_01/filtered_dis_ind_10_035_more' save = True plot = 'all' # should be all, pymol, vmd, or none traj_num = [f'{i:04d}' for i in range(100)] traj_path = '../DESRES-Trajectory_sarscov2-10880334-no-water-no-ion-glueCA/sarscov2-10880334-no-water-no-ion-glueCA/sarscov2-10880334-no-water-no-ion-glueCA-' traj_list = [ traj_path + str(i) + '.dcd' for i in traj_num] pdb = '../DESRES_protease_chainid.pdb' feat = coor.featurizer(pdb) feat.add_distances(feat.pairs(feat.select('name == CA and chainid == 0'), excluded_neighbors=3)) traj = coor.load(traj_list, feat, stride=5) traj_cat = np.concatenate(traj) feat1 = coor.featurizer(pdb) feat1.add_distances(feat1.pairs(feat1.select('name == CA and chainid == 1'), excluded_neighbors=3)) traj1 = coor.load(traj_list, feat, stride=5) traj_cat1 = np.concatenate(traj) traj_cat_pair = np.concatenate((traj_cat, traj_cat1), axis=0) min_dist = traj_cat_pair.min(axis=0) std_dist = traj_cat_pair.std(axis=0) new_dists = np.where((min_dist < dis_cutoff) & (std_dist > std_cutoff))[0] print('new distances:', new_dists.shape)
#cluster data into a small amount of clusters to later pull out structures import pyemma.coordinates as coor import numpy as np sys = 'back' tica_data = coor.load('tica_data_05/back_tica_data.h5') n_clusters = 50 cl = coor.cluster_kmeans(tica_data, k=n_clusters, max_iter=50) cl.save(f'{sys}_{n_clusters}_mini_cluster_object.h5', overwrite=True) cl.write_to_hdf5(f'{sys}_{n_clusters}_cluster_dtrajs.h5')
#! /usr/bin/env/ python # Featurize trajectories several different ways to test in next step import mdtraj as md import pyemma.coordinates as coor import numpy as np import pickle import pyemma import os import pandas as pd var_cutoff = 0.9 # adjust to find elbow of of cumulative kinetic variance feature_list = pd.read_pickle('feature_list_1.pickl') data_a = coor.load('feature_data_02/backbone_chi1_2_chain_0.h5') data_b = coor.load('feature_data_02/backbone_chi1_2_chain_1.h5') data = data_a + data_b tica = coor.tica(data=data, lag=10, kinetic_map=False, commute_map=True) with open('tica_data_05/chi2_cumvar.npy', 'wb') as handle: np.save(handle, tica.cumvar) tica.var_cutoff = var_cutoff print('Number of dimentions saved is: ', tica.dimension()) tica.write_to_hdf5('tica_data_05/chi2_tica_data.h5')
import pyemma.coordinates as coor # define features to load for spacetime diffusion map analysis: heavy atom coordinates only. print('define basis functions: heavy atom coordinates') print('\n') sys.stdout.flush() featurizer = coor.featurizer(topology) featurizer.add_selection(featurizer.select_Heavy()) print(featurizer.dimension()) sys.stdout.flush() # use featurizer to read in trajectory X1 = coor.load(allfiles1, featurizer, stride=nskip) # concatenating trajectory chunks into one single trajectory X1 = np.vstack(X1) print(X1.shape) print('trajectory loaded!') sys.stdout.flush() # extracting the (indices) subset of configurations from the whole trajectory that was just loaded state = [[] for k in range(n_traj)] for i in range(n_traj): for j in range(hidden_states): state[i].append(states[i][j][np.where(states[i][j]%int(nskip) == 0)]/int(nskip)) my_idx1 = state[0][state_idx]