def run_sampling(args): topology = "Native.pdb" ticadim = 10 num_sample_frames = 10000 fn = args.file # file name wn = args.weights # weights name weights = np.loadtxt(wn) weights = weights / np.sum(weights) # first time time1 = time.clock() feat = coor.featurizer(topology) feat.add_distances_ca() X1 = coor.load(fn, feat, stride=1) # time for loading time2 = time.clock() print "Took %f minutes to load a file" % ((time2 - time1) / 60.0) sampled_frames = np.zeros((num_sample_frames, np.shape(X1)[1])) selected_frames = np.random.choice(np.shape(X1)[0], size=num_sample_frames, replace=True, p=weights) time3 = time.clock() print "Took %f minutes to select new frames" % ((time3 - time2) / 60.0) for i in range(num_sample_frames): ##debug # print np.shape(sampled_frames) # print np.shape(X1) ##debugg sampled_frames[i, :] = X1[selected_frames[i], :] time4 = time.clock() print "Took %f minutes to load the new frames" % ((time4 - time3) / 60.0) ##debug for j in sampled_frames: for i in j: if i == 0: print "ERROR, distance too short, something not written" f = open("log.txt", "w") f.write("ERROR, distance too short, something not written") f.close() ##debugg time5 = time.clock() print "Took %f minutes to go through the debug check" % ((time5 - time4) / 60.0) tica_obj = coor.tica(sampled_frames, stride=1, lag=1, dim=ticadim) time6 = time.clock() print "Took %f minutes to calculate the tica_object" % ((time6 - time5) / 60.0) outputs = tica_obj.get_output()[0] eigen = tica_obj.eigenvalues time7 = time.clock() print "Took %f minutes to get the output of the tica_object" % ((time7 - time6) / 60.0) print "saving files" np.savetxt("output.dat", outputs) np.savetxt("eigenvalues.dat", eigen) print "files saved" time8 = time.clock() print "Took %f minutes to write the output files" % ((time8 - time7) / 60.0)
def run_analysis(args): feat = coor.featurizer(args.topfile) feat.add_distances(tmeth.generate_pairs(args.range[0],args.range[1], args.step_size, args.cut_value)) traj = coor.load(args.traj_file, feat, stride=args.stride) tica_obj = coor.tica(traj, stride=1, lag=args.lag, dim=args.ticadim) outputs = tica_obj.get_output()[0] eigen = tica_obj.eigenvalues np.savetxt("%s_output_raw.dat"%args.title, outputs) np.savetxt("%s_eigenvalues_raw.dat"%args.title, eigen) tmeth.plot_eigen_series(eigen, args.title, time_scale=args.time_step*args.stride) tmeth.plot_output(outputs, args.title, time_scale=args.time_step*args.stride)
def pyemma_feat(args): irow, featurizer_name, tops, indices = args i, row = irow traj, top = row['traj_fn'], tops[row['top_fn']] feat = featurizer(top) try: adder = getattr(feat, featurizer_name) adder(indexes=indices, cossin=True) feat_traj = np.squeeze(source(traj, features=feat).get_output(), axis=0) return i, feat_traj except AttributeError: print("pyEMMA doesn't have {} as a featurizer".format(featurizer_name))
def msmbuilder_to_pyemma(msmbuilder_dih_featurizer,trajectory): ''' accepts an msmbuilder.featurizer.DihedralFeaturizer object + a trajectory (containing the topology this featurizer will be applied to) and spits out an equivalent PyEMMA featurizer ''' all_indices = [] for dih_type in msmbuilder_dih_featurizer.types: func = getattr(md, 'compute_%s' % dih_type) indices,_ = func(trajectory) all_indices.append(indices) indices = np.vstack(all_indices) sincos = msmbuilder_dih_featurizer.sincos pyemma_feat = coor.featurizer(trajectory.topology) pyemma_feat.add_dihedrals(indices,cossin=sincos) return pyemma_feat
def sasa_per_res(chain=0): ''' Salvent acessable surfase area per residue ''' def calc_sasa(traj, chain=0, featurizer=None): small_traj = traj.atom_slice( atom_indices=featurizer.select(f'chainid == {chain}')) res = md.shrake_rupley(small_traj, probe_radius=0.14, n_sphere_points=960, mode='residue') return res featurizer = coor.featurizer(pdb) featurizer.add_custom_func(calc_sasa, dim=int(featurizer.topology.n_residues / 2), chain=0, featurizer=featurizer) return featurizer
def filtered_ca_distances_larger(chain=0): ''' Pairwize filtered carbon alpha distances defined in filter_distances_01.py''' dist_indsA = np.load( open( "filtered_distance_featurization_01/filtered_dis_ind_12_03chainA.npy", "rb")) dist_indsB = np.load( open( "filtered_distance_featurization_01/filtered_dis_ind_12_03chainA.npy", "rb")) featurizer = coor.featurizer(pdb) if chain == 0: featurizer.add_distances(dist_indsA) elif chain == 1: featurizer.add_distances(dist_indsB) else: raise ValueError("chain must be 0 or 1") return featurizer
def run_sampling(args): topology = args.topfile ticadim = 10 num_sample_frames = 10000 tica_lag_time = 5 fn = args.filedir # file name wn = args.weights # weights name weights = np.loadtxt(wn) weights = weights / np.sum(weights) # first time time1 = time.clock() feat = coor.featurizer(topology) feat.add_distances(tmeth.generate_pairs(5, 288, 4, 4)) selected_frames = np.random.choice(args.number_traj, size=num_sample_frames, replace=True, p=weights) selected_files = [] selected_frames.sort() for i in selected_frames: selected_files.append("%s/traj%d.xtc" % (fn, i)) time2 = time.clock() print "Took %f minutes to select new frames" % ((time2 - time1) / 60.0) sampled_frames = coor.load(selected_files, feat, stride=10) time3 = time.clock() print "Took %f minutes to load the new frames" % ((time3 - time2) / 60.0) tica_obj = coor.tica(sampled_frames, stride=1, lag=tica_lag_time, dim=ticadim) time4 = time.clock() print "Took %f minutes to calculate the tica_object" % ((time4 - time3) / 60.0) all_outputs = tica_obj.get_output()[0] for i in xrange(num_sample_frames - 1): outputs = tica_obj.get_output()[i + 1] all_outputs = np.append(all_outputs, outputs, axis=0) eigen = tica_obj.eigenvalues print "saving files" np.savetxt("output.dat", all_outputs) np.savetxt("eigenvalues.dat", eigen) np.savetxt("selected_frames.dat", selected_frames) print "files saved" time5 = time.clock() print "Took %f minutes to write the output files" % ((time5 - time4) / 60.0)
trajfile = './traj/2F4K-0-protein_all.dcd' topology = './traj/2F4K-0-protein_fixed_noH.pdb' #-------------------------------------------------------------------- # define features to be used to input to TICA calculation import pyemma.coordinates as coor import itertools traj = md.load(topology) print('trajectory objects = ' + str(traj)) print('topology object = ' + str(traj.topology)) sys.stdout.flush() # define a featurizer feat = coor.featurizer(topology) # define basis functions: heavy-atom contact distances, heavy atom coordinates, all torsions print('define basis functions: heavy-atom contact distances, heavy atom coordinates, all torsions, inverse distances') print('\n') sys.stdout.flush() featurizer = coor.featurizer(topology) featurizer.add_residue_mindist(residue_pairs='all', scheme='closest-heavy') featurizer.add_all() featurizer.add_backbone_torsions(cossin=True) featurizer.add_chi1_torsions(cossin=True) indx = md.compute_chi2(traj)[0] featurizer.add_dihedrals(indx, cossin=True) indx = md.compute_chi3(traj)[0] featurizer.add_dihedrals(indx, cossin=True)
def featurelize(self): feature = coordinates.featurizer(self.topologyfile) feature.add_backbone_torsions() self.src.featurizer = feature
def ca_distances_skip5(chain=0): ''' Pairwise distances between every 5th carbon alpha ''' featurizer = coor.featurizer(pdb) skip5 = featurizer.select(f'name == CA and chainid == {chain}')[::5] featurizer.add_distances(skip5) return featurizer
as_list = [ 84, 85, 86, 89, 90, 91, 92, 93, 114, 115, 116, 117, 118, 119, 121, 122, 125, 126, 129, 130, 376, 378, 379, 390, 394, 409, 410, 432, 433, 434, 435, 459, 460, 461, 462, 464, 465, 466, 468, 469, 522, 523, 524, 525, 526, 527, 528, 530, 538, 541, 558, 559, 560, 561, 562, 567 ] atoms_list = [ atom.index for atom in mdtraj_top.atoms if atom.name == 'CA' if int(str(atom.residue)[3:]) in as_list ] # PyEmma Part - TICA, clustering and MSM top = 'stripped_adduct.prmtop' trajs = ('test_size.nc') feat = coor.featurizer(top) feat.add_contacts(indices=atoms_list, threshold=1) print(feat.dimension()) # Conducting TICA inp = coor.source(trajs, feat) t0 = perf_counter() tica_obj = coor.tica(inp, lag=500, var_cutoff=0.9, kinetic_map=True, stride=1) t1 = perf_counter() print(f"Time elapsed: {round(t1-t0)} s") print(dir(tica_obj.dim)) print(tica_obj.dimension()) print(len(tica_obj.cumvar))
################################################################################ # Load reference topology ################################################################################ print('loading reference topology...') reference_pdb_filename = 'reference.pdb' reference_trajectory = os.path.join(source_directory, 'run0-clone0.h5') traj = md.load(reference_trajectory) traj[0].save_pdb(reference_pdb_filename) ################################################################################ # Initialize featurizer ################################################################################ print('Initializing backbone torsions featurizer...') featurizer = coor.featurizer(reference_pdb_filename) featurizer.add_chi1_torsions() ################################################################################ # Define coordinates source ################################################################################ trajectory_files = glob(os.path.join(source_directory, '*0.h5')) coordinates_source = coor.source(trajectory_files, featurizer) print("There are %d frames total in %d trajectories." % (coordinates_source.n_frames_total(), coordinates_source.number_of_trajectories())) ################################################################################ # Do tICA ################################################################################
import numpy as np try: import pyemma import pyemma.coordinates as coor except: print "pyemma not imported!" import mdtraj as md import time import analysis_scripts.plot_package as pltpkg if __name__ == "__main__": topology = "firstframe.pdb" feat = coor.featurizer(topology) pairs = np.array([[79, 492]]) feat.add_distances(pairs) print feat.describe() files_list = [] for i in np.arange(0, 10, 1): files_list.append("ww_2-protein-00%d.dcd" % i) for i in np.arange(10, 50, 1): files_list.append("ww_2-protein-0%d.dcd" % i) output = coor.load(files_list, features=feat) print np.shape(output)
def get_pMHC_featurizer(feat_type, top): #, peptide_residues, system_residues): featurizer = coor.featurizer(top) peptide_residues = [] system_residues = np.arange(top.n_residues) #new_system_residues = [] for resi in system_residues: if len(top.top.select("chainid == 1 and resi == " + str(resi))) > 0: peptide_residues.append(resi) #elif len(top.top.select("chainid == 0 and resi == " + str(resi))) > 0 and (resi < 45 or (resi >= 95 and resi <= 120) ): new_system_residues.append(resi) #elif len(top.top.select("chainid == 0 and resi == " + str(resi))) > 0: new_system_residues.append(resi) #system_residues = new_system_residues if feat_type == 'pep_to_MHC': residue_pairs = [] for peptide_residue in peptide_residues: for residue in system_residues: if peptide_residue == residue: continue residue_pairs.append([peptide_residue, residue]) featurizer.add_residue_mindist(residue_pairs=np.array(residue_pairs), scheme='closest-heavy') elif feat_type == 'pep_to_MHC_ca': residue_pairs = [] for peptide_residue in peptide_residues: for residue in system_residues: if peptide_residue == residue: continue residue_pairs.append([peptide_residue, residue]) featurizer.add_residue_mindist(residue_pairs=np.array(residue_pairs), scheme='ca') elif feat_type == 'pep_bb_ca_torsions': resi_str = "resi " + str(peptide_residues[0]) + " to " + str(peptide_residues[-1]) featurizer.add_backbone_torsions(selstr=resi_str, cossin=True) featurizer.add_sidechain_torsions(selstr=resi_str, cossin=True) elif feat_type == 'pep_bb_torsions': resi_str = "resi " + str(peptide_residues[0]) + " to " + str(peptide_residues[-1]) featurizer.add_backbone_torsions(selstr=resi_str, cossin=True) elif feat_type == 'pep_bb_ca': resi_str = "resi " + str(peptide_residues[0]) + " to " + str(peptide_residues[-1]) bb_ca_str = resi_str + " and backbone and name == 'CA'" bb_ca_indices = top.top.select(bb_ca_str) featurizer.add_distances(indices=featurizer.pairs(bb_ca_indices)) elif feat_type == 'pep_bb_torsions_and_ca': resi_str = "resi " + str(peptide_residues[0]) + " to " + str(peptide_residues[-1]) featurizer.add_backbone_torsions(selstr=resi_str, cossin=True) bb_ca_str = resi_str + " and backbone and name == 'CA'" bb_ca_indices = top.top.select(bb_ca_str) featurizer.add_distances(indices=featurizer.pairs(bb_ca_indices)) elif feat_type == 'sasa': featurizer.add_custom_func(get_sasa, len(system_residues)) else: print("Featurizer type not recognized") sys.exit(0) print("Number of atoms:", top.n_atoms) print("Number of residues:", top.n_residues) print("Number of features:", featurizer.dimension()) return featurizer
################################################################################ # Load reference topology ################################################################################ print ('loading reference topology...') reference_pdb_filename = 'reference.pdb' reference_trajectory = os.path.join(source_directory, 'run0-clone0.h5') traj = md.load(reference_trajectory) traj[0].save_pdb(reference_pdb_filename) ################################################################################ # Initialize featurizer ################################################################################ print('Initializing backbone torsions featurizer...') featurizer = coor.featurizer(reference_pdb_filename) featurizer.add_backbone_torsions() ################################################################################ # Define coordinates source ################################################################################ trajectory_files = glob(os.path.join(source_directory, '*0.h5')) coordinates_source = coor.source(trajectory_files,featurizer) print("There are %d frames total in %d trajectories." % (coordinates_source.n_frames_total(), coordinates_source.number_of_trajectories())) ################################################################################ # Do tICA ################################################################################ print('tICA...')
import numpy as np import pickle from util.plot_structure_util import plot_vmd_cylinder_from_inds, plot_pymol_cylinder_from_inds dis_cutoff = 1.0 std_cutoff = 0.035 outfile = 'filtered_distance_featurization_01/filtered_dis_ind_10_035_more' save = True plot = 'all' # should be all, pymol, vmd, or none traj_num = [f'{i:04d}' for i in range(100)] traj_path = '../DESRES-Trajectory_sarscov2-10880334-no-water-no-ion-glueCA/sarscov2-10880334-no-water-no-ion-glueCA/sarscov2-10880334-no-water-no-ion-glueCA-' traj_list = [ traj_path + str(i) + '.dcd' for i in traj_num] pdb = '../DESRES_protease_chainid.pdb' feat = coor.featurizer(pdb) feat.add_distances(feat.pairs(feat.select('name == CA and chainid == 0'), excluded_neighbors=3)) traj = coor.load(traj_list, feat, stride=5) traj_cat = np.concatenate(traj) feat1 = coor.featurizer(pdb) feat1.add_distances(feat1.pairs(feat1.select('name == CA and chainid == 1'), excluded_neighbors=3)) traj1 = coor.load(traj_list, feat, stride=5) traj_cat1 = np.concatenate(traj) traj_cat_pair = np.concatenate((traj_cat, traj_cat1), axis=0) min_dist = traj_cat_pair.min(axis=0) std_dist = traj_cat_pair.std(axis=0) new_dists = np.where((min_dist < dis_cutoff) & (std_dist > std_cutoff))[0]
import pyemma.coordinates as coor import numpy as np import pickle from util.plot_structure_util import plot_vmd_cylinder_from_inds, plot_pymol_cylinder_from_inds dis_cutoff = 1.2 std_cutoff = 0.03 outfile = 'filtered_distance_featurization_01/filtered_dis_ind_12_03' save = True plot = 'all' # should be all, pymol, vmd, or none traj_num = [f'{i:04d}' for i in range(100)] traj_path = '../DESRES-Trajectory_sarscov2-10880334-no-water-no-ion-glueCA/sarscov2-10880334-no-water-no-ion-glueCA/sarscov2-10880334-no-water-no-ion-glueCA-' traj_list = [ traj_path + str(i) + '.dcd' for i in traj_num] feat = coor.featurizer('../DESRES_protease_chainid.pdb') feat.add_distances(feat.pairs(feat.select('name == CA and chainid == 0'), excluded_neighbors=3)) traj = coor.load(traj_list, feat, stride=5) traj_cat = np.concatenate(traj) feat1 = coor.featurizer('../DESRES_protease_chainid.pdb') feat1.add_distances(feat1.pairs(feat1.select('name == CA and chainid == 1'), excluded_neighbors=3)) traj1 = coor.load(traj_list, feat, stride=5) traj_cat1 = np.concatenate(traj) traj_cat_pair = np.concatenate((traj_cat, traj_cat1), axis=0) min_dist = traj_cat_pair.min(axis=0) std_dist = traj_cat_pair.std(axis=0) new_dists = np.where((min_dist < dis_cutoff) & (std_dist > std_cutoff))[0]
Rspine_res = np.array([],dtype=np.int64) for res in Rspine: atom_select = top.select(res) res_select = convert_atom_list_to_resid(atom_select,top) Rspine_res = np.append(Rspine_res, res_select ) feat.add_residue_mindist([[Rspine_res[0],Rspine_res[1]],[Rspine_res[1],Rspine_res[2]],[Rspine_res[2],Rspine_res[3]]]) print('Final Features Dimensions: %s '%feat.dimension()) return feat ## D671N ## # Make our featurizers feat_D671N = coor.featurizer(top_D671N) feat_D671N = add_kinase_coords_featurizer(feat_D671N,top_D671N) # Write out files for these features for our D671N trajs src_D671N = coor.source(filenames_D671N, features=feat_D671N) calculated_features_D671N = src_D671N.get_output() print('len(calculated_features_D671N): %s' %len(calculated_features_D671N)) for i, traj in enumerate(calculated_features_D671N): np.save('D671N-pro/calculated_features_D671N_%s.npy'%i, traj) ## Y755A ## # Make our featurizers feat_Y755A = coor.featurizer(top_Y755A) feat_Y755A = add_kinase_coords_featurizer(feat_Y755A,top_Y755A)
def backbone(chain=0): ''' Bachbone Phi and Psi torsion angles ''' featurizer = coor.featurizer(pdb) featurizer.add_backbone_torsions(cossin=True, selstr=f'chainid == {chain}') return featurizer
sorted_temps.sort() dirs = [ dir for key in sorted_temps for dir in organized_temps[key] ] topfile = "{}/ref.pdb".format(dirs[0]) trajfiles = [ "{}/traj.xtc".format(x) for x in dirs ] T_labels = [ x.split("_")[1] for x in dirs ] T = [ float(x) for x in T_labels ] #tram_lag = 400 # Found from doing an MSM at one temp. For C-alpha SBM if not os.path.exists("dtram/dtram.pkl"): # estimate dtram print "solving tram" feat = coor.featurizer(topfile) feat = util.default_ca_sbm_features(feat, topfile, pairsfile=pairsfile) dirs, dtrajs, dtram = util.multi_temperature_dtram(feat, trajfiles, T, tram_lag=tram_lag) util.save_multi_temperature_dtram(dirs, dtrajs, dtram) else: print "loading tram" dirs, dtrajs, dtram = util.load_multi_temperature_dtram() # define bin edges for clustering observable bins = np.linspace(0, 133, 50) mid_bin = 0.5*(bins[1:] + bins[:-1]) # calculate the distribution of an observable from the tram MSM's. # get observables for each cluster at each thermodynamic state thermo_obs = {}
xlabel('component 1') ylabel('component 2') #feat=pyemma.coordinates.featurizer('prot.pdb') top = '/scratch/f91/ma2374/vsite_CFTR/wt/310K/combined_pca_analysis/cov-domain-average.pdb' #trajs = ['/scratch/f91/ma2374/vsite_CFTR/wt/310K/1/wt_ca.xtc','/scratch/f91/ma2374/vsite_CFTR/wt/310K/2/wt_ca.xtc','/scratch/f91/ma2374/vsite_CFTR/wt/310K/3/wt_ca.xtc','/scratch/f91/ma2374/vsite_CFTR/wt/310K/combined_pca_analysis/aa_wt_ca.xtc'] trajs = [ '/scratch/f91/ma2374/vsite_CFTR/wt/310K/1/wt_ca_domain.xtc', '/scratch/f91/ma2374/vsite_CFTR/wt/310K/2/wt_ca_domain.xtc', '/scratch/f91/ma2374/vsite_CFTR/wt/310K/3/wt_ca_domain.xtc', '/scratch/f91/ma2374/vsite_CFTR/wt/310K/combined_pca_analysis/aa_wt_ca_domain.xtc' ] feat_Ca = coor.featurizer(top) feat_Ca.add_selection(feat_Ca.select('name CA')) print(feat_Ca.dimension()) cluster = False if cluster: tica_Ca, tica_Y_Ca, tica_cl_Ca = project_and_cluster(trajs, feat_Ca) else: tica_Ca, tica_Y_Ca = project_and_cluster(trajs, feat_Ca) #tica_Ca, tica_Y_Ca = project_and_cluster(trajs, feat_Ca,tica=False) #tica_Ca, tica_Y_Ca = project_and_cluster(trajs, feat_Ca) print(np.shape(tica_Ca.eigenvectors)) x = (tica_Ca.get_params()) #pdb.set_trace() np.save('tica_eigvec.npy', tica_Ca.eigenvectors) np.save('tica_eigval.npy', tica_Ca.eigenvalues) print('feat_means.npy', tica_Ca.get_params().keys())
def chi(x): return x feat = coordinates.featurizer(x) return feat.something()
def gamma(y): return y feat = coordinates.featurizer(y) return feat.somethingelse()
type=float, default=2.0, help="minimum distance between cluster centers.") parser.add_argument('-log', action="store", dest="log", type=str,\ default = "clust.log", help="log file's name: default clust.log") parser.add_argument('-o', action="store", dest="o", type=str, default = "./",\ help="output path filename ") arg = parser.parse_args() #topfile = 'peptide_example.gro' topfile = arg.g #traj = 'md_example.xtc' traj = arg.f cleanFolder(arg.o) feat = coor.featurizer(topfile) feat.add_backbone_torsions(selstr=None, deg=True, cossin=True) # in degrees #List of all the angles #print(feat.describe()) #Number of dregree of freedom #print(feat.dimension()) inp = coor.source(traj, feat) sincos = inp.get_output()[0] ############# #Use a regular space clustering. Cluster centers are at least in distance of #dmin to each other according to the given metric.Then Voronoi discretization #with the computed centers is used to partition the data
import pyemma.coordinates as coor import numpy as np import pickle feat = coor.featurizer('act_site.pdb') feat.add_angles(np.array([[0, 1, 2], [1, 2, 3]]), deg=False, cossin=False, periodic=False) inp = coor.source('prod0-as1-aligned.dcd', feat) # Save the feature description for comparison later pickle.dump(inp.describe(), open('nocos_desc.p', 'wb')) # Comparisons if True: fixed = pickle.load(open( 'fixed_desc.p', 'rb')) # rad = rad.reshape(rad.shape[0], rad.shape[1]*rad.shape[2]) broken = pickle.load( open('broken_desc.p', 'rb') ) # rad = rad.reshape(functools.reduce(lambda x, y: x * y, rad.shape),) nocos = pickle.load( open('nocos_desc.p', 'rb') ) # rad = rad.reshape(functools.reduce(lambda x, y: x * y, rad.shape),) # & cossin=False fixed = fixed[1:] broken = broken[1:] nocos = nocos[1:]