def test_dihedral_feat(): print(base_dir) pool = Pool(6) yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) for prt in ["kinase_1", "kinase_2"]: print(prt) prj = yaml_file["project_dict"][prt][0] featurize_project_wrapper(yaml_file, prt, feat=None, stride=1, view=pool) feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) flist = glob.glob(os.path.join(base_dir, prt , yaml_file["protein_dir"],"*.hdf5")) for i in np.random.choice(flist, 3): trj = mdt.load(i) my_feat = feat.partial_transform(trj) expected_fname = os.path.join(base_dir, prt, yaml_file["feature_dir"], os.path.splitext(os.path.basename(i))[0]+".jl") calc_feat = verboseload(expected_fname) assert np.allclose(my_feat, calc_feat) return True
def read_and_featurize_divided(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10): #print("reading and featurizing %s" %(filename)) traj_top = md.load_frame(filename,0).topology atom_indices = [a.index for a in traj_top.atoms if a.residue.name[0:2] != "HI"] traj = md.load(filename,atom_indices=atom_indices) #print("got traj") featurizer = DihedralFeaturizer(types = dihedrals) features = featurizer.transform(traj_list = traj) #print(np.shape(features)) #print("finished featurizing") directory = filename.split("/") condition = directory[len(directory)-2] dcd_file = directory[len(directory)-1] new_file = "%s_features_stride%d.h5" %(dcd_file.rsplit( ".", 1 )[ 0 ] , stride) new_root_dir = "/scratch/users/enf/b2ar_analysis/subsampled_features" new_condition_dir = "%s/%s" %(new_root_dir, condition) new_file_full = "%s/%s/%s" %(new_root_dir, condition, new_file) #print("saving features as %s" %new_file_full) verbosedump(features, new_file_full) return features
def read_and_featurize(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10): print("reading and featurizing %s" %(filename)) traj = md.load(filename) #test_traj_init = md.load_frame(filename,5) #test_traj_init.save_pdb("/scratch/users/enf/b2ar_analysis/test_init.pdb") #traj.topology = fix_topology(traj.topology) #traj[-1].save_pdb("/scratch/users/enf/b2ar_analysis/test_fixed.pdb") #traj.save_dcd("/scratch/users/enf/b2ar_analysis/test_fixed.dcd") #print("got traj") featurizer = DihedralFeaturizer(types = dihedrals) features = featurizer.transform(traj_list = traj) #print("finished featurizing") directory = filename.split("/") traj_file = directory[len(directory)-1] condition = traj_file.split("_")[0].split(".")[0] print("Condition %s has features of shape %s" %(condition, np.shape(features))) new_file = "/scratch/users/enf/b2ar_analysis/combined_features/%s_features.h5" %condition verbosedump(features, new_file)
def featurize_file(job_tuple): yaml_file, protein, feat, traj_file,stride = job_tuple yaml_file = load_yaml_file(yaml_file) if feat is None: feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) _check_output_folder_exists(yaml_file, protein) output_folder = os.path.join(yaml_file["base_dir"], protein, yaml_file["feature_dir"]) traj_name = os.path.splitext(os.path.basename(traj_file))[0] output_fname = os.path.join(output_folder, traj_name+".jl") feat_descriptor = os.path.join(output_folder, "feature_descriptor.h5") try: trj = mdt.load(traj_file) except : warnings.warn("Removing %s because of misformed trajectory"%traj_file) os.remove(traj_file) return features = feat.partial_transform(trj) verbosedump(features, output_fname) if not os.path.isfile(feat_descriptor) and hasattr(feat, "describe_features"): dih_df = pd.DataFrame(feat.describe_features(trj[0])) verbosedump(dih_df, feat_descriptor) return
def test_function_featurizer(): trajectories = AlanineDipeptide().get_cached().trajectories trj0 = trajectories[0] # use the dihedral to compute phi for ala atom_ind = [[4, 6, 8, 14]] func = compute_dihedrals # test with args f = FunctionFeaturizer(func, func_args={"indices": atom_ind}) res1 = f.transform([trj0]) # test with function in a function without any args def funcception(trj): return compute_phi(trj)[1] f = FunctionFeaturizer(funcception) res2 = f.transform([trj0]) # know results f3 = DihedralFeaturizer(['phi'], sincos=False) res3 = f3.transform([trj0]) # compare all for r in [res2, res3]: np.testing.assert_array_almost_equal(res1, r)
def read_and_featurize(filename, dihedrals=['chi2'], stride=10): #print("reading and featurizing %s" %(filename)) top = md.load_frame(filename, 0).topology #print("got top") atom_indices = [a.index for a in top.atoms if a.residue.resSeq == 93 and a.residue != "POPC" and str(a.residue)[0] == "H"] print(len(atom_indices)) #atom_indices = [a.index for a in top.atoms if a.residue.chain.index == 0 and a.residue.resSeq != 93 and a.residue != "POPC" and a.residue.resSeq != 130 and a.residue.resSeq != 172 and a.residue.resSeq != 79 and a.residue.resSeq != 341] #print("got indices") traj = md.load(filename, stride=1000, atom_indices=atom_indices) #print("got traj") featurizer = DihedralFeaturizer(types = dihedrals) features = featurizer.transform(traj_list = traj) #print(np.shape(features)) #print("finished featurizing") directory = filename.split("/") condition = directory[len(directory)-2] dcd_file = directory[len(directory)-1] new_file = "%s_features_stride%d.h5" %(dcd_file.rsplit( ".", 1 )[ 0 ] , stride) new_root_dir = "/scratch/users/enf/b2ar_analysis/subsampled_features" new_condition_dir = "%s/%s" %(new_root_dir, condition) new_file_full = "%s/%s/%s" %(new_root_dir, condition, new_file) #print("saving features as %s" %new_file_full) verbosedump(features, new_file_full) return features
def _test_tic_sampling(yaml_file, protein_name, tic_list, n_frames, scheme): #test to make sure we are sampling right sample_for_all_proteins(yaml_file, [protein_name], tic_list, n_frames, scheme=scheme) ser = ProteinSeries(yaml_file) prt = Protein(ser, protein_name) for tic_index in [0,1]: traj_path = os.path.join(base_dir,yaml_file["mdl_dir"], protein_name,"tic%d.xtc"%tic_index) traj_top = os.path.join(base_dir,yaml_file["mdl_dir"], protein_name, "prot.pdb") tica_traj = mdt.load(traj_path,top=traj_top) print(tica_traj.n_frames) feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) f = feat.partial_transform(tica_traj) t_f = np.round(prt.tica_mdl.transform([f])) #check that the tic goes from min to max print("Look here",t_f[0]) assert t_f[0][0][tic_index] <= t_f[0][-1][tic_index] all_vals = [] for traj_tica_data in prt.tica_data.values(): all_vals.extend(traj_tica_data[:,tic_index]) #sort it because all three sampling schemes use it all_vals = np.round(np.sort(all_vals)) print(tic_index) print(t_f[0][:,tic_index] >= all_vals[0]) print(t_f[0][:,tic_index] <= all_vals[-1]) #make sure the frames are within limitsss assert (t_f[0][:,tic_index] >= all_vals[0]).all() assert (t_f[0][:,tic_index] <= all_vals[-1]).all() return True
def test_get_common_features(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: t = load_random_traj(yaml_file, protein) aligned_dict[protein] = t.top.to_fasta(chain=0) f= DihedralFeaturizer() common_feature_dic,_ = _get_common_features(yaml_file,f, aligned_dict, False) for protein in yaml_file["protein_list"]: t = load_random_traj(yaml_file, protein) assert(len(common_feature_dic[protein])==f.transform(t)[0].shape[1]) return
def individual_traj_featurize(data_to_process): #print('Running individual traj featurize\n') test = 1 #print("Data process to do is :", data_to_process) featurizer_type = data_to_process[0] if featurizer_type == 'Dihedral': featurizer_data = DihedralFeaturizer(types=['phi', 'psi']) # print('Featurizer created:\n') featurized_data = featurizer_data.fit_transform(data_to_process[2]) #print('Finished individual traj featurize\n') return [data_to_process[1], featurized_data]
def test_code_works(): # creates a 4-state HMM on the ALA2 data. Nothing fancy, just makes # sure the code runs without erroring out trajectories = AlanineDipeptide().get_cached().trajectories topology = trajectories[0].topology indices = topology.select('symbol C or symbol O or symbol N') featurizer = DihedralFeaturizer(['phi', 'psi'], trajectories[0][0]) sequences = featurizer.transform(trajectories) hmm = VonMisesHMM(n_states=4, n_init=1) hmm.fit(sequences) assert len(hmm.timescales_ == 3) assert np.any(hmm.timescales_ > 50)
def featurize_project(proj_folder, top_folder, featurizer_object, stride, view): #if already featurized dont bother(should add a warning about this) if os.path.exists(proj_folder + "/featurized_traj.pkl"): return verboseload(proj_folder + "/featurized_traj.pkl") if featurizer_object is None: featurizer = DihedralFeaturizer(types=['phi', 'psi', 'chi1']) else: try: featurizer = verboseload(featurizer_object) except: sys.exit("Cant Load Featurizer using msmbuilder verboseload") feature_dict = {} traj_list = glob.glob(proj_folder + "/trajectories/*.dcd") jobs = [(proj_folder, top_folder, featurizer, traj, stride) for traj in traj_list] results = view.map_sync(featurize_traj, jobs) for result in results: feature_dict[result[0]] = result[1] verbosedump(feature_dict, proj_folder + "/featurized_traj.pkl") return feature_dict
def test_DihedralFeaturizer_describe_features_nosincos(): feat = DihedralFeaturizer(sincos=False) rnd_traj = np.random.randint(len(trajectories)) features = feat.transform([trajectories[rnd_traj]]) df = pd.DataFrame(feat.describe_features(trajectories[rnd_traj])) for f in range(25): f_index = np.random.choice(len(df)) atom_inds = df.iloc[f_index].atominds feature_value = md.compute_dihedrals(trajectories[rnd_traj], [atom_inds]) if feat.sincos: func = getattr(np, '%s' % df.iloc[f_index].otherinfo) feature_value = func(feature_value) assert (features[0][:, f_index] == feature_value.flatten()).all()
def feat_traj(traj): # load again to get the waters trj = mdt.load(traj) atp_solute = [ i.index for i in trj.topology.atoms if (i.residue.name == "atp" and (i.element.name == "oxygen" or i.element.name == "nitrogen")) or (i.residue.name == "MG") ] # get the oxygen and nitrogen indices solute_indices = [ i.index for i in trj.topology.atoms if i.residue.is_protein and (i.element.name == "oxygen" or i.element.name == "nitrogen") ] # get the oxygen solvent indices solvent_indices = [i.index for i in trj.topology.atoms if (i.residue.is_water and i.element.name != "hydrogen")] # set up featurizers atp_feat = wmsm.SolventShellsFeaturizer(atp_solute, solvent_indices, 2, 0.3) water_feat = wmsm.SolventShellsFeaturizer(solute_indices, solvent_indices, 2, 0.3) dihedral_feat = DihedralFeaturizer(["phi", "psi", "chi1"]) # calculate features water_features = water_feat.partial_transform(trj) dihedral_features = dihedral_feat.partial_transform(trj) atp_features = atp_feat.partial_transform(trj) combined_features = np.hstack((dihedral_features, water_features, atp_features)) return combined_features # dump fname = os.path.basename(traj) save_path = os.path.join("/nobackup/msultan/research/kinase/fyn_kinase/fah_data/features/") verbosedump(dihedral_features, os.path.join((save_path, "dihedral/%s" % fname))) verbosedump(water_features, os.path.join((save_path, "water/%s" % fname))) verbosedump(atp_features, os.path.join((save_path, "atp/%s" % fname))) verbosedump(combined_features, os.path.join((save_path, "combined/%s" % fname))) return
def read_and_featurize(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10): print(("reading and featurizing %s" % (filename))) traj = md.load(filename).select('chain A and protein') featurizer = DihedralFeaturizer(types=dihedrals) features = featurizer.transform(traj_list=traj) print("finished featurizing") directory = filename.split("/") condition = directory[len(directory) - 2] dcd_file = directory[len(directory) - 1] new_file = "%s_features_stride%d.h5" % (dcd_file.rsplit(".", 1)[0], stride) new_root_dir = "/home/enf/b2ar_analysis/subsampled_features/" new_condition_dir = "%s/%s" % (new_root_dir, condition) if not os.path.exists(new_condition_dir): os.makedirs(new_condition_dir) new_file_full = "%s/%s/%s" % (new_root_dir, condition, new_file) print(("saving features as %s" % new_file_full)) verbosedump(features, new_file_full) return features
def read_and_featurize(filename, dihedrals=['phi','psi','chi2'], stride=10): print("reading and featurizing %s" %(filename)) traj = md.load(filename).select('chain A and protein') featurizer = DihedralFeaturizer(types = dihedrals) features = featurizer.transform(traj_list = traj) print("finished featurizing") directory = filename.split("/") condition = directory[len(directory)-2] dcd_file = directory[len(directory)-1] new_file = "%s_features_stride%d.h5" %(dcd_file.rsplit( ".", 1 )[ 0 ] , stride) new_root_dir = "/home/enf/b2ar_analysis/subsampled_features/" new_condition_dir = "%s/%s" %(new_root_dir, condition) if not os.path.exists(new_condition_dir): os.makedirs(new_condition_dir) new_file_full = "%s/%s/%s" %(new_root_dir, condition, new_file) print("saving features as %s" %new_file_full) verbosedump(features, new_file_full) return features
def fit_and_transform(directory, stride=5): projected_data_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi_stride%d_projected.h5" %stride fit_model_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi2_stride%s_tica_coords.h5" %stride #active_pdb_file = "/scratch/users/enf/b2ar_analysis/3P0G_pymol_prepped.pdb" active_pdb_file = "/scratch/users/enf/b2ar_analysis/system_B.pdb" tica_model = tICA(n_components=4) if not os.path.exists(projected_data_filename): print("loading feature files") feature_files = get_trajectory_files(directory) pool = mp.Pool(mp.cpu_count()) features = pool.map(load_features, feature_files) pool.terminate() if not os.path.exists(fit_model_filename): print("fitting data to tICA model") fit_model = tica_model.fit(features) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: print("loading tICA model") fit_model = verboseload(fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: fit_model = verboseload(fit_model_filename) transformed_data = verboseload(projected_data_filename) active_pdb = md.load(active_pdb_file) top = active_pdb.topology atom_indices = [a.index for a in top.atoms if a.residue.is_protein and a.residue.resSeq != 341 and a.residue.name[0:2] != "HI" and a.residue.resSeq != 79 and a.residue.resSeq != 296 and a.residue.resSeq != 269 and a.residue.resSeq != 178 and a.residue.resSeq != 93 and a.residue.name != "NMA" and a.residue.name != "NME" and a.residue.name != "ACE"] active_pdb = md.load(active_pdb_file, atom_indices=atom_indices) featurizer = DihedralFeaturizer(types=['phi', 'psi', 'chi2']) active_pdb_features = featurizer.transform(active_pdb) active_pdb_projected = fit_model.transform(active_pdb_features) print(active_pdb_projected[0:4])
def test_FeatureSelector_describe_features(): rnd_traj = np.random.randint(len(trajectories)) f_ca = ContactFeaturizer(scheme='CA', ignore_nonprotein=True) f1 = f_ca.transform([trajectories[rnd_traj]]) df1 = pd.DataFrame(f_ca.describe_features(trajectories[rnd_traj])) f_dih = DihedralFeaturizer() f2 = f_dih.transform([trajectories[rnd_traj]]) df2 = pd.DataFrame(f_dih.describe_features(trajectories[rnd_traj])) df_dict = {} df_dict["ca"] = df1 df_dict["dih"] = df2 f_comb = FeatureSelector([('ca', f_ca), ('dih', f_dih)]) f3 = f_comb.transform([trajectories[rnd_traj]]) df3 = pd.DataFrame(f_comb.describe_features(trajectories[rnd_traj])) assert len(df3) == len(df1) + len(df2) df4 = pd.concat([df_dict[i] for i in f_comb.feat_list]) # lets randomly compare 40 features for i in np.random.choice(range(len(df3)), 40): for j in df3.columns: assert eq(df3.iloc[i][j], df4.iloc[i][j])
def test_feature_slicer(): trajectories = AlanineDipeptide().get_cached().trajectories f = DihedralFeaturizer() fs = FeatureSlicer(f, indices=[0, 1]) y1 = fs.transform(trajectories) assert y1[0].shape[1] == 2 df = pd.DataFrame(fs.describe_features(trajectories[0])) assert len(df) == 2 assert 'psi' not in df.featuregroup[0] assert 'psi' not in df.featuregroup[1] fs = FeatureSlicer(f, indices=[2, 3]) y1 = fs.transform(trajectories) assert y1[0].shape[1] == 2 df = pd.DataFrame(fs.describe_features(trajectories[0])) assert len(df) == 2 assert 'phi' not in df.featuregroup[0] assert 'phi' not in df.featuregroup[1]
def featurize_traj(job_tuple): #separate out the job tuple into required things mutant,mutant_dir,project,proj_folder,proj_top_folder,traj_file,stride,save_common,allowed_residue_ind \ = job_tuple #load top file to setup solute/solvent indices top_path = os.path.join(proj_top_folder, "%s.pdb"%os.path.basename(traj_file).split("_")[0]) top_trj = mdtraj.load(top_path) #set up featurizer objects dihedral_feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) #load the trajectory try: trj = mdtraj.load(traj_file,stride=stride) except: print "Cant featurize %s"%traj_file return #setup file name traj_name = os.path.splitext(os.path.basename(traj_file))[0] print traj_name dihedral_output_file = os.path.join(mutant_dir,"features/dihedral_features/")+str(project)+\ "_"+traj_name+".h5" water_output_file = os.path.join(mutant_dir,"features/water_features/")+str(project)+\ "_"+traj_name+".h5" combined_output_file = os.path.join(mutant_dir,"features/combined_features/")+str(project)+\ "_"+traj_name+".h5" do_again=True already_done=False if os.path.isfile(combined_output_file): f = verboseload(combined_output_file) if f.shape[0]!=trj.n_frames: already_done=True if not already_done or do_again: dihedral_features = dihedral_feat.partial_transform(trj) traj_name = os.path.splitext(os.path.basename(traj_file))[0] dihedral_output_file = os.path.join(mutant_dir,"features/dihedral_features/")+str(project)+\ "_"+traj_name+".h5" #now we can dump verbosedump(dihedral_features,dihedral_output_file) if save_common: dih_df = pandas.DataFrame(dihedral_feat.describe_features(top_trj)) dih_f_ind = numpy.array([set(i).issubset(allowed_residue_ind) for i in dih_df["resid"]]) subset_dihedral_features = dihedral_features[:,dih_f_ind] dihedral_output_file = os.path.join(mutant_dir,"features/common_basis/dihedral_features/")+\ str(project)+"_"+traj_name+".h5" #now we can dump verbosedump(subset_dihedral_features,dihedral_output_file) #save the featurizer information. verbosedump([dih_df,allowed_residue_ind,dih_f_ind,],\ os.path.join(mutant_dir,"features/common_basis/dihedral_features/")+"saved_dihed_feat.h5") return else: print "skipping featurization for %s since its already done"%traj_name return
from msmbuilder.featurizer import DihedralFeaturizer from msmbuilder.decomposition import tICA from msmbuilder.cluster import MiniBatchKMeans from msmbuilder.msm import MarkovStateModel import numpy as np import msmexplorer as msme rs = np.random.RandomState(42) # Load Fs Peptide Data trajs = FsPeptide().get().trajectories # Extract Backbone Dihedrals featurizer = DihedralFeaturizer(types=['chi1']) diheds = featurizer.fit_transform(trajs) # Perform Dimensionality Reduction tica_model = tICA(lag_time=2, n_components=2) tica_trajs = tica_model.fit_transform(diheds) # Perform Clustering clusterer = MiniBatchKMeans(n_clusters=12, random_state=rs) clustered_trajs = clusterer.fit_transform(tica_trajs) # Construct MSM msm = MarkovStateModel(lag_time=2) assignments = msm.fit_transform(clustered_trajs) # Plot Stacked Distributions
def _fit_transform(prt, trj): f=DihedralFeaturizer(types=['phi', 'psi','chi1']) feat = f.partial_transform(trj) t_f = prt.tica_mdl.transform([feat]) st = prt.kmeans_mdl.transform(t_f) return st
for traj in os.listdir(traj_dir): if traj.endswith(".dcd"): traj_files.append("%s/%s" %(traj_dir,traj)) traj_files.sort() traj = md.load(traj_files, top = "/home/harrigan/compute/wetmsm/gpcr/des/system_mae_to_pdb/des_trajs/DESRES-Trajectory_pnas2011b-H-05-all/system.pdb", stride=10) traj = traj[0].join(traj[1:]) traj.save("/home/enf/b2ar_analysis/H-05/%s" %("combined_traj_stride10.h5")) else: ''' #print("loading h5 traj") #traj = md.load("combined_traj_stride10.h5") ''' ''' if not (os.path.isfile("phi_psi_chi2_features_vd_stride10.h5")): print("featurizing") phi_psi_chi2 = DihedralFeaturizer(types=['phi','psi','chi2']) features = phi_psi_chi2.transform(traj_list = traj) print("finished featurizing") verbosedump(features, "phi_psi_chi2_features_vd_stride10.h5") else: print("loading existing features") features = verboseload("phi_psi_chi2_features_vd_stride10.h5") features = [np.concatenate(features)] if not (os.path.isfile("reduced_phi_psi_chi_stride10.h5")): print("Fitting tICA model") tica_model = tICA(n_components=4) fitted_model = tica_model.fit(features) reduced_data = fitted_model.transform(features) verbosedump(reduced_data, "reduced_phi_psi_chi_stride10.h5") print(tica_model.summarize())