def featurize_divided(directory): simulations = os.listdir(directory) for simulation in simulations: if simulation[0] not in [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J' ]: continue sim_dir = "%s/%s" % (directory, simulation) new_dir = "%s/%s" % ( "/scratch/users/enf/b2ar_analysis/subsampled_features", simulation) if os.path.exists(new_dir): print("we have already featurized this simulation") continue else: os.makedirs(new_dir) print(("currently analyzing %s " % sim_dir)) trajs = get_trajectory_files(sim_dir)[0:3] print(trajs) #print("there are %d cpus" %(mp.cpu_count())) pool = mp.Pool(mp.cpu_count()) features_i = pool.map(read_and_featurize, trajs) #print(features_i) features = [np.concatenate(np.concatenate(features_i))] print((np.shape(features[0]))) combined_dir = "/scratch/users/enf/b2ar_analysis/combined_features" new_file_name = "%s_combined.h5" % (simulation) new_file = "%s/%s" % (combined_dir, new_file_name) #print("saving concatenated features for %s as %s" %(simulation, new_file)) verbosedump(features, new_file)
def featurize_project(proj_folder,top_folder,featurizer_object,stride,view): #if already featurized dont bother(should add a warning about this) if os.path.exists(proj_folder+"/featurized_traj.pkl"): return verboseload(proj_folder+"/featurized_traj.pkl") if featurizer_object is None: featurizer = DihedralFeaturizer(types=['phi', 'psi','chi1']) else: try: featurizer = verboseload(featurizer_object) except: sys.exit("Cant Load Featurizer using msmbuilder verboseload") feature_dict={} traj_list = glob.glob(proj_folder+"/trajectories/*.dcd") jobs = [(proj_folder,top_folder,featurizer,traj,stride) for traj in traj_list] results = view.map_sync(featurize_traj,jobs) for result in results: feature_dict[result[0]] = result[1] verbosedump(feature_dict,proj_folder+"/featurized_traj.pkl") return feature_dict
def read_and_featurize_divided(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10): #print("reading and featurizing %s" %(filename)) traj_top = md.load_frame(filename, 0).topology atom_indices = [ a.index for a in traj_top.atoms if a.residue.name[0:2] != "HI" ] traj = md.load(filename, atom_indices=atom_indices) #print("got traj") featurizer = DihedralFeaturizer(types=dihedrals) features = featurizer.transform(traj_list=traj) #print(np.shape(features)) #print("finished featurizing") directory = filename.split("/") condition = directory[len(directory) - 2] dcd_file = directory[len(directory) - 1] new_file = "%s_features_stride%d.h5" % (dcd_file.rsplit(".", 1)[0], stride) new_root_dir = "/scratch/users/enf/b2ar_analysis/subsampled_features" new_condition_dir = "%s/%s" % (new_root_dir, condition) new_file_full = "%s/%s/%s" % (new_root_dir, condition, new_file) #print("saving features as %s" %new_file_full) verbosedump(features, new_file_full) return features
def cluster_minikmeans(tica_dir, data_dir, traj_dir, n_clusters, clusterer_dir=None, tICs=None): if (os.path.exists(clusterer_dir)): reduced_data = load_file(data_dir) clusterer = verboseload(clusterer_dir) clusterer.labels_ = clusterer.transform(reduced_data) verbosedump(clusterer, clusterer_dir) else: print("Clustering by KMeans") try: reduced_data = verboseload(data_dir) except: reduced_data = load_dataset(data_dir) if tICs is not None: X = [] for traj in reduced_data: X.append(traj[:, tICs]) else: X = reduced_data clusterer = MiniBatchKMeans(n_clusters=n_clusters, n_init=10) clusterer.fit_transform(X) verbosedump(clusterer, clusterer_dir)
def featurize_divided(directory): simulations = os.listdir(directory) for simulation in simulations: if simulation[0] not in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']: continue sim_dir = "%s/%s" %(directory, simulation) new_dir = "%s/%s" %("/scratch/users/enf/b2ar_analysis/subsampled_features", simulation) if os.path.exists(new_dir): print("we have already featurized this simulation") continue else: os.makedirs(new_dir) print("currently analyzing %s " %sim_dir) trajs = get_trajectory_files(sim_dir)[0:3] print(trajs) #print("there are %d cpus" %(mp.cpu_count())) pool = mp.Pool(mp.cpu_count()) features_i = pool.map(read_and_featurize, trajs) #print(features_i) features = [np.concatenate(np.concatenate(features_i))] print(np.shape(features[0])) combined_dir = "/scratch/users/enf/b2ar_analysis/combined_features" new_file_name = "%s_combined.h5" %(simulation) new_file = "%s/%s" %(combined_dir, new_file_name) #print("saving concatenated features for %s as %s" %(simulation, new_file)) verbosedump(features, new_file)
def _slice_file(job_tuple): inp_file, feature_ind, output_folder = job_tuple featurized_file = verboseload(inp_file) sliced_file = featurized_file[:, feature_ind] sliced_file_out = os.path.join(output_folder, os.path.basename(inp_file)) verbosedump(sliced_file, sliced_file_out) return
def build_msm(clusterer_dir, lag_time): clusterer = verboseload(clusterer_dir) n_clusters = np.shape(clusterer.cluster_centers_)[0] labels = clusterer.labels_ msm_modeler = MarkovStateModel(lag_time=lag_time) print("fitting msm to trajectories with %d clusters and lag_time %d" %(n_clusters, lag_time)) msm_modeler.fit_transform(labels) verbosedump(msm_modeler, "/scratch/users/enf/b2ar_analysis/msm_model_%d_clusters_t%d" %(n_clusters, lag_time)) print("fitted msm to trajectories with %d states" %(msm_modeler.n_states_)) #np.savetxt("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_transmat.csv" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",") #G = nx.from_numpy_matrix(msm_modeler.transmat_) #nx.write_edgelist(G, "/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",") transmat = msm_modeler.transmat_ mapping = msm_modeler.mapping_ edges = open("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist.csv" %(n_clusters, lag_time), "wb") for i in range(0, msm_modeler.n_states_): if i == 0: for j in range(0, msm_modeler.n_states_): edges.write(";") edges.write("%d" %mapping[j]) edges.write("\n") edges.write("%d" %(mapping[i])) for j in range(0, msm_modeler.n_states_): prob = transmat[i][j] edges.write(";") if prob > 0.000001: edges.write("%f" %prob) else: edges.write("0") edges.write("\n") edges.close()
def read_and_featurize(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10): print(("reading and featurizing %s" % (filename))) traj = md.load(filename) #test_traj_init = md.load_frame(filename,5) #test_traj_init.save_pdb("/scratch/users/enf/b2ar_analysis/test_init.pdb") #traj.topology = fix_topology(traj.topology) #traj[-1].save_pdb("/scratch/users/enf/b2ar_analysis/test_fixed.pdb") #traj.save_dcd("/scratch/users/enf/b2ar_analysis/test_fixed.dcd") #print("got traj") featurizer = DihedralFeaturizer(types=dihedrals) features = featurizer.transform(traj_list=traj) #print("finished featurizing") directory = filename.split("/") traj_file = directory[len(directory) - 1] condition = traj_file.split("_")[0].split(".")[0] print(("Condition %s has features of shape %s" % (condition, np.shape(features)))) new_file = "/scratch/users/enf/b2ar_analysis/combined_features/%s_features.h5" % condition verbosedump(features, new_file)
def read_and_featurize_divided(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10): #print("reading and featurizing %s" %(filename)) traj_top = md.load_frame(filename,0).topology atom_indices = [a.index for a in traj_top.atoms if a.residue.name[0:2] != "HI"] traj = md.load(filename,atom_indices=atom_indices) #print("got traj") featurizer = DihedralFeaturizer(types = dihedrals) features = featurizer.transform(traj_list = traj) #print(np.shape(features)) #print("finished featurizing") directory = filename.split("/") condition = directory[len(directory)-2] dcd_file = directory[len(directory)-1] new_file = "%s_features_stride%d.h5" %(dcd_file.rsplit( ".", 1 )[ 0 ] , stride) new_root_dir = "/scratch/users/enf/b2ar_analysis/subsampled_features" new_condition_dir = "%s/%s" %(new_root_dir, condition) new_file_full = "%s/%s/%s" %(new_root_dir, condition, new_file) #print("saving features as %s" %new_file_full) verbosedump(features, new_file_full) return features
def featurize_project(proj_folder, top_folder, featurizer_object, stride, view): #if already featurized dont bother(should add a warning about this) if os.path.exists(proj_folder + "/featurized_traj.pkl"): return verboseload(proj_folder + "/featurized_traj.pkl") if featurizer_object is None: featurizer = DihedralFeaturizer(types=['phi', 'psi', 'chi1']) else: try: featurizer = verboseload(featurizer_object) except: sys.exit("Cant Load Featurizer using msmbuilder verboseload") feature_dict = {} traj_list = glob.glob(proj_folder + "/trajectories/*.dcd") jobs = [(proj_folder, top_folder, featurizer, traj, stride) for traj in traj_list] results = view.map_sync(featurize_traj, jobs) for result in results: feature_dict[result[0]] = result[1] verbosedump(feature_dict, proj_folder + "/featurized_traj.pkl") return feature_dict
def featurize_file(job_tuple): yaml_file, protein, feat, traj_file,stride = job_tuple yaml_file = load_yaml_file(yaml_file) if feat is None: feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) _check_output_folder_exists(yaml_file, protein) output_folder = os.path.join(yaml_file["base_dir"], protein, yaml_file["feature_dir"]) traj_name = os.path.splitext(os.path.basename(traj_file))[0] output_fname = os.path.join(output_folder, traj_name+".jl") feat_descriptor = os.path.join(output_folder, "feature_descriptor.h5") try: trj = mdt.load(traj_file) except : warnings.warn("Removing %s because of misformed trajectory"%traj_file) os.remove(traj_file) return features = feat.partial_transform(trj) verbosedump(features, output_fname) if not os.path.isfile(feat_descriptor) and hasattr(feat, "describe_features"): dih_df = pd.DataFrame(feat.describe_features(trj[0])) verbosedump(dih_df, feat_descriptor) return
def read_and_featurize(filename, dihedrals=['chi2'], stride=10): #print("reading and featurizing %s" %(filename)) top = md.load_frame(filename, 0).topology #print("got top") atom_indices = [a.index for a in top.atoms if a.residue.resSeq == 93 and a.residue != "POPC" and str(a.residue)[0] == "H"] print((len(atom_indices))) #atom_indices = [a.index for a in top.atoms if a.residue.chain.index == 0 and a.residue.resSeq != 93 and a.residue != "POPC" and a.residue.resSeq != 130 and a.residue.resSeq != 172 and a.residue.resSeq != 79 and a.residue.resSeq != 341] #print("got indices") traj = md.load(filename, stride=1000, atom_indices=atom_indices) #print("got traj") featurizer = DihedralFeaturizer(types = dihedrals) features = featurizer.transform(traj_list = traj) #print(np.shape(features)) #print("finished featurizing") directory = filename.split("/") condition = directory[len(directory)-2] dcd_file = directory[len(directory)-1] new_file = "%s_features_stride%d.h5" %(dcd_file.rsplit( ".", 1 )[ 0 ] , stride) new_root_dir = "/scratch/users/enf/b2ar_analysis/subsampled_features" new_condition_dir = "%s/%s" %(new_root_dir, condition) new_file_full = "%s/%s/%s" %(new_root_dir, condition, new_file) #print("saving features as %s" %new_file_full) verbosedump(features, new_file_full) return features
def transform_protein_pca(yaml_file): mdl_dir = yaml_file["mdl_dir"] pca_obj_path = os.path.join(mdl_dir, "pca_mdl.pkl") protein_pca_mdl = verboseload(pca_obj_path) for protein in yaml_file["protein_list"]: with enter_protein_data_dir(yaml_file, protein): print("Transforming protein %s" % protein) featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) pca_data = {} for f in featurized_traj: featurized_path = verboseload(f) try: pca_data[os.path.basename(f)] = \ protein_pca_mdl.partial_transform(featurized_path) except: print('Error') with enter_protein_mdl_dir(yaml_file, protein): verbosedump(pca_data, 'pca_data.pkl') print("Done transforming protein %s" % protein) # dumping the pca_mdl again since the eigenspectrum might have been calculated pca_mdl_path = os.path.join(mdl_dir, "pca_mdl.pkl") verbosedump(protein_pca_mdl, pca_mdl_path) return
def fit_bayes_msms(yaml_file): mdl_params = yaml_file["mdl_params"] msm__lag_time = mdl_params["msm__lag_time"] if "bayesmsm__n_samples" in mdl_params.keys(): bayesmsm__n_samples = mdl_params["bayesmsm__n_samples"] else: bayesmsm__n_samples = 800 if "bayesmsm__n_steps" in mdl_params.keys(): bayesmsm__n_steps = mdl_params["bayesmsm__n_steps"] else: bayesmsm__n_steps = 1000000 for protein in yaml_file["protein_list"]: with enter_protein_mdl_dir(yaml_file, protein): print(protein) assignments = verboseload("assignments.pkl") msm_mdl = BayesianMarkovStateModel(n_samples=bayesmsm__n_samples, n_steps=bayesmsm__n_steps, lag_time=msm__lag_time, ergodic_cutoff=1.0/msm__lag_time, verbose=True).fit( [assignments[i] for i in assignments.keys()]) _ = msm_mdl.all_eigenvalues_ verbosedump(msm_mdl, "bayesmsm_mdl.pkl") fixed_assignments = {} for i in assignments.keys(): fixed_assignments[i] = msm_mdl.transform( assignments[i], mode='fill')[0] verbosedump(fixed_assignments, 'fixed_assignments.pkl') return
def fit_protein_kmeans(yaml_file,mini=True,pca=False): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("cluster__"): current_mdl_params[i.split("cluster__")[1]] = mdl_params[i] if mini: current_mdl_params["batch_size"] = 100*current_mdl_params["n_clusters"] kmeans_mdl = MiniBatchKMeans(**current_mdl_params) else: kmeans_mdl = KMeans(**current_mdl_params) data = [] for protein in yaml_file["protein_list"]: with enter_protein_mdl_dir(yaml_file, protein): if pca: tica_data = verboseload("pca_data.pkl") else: tica_data = verboseload("tica_data.pkl") # get all traj sorted_list = sorted(tica_data.keys(), key=keynat) data.extend([tica_data[i] for i in sorted_list]) kmeans_mdl.fit(data) kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl") verbosedump(kmeans_mdl, kmeans_mdl_path) return
def transform_protein_tica(yaml_file): mdl_dir = yaml_file["mdl_dir"] tica_obj_path = os.path.join(mdl_dir, "tica_mdl.pkl") protein_tica_mdl = verboseload(tica_obj_path) for protein in yaml_file["protein_list"]: print("Fitting to protein %s" % protein) with enter_protein_data_dir(yaml_file, protein): if os.path.exists("./normalized_features"): featurized_traj = sorted(glob.glob("./normalized_features/*.jl"), key=keynat) else: print('Warning: features have not been scaled') featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) tica_data = {} for f in featurized_traj: featurized_path = verboseload(f) try: tica_data[os.path.basename(f)] = \ protein_tica_mdl.partial_transform(featurized_path) except: pass with enter_protein_mdl_dir(yaml_file, protein): verbosedump(tica_data, 'tica_data.pkl') print("Done transforming protein %s" % protein) # dumping the tica_mdl again since the eigenspectrum might have been calculated tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl") verbosedump(protein_tica_mdl, tica_mdl_path) return
def generate_tpt_traj_index_series(msm_object, sources, sinks, clusters_map, num_paths, remove_path, save_file): net_flux = tpt.net_fluxes(sources, sinks, msm_object) tpt_paths = tpt.paths(sources, sinks, net_flux, remove_path=remove_path, num_paths=num_paths, flux_cutoff=0.5) inv_map = {v: k for k, v in msm_object.mapping_.items()} print(tpt_paths) traj_index_pairs_list = [] for path in tpt_paths[0]: print("path = %s" % (str(path))) traj_index_pairs = [] for state in path: cluster = inv_map[state] traj_index_pair = random.choice(list(clusters_map[cluster])) traj_index_pairs.append(traj_index_pair) traj_index_pairs_list.append(traj_index_pairs) verbosedump(traj_index_pairs_list, save_file) inv_tpt_paths = [] for tpt_path in tpt_paths[0]: inv_tpt_paths.append([inv_map[i] for i in tpt_path]) return tpt_paths[0], inv_tpt_paths, traj_index_pairs_list
def fit_protein_pca(yaml_file): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("pca__"): current_mdl_params[i.split("pca__")[1]] = mdl_params[i] protein_pca_mdl = PCA(**current_mdl_params) for protein in yaml_file["protein_list"]: print("Fitting to protein %s" % protein) with enter_protein_data_dir(yaml_file, protein): featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) for f in featurized_traj: featurized_path = verboseload(f) try: protein_pca_mdl.partial_fit(featurized_path) except: pass print("Done partial fitting to protein %s" % protein) # dumping the pca_mdl pca_mdl_path = os.path.join(mdl_dir, "pca_mdl.pkl") verbosedump(protein_pca_mdl, pca_mdl_path) return
def fit_protein_kmeans(yaml_file,mini=True): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("cluster__"): current_mdl_params[i.split("cluster__")[1]] = mdl_params[i] if mini: current_mdl_params["batch_size"] = 100*current_mdl_params["n_clusters"] kmeans_mdl = MiniBatchKMeans(**current_mdl_params) else: kmeans_mdl = KMeans(**current_mdl_params) data = [] for protein in yaml_file["protein_list"]: with enter_protein_mdl_dir(yaml_file, protein): tica_data = verboseload("tica_data.pkl") # get all traj sorted_list = sorted(tica_data.keys(), key=keynat) data.extend([tica_data[i] for i in sorted_list]) kmeans_mdl.fit(data) kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl") verbosedump(kmeans_mdl, kmeans_mdl_path) return
def read_and_featurize(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10): print("reading and featurizing %s" %(filename)) traj = md.load(filename) #test_traj_init = md.load_frame(filename,5) #test_traj_init.save_pdb("/scratch/users/enf/b2ar_analysis/test_init.pdb") #traj.topology = fix_topology(traj.topology) #traj[-1].save_pdb("/scratch/users/enf/b2ar_analysis/test_fixed.pdb") #traj.save_dcd("/scratch/users/enf/b2ar_analysis/test_fixed.dcd") #print("got traj") featurizer = DihedralFeaturizer(types = dihedrals) features = featurizer.transform(traj_list = traj) #print("finished featurizing") directory = filename.split("/") traj_file = directory[len(directory)-1] condition = traj_file.split("_")[0].split(".")[0] print("Condition %s has features of shape %s" %(condition, np.shape(features))) new_file = "/scratch/users/enf/b2ar_analysis/combined_features/%s_features.h5" %condition verbosedump(features, new_file)
def transform_protein_tica(yaml_file): mdl_dir = yaml_file["mdl_dir"] tica_obj_path = os.path.join(mdl_dir, "tica_mdl.pkl") protein_tica_mdl = verboseload(tica_obj_path) for protein in yaml_file["protein_list"]: with enter_protein_data_dir(yaml_file, protein): print("Transforming protein %s" % protein) featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) tica_data = {} for f in featurized_traj: featurized_path = verboseload(f) try: tica_data[os.path.basename(f)] = \ protein_tica_mdl.partial_transform(featurized_path) except: pass with enter_protein_mdl_dir(yaml_file, protein): verbosedump(tica_data, 'tica_data.pkl') print("Done transforming protein %s" % protein) # dumping the tica_mdl again since the eigenspectrum might have been calculated. tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl") verbosedump(protein_tica_mdl, tica_mdl_path) return
def read_and_featurize(filename, dihedrals=['chi2'], stride=10): #print("reading and featurizing %s" %(filename)) top = md.load_frame(filename, 0).topology #print("got top") atom_indices = [a.index for a in top.atoms if a.residue.resSeq == 93 and a.residue != "POPC" and str(a.residue)[0] == "H"] print(len(atom_indices)) #atom_indices = [a.index for a in top.atoms if a.residue.chain.index == 0 and a.residue.resSeq != 93 and a.residue != "POPC" and a.residue.resSeq != 130 and a.residue.resSeq != 172 and a.residue.resSeq != 79 and a.residue.resSeq != 341] #print("got indices") traj = md.load(filename, stride=1000, atom_indices=atom_indices) #print("got traj") featurizer = DihedralFeaturizer(types = dihedrals) features = featurizer.transform(traj_list = traj) #print(np.shape(features)) #print("finished featurizing") directory = filename.split("/") condition = directory[len(directory)-2] dcd_file = directory[len(directory)-1] new_file = "%s_features_stride%d.h5" %(dcd_file.rsplit( ".", 1 )[ 0 ] , stride) new_root_dir = "/scratch/users/enf/b2ar_analysis/subsampled_features" new_condition_dir = "%s/%s" %(new_root_dir, condition) new_file_full = "%s/%s/%s" %(new_root_dir, condition, new_file) #print("saving features as %s" %new_file_full) verbosedump(features, new_file_full) return features
def fit_protein_tica(yaml_file,sparse=False): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("tica__"): current_mdl_params[i.split("tica__")[1]] = mdl_params[i] if sparse==True: protein_tica_mdl = SparseTICA(**current_mdl_params) else: protein_tica_mdl = tICA(**current_mdl_params) for protein in yaml_file["protein_list"]: print("Fitting to protein %s" % protein) with enter_protein_data_dir(yaml_file, protein): featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) for f in featurized_traj: featurized_path = verboseload(f) try: protein_tica_mdl.partial_fit(featurized_path) except: pass print("Done partial fitting to protein %s" % protein) # dumping the tica_mdl tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl") verbosedump(protein_tica_mdl, tica_mdl_path) return
def sample_from_clusterer(clusterer_file, projected_features_dir, traj_files, n_samples, save_dir, samples_indices_file, structure=None, residue_cutoff=10000, parallel=False, worker_pool=None, lig_name="UNK", reseed_dir=None): clusterer = compat_verboseload(clusterer_file) n_clusters = len(clusterer.cluster_centers_) traj_index_frame_pairs = find_closest_indices_to_cluster_center( projected_features_dir, clusterer_file, k=n_samples) print(traj_index_frame_pairs) print(len(traj_index_frame_pairs)) sample_cluster_partial = partial(sample_cluster, traj_index_frame_pairs, traj_files, structure, residue_cutoff, save_dir, lig_name, reseed_dir) if worker_pool is not None: worker_pool.map_sync(sample_cluster_partial, range(0, n_clusters)) elif parallel: pool = mp.Pool(mp.cpu_count()) pool.map(sample_cluster_partial, range(0, n_clusters)) pool.terminate() else: for cluster in range(0, n_clusters): sample_cluster_partial(cluster) verbosedump(traj_index_frame_pairs, samples_indices_file)
def landmark_ktica(features_dir, combined_features_file=None, feature_ext = ".dataset", use_clusters_as_landmarks=True, clusters_map_file = "", landmarks_dir = "", nystroem_components=1000, n_components=10, lag_time=5, nystroem_data_filename = "", fit_model_filename = "", projected_data_filename = "", landmark_subsample=10, sparse = False, shrinkage = 0.05, wolf = False, rho = 0.01): ''' features_dir: string, directory where your featurized trajectories are kept. combined_features_dir: if you have a file containing all featurized trajectories in one file, i.e. as a list of np arrays, this is it. feature_ext: if instead of a combined file of features they are in separate files, what is the extension of your feature files? use_clusters_as_landmarks: this is if you are doing a composition of tICA --> clustering --> Nystroem --> tICA. this is what I do. if true, you need to feed it a json file containing a dictionary that maps cluster name --> list of 2-tuples, where each tuple has (trajectory_id, frame_number pairs). So this way, instead of choosing landmark points at random in the Nystroem approximation, you are using regular linear tICA-driven clustering to choose your landmark points more efficiently. landmarks_dir: directory where you will save the landmarks. this should be a file containing a list of 1d np arrays or a 2d array nystroem_components: the number of landmarks to use. n_components: the number of ktICA components to compute. lag_time: lag time of tICA nystroem_data_filename: where you will save Nystroem object fit_model_filename: the filename of the ktICA object to save. projected_data_filename: where you will save the features projected with kernel tICA landmark_subsample= how frequently to subsample the landmarks if you are doing use_clusters_as_landmarks. sparse: set to False. shrinkage: same as gamma in old version of tICA. you might want to mess with this. wolf = False: keep this as true unless you're using Robert's branch of msmbuilder rho = Ignore this. ''' if not os.path.exists(nystroem_data_filename): if combined_features_dir is not None: features = verboseload(combined_features_file) else: features = load_file_list(get_trajectory_files(features_dir, ext = feature_ext)) if os.path.exists(landmarks_dir): landmarks = verboseload(landmarks_dir) print(np.shape(landmarks)) else: if use_clusters_as_landmarks: with open(clusters_map_file) as f: clusters_map = json.load(f) clusters_map = {int(k):v for k,v in clusters_map.items()} landmarks = [] for cluster_id,sample_list in clusters_map.items(): for sample in sample_list: traj = sample[0] frame = sample[1] landmark = features[traj][frame] landmarks.append(landmark) landmarks = [landmarks[i] for i in range(0,np.shape(landmarks)[0]) if i%landmark_subsample==0] #%landmark_subsample == 0] verbosedump(landmarks, landmarks_dir) else: n = np.shape(features)[0] indices = np.random.choice(n, nystroem_components) features_concatenated = np.concatenate(features) landmarks = features_concatenated[indices,:] verbosedump(landmarks, landmarks_dir) ktica(features, landmarks, projected_data_filename, nystroem_data_filename, fit_model_filename, sparse, shrinkage, wolf, rho)
def featurize_known_traj(traj_dir, inactive, features_dir): print("currently featurizing %s" %traj_dir.split("/")[len(traj_dir.split("/"))-1]) traj = md.load(traj_dir) rmsds = rmsd_npxxy(traj, inactive) helix6_helix3_distances = helix6_helix3_dist(traj) features = np.transpose(np.concatenate([[rmsds], [np.concatenate(helix6_helix3_distances)]])) print np.shape(features) filename = "%s/%s" %(features_dir, traj_dir.split("/")[len(traj_dir.split("/"))-1]) verbosedump(features, filename)
def cluster(data_dir, traj_dir, n_clusters, lag_time): clusterer_dir = "/scratch/users/enf/b2ar_analysis/clusterer_%d_t%d.h5" %(n_clusters, lag_time) if (os.path.exists(clusterer_dir)): print "Already clustered" else: reduced_data = verboseload(data_dir) trajs = np.concatenate(reduced_data) clusterer = MiniBatchKMedoids(n_clusters = n_clusters) clusterer.fit_transform(reduced_data) verbosedump(clusterer, "/scratch/users/enf/b2ar_analysis/clusterer_%d_t%d.h5" %(n_clusters, lag_time))
def build_msm(clusterer_dir, lag_time): clusterer = verboseload(clusterer_dir) n_clusters = np.shape(clusterer.cluster_centers_)[0] labels = clusterer.labels_ msm_modeler = MarkovStateModel(lag_time=lag_time) print("fitting msm to trajectories with %d clusters and lag_time %d" %(n_clusters, lag_time)) msm_modeler.fit_transform(labels) verbosedump(msm_modeler, "/scratch/users/enf/b2ar_analysis/msm_model_%d_clusters_t%d" %(n_clusters, lag_time)) print("fitted msm to trajectories with %d states" %(msm_modeler.n_states_)) '''
def create_fake_data(base_dir, protein_list, project_dict): np.random.seed(42) for protein in protein_list: os.mkdir(protein) os.mkdir(os.path.join(protein, "feature_dir")) for project in project_dict[protein]: os.mkdir(os.path.join(protein,project)) for i in range(5): X = np.random.randn(20, 3) verbosedump(X, os.path.join(protein, "feature_dir" ,"%d.jl"%i)) return
def create_fake_data(base_dir, protein_list, project_dict): np.random.seed(42) for protein in protein_list: os.mkdir(protein) os.mkdir(os.path.join(protein, "feature_dir")) for project in project_dict[protein]: os.mkdir(os.path.join(protein, project)) for i in range(5): X = np.random.randn(20, 3) verbosedump(X, os.path.join(protein, "feature_dir", "%d.jl" % i)) return
def cluster_kmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time): clusterer_dir = "%s/clusterer_%dclusters.h5" % (tica_dir, n_clusters) if (os.path.exists(clusterer_dir)): print("Already clustered") else: print("Clustering by KMeans") reduced_data = verboseload(data_dir) trajs = np.concatenate(reduced_data) clusterer = KMeans(n_clusters=n_clusters, n_jobs=-1) clusterer.fit_transform(reduced_data) verbosedump(clusterer, clusterer_dir)
def cluster_kmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time): clusterer_dir = "%s/clusterer_%dclusters.h5" %(tica_dir, n_clusters) if (os.path.exists(clusterer_dir)): print "Already clustered" else: print "Clustering by KMeans" reduced_data = verboseload(data_dir) trajs = np.concatenate(reduced_data) clusterer = KMeans(n_clusters = n_clusters, n_jobs=-1) clusterer.fit_transform(reduced_data) verbosedump(clusterer, clusterer_dir)
def landmark_ktica_ticaTraj(tica_dir, clusterer_dir, ktica_dir, clusters_map_file = "", landmarks_dir = "", nystroem_components=1000, n_components=10, lag_time=5, nystroem_data_filename = "", fit_model_filename = "", projected_data_filename = "", landmark_subsample=1, sparse = False, wolf = True, rho = 0.01, shrinkage = None): if not os.path.exists(ktica_dir): os.makedirs(ktica_dir) if not sparse: if shrinkage is None: tica_model = tICA(n_components = n_components, lag_time = lag_time) else: tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage) else: if shrinkage is None: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho) else: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage) if not os.path.exists(nystroem_data_filename): clusterer = verboseload(clusterer_dir) tica = verboseload(tica_dir) features = tica clusters = clusterer.cluster_centers_ landmarks = clusters print("here's what goes into the combined class:") #print(np.shape(features)) print(np.shape(landmarks)) print(type(landmarks)) nys = Nystroem(n_components = np.shape(landmarks)[0], basis = landmarks)#np.shape(landmarks)[0])# basis=landmarks) nyx = nys.fit_transform(features) del features del landmarks try: save_dataset(nyx, nystroem_data_filename) except: os.system("rm -rf %s" %nystroem_data_filename) save_dataset(nyx, nystroem_data_filename) else: nyx = load_dataset(nystroem_data_filename) print(np.shape(nyx)) print(dir(nyx)) if not os.path.exists(projected_data_filename): fit_model = tica_model.fit(nyx) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(nyx) del(nyx) try: save_dataset(transformed_data, projected_data_filename) except: os.system("rm -rf %s" %projected_data_filename) save_dataset(transformed_data, projected_data_filename) else: print("Already performed landmark kernel tICA.")
def tica_wrapper(proj_folder,feature_dict,lag_time=10): #100ps*100==10ns and 10 features if os.path.exists(proj_folder+"/tica_features.pkl"): return verboseload(proj_folder+"/tica_features.pkl") tica_mdl = tICA(lag_time=lag_time,n_components=10) tica_mdl.fit([feature_dict[i] for i in feature_dict.keys()]) tica_features={} for i in feature_dict.keys(): tica_features[i] = tica_mdl.transform([feature_dict[i]])[0] verbosedump(tica_features,proj_folder+"/tica_features.pkl") return tica_features
def featurize_known_traj(traj_dir, inactive, features_dir): print(("currently featurizing %s" % traj_dir.split("/")[len(traj_dir.split("/")) - 1])) traj = md.load(traj_dir) rmsds = rmsd_npxxy(traj, inactive) helix6_helix3_distances = helix6_helix3_dist(traj) features = np.transpose( np.concatenate([[rmsds], [np.concatenate(helix6_helix3_distances)]])) print(np.shape(features)) filename = "%s/%s" % (features_dir, traj_dir.split("/")[len(traj_dir.split("/")) - 1]) verbosedump(features, filename)
def cluster(data_dir, traj_dir, n_clusters, lag_time): clusterer_dir = "/scratch/users/enf/b2ar_analysis/clusterer_%d_t%d.h5" % ( n_clusters, lag_time) if (os.path.exists(clusterer_dir)): print("Already clustered") else: reduced_data = verboseload(data_dir) trajs = np.concatenate(reduced_data) clusterer = MiniBatchKMedoids(n_clusters=n_clusters) clusterer.fit_transform(reduced_data) verbosedump( clusterer, "/scratch/users/enf/b2ar_analysis/clusterer_%d_t%d.h5" % (n_clusters, lag_time))
def tica_wrapper(proj_folder, feature_dict, lag_time=10): #100ps*100==10ns and 10 features if os.path.exists(proj_folder + "/tica_features.pkl"): return verboseload(proj_folder + "/tica_features.pkl") tica_mdl = tICA(lag_time=lag_time, n_components=10) tica_mdl.fit([feature_dict[i] for i in feature_dict.keys()]) tica_features = {} for i in feature_dict.keys(): tica_features[i] = tica_mdl.transform([feature_dict[i]])[0] verbosedump(tica_features, proj_folder + "/tica_features.pkl") return tica_features
def read_and_featurize_custom(traj_file, condition=None, location=None, dihedral_residues = None, distance_residues = None): top = md.load_frame(traj_file,index = 0).topology #atom_indices = [a.index for a in top.atoms if a.residue.resSeq != 130] atom_indices = [a.index for a in top.atoms] traj = md.load(traj_file, atom_indices=atom_indices) print traj_file #print traj #print("loaded trajectory") ''' a = time.time() featurizer = DihedralFeaturizer(types = ['phi', 'psi', 'chi2']) features = featurizer.transform(traj) b = time.time() #print(b-a) print("original features has dim") print(np.shape(features)) ''' a = time.time() phi_tuples = phi_indices(traj.topology, dihedral_residues) psi_tuples = psi_indices(traj.topology, dihedral_residues) chi2_tuples = chi2_indices(traj.topology, dihedral_residues) #if distance_residues is not None: #print("new features has dim %d" %(2*len(phi_tuples) + 2*len(psi_tuples) + 2*len(chi2_tuples))) #print("feauturizing manually:") phi_angles = np.transpose(ManualDihedral.compute_dihedrals(traj=traj,indices=phi_tuples)) psi_angles = np.transpose(ManualDihedral.compute_dihedrals(traj=traj,indices=psi_tuples)) chi2_angles = np.transpose(ManualDihedral.compute_dihedrals(traj=traj,indices=chi2_tuples)) manual_features = np.concatenate([np.sin(phi_angles), np.cos(phi_angles), np.sin(psi_angles), np.cos(psi_angles), np.sin(chi2_angles), np.cos(chi2_angles)]) b = time.time() #print(b-a) print("new features has shape: ") print(np.shape(manual_features)) if condition is None: condition = get_condition(traj_file) if location is None: location = "/scratch/users/enf/b2ar_analysis/features_allprot" verbosedump(manual_features, "%s/%s.h5" %(location, condition))
def cluster_minikmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time): clusterer_dir = "%s/clusterer_%dclusters.h5" %(tica_dir, n_clusters) if (os.path.exists(clusterer_dir)): print "Already clustered" else: print "Clustering by KMeans" try: reduced_data = verboseload(data_dir) except: reduced_data = load_dataset(data_dir) trajs = np.concatenate(reduced_data) clusterer = MiniBatchKMeans(n_clusters = n_clusters) clusterer.fit_transform(reduced_data) verbosedump(clusterer, clusterer_dir)
def macrostate_pcca(msm_file, clusterer_file, n_macrostates, macrostate_dir): msm = verboseload(msm_file) clusterer = verboseload(clusterer_file) #pcca = lumping.PCCAPlus.from_msm(msm = msm,n_macrostates = n_macrostates) #macrostate_model = MarkovStateModel() #macrostate_model.fit(pcca.transform(labels)) pcca_object = lumping.PCCA(n_macrostates=10) pcca_object.fit(sequences=clusterer.labels_) #pcca_object.transform(sequences = clusterer.labels_) #macrostate_model = pcca_object.from_msm(msm = msm, n_macrostates = n_macrostates) print(pcca_object) print((pcca_object.microstate_mapping_)) verbosedump(pcca_object, macrostate_dir)
def transform_protein_kmeans(yaml_file): mdl_dir = yaml_file["mdl_dir"] kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl") kmeans_mdl = verboseload(kmeans_mdl_path) for protein in yaml_file["protein_list"]: print("Assigning protein %s" % protein) with enter_protein_mdl_dir(yaml_file, protein): tica_data = verboseload("tica_data.pkl") # do assignments assignments = {} for i in tica_data.keys(): assignments[i] = kmeans_mdl.predict([tica_data[i]])[0] verbosedump(assignments, 'assignments.pkl') print("Done assigning %s" % protein) return
def macrostate_pcca(msm_file, clusterer_file, n_macrostates, macrostate_dir): msm = verboseload(msm_file) clusterer = verboseload(clusterer_file) #pcca = lumping.PCCAPlus.from_msm(msm = msm,n_macrostates = n_macrostates) #macrostate_model = MarkovStateModel() #macrostate_model.fit(pcca.transform(labels)) pcca_object = lumping.PCCA(n_macrostates = 10) pcca_object.fit(sequences = clusterer.labels_) #pcca_object.transform(sequences = clusterer.labels_) #macrostate_model = pcca_object.from_msm(msm = msm, n_macrostates = n_macrostates) print(pcca_object) print(pcca_object.microstate_mapping_) verbosedump(pcca_object, macrostate_dir)
def normalize_project_series(yaml_file, output_folder="normalized_features", stride=40,nrm=None): """ routine to take a set of proteins features stored in the feature_dir and normalize them by removing the mean and setting variance to 1 using the standard scaler. The normalizer is dumped into the mdl dir. :param yaml_file: The yaml file to work with. :param output_folder: The name of the output folder to dump normalized features in :param stride: The initial stride in files to fit the normalizer with. This is necessary to prevent memory errors. defaults to every 40th file :param nrm: previously fit normalizer. else it uses the standard scaler from scikitlearn :return: """ yaml_file = load_yaml_file(yaml_file) #setup normalizer if nrm is None: nrm = preprocessing.StandardScaler() all_data = {} for prt in yaml_file["protein_list"]: with enter_protein_data_dir(yaml_file, prt): print(prt) flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))[::stride] for f in flist: all_data[f]=verboseload(f) seq=[] for i in all_data.keys(): seq.extend(all_data[i]) #fit it nrm.fit(seq) #dump it into the mdl dir. verbosedump(nrm,"%s/nrm.h5"%yaml_file["mdl_dir"]) for prt in yaml_file["protein_list"]: _check_output_folder_exists(yaml_file, prt, output_folder) with enter_protein_data_dir(yaml_file, prt): output_folder_path = os.path.abspath(output_folder) flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"])) for f in flist: res = verboseload(f) res = nrm.transform(res) verbosedump(res,"%s/%s"%(output_folder_path, os.path.basename(f))) return
def normalize_project_series(yaml_file, output_folder="normalized_features", stride=1,nrm=None): """ routine to take a set of proteins features stored in the feature_dir and normalize them by removing the mean and setting variance to 1 using the standard scaler. The normalizer is dumped into the mdl dir. :param yaml_file: The yaml file to work with. :param output_folder: The name of the output folder to dump normalized features in :param stride: The initial stride in files to fit the normalizer with. This is necessary to prevent memory errors. defaults to every 40th file :param nrm: previously fit normalizer. else it uses the standard scaler from scikitlearn :return: """ yaml_file = load_yaml_file(yaml_file) #setup normalizer if nrm is None: nrm = preprocessing.RobustScaler() all_data = {} for prt in yaml_file["protein_list"]: with enter_protein_data_dir(yaml_file, prt): print(prt) flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))[::stride] for f in flist: all_data[f]=verboseload(f) seq=[] for i in all_data.keys(): seq.extend(all_data[i]) #fit it nrm.fit(seq) #dump it into the mdl dir. verbosedump(nrm,"%s/nrm.h5"%yaml_file["mdl_dir"]) for prt in yaml_file["protein_list"]: _check_output_folder_exists(yaml_file, prt, output_folder) with enter_protein_data_dir(yaml_file, prt): output_folder_path = os.path.abspath(output_folder) flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"])) for f in flist: res = verboseload(f) res = nrm.transform(res) verbosedump(res,"%s/%s"%(output_folder_path, os.path.basename(f))) return
def cluster_project_wrapper(proj_folder,feature_dict,n_states): if os.path.exists(proj_folder+"/assignments.pkl"): return verboseload(proj_folder+"/cluster_mdl.pkl"),verboseload(proj_folder+"/assignments.pkl") elif os.path.exists(proj_folder+"/cluster_mdl.pkl"): cluster_mdl = verboseload(proj_folder+"/cluster_mdl.pkl") else: cluster_mdl = KMeans(n_clusters = n_states) cluster_mdl.fit([feature_dict[i] for i in feature_dict.keys()]) assignments={} for i in feature_dict.keys(): assignments[i] = cluster_mdl.transform([feature_dict[i]]) verbosedump(cluster_mdl,proj_folder+"/cluster_mdl.pkl") verbosedump(assignments,proj_folder+"/assignments.pkl") return cluster_mdl,assignments
def resample_by_msm(total_samples, msm_object, clusters_map, num_trajs, save_file, equilibrium_populations=None): if equilibrium_populations is None: equilibrium_populations = msm_object.populations_ num_to_sample_per_cluster = {} for cluster_id in msm_object.mapping_.keys(): state_id = msm_object.mapping_[cluster_id] num_to_sample_per_cluster[cluster_id] = np.rint( equilibrium_populations[state_id] * total_samples) print( "Found number to sample per cluster based on equilibrium proporrtions." ) sample_pairs = [] for cluster_id in msm_object.mapping_.keys(): traj_index_pairs = list(clusters_map[cluster_id]) if len(traj_index_pairs) == 0: continue num_to_sample = num_to_sample_per_cluster[cluster_id] random_indices = np.random.choice(range(0, len(traj_index_pairs)), size=num_to_sample, replace=True) clusters_sample_pairs = [traj_index_pairs[i] for i in random_indices] sample_pairs += clusters_sample_pairs print( "Obtained random (trajectory, frame) pairs based on equilibrium populations" ) #if there exists some fancy numpy way to index a 3d array by 2d tuples, then great, else: traj_to_frames = {} for i in range(0, num_trajs): traj_to_frames[i] = [] for sample_pair in sample_pairs: traj_to_frames[sample_pair[0]].append(sample_pair[1]) print("Rearranged equilibrium sampled frames based on trajectories") if save_file is not None: verbosedump(traj_to_frames, save_file) return traj_to_frames
def ktica(features, landmarks, projected_data_filename, nystroem_data_filename, fit_model_filename, sparse = False, shrinkage = 0.05, wolf = True, rho = 0.01): if not sparse: if shrinkage is None: tica_model = tICA(n_components = n_components, lag_time = lag_time) else: if wolf: tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage) else: tica_model = tICA(n_components = n_components, lag_time = lag_time, gamma = shrinkage) else: if shrinkage is None: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho) else: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage) if not os.path.exists(nystroem_data_filename): nys = Nystroem(n_components = np.shape(landmarks)[0], basis = landmarks)#np.shape(landmarks)[0])# basis=landmarks) nyx = nys.fit_transform(features) print("Computed Nystroem.") del features del landmarks try: save_dataset(nyx, nystroem_data_filename) except: os.system("rm -rf %s" %nystroem_data_filename) save_dataset(nyx, nystroem_data_filename) else: nyx = load_dataset(nystroem_data_filename) print("Loaded Nystroem") if not os.path.exists(projected_data_filename): fit_model = tica_model.fit(nyx) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(nyx) del(nyx) try: save_dataset(transformed_data, projected_data_filename) except: os.system("rm -rf %s" %projected_data_filename) save_dataset(transformed_data, projected_data_filename) else: print("Already performed landmark kernel tICA.")
def resample_features_by_msm_equilibirum_pop(features, traj_to_frames, save_file=None): resampled_features = [] for traj_index, frames in traj_to_frames.items(): if isinstance(features[0], pd.DataFrame): resampled_features.append(features[traj_index].iloc[frames]) else: resampled_features.append(features[traj_index][frames, :]) if isinstance(features[0], pd.DataFrame): resampled_features = pd.concat(resampled_features, axis=0) else: resampled_features = np.concatenate(resampled_features) if save_file is not None: verbosedump(resampled_features, save_file) else: return resampled_features
def cluster_project_wrapper(proj_folder, feature_dict, n_states): if os.path.exists(proj_folder + "/assignments.pkl"): return verboseload(proj_folder + "/cluster_mdl.pkl"), verboseload(proj_folder + "/assignments.pkl") elif os.path.exists(proj_folder + "/cluster_mdl.pkl"): cluster_mdl = verboseload(proj_folder + "/cluster_mdl.pkl") else: cluster_mdl = KMeans(n_clusters=n_states) cluster_mdl.fit([feature_dict[i] for i in feature_dict.keys()]) assignments = {} for i in feature_dict.keys(): assignments[i] = cluster_mdl.transform([feature_dict[i]]) verbosedump(cluster_mdl, proj_folder + "/cluster_mdl.pkl") verbosedump(assignments, proj_folder + "/assignments.pkl") return cluster_mdl, assignments
def sample_msm_traj(yaml_file, prt_name, n_steps, starting_state=None, fname="msm_traj.xtc", msm_traj=None, scheme='random'): """ :param yaml_file: The model's yaml file :param prt: The name of the protein mdl :param n_steps: The number of markovian frames desired. :param starting_state: If None, we start from the most populated state. :param fname : The output filename :msm_traj : output of msm.sample_discrete. This is so that if you want you can sample a random trajectory :return: Dumps the msm traj. """ yaml_file = load_yaml_file(yaml_file) ser = ProteinSeries(yaml_file) prt = Protein(ser, prt_name) if msm_traj is None: # this returns in original assignment space msm_traj = prt.msm.sample_discrete(state=starting_state, n_steps=n_steps) # there we use the original assignment matrix too key_mapping, assignment_matrix = create_assignment_matrix(prt.assignments) jbs = [(state, assignment_matrix, key_mapping, ser.base_dir, prt.name, yaml_file["protein_dir"]) for state in msm_traj] p = Pool(int(cpu_count() / 4)) trj_list = p.map(_random_sample_state, jbs) print("Done") trj = trj_list[0] + trj_list[1:] with enter_protein_mdl_dir(yaml_file, prt_name): verbosedump(msm_traj, "msm_traj.pkl") trj.save_xtc(fname) if not os.path.isfile("prot.pdb"): trj[0].save_pdb("prot.pdb") return
def standardize_features(features_dir, features_ext, standardized_features_dir): if not os.path.exists(standardized_features_dir): os.makedirs(standardized_features_dir) feature_files = get_trajectory_files(features_dir, features_ext) features = load_file_list(feature_files) concatenated_features = np.concatenate(features) means = np.mean(concatenated_features, axis = 0) stdevs = np.std(concatenated_features, axis = 0) standardized_features = [] for X in features: X -= means X /= stdevs standardized_features.append(X) print("Finished standardizing features") for i in range(0, len(feature_files)): filename = feature_files[i].split("/")[len(feature_files[i].split("/"))-1] new_filename = "%s/%s" %(standardized_features_dir, filename) verbosedump(standardized_features[i], new_filename) print("Finished saving all standardized features") return
def transform_protein_kmeans(yaml_file,pca=False): mdl_dir = yaml_file["mdl_dir"] kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl") kmeans_mdl = verboseload(kmeans_mdl_path) for protein in yaml_file["protein_list"]: print("Assigning protein %s" % protein) with enter_protein_mdl_dir(yaml_file, protein): if pca: tica_data = verboseload("pca_data.pkl") else: tica_data = verboseload("tica_data.pkl") # do assignments assignments = {} for i in tica_data.keys(): assignments[i] = kmeans_mdl.predict([tica_data[i]])[0] verbosedump(assignments, 'assignments.pkl') print("Done assigning %s" % protein) return
def generate_msm_traj_index_series(msm_object, start_cluster, n_steps, clusters_map, save_file=None): inv_map = {v: k for k, v in msm_object.mapping_.items()} msm_trajectory = msm_object.sample_discrete(state=start_cluster, n_steps=n_steps) traj_index_pairs = [] clusters = [] for state in msm_trajectory: cluster = state #inv_map[state] clusters.append(cluster) traj_index_pair = random.choice(list(clusters_map[cluster])) traj_index_pairs.append(traj_index_pair) if save_file is not None: verbosedump(traj_index_pairs, save_file) return traj_index_pairs, clusters
def build_msm(clusterer_dir, lag_time, msm_model_dir, prior_counts=0.0, ergodic_cutoff='on'): clusterer = verboseload(clusterer_dir) n_clusters = np.shape(clusterer.cluster_centers_)[0] labels = clusterer.labels_ msm_modeler = MarkovStateModel(lag_time=lag_time, prior_counts=prior_counts, ergodic_cutoff=ergodic_cutoff) print(("fitting msm to trajectories with %d clusters and lag_time %d" % (n_clusters, lag_time))) msm_modeler.fit_transform(labels) print(msm_modeler) verbosedump(msm_modeler, msm_model_dir) print(("fitted msm to trajectories with %d states" % (msm_modeler.n_states_))) return msm_modeler '''
def build_msm(clusterer_dir, lag_time): clusterer = verboseload(clusterer_dir) n_clusters = np.shape(clusterer.cluster_centers_)[0] labels = clusterer.labels_ msm_modeler = MarkovStateModel(lag_time=lag_time) print(("fitting msm to trajectories with %d clusters and lag_time %d" % (n_clusters, lag_time))) msm_modeler.fit_transform(labels) verbosedump( msm_modeler, "/scratch/users/enf/b2ar_analysis/msm_model_%d_clusters_t%d" % (n_clusters, lag_time)) print(("fitted msm to trajectories with %d states" % (msm_modeler.n_states_))) #np.savetxt("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_transmat.csv" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",") #G = nx.from_numpy_matrix(msm_modeler.transmat_) #nx.write_edgelist(G, "/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",") transmat = msm_modeler.transmat_ mapping = msm_modeler.mapping_ edges = open( "/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist.csv" % (n_clusters, lag_time), "wb") for i in range(0, msm_modeler.n_states_): if i == 0: for j in range(0, msm_modeler.n_states_): edges.write(";") edges.write("%d" % mapping[j]) edges.write("\n") edges.write("%d" % (mapping[i])) for j in range(0, msm_modeler.n_states_): prob = transmat[i][j] edges.write(";") if prob > 0.000001: edges.write("%f" % prob) else: edges.write("0") edges.write("\n") edges.close()
def fit_bootstrap(yaml_file,pool=None): mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("msm__"): current_mdl_params[i.split("msm__")[1]] = mdl_params[i] if "bootstrap__n_samples" in mdl_params.keys(): bootstrap__n_samples = mdl_params["bootstrap__n_samples"] else: bootstrap__n_samples = 100 for protein in yaml_file["protein_list"]: with enter_protein_mdl_dir(yaml_file, protein): print(protein) assignments = verboseload("assignments.pkl") msm_mdl =BootStrapMarkovStateModel(n_samples= bootstrap__n_samples, n_procs=2, msm_args = current_mdl_params ) msm_mdl.fit([assignments[i] for i in assignments.keys()], pool=pool) verbosedump(msm_mdl, "bootstrap_msm_mdl.pkl") verbosedump(msm_mdl.mle_, "msm_mdl.pkl") fixed_assignments = {} for i in assignments.keys(): fixed_assignments[i] = msm_mdl.mle_.transform( assignments[i], mode='fill')[0] verbosedump(fixed_assignments, 'fixed_assignments.pkl') return
def featurize_pnas_distance_pdbs(traj_dir, new_filename, features_dir, inactive_dir, active_dir, inactive_distances_dir, active_distances_dir, coords_dir): #if not os.path.exists(features_dir): os.makedirs(features_dir) inactive = md.load(inactive_dir) active = md.load(active_dir) agonist_bound = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'] samples = get_trajectory_files(traj_dir, ext = ".pdb") pool = mp.Pool(mp.cpu_count()) trajs = pool.map(load_pdb_traj, samples) trajs_joined = trajs[0].join(trajs[1:]) trajs_joined.save_hdf5(new_filename) features = compute_pnas_coords_and_distance(new_filename, inactive, active) coords = [f[0] for f in features] inactive_distances = [f[1][0] for f in features] active_distances = [f[1][1] for f in features] verbosedump(coords, coords_dir) verbosedump(inactive_distances, inactive_distances_dir) verbosedump(active_distances, active_distances_dir) print("Completed featurizing")
def fit_and_transform(features_directory, model_dir, stride=5, lag_time=10, n_components = 5): if not os.path.exists(model_dir): os.makedirs(model_dir) projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" %model_dir fit_model_filename = "%s/phi_psi_chi2_allprot_tica_coords.h5" %model_dir #active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb" tica_model = tICA(n_components = n_components, lag_time = lag_time) if not os.path.exists(projected_data_filename): print("loading feature files") feature_files = get_trajectory_files(features_directory, ext = ".h5") pool = mp.Pool(mp.cpu_count()) features = pool.map(load_features, feature_files) pool.terminate() if not os.path.exists(fit_model_filename): print("fitting data to tICA model") fit_model = tica_model.fit(features) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: print("loading tICA model") fit_model = verboseload(fit_model_filename) print("transforming") transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: fit_model = verboseload(fit_model_filename) transformed_data = verboseload(projected_data_filename) print fit_model.summarize()