Exemple #1
0
def featurize_divided(directory):
    simulations = os.listdir(directory)
    for simulation in simulations:
        if simulation[0] not in [
                'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'
        ]:
            continue

        sim_dir = "%s/%s" % (directory, simulation)

        new_dir = "%s/%s" % (
            "/scratch/users/enf/b2ar_analysis/subsampled_features", simulation)
        if os.path.exists(new_dir):
            print("we have already featurized this simulation")
            continue
        else:
            os.makedirs(new_dir)

        print(("currently analyzing %s " % sim_dir))
        trajs = get_trajectory_files(sim_dir)[0:3]
        print(trajs)

        #print("there are %d cpus" %(mp.cpu_count()))
        pool = mp.Pool(mp.cpu_count())
        features_i = pool.map(read_and_featurize, trajs)
        #print(features_i)
        features = [np.concatenate(np.concatenate(features_i))]
        print((np.shape(features[0])))
        combined_dir = "/scratch/users/enf/b2ar_analysis/combined_features"
        new_file_name = "%s_combined.h5" % (simulation)
        new_file = "%s/%s" % (combined_dir, new_file_name)
        #print("saving concatenated features for %s as %s" %(simulation, new_file))
        verbosedump(features, new_file)
def featurize_project(proj_folder,top_folder,featurizer_object,stride,view):

     #if already featurized dont bother(should add a warning about this)
     if os.path.exists(proj_folder+"/featurized_traj.pkl"):
          return verboseload(proj_folder+"/featurized_traj.pkl")

     if featurizer_object is None:
          featurizer = DihedralFeaturizer(types=['phi', 'psi','chi1'])
     else:
          try:
               featurizer = verboseload(featurizer_object)
          except:
               sys.exit("Cant Load Featurizer using msmbuilder verboseload")

     feature_dict={}

     traj_list =  glob.glob(proj_folder+"/trajectories/*.dcd")


     jobs = [(proj_folder,top_folder,featurizer,traj,stride) for traj in traj_list]
     results = view.map_sync(featurize_traj,jobs)

     for result in results:
          feature_dict[result[0]] = result[1]

     verbosedump(feature_dict,proj_folder+"/featurized_traj.pkl")

     return feature_dict
Exemple #3
0
def read_and_featurize_divided(filename,
                               dihedrals=['phi', 'psi', 'chi2'],
                               stride=10):
    #print("reading and featurizing %s" %(filename))

    traj_top = md.load_frame(filename, 0).topology
    atom_indices = [
        a.index for a in traj_top.atoms if a.residue.name[0:2] != "HI"
    ]

    traj = md.load(filename, atom_indices=atom_indices)
    #print("got traj")
    featurizer = DihedralFeaturizer(types=dihedrals)
    features = featurizer.transform(traj_list=traj)
    #print(np.shape(features))
    #print("finished featurizing")

    directory = filename.split("/")
    condition = directory[len(directory) - 2]
    dcd_file = directory[len(directory) - 1]
    new_file = "%s_features_stride%d.h5" % (dcd_file.rsplit(".", 1)[0], stride)
    new_root_dir = "/scratch/users/enf/b2ar_analysis/subsampled_features"
    new_condition_dir = "%s/%s" % (new_root_dir, condition)

    new_file_full = "%s/%s/%s" % (new_root_dir, condition, new_file)
    #print("saving features as %s" %new_file_full)

    verbosedump(features, new_file_full)
    return features
def cluster_minikmeans(tica_dir,
                       data_dir,
                       traj_dir,
                       n_clusters,
                       clusterer_dir=None,
                       tICs=None):
    if (os.path.exists(clusterer_dir)):
        reduced_data = load_file(data_dir)
        clusterer = verboseload(clusterer_dir)
        clusterer.labels_ = clusterer.transform(reduced_data)
        verbosedump(clusterer, clusterer_dir)
    else:
        print("Clustering by KMeans")
        try:
            reduced_data = verboseload(data_dir)
        except:
            reduced_data = load_dataset(data_dir)
        if tICs is not None:
            X = []
            for traj in reduced_data:
                X.append(traj[:, tICs])
        else:
            X = reduced_data

        clusterer = MiniBatchKMeans(n_clusters=n_clusters, n_init=10)
        clusterer.fit_transform(X)
        verbosedump(clusterer, clusterer_dir)
Exemple #5
0
def featurize_divided(directory):
	simulations = os.listdir(directory)
	for simulation in simulations:
		if simulation[0] not in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']:
			continue

		sim_dir = "%s/%s" %(directory, simulation)
		
		new_dir = "%s/%s" %("/scratch/users/enf/b2ar_analysis/subsampled_features", simulation)
		if os.path.exists(new_dir):
			print("we have already featurized this simulation")
			continue
		else:
			os.makedirs(new_dir)

		print("currently analyzing %s " %sim_dir)
		trajs = get_trajectory_files(sim_dir)[0:3]
		print(trajs)

		#print("there are %d cpus" %(mp.cpu_count()))
		pool = mp.Pool(mp.cpu_count())
		features_i = pool.map(read_and_featurize, trajs)
		#print(features_i)
		features = [np.concatenate(np.concatenate(features_i))]
		print(np.shape(features[0]))
		combined_dir = "/scratch/users/enf/b2ar_analysis/combined_features"
		new_file_name = "%s_combined.h5" %(simulation)
		new_file = "%s/%s" %(combined_dir, new_file_name)
		#print("saving concatenated features for %s as %s" %(simulation, new_file))
		verbosedump(features, new_file)
def _slice_file(job_tuple):
    inp_file, feature_ind, output_folder = job_tuple
    featurized_file = verboseload(inp_file)
    sliced_file = featurized_file[:, feature_ind]
    sliced_file_out = os.path.join(output_folder, os.path.basename(inp_file))
    verbosedump(sliced_file, sliced_file_out)
    return
def build_msm(clusterer_dir, lag_time):
	clusterer = verboseload(clusterer_dir)
	n_clusters = np.shape(clusterer.cluster_centers_)[0]
	labels = clusterer.labels_
	msm_modeler = MarkovStateModel(lag_time=lag_time)
	print("fitting msm to trajectories with %d clusters and lag_time %d" %(n_clusters, lag_time))
	msm_modeler.fit_transform(labels)
	verbosedump(msm_modeler, "/scratch/users/enf/b2ar_analysis/msm_model_%d_clusters_t%d" %(n_clusters, lag_time))
	print("fitted msm to trajectories with %d states" %(msm_modeler.n_states_))
	#np.savetxt("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_transmat.csv" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",")
	#G = nx.from_numpy_matrix(msm_modeler.transmat_)
	#nx.write_edgelist(G, "/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",")
	transmat = msm_modeler.transmat_

	mapping = msm_modeler.mapping_

	edges = open("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist.csv" %(n_clusters, lag_time), "wb")
	for i in range(0, msm_modeler.n_states_):
		if i == 0:
			for j in range(0, msm_modeler.n_states_):
				edges.write(";")
				edges.write("%d" %mapping[j])
			edges.write("\n")

		edges.write("%d" %(mapping[i]))
		for j in range(0, msm_modeler.n_states_):
			prob = transmat[i][j]
			edges.write(";")
			if prob > 0.000001:
				edges.write("%f" %prob)
			else:
				edges.write("0")
		edges.write("\n")
	edges.close()
Exemple #8
0
def read_and_featurize(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10):
    print(("reading and featurizing %s" % (filename)))

    traj = md.load(filename)
    #test_traj_init = md.load_frame(filename,5)
    #test_traj_init.save_pdb("/scratch/users/enf/b2ar_analysis/test_init.pdb")

    #traj.topology = fix_topology(traj.topology)

    #traj[-1].save_pdb("/scratch/users/enf/b2ar_analysis/test_fixed.pdb")
    #traj.save_dcd("/scratch/users/enf/b2ar_analysis/test_fixed.dcd")

    #print("got traj")
    featurizer = DihedralFeaturizer(types=dihedrals)
    features = featurizer.transform(traj_list=traj)
    #print("finished featurizing")

    directory = filename.split("/")
    traj_file = directory[len(directory) - 1]
    condition = traj_file.split("_")[0].split(".")[0]

    print(("Condition %s has features of shape %s" %
           (condition, np.shape(features))))

    new_file = "/scratch/users/enf/b2ar_analysis/combined_features/%s_features.h5" % condition
    verbosedump(features, new_file)
Exemple #9
0
def read_and_featurize_divided(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10):
	#print("reading and featurizing %s" %(filename))

	traj_top = md.load_frame(filename,0).topology
	atom_indices = [a.index for a in traj_top.atoms if a.residue.name[0:2] != "HI"]

	traj = md.load(filename,atom_indices=atom_indices)
	#print("got traj")
	featurizer = DihedralFeaturizer(types = dihedrals)
	features = featurizer.transform(traj_list = traj)
	#print(np.shape(features))
	#print("finished featurizing")

	directory = filename.split("/")
	condition = directory[len(directory)-2]
	dcd_file = directory[len(directory)-1]
	new_file = "%s_features_stride%d.h5" %(dcd_file.rsplit( ".", 1 )[ 0 ] , stride)
	new_root_dir = "/scratch/users/enf/b2ar_analysis/subsampled_features"
	new_condition_dir = "%s/%s" %(new_root_dir, condition)

	new_file_full = "%s/%s/%s" %(new_root_dir, condition, new_file)
	#print("saving features as %s" %new_file_full)

	verbosedump(features, new_file_full)
	return features
Exemple #10
0
def featurize_project(proj_folder, top_folder, featurizer_object, stride,
                      view):

    #if already featurized dont bother(should add a warning about this)
    if os.path.exists(proj_folder + "/featurized_traj.pkl"):
        return verboseload(proj_folder + "/featurized_traj.pkl")

    if featurizer_object is None:
        featurizer = DihedralFeaturizer(types=['phi', 'psi', 'chi1'])
    else:
        try:
            featurizer = verboseload(featurizer_object)
        except:
            sys.exit("Cant Load Featurizer using msmbuilder verboseload")

    feature_dict = {}

    traj_list = glob.glob(proj_folder + "/trajectories/*.dcd")

    jobs = [(proj_folder, top_folder, featurizer, traj, stride)
            for traj in traj_list]
    results = view.map_sync(featurize_traj, jobs)

    for result in results:
        feature_dict[result[0]] = result[1]

    verbosedump(feature_dict, proj_folder + "/featurized_traj.pkl")

    return feature_dict
def featurize_file(job_tuple):

    yaml_file, protein, feat, traj_file,stride = job_tuple
    yaml_file = load_yaml_file(yaml_file)

    if feat is None:
        feat = DihedralFeaturizer(types=['phi', 'psi','chi1'])

    _check_output_folder_exists(yaml_file, protein)

    output_folder = os.path.join(yaml_file["base_dir"],
                                 protein,
                                 yaml_file["feature_dir"])

    traj_name = os.path.splitext(os.path.basename(traj_file))[0]
    output_fname = os.path.join(output_folder, traj_name+".jl")

    feat_descriptor = os.path.join(output_folder, "feature_descriptor.h5")
    try:
        trj = mdt.load(traj_file)
    except :
        warnings.warn("Removing %s because of misformed trajectory"%traj_file)
        os.remove(traj_file)
        return

    features = feat.partial_transform(trj)
    verbosedump(features, output_fname)

    if not os.path.isfile(feat_descriptor) and hasattr(feat, "describe_features"):
        dih_df = pd.DataFrame(feat.describe_features(trj[0]))
        verbosedump(dih_df, feat_descriptor)

    return
Exemple #12
0
def read_and_featurize(filename, dihedrals=['chi2'], stride=10):
	#print("reading and featurizing %s" %(filename))
	top = md.load_frame(filename, 0).topology
	#print("got top")
	atom_indices = [a.index for a in top.atoms if a.residue.resSeq == 93 and a.residue != "POPC" and str(a.residue)[0] == "H"]
	print((len(atom_indices)))
	#atom_indices = [a.index for a in top.atoms if a.residue.chain.index == 0 and a.residue.resSeq != 93 and a.residue != "POPC" and a.residue.resSeq != 130 and a.residue.resSeq != 172 and a.residue.resSeq != 79 and a.residue.resSeq != 341]
	#print("got indices")
	traj = md.load(filename, stride=1000, atom_indices=atom_indices)
	#print("got traj")
	featurizer = DihedralFeaturizer(types = dihedrals)
	features = featurizer.transform(traj_list = traj)
	#print(np.shape(features))
	#print("finished featurizing")

	directory = filename.split("/")
	condition = directory[len(directory)-2]
	dcd_file = directory[len(directory)-1]
	new_file = "%s_features_stride%d.h5" %(dcd_file.rsplit( ".", 1 )[ 0 ] , stride)
	new_root_dir = "/scratch/users/enf/b2ar_analysis/subsampled_features"
	new_condition_dir = "%s/%s" %(new_root_dir, condition)

	new_file_full = "%s/%s/%s" %(new_root_dir, condition, new_file)
	#print("saving features as %s" %new_file_full)

	verbosedump(features, new_file_full)
	return features
Exemple #13
0
def _slice_file(job_tuple):
    inp_file, feature_ind, output_folder = job_tuple
    featurized_file = verboseload(inp_file)
    sliced_file = featurized_file[:, feature_ind]
    sliced_file_out = os.path.join(output_folder, os.path.basename(inp_file))
    verbosedump(sliced_file, sliced_file_out)
    return
def transform_protein_pca(yaml_file):
    mdl_dir = yaml_file["mdl_dir"]
    pca_obj_path = os.path.join(mdl_dir, "pca_mdl.pkl")
    protein_pca_mdl = verboseload(pca_obj_path)
    for protein in yaml_file["protein_list"]:
        with enter_protein_data_dir(yaml_file, protein):
            print("Transforming protein %s" % protein)
            featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
            pca_data = {}
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    pca_data[os.path.basename(f)] = \
                        protein_pca_mdl.partial_transform(featurized_path)
                except:
                    print('Error')
            with enter_protein_mdl_dir(yaml_file, protein):
                verbosedump(pca_data, 'pca_data.pkl')
                print("Done transforming protein %s" % protein)

    # dumping the pca_mdl again since the eigenspectrum might have been calculated
    pca_mdl_path = os.path.join(mdl_dir, "pca_mdl.pkl")
    verbosedump(protein_pca_mdl, pca_mdl_path)
    return
def fit_bayes_msms(yaml_file):
    mdl_params = yaml_file["mdl_params"]
    msm__lag_time = mdl_params["msm__lag_time"]
    if "bayesmsm__n_samples" in mdl_params.keys():
        bayesmsm__n_samples = mdl_params["bayesmsm__n_samples"]
    else:
        bayesmsm__n_samples = 800
    if "bayesmsm__n_steps" in mdl_params.keys():
        bayesmsm__n_steps = mdl_params["bayesmsm__n_steps"]
    else:
        bayesmsm__n_steps = 1000000

    for protein in yaml_file["protein_list"]:
        with enter_protein_mdl_dir(yaml_file, protein):
            print(protein)
            assignments = verboseload("assignments.pkl")
            msm_mdl = BayesianMarkovStateModel(n_samples=bayesmsm__n_samples,
                                               n_steps=bayesmsm__n_steps,
                                               lag_time=msm__lag_time,
                                               ergodic_cutoff=1.0/msm__lag_time,
                                               verbose=True).fit(
                [assignments[i] for i in assignments.keys()])
            _ = msm_mdl.all_eigenvalues_
            verbosedump(msm_mdl, "bayesmsm_mdl.pkl")
            fixed_assignments = {}
            for i in assignments.keys():
                fixed_assignments[i] = msm_mdl.transform(
                    assignments[i], mode='fill')[0]
            verbosedump(fixed_assignments, 'fixed_assignments.pkl')
    return
def fit_protein_kmeans(yaml_file,mini=True,pca=False):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("cluster__"):
            current_mdl_params[i.split("cluster__")[1]] = mdl_params[i]

    if mini:
        current_mdl_params["batch_size"] = 100*current_mdl_params["n_clusters"]
        kmeans_mdl = MiniBatchKMeans(**current_mdl_params)
    else:
        kmeans_mdl = KMeans(**current_mdl_params)
    data = []

    for protein in yaml_file["protein_list"]:
        with enter_protein_mdl_dir(yaml_file, protein):
            if pca:
                tica_data = verboseload("pca_data.pkl")
            else:
                tica_data = verboseload("tica_data.pkl")
            # get all traj
            sorted_list = sorted(tica_data.keys(), key=keynat)
            data.extend([tica_data[i] for i in sorted_list])

    kmeans_mdl.fit(data)
    kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl")
    verbosedump(kmeans_mdl, kmeans_mdl_path)
    return
def transform_protein_tica(yaml_file):
    mdl_dir = yaml_file["mdl_dir"]
    tica_obj_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    protein_tica_mdl = verboseload(tica_obj_path)

    for protein in yaml_file["protein_list"]:
        print("Fitting to protein %s" % protein)
        with enter_protein_data_dir(yaml_file, protein):
            if os.path.exists("./normalized_features"):
                featurized_traj = sorted(glob.glob("./normalized_features/*.jl"), key=keynat)
            else:
                print('Warning: features have not been scaled')
                featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)

            tica_data = {}
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    tica_data[os.path.basename(f)] = \
                        protein_tica_mdl.partial_transform(featurized_path)
                except:
                    pass
            with enter_protein_mdl_dir(yaml_file, protein):
                verbosedump(tica_data, 'tica_data.pkl')
                print("Done transforming protein %s" % protein)

    # dumping the tica_mdl again since the eigenspectrum might have been calculated
    tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    verbosedump(protein_tica_mdl, tica_mdl_path)
    return
Exemple #18
0
def generate_tpt_traj_index_series(msm_object, sources, sinks, clusters_map,
                                   num_paths, remove_path, save_file):
    net_flux = tpt.net_fluxes(sources, sinks, msm_object)
    tpt_paths = tpt.paths(sources,
                          sinks,
                          net_flux,
                          remove_path=remove_path,
                          num_paths=num_paths,
                          flux_cutoff=0.5)

    inv_map = {v: k for k, v in msm_object.mapping_.items()}

    print(tpt_paths)
    traj_index_pairs_list = []
    for path in tpt_paths[0]:
        print("path = %s" % (str(path)))
        traj_index_pairs = []
        for state in path:
            cluster = inv_map[state]
            traj_index_pair = random.choice(list(clusters_map[cluster]))
            traj_index_pairs.append(traj_index_pair)
        traj_index_pairs_list.append(traj_index_pairs)

    verbosedump(traj_index_pairs_list, save_file)

    inv_tpt_paths = []
    for tpt_path in tpt_paths[0]:
        inv_tpt_paths.append([inv_map[i] for i in tpt_path])
    return tpt_paths[0], inv_tpt_paths, traj_index_pairs_list
def fit_protein_pca(yaml_file):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("pca__"):
            current_mdl_params[i.split("pca__")[1]] = mdl_params[i]

    protein_pca_mdl = PCA(**current_mdl_params)

    for protein in yaml_file["protein_list"]:
        print("Fitting to protein %s" % protein)
        with enter_protein_data_dir(yaml_file, protein):
            featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    protein_pca_mdl.partial_fit(featurized_path)
                except:
                    pass
            print("Done partial fitting to protein %s" % protein)
    # dumping the pca_mdl
    pca_mdl_path = os.path.join(mdl_dir, "pca_mdl.pkl")
    verbosedump(protein_pca_mdl, pca_mdl_path)
    return
def fit_protein_kmeans(yaml_file,mini=True):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("cluster__"):
            current_mdl_params[i.split("cluster__")[1]] = mdl_params[i]

    if mini:
        current_mdl_params["batch_size"] = 100*current_mdl_params["n_clusters"]
        kmeans_mdl = MiniBatchKMeans(**current_mdl_params)
    else:
        kmeans_mdl = KMeans(**current_mdl_params)
    data = []

    for protein in yaml_file["protein_list"]:
        with enter_protein_mdl_dir(yaml_file, protein):
            tica_data = verboseload("tica_data.pkl")
            # get all traj
            sorted_list = sorted(tica_data.keys(), key=keynat)
            data.extend([tica_data[i] for i in sorted_list])

    kmeans_mdl.fit(data)
    kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl")
    verbosedump(kmeans_mdl, kmeans_mdl_path)
    return
Exemple #21
0
def read_and_featurize(filename, dihedrals=['phi', 'psi', 'chi2'], stride=10):
	print("reading and featurizing %s" %(filename))

	traj = md.load(filename)
	#test_traj_init = md.load_frame(filename,5)
	#test_traj_init.save_pdb("/scratch/users/enf/b2ar_analysis/test_init.pdb")

	#traj.topology = fix_topology(traj.topology)

	#traj[-1].save_pdb("/scratch/users/enf/b2ar_analysis/test_fixed.pdb")
	#traj.save_dcd("/scratch/users/enf/b2ar_analysis/test_fixed.dcd")

	#print("got traj")
	featurizer = DihedralFeaturizer(types = dihedrals)
	features = featurizer.transform(traj_list = traj)
	#print("finished featurizing")

	directory = filename.split("/")
	traj_file = directory[len(directory)-1]
	condition = traj_file.split("_")[0].split(".")[0]

	print("Condition %s has features of shape %s" %(condition, np.shape(features)))

	new_file = "/scratch/users/enf/b2ar_analysis/combined_features/%s_features.h5" %condition
	verbosedump(features, new_file)
def transform_protein_tica(yaml_file):
    mdl_dir = yaml_file["mdl_dir"]
    tica_obj_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    protein_tica_mdl = verboseload(tica_obj_path)
    for protein in yaml_file["protein_list"]:
        with enter_protein_data_dir(yaml_file, protein):
            print("Transforming protein %s" % protein)
            featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
            tica_data = {}
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    tica_data[os.path.basename(f)] = \
                        protein_tica_mdl.partial_transform(featurized_path)
                except:
                    pass
            with enter_protein_mdl_dir(yaml_file, protein):
                verbosedump(tica_data, 'tica_data.pkl')
                print("Done transforming protein %s" % protein)

    # dumping the tica_mdl again since the eigenspectrum might have been calculated. 
    tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    verbosedump(protein_tica_mdl, tica_mdl_path)
    return
Exemple #23
0
def read_and_featurize(filename, dihedrals=['chi2'], stride=10):
	#print("reading and featurizing %s" %(filename))
	top = md.load_frame(filename, 0).topology
	#print("got top")
	atom_indices = [a.index for a in top.atoms if a.residue.resSeq == 93 and a.residue != "POPC" and str(a.residue)[0] == "H"]
	print(len(atom_indices))
	#atom_indices = [a.index for a in top.atoms if a.residue.chain.index == 0 and a.residue.resSeq != 93 and a.residue != "POPC" and a.residue.resSeq != 130 and a.residue.resSeq != 172 and a.residue.resSeq != 79 and a.residue.resSeq != 341]
	#print("got indices")
	traj = md.load(filename, stride=1000, atom_indices=atom_indices)
	#print("got traj")
	featurizer = DihedralFeaturizer(types = dihedrals)
	features = featurizer.transform(traj_list = traj)
	#print(np.shape(features))
	#print("finished featurizing")

	directory = filename.split("/")
	condition = directory[len(directory)-2]
	dcd_file = directory[len(directory)-1]
	new_file = "%s_features_stride%d.h5" %(dcd_file.rsplit( ".", 1 )[ 0 ] , stride)
	new_root_dir = "/scratch/users/enf/b2ar_analysis/subsampled_features"
	new_condition_dir = "%s/%s" %(new_root_dir, condition)

	new_file_full = "%s/%s/%s" %(new_root_dir, condition, new_file)
	#print("saving features as %s" %new_file_full)

	verbosedump(features, new_file_full)
	return features
def fit_protein_tica(yaml_file,sparse=False):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("tica__"):
            current_mdl_params[i.split("tica__")[1]] = mdl_params[i]

    if sparse==True:
        protein_tica_mdl = SparseTICA(**current_mdl_params)
    else:
        protein_tica_mdl = tICA(**current_mdl_params)

    for protein in yaml_file["protein_list"]:
        print("Fitting to protein %s" % protein)
        with enter_protein_data_dir(yaml_file, protein):
            featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    protein_tica_mdl.partial_fit(featurized_path)
                except:
                    pass
            print("Done partial fitting to protein %s" % protein)
    # dumping the tica_mdl
    tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    verbosedump(protein_tica_mdl, tica_mdl_path)
    return
def sample_from_clusterer(clusterer_file,
                          projected_features_dir,
                          traj_files,
                          n_samples,
                          save_dir,
                          samples_indices_file,
                          structure=None,
                          residue_cutoff=10000,
                          parallel=False,
                          worker_pool=None,
                          lig_name="UNK",
                          reseed_dir=None):
    clusterer = compat_verboseload(clusterer_file)
    n_clusters = len(clusterer.cluster_centers_)

    traj_index_frame_pairs = find_closest_indices_to_cluster_center(
        projected_features_dir, clusterer_file, k=n_samples)
    print(traj_index_frame_pairs)
    print(len(traj_index_frame_pairs))
    sample_cluster_partial = partial(sample_cluster, traj_index_frame_pairs,
                                     traj_files, structure, residue_cutoff,
                                     save_dir, lig_name, reseed_dir)
    if worker_pool is not None:
        worker_pool.map_sync(sample_cluster_partial, range(0, n_clusters))
    elif parallel:
        pool = mp.Pool(mp.cpu_count())
        pool.map(sample_cluster_partial, range(0, n_clusters))
        pool.terminate()
    else:
        for cluster in range(0, n_clusters):
            sample_cluster_partial(cluster)

    verbosedump(traj_index_frame_pairs, samples_indices_file)
def landmark_ktica(features_dir, combined_features_file=None, feature_ext = ".dataset", use_clusters_as_landmarks=True, clusters_map_file = "", 
	landmarks_dir = "", nystroem_components=1000, n_components=10, lag_time=5, nystroem_data_filename = "", 
	fit_model_filename = "", projected_data_filename = "", landmark_subsample=10, 
	sparse = False, shrinkage = 0.05, wolf = False, rho = 0.01):
'''
features_dir: string, directory where your featurized trajectories are kept. 
combined_features_dir: if you have a file containing all featurized trajectories in one file, i.e. as a list of np arrays, this is it.
feature_ext: if instead of a combined file of features they are in separate files, what is the extension of your feature files? 
use_clusters_as_landmarks: this is if you are doing a composition of tICA --> clustering --> Nystroem --> tICA. this is what I do. 
	if true, you need to feed it a json file containing a dictionary that maps cluster name --> list of 2-tuples, where each tuple has 
	(trajectory_id, frame_number pairs). So this way, instead of choosing landmark points at random in the Nystroem approximation, you
	are using regular linear tICA-driven clustering to choose your landmark points more efficiently. 
landmarks_dir: directory where you will save the landmarks. this should be a file containing a list of 1d np arrays or a 2d array
nystroem_components: the number of landmarks to use. 
n_components: the number of ktICA components to compute.
lag_time: lag time of tICA 
nystroem_data_filename: where you will save Nystroem object
fit_model_filename: the filename of the ktICA object to save.
projected_data_filename: where you will save the features projected with kernel tICA 
landmark_subsample= how frequently to subsample the landmarks if you are doing use_clusters_as_landmarks.
sparse: set to False. 
shrinkage: same as gamma in old version of tICA. you might want to mess with this. 
wolf = False: keep this as true unless you're using Robert's branch of msmbuilder
rho = Ignore this. 

'''

	if not os.path.exists(nystroem_data_filename):
		if combined_features_dir is not None: 
			features = verboseload(combined_features_file)
		else:
			features = load_file_list(get_trajectory_files(features_dir, ext = feature_ext))

		if os.path.exists(landmarks_dir):
			landmarks = verboseload(landmarks_dir)
			print(np.shape(landmarks))
		else:
			if use_clusters_as_landmarks:
				with open(clusters_map_file) as f:
					clusters_map = json.load(f)
					clusters_map = {int(k):v for k,v in clusters_map.items()}
					landmarks = []
					for cluster_id,sample_list in clusters_map.items():
						for sample in sample_list:
							traj = sample[0]
							frame = sample[1]
							landmark = features[traj][frame]
							landmarks.append(landmark)
							landmarks = [landmarks[i] for i in range(0,np.shape(landmarks)[0]) if i%landmark_subsample==0] #%landmark_subsample == 0]

					verbosedump(landmarks, landmarks_dir)
			else: 
				n = np.shape(features)[0]
				indices = np.random.choice(n, nystroem_components)
				features_concatenated = np.concatenate(features)
				landmarks = features_concatenated[indices,:]
				verbosedump(landmarks, landmarks_dir)

		ktica(features, landmarks, projected_data_filename, nystroem_data_filename, fit_model_filename, sparse, shrinkage, wolf, rho)
def featurize_known_traj(traj_dir, inactive, features_dir):
	print("currently featurizing %s" %traj_dir.split("/")[len(traj_dir.split("/"))-1])
	traj = md.load(traj_dir)
	rmsds = rmsd_npxxy(traj, inactive)
	helix6_helix3_distances = helix6_helix3_dist(traj)
	features = np.transpose(np.concatenate([[rmsds], [np.concatenate(helix6_helix3_distances)]]))
	print np.shape(features)

	filename = "%s/%s" %(features_dir, traj_dir.split("/")[len(traj_dir.split("/"))-1])
	verbosedump(features, filename)
def cluster(data_dir, traj_dir, n_clusters, lag_time):
	clusterer_dir = "/scratch/users/enf/b2ar_analysis/clusterer_%d_t%d.h5" %(n_clusters, lag_time)
	if (os.path.exists(clusterer_dir)):
		print "Already clustered"
	else:
		reduced_data = verboseload(data_dir)
		trajs = np.concatenate(reduced_data)
		clusterer = MiniBatchKMedoids(n_clusters = n_clusters)
		clusterer.fit_transform(reduced_data)
		verbosedump(clusterer, "/scratch/users/enf/b2ar_analysis/clusterer_%d_t%d.h5" %(n_clusters, lag_time))	
Exemple #29
0
def build_msm(clusterer_dir, lag_time):
	clusterer = verboseload(clusterer_dir)
	n_clusters = np.shape(clusterer.cluster_centers_)[0]
	labels = clusterer.labels_
	msm_modeler = MarkovStateModel(lag_time=lag_time)
	print("fitting msm to trajectories with %d clusters and lag_time %d" %(n_clusters, lag_time))
	msm_modeler.fit_transform(labels)
	verbosedump(msm_modeler, "/scratch/users/enf/b2ar_analysis/msm_model_%d_clusters_t%d" %(n_clusters, lag_time))
	print("fitted msm to trajectories with %d states" %(msm_modeler.n_states_))
	'''
Exemple #30
0
def create_fake_data(base_dir, protein_list, project_dict):
    np.random.seed(42)
    for protein in protein_list:
        os.mkdir(protein)
        os.mkdir(os.path.join(protein, "feature_dir"))
        for project in project_dict[protein]:
            os.mkdir(os.path.join(protein,project))
        for i in range(5):
            X = np.random.randn(20, 3)
            verbosedump(X, os.path.join(protein, "feature_dir" ,"%d.jl"%i))
    return
Exemple #31
0
def create_fake_data(base_dir, protein_list, project_dict):
    np.random.seed(42)
    for protein in protein_list:
        os.mkdir(protein)
        os.mkdir(os.path.join(protein, "feature_dir"))
        for project in project_dict[protein]:
            os.mkdir(os.path.join(protein, project))
        for i in range(5):
            X = np.random.randn(20, 3)
            verbosedump(X, os.path.join(protein, "feature_dir", "%d.jl" % i))
    return
Exemple #32
0
def cluster_kmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time):
    clusterer_dir = "%s/clusterer_%dclusters.h5" % (tica_dir, n_clusters)
    if (os.path.exists(clusterer_dir)):
        print("Already clustered")
    else:
        print("Clustering by KMeans")
        reduced_data = verboseload(data_dir)
        trajs = np.concatenate(reduced_data)
        clusterer = KMeans(n_clusters=n_clusters, n_jobs=-1)
        clusterer.fit_transform(reduced_data)
        verbosedump(clusterer, clusterer_dir)
def cluster_kmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time):
	clusterer_dir = "%s/clusterer_%dclusters.h5" %(tica_dir, n_clusters)
	if (os.path.exists(clusterer_dir)):
		print "Already clustered"
	else:
		print "Clustering by KMeans"
		reduced_data = verboseload(data_dir)
		trajs = np.concatenate(reduced_data)
		clusterer = KMeans(n_clusters = n_clusters, n_jobs=-1)
		clusterer.fit_transform(reduced_data)
		verbosedump(clusterer, clusterer_dir)	
def landmark_ktica_ticaTraj(tica_dir, clusterer_dir, ktica_dir, clusters_map_file = "", landmarks_dir = "", nystroem_components=1000, n_components=10, lag_time=5, nystroem_data_filename = "", fit_model_filename = "", projected_data_filename = "", landmark_subsample=1, sparse = False, wolf = True, rho = 0.01, shrinkage = None):
	if not os.path.exists(ktica_dir): os.makedirs(ktica_dir)
	
	if not sparse:
		if shrinkage is None:
			tica_model = tICA(n_components = n_components, lag_time = lag_time)
		else:
			tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage)
		
	else:
		if shrinkage is None:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho)
		else:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage)

	if not os.path.exists(nystroem_data_filename):
		clusterer = verboseload(clusterer_dir)
		tica = verboseload(tica_dir)
		features = tica
		clusters = clusterer.cluster_centers_
		landmarks = clusters

		print("here's what goes into the combined class:")
		#print(np.shape(features))
		print(np.shape(landmarks))
		print(type(landmarks))
		nys = Nystroem(n_components = np.shape(landmarks)[0], basis = landmarks)#np.shape(landmarks)[0])# basis=landmarks)
		nyx = nys.fit_transform(features)
		del features
		del landmarks
		try:
			save_dataset(nyx, nystroem_data_filename)
		except:
			os.system("rm -rf %s" %nystroem_data_filename)
			save_dataset(nyx, nystroem_data_filename)
	else:
		nyx = load_dataset(nystroem_data_filename)

	print(np.shape(nyx))
	print(dir(nyx))

	if not os.path.exists(projected_data_filename):
		fit_model = tica_model.fit(nyx)
		verbosedump(fit_model, fit_model_filename)
		transformed_data = fit_model.transform(nyx)
		del(nyx)
		try:
			save_dataset(transformed_data, projected_data_filename)
		except:
			os.system("rm -rf %s" %projected_data_filename)
			save_dataset(transformed_data, projected_data_filename)
	else:
		print("Already performed landmark kernel tICA.")
def tica_wrapper(proj_folder,feature_dict,lag_time=10):
     #100ps*100==10ns and 10 features
     if os.path.exists(proj_folder+"/tica_features.pkl"):
          return verboseload(proj_folder+"/tica_features.pkl")

     tica_mdl = tICA(lag_time=lag_time,n_components=10)
     tica_mdl.fit([feature_dict[i] for i in feature_dict.keys()])

     tica_features={}
     for i in feature_dict.keys():
          tica_features[i] = tica_mdl.transform([feature_dict[i]])[0]
     verbosedump(tica_features,proj_folder+"/tica_features.pkl")
     return tica_features
def featurize_known_traj(traj_dir, inactive, features_dir):
    print(("currently featurizing %s" %
           traj_dir.split("/")[len(traj_dir.split("/")) - 1]))
    traj = md.load(traj_dir)
    rmsds = rmsd_npxxy(traj, inactive)
    helix6_helix3_distances = helix6_helix3_dist(traj)
    features = np.transpose(
        np.concatenate([[rmsds], [np.concatenate(helix6_helix3_distances)]]))
    print(np.shape(features))

    filename = "%s/%s" % (features_dir,
                          traj_dir.split("/")[len(traj_dir.split("/")) - 1])
    verbosedump(features, filename)
Exemple #37
0
def cluster(data_dir, traj_dir, n_clusters, lag_time):
    clusterer_dir = "/scratch/users/enf/b2ar_analysis/clusterer_%d_t%d.h5" % (
        n_clusters, lag_time)
    if (os.path.exists(clusterer_dir)):
        print("Already clustered")
    else:
        reduced_data = verboseload(data_dir)
        trajs = np.concatenate(reduced_data)
        clusterer = MiniBatchKMedoids(n_clusters=n_clusters)
        clusterer.fit_transform(reduced_data)
        verbosedump(
            clusterer, "/scratch/users/enf/b2ar_analysis/clusterer_%d_t%d.h5" %
            (n_clusters, lag_time))
Exemple #38
0
def tica_wrapper(proj_folder, feature_dict, lag_time=10):
    #100ps*100==10ns and 10 features
    if os.path.exists(proj_folder + "/tica_features.pkl"):
        return verboseload(proj_folder + "/tica_features.pkl")

    tica_mdl = tICA(lag_time=lag_time, n_components=10)
    tica_mdl.fit([feature_dict[i] for i in feature_dict.keys()])

    tica_features = {}
    for i in feature_dict.keys():
        tica_features[i] = tica_mdl.transform([feature_dict[i]])[0]
    verbosedump(tica_features, proj_folder + "/tica_features.pkl")
    return tica_features
def read_and_featurize_custom(traj_file, condition=None, location=None, dihedral_residues = None, distance_residues = None):
	top = md.load_frame(traj_file,index = 0).topology
	#atom_indices = [a.index for a in top.atoms if a.residue.resSeq != 130]
	atom_indices = [a.index for a in top.atoms]
	traj = md.load(traj_file, atom_indices=atom_indices)
	print traj_file
	#print traj
	#print("loaded trajectory")

	'''
	a = time.time()
	featurizer = DihedralFeaturizer(types = ['phi', 'psi', 'chi2'])
	features = featurizer.transform(traj)
	b = time.time()
	#print(b-a)
	print("original features has dim")
	print(np.shape(features))
	'''
	a = time.time()

	
	phi_tuples = phi_indices(traj.topology, dihedral_residues)
	psi_tuples = psi_indices(traj.topology, dihedral_residues)
	chi2_tuples = chi2_indices(traj.topology, dihedral_residues)

	#if distance_residues is not None:

	

	#print("new features has dim %d" %(2*len(phi_tuples) + 2*len(psi_tuples) + 2*len(chi2_tuples)))

	#print("feauturizing manually:")

	phi_angles = np.transpose(ManualDihedral.compute_dihedrals(traj=traj,indices=phi_tuples))
	psi_angles = np.transpose(ManualDihedral.compute_dihedrals(traj=traj,indices=psi_tuples))
	chi2_angles = np.transpose(ManualDihedral.compute_dihedrals(traj=traj,indices=chi2_tuples))
	
	manual_features = np.concatenate([np.sin(phi_angles), np.cos(phi_angles), np.sin(psi_angles), np.cos(psi_angles), np.sin(chi2_angles), np.cos(chi2_angles)])
	b = time.time()
	#print(b-a)

	print("new features has shape: ")
	print(np.shape(manual_features))

	if condition is None:
		condition = get_condition(traj_file)

	if location is None:
		location = "/scratch/users/enf/b2ar_analysis/features_allprot"

	verbosedump(manual_features, "%s/%s.h5" %(location, condition))
def cluster_minikmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time):
	clusterer_dir = "%s/clusterer_%dclusters.h5" %(tica_dir, n_clusters)
	if (os.path.exists(clusterer_dir)):
		print "Already clustered"
	else:
		print "Clustering by KMeans"
		try:
			reduced_data = verboseload(data_dir)
		except:
			reduced_data = load_dataset(data_dir)
		trajs = np.concatenate(reduced_data)
		clusterer = MiniBatchKMeans(n_clusters = n_clusters)
		clusterer.fit_transform(reduced_data)
		verbosedump(clusterer, clusterer_dir)
Exemple #41
0
def macrostate_pcca(msm_file, clusterer_file, n_macrostates, macrostate_dir):

    msm = verboseload(msm_file)
    clusterer = verboseload(clusterer_file)

    #pcca = lumping.PCCAPlus.from_msm(msm = msm,n_macrostates = n_macrostates)
    #macrostate_model = MarkovStateModel()
    #macrostate_model.fit(pcca.transform(labels))

    pcca_object = lumping.PCCA(n_macrostates=10)
    pcca_object.fit(sequences=clusterer.labels_)
    #pcca_object.transform(sequences = clusterer.labels_)
    #macrostate_model = pcca_object.from_msm(msm = msm, n_macrostates = n_macrostates)
    print(pcca_object)
    print((pcca_object.microstate_mapping_))
    verbosedump(pcca_object, macrostate_dir)
def transform_protein_kmeans(yaml_file):
    mdl_dir = yaml_file["mdl_dir"]
    kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl")
    kmeans_mdl = verboseload(kmeans_mdl_path)
    for protein in yaml_file["protein_list"]:
        print("Assigning protein %s" % protein)
        with enter_protein_mdl_dir(yaml_file, protein):
            tica_data = verboseload("tica_data.pkl")
            # do assignments
            assignments = {}
            for i in tica_data.keys():
                assignments[i] = kmeans_mdl.predict([tica_data[i]])[0]
            verbosedump(assignments, 'assignments.pkl')

            print("Done assigning %s" % protein)
    return
Exemple #43
0
def macrostate_pcca(msm_file, clusterer_file, n_macrostates, macrostate_dir):

	msm = verboseload(msm_file)
	clusterer = verboseload(clusterer_file)

	#pcca = lumping.PCCAPlus.from_msm(msm = msm,n_macrostates = n_macrostates)
	#macrostate_model = MarkovStateModel()
	#macrostate_model.fit(pcca.transform(labels))

	pcca_object = lumping.PCCA(n_macrostates = 10)
	pcca_object.fit(sequences = clusterer.labels_)
	#pcca_object.transform(sequences = clusterer.labels_)
	#macrostate_model = pcca_object.from_msm(msm = msm, n_macrostates = n_macrostates)
	print(pcca_object)
	print(pcca_object.microstate_mapping_)
	verbosedump(pcca_object, macrostate_dir)
def normalize_project_series(yaml_file, output_folder="normalized_features",
                             stride=40,nrm=None):
    """
    routine to take a set of proteins features stored in the feature_dir and
    normalize them by removing the mean and setting variance to 1 using the standard
    scaler. The normalizer is dumped into the mdl dir.
    :param yaml_file: The yaml file to work with.
    :param output_folder: The name of the output folder to dump normalized features in
    :param stride: The initial stride in files to fit the normalizer with.
    This is necessary to prevent memory errors. defaults to every 40th file
    :param nrm: previously fit normalizer. else it uses the standard scaler from
    scikitlearn
    :return:
    """
    yaml_file = load_yaml_file(yaml_file)
    #setup normalizer
    if nrm is None:
        nrm = preprocessing.StandardScaler()
        all_data = {}
        for prt in yaml_file["protein_list"]:
            with enter_protein_data_dir(yaml_file, prt):
                print(prt)
                flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))[::stride]
                for f in flist:
                     all_data[f]=verboseload(f)

        seq=[]
        for i in all_data.keys():
           seq.extend(all_data[i])

        #fit it
        nrm.fit(seq)
        #dump it into the mdl dir.
        verbosedump(nrm,"%s/nrm.h5"%yaml_file["mdl_dir"])

    for prt in yaml_file["protein_list"]:
        _check_output_folder_exists(yaml_file, prt, output_folder)

        with enter_protein_data_dir(yaml_file, prt):
            output_folder_path = os.path.abspath(output_folder)
            flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))
            for f in flist:
                res = verboseload(f)
                res = nrm.transform(res)
                verbosedump(res,"%s/%s"%(output_folder_path, os.path.basename(f)))

    return
Exemple #45
0
def normalize_project_series(yaml_file, output_folder="normalized_features",
                             stride=1,nrm=None):
    """
    routine to take a set of proteins features stored in the feature_dir and
    normalize them by removing the mean and setting variance to 1 using the standard
    scaler. The normalizer is dumped into the mdl dir.
    :param yaml_file: The yaml file to work with.
    :param output_folder: The name of the output folder to dump normalized features in
    :param stride: The initial stride in files to fit the normalizer with.
    This is necessary to prevent memory errors. defaults to every 40th file
    :param nrm: previously fit normalizer. else it uses the standard scaler from
    scikitlearn
    :return:
    """
    yaml_file = load_yaml_file(yaml_file)
    #setup normalizer
    if nrm is None:
        nrm = preprocessing.RobustScaler()
        all_data = {}
        for prt in yaml_file["protein_list"]:
            with enter_protein_data_dir(yaml_file, prt):
                print(prt)
                flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))[::stride]
                for f in flist:
                     all_data[f]=verboseload(f)

        seq=[]
        for i in all_data.keys():
           seq.extend(all_data[i])

        #fit it
        nrm.fit(seq)
        #dump it into the mdl dir.
        verbosedump(nrm,"%s/nrm.h5"%yaml_file["mdl_dir"])

    for prt in yaml_file["protein_list"]:
        _check_output_folder_exists(yaml_file, prt, output_folder)

        with enter_protein_data_dir(yaml_file, prt):
            output_folder_path = os.path.abspath(output_folder)
            flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))
            for f in flist:
                res = verboseload(f)
                res = nrm.transform(res)
                verbosedump(res,"%s/%s"%(output_folder_path, os.path.basename(f)))

    return
def cluster_project_wrapper(proj_folder,feature_dict,n_states):

     if os.path.exists(proj_folder+"/assignments.pkl"):
          return verboseload(proj_folder+"/cluster_mdl.pkl"),verboseload(proj_folder+"/assignments.pkl")
     elif os.path.exists(proj_folder+"/cluster_mdl.pkl"):
          cluster_mdl = verboseload(proj_folder+"/cluster_mdl.pkl")
     else:
          cluster_mdl = KMeans(n_clusters = n_states)
          cluster_mdl.fit([feature_dict[i] for i in feature_dict.keys()])

     assignments={}
     for i in feature_dict.keys():
          assignments[i] = cluster_mdl.transform([feature_dict[i]])

     verbosedump(cluster_mdl,proj_folder+"/cluster_mdl.pkl")
     verbosedump(assignments,proj_folder+"/assignments.pkl")
     return cluster_mdl,assignments
Exemple #47
0
def resample_by_msm(total_samples,
                    msm_object,
                    clusters_map,
                    num_trajs,
                    save_file,
                    equilibrium_populations=None):
    if equilibrium_populations is None:
        equilibrium_populations = msm_object.populations_

    num_to_sample_per_cluster = {}
    for cluster_id in msm_object.mapping_.keys():
        state_id = msm_object.mapping_[cluster_id]
        num_to_sample_per_cluster[cluster_id] = np.rint(
            equilibrium_populations[state_id] * total_samples)

    print(
        "Found number to sample per cluster based on equilibrium proporrtions."
    )
    sample_pairs = []
    for cluster_id in msm_object.mapping_.keys():
        traj_index_pairs = list(clusters_map[cluster_id])
        if len(traj_index_pairs) == 0:
            continue
        num_to_sample = num_to_sample_per_cluster[cluster_id]
        random_indices = np.random.choice(range(0, len(traj_index_pairs)),
                                          size=num_to_sample,
                                          replace=True)
        clusters_sample_pairs = [traj_index_pairs[i] for i in random_indices]
        sample_pairs += clusters_sample_pairs

    print(
        "Obtained random (trajectory, frame) pairs based on equilibrium populations"
    )
    #if there exists some fancy numpy way to index a 3d array by 2d tuples, then great, else:
    traj_to_frames = {}
    for i in range(0, num_trajs):
        traj_to_frames[i] = []

    for sample_pair in sample_pairs:
        traj_to_frames[sample_pair[0]].append(sample_pair[1])

    print("Rearranged equilibrium sampled frames based on trajectories")

    if save_file is not None:
        verbosedump(traj_to_frames, save_file)
    return traj_to_frames
def ktica(features, landmarks, projected_data_filename, nystroem_data_filename, fit_model_filename, sparse = False, shrinkage = 0.05, wolf = True, rho = 0.01):
	if not sparse:
		if shrinkage is None:
			tica_model = tICA(n_components = n_components, lag_time = lag_time)
		else:
			if wolf:
				tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage)
			else:
				tica_model = tICA(n_components = n_components, lag_time = lag_time, gamma = shrinkage)

		
	else:
		if shrinkage is None:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho)
		else:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage)


	if not os.path.exists(nystroem_data_filename):
		nys = Nystroem(n_components = np.shape(landmarks)[0], basis = landmarks)#np.shape(landmarks)[0])# basis=landmarks)
		nyx = nys.fit_transform(features)
		print("Computed Nystroem.")
		del features
		del landmarks
		try:
			save_dataset(nyx, nystroem_data_filename)
		except:
			os.system("rm -rf %s" %nystroem_data_filename)
			save_dataset(nyx, nystroem_data_filename)
	else:
		nyx = load_dataset(nystroem_data_filename)
		print("Loaded Nystroem")

	if not os.path.exists(projected_data_filename):
		fit_model = tica_model.fit(nyx)
		verbosedump(fit_model, fit_model_filename)
		transformed_data = fit_model.transform(nyx)
		del(nyx)
		try:
			save_dataset(transformed_data, projected_data_filename)
		except:
			os.system("rm -rf %s" %projected_data_filename)
			save_dataset(transformed_data, projected_data_filename)
	else:
		print("Already performed landmark kernel tICA.")
Exemple #49
0
def resample_features_by_msm_equilibirum_pop(features,
                                             traj_to_frames,
                                             save_file=None):
    resampled_features = []
    for traj_index, frames in traj_to_frames.items():
        if isinstance(features[0], pd.DataFrame):
            resampled_features.append(features[traj_index].iloc[frames])
        else:
            resampled_features.append(features[traj_index][frames, :])

    if isinstance(features[0], pd.DataFrame):
        resampled_features = pd.concat(resampled_features, axis=0)
    else:
        resampled_features = np.concatenate(resampled_features)

    if save_file is not None:
        verbosedump(resampled_features, save_file)
    else:
        return resampled_features
Exemple #50
0
def cluster_project_wrapper(proj_folder, feature_dict, n_states):

    if os.path.exists(proj_folder + "/assignments.pkl"):
        return verboseload(proj_folder +
                           "/cluster_mdl.pkl"), verboseload(proj_folder +
                                                            "/assignments.pkl")
    elif os.path.exists(proj_folder + "/cluster_mdl.pkl"):
        cluster_mdl = verboseload(proj_folder + "/cluster_mdl.pkl")
    else:
        cluster_mdl = KMeans(n_clusters=n_states)
        cluster_mdl.fit([feature_dict[i] for i in feature_dict.keys()])

    assignments = {}
    for i in feature_dict.keys():
        assignments[i] = cluster_mdl.transform([feature_dict[i]])

    verbosedump(cluster_mdl, proj_folder + "/cluster_mdl.pkl")
    verbosedump(assignments, proj_folder + "/assignments.pkl")
    return cluster_mdl, assignments
Exemple #51
0
def sample_msm_traj(yaml_file,
                    prt_name,
                    n_steps,
                    starting_state=None,
                    fname="msm_traj.xtc",
                    msm_traj=None,
                    scheme='random'):
    """
    :param yaml_file: The model's yaml file
    :param prt: The name of the protein mdl
    :param n_steps: The number of markovian frames desired.
    :param starting_state: If None, we start from the most populated state.
    :param fname : The output filename
    :msm_traj : output of msm.sample_discrete. This is so that if you want
    you can sample a random trajectory
    :return: Dumps the msm traj.
    """

    yaml_file = load_yaml_file(yaml_file)
    ser = ProteinSeries(yaml_file)
    prt = Protein(ser, prt_name)

    if msm_traj is None:
        # this returns in original assignment space
        msm_traj = prt.msm.sample_discrete(state=starting_state,
                                           n_steps=n_steps)
    # there we use the original assignment matrix too
    key_mapping, assignment_matrix = create_assignment_matrix(prt.assignments)

    jbs = [(state, assignment_matrix, key_mapping, ser.base_dir, prt.name,
            yaml_file["protein_dir"]) for state in msm_traj]
    p = Pool(int(cpu_count() / 4))
    trj_list = p.map(_random_sample_state, jbs)
    print("Done")
    trj = trj_list[0] + trj_list[1:]

    with enter_protein_mdl_dir(yaml_file, prt_name):
        verbosedump(msm_traj, "msm_traj.pkl")
        trj.save_xtc(fname)
        if not os.path.isfile("prot.pdb"):
            trj[0].save_pdb("prot.pdb")
    return
def standardize_features(features_dir, features_ext, standardized_features_dir):
	if not os.path.exists(standardized_features_dir): os.makedirs(standardized_features_dir)
	feature_files = get_trajectory_files(features_dir, features_ext)
	features = load_file_list(feature_files)
	concatenated_features = np.concatenate(features)
	means = np.mean(concatenated_features, axis = 0)
	stdevs = np.std(concatenated_features, axis = 0)
	standardized_features = []
	for X in features: 
		X -= means
		X /= stdevs 
		standardized_features.append(X)

	print("Finished standardizing features")
	for i in range(0, len(feature_files)):
		filename = feature_files[i].split("/")[len(feature_files[i].split("/"))-1]
		new_filename = "%s/%s" %(standardized_features_dir, filename)
		verbosedump(standardized_features[i], new_filename)
	print("Finished saving all standardized features")
	return 
def transform_protein_kmeans(yaml_file,pca=False):
    mdl_dir = yaml_file["mdl_dir"]
    kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl")
    kmeans_mdl = verboseload(kmeans_mdl_path)
    for protein in yaml_file["protein_list"]:
        print("Assigning protein %s" % protein)
        with enter_protein_mdl_dir(yaml_file, protein):
            if pca:
                tica_data = verboseload("pca_data.pkl")
            else:
                tica_data = verboseload("tica_data.pkl")
 
            # do assignments
            assignments = {}
            for i in tica_data.keys():
                assignments[i] = kmeans_mdl.predict([tica_data[i]])[0]
            verbosedump(assignments, 'assignments.pkl')

            print("Done assigning %s" % protein)
    return
Exemple #54
0
def generate_msm_traj_index_series(msm_object,
                                   start_cluster,
                                   n_steps,
                                   clusters_map,
                                   save_file=None):
    inv_map = {v: k for k, v in msm_object.mapping_.items()}
    msm_trajectory = msm_object.sample_discrete(state=start_cluster,
                                                n_steps=n_steps)

    traj_index_pairs = []
    clusters = []
    for state in msm_trajectory:
        cluster = state  #inv_map[state]
        clusters.append(cluster)
        traj_index_pair = random.choice(list(clusters_map[cluster]))
        traj_index_pairs.append(traj_index_pair)

    if save_file is not None:
        verbosedump(traj_index_pairs, save_file)
    return traj_index_pairs, clusters
Exemple #55
0
def build_msm(clusterer_dir,
              lag_time,
              msm_model_dir,
              prior_counts=0.0,
              ergodic_cutoff='on'):
    clusterer = verboseload(clusterer_dir)
    n_clusters = np.shape(clusterer.cluster_centers_)[0]
    labels = clusterer.labels_
    msm_modeler = MarkovStateModel(lag_time=lag_time,
                                   prior_counts=prior_counts,
                                   ergodic_cutoff=ergodic_cutoff)
    print(("fitting msm to trajectories with %d clusters and lag_time %d" %
           (n_clusters, lag_time)))
    msm_modeler.fit_transform(labels)
    print(msm_modeler)
    verbosedump(msm_modeler, msm_model_dir)
    print(("fitted msm to trajectories with %d states" %
           (msm_modeler.n_states_)))
    return msm_modeler
    '''
Exemple #56
0
def build_msm(clusterer_dir, lag_time):
    clusterer = verboseload(clusterer_dir)
    n_clusters = np.shape(clusterer.cluster_centers_)[0]
    labels = clusterer.labels_
    msm_modeler = MarkovStateModel(lag_time=lag_time)
    print(("fitting msm to trajectories with %d clusters and lag_time %d" %
           (n_clusters, lag_time)))
    msm_modeler.fit_transform(labels)
    verbosedump(
        msm_modeler,
        "/scratch/users/enf/b2ar_analysis/msm_model_%d_clusters_t%d" %
        (n_clusters, lag_time))
    print(("fitted msm to trajectories with %d states" %
           (msm_modeler.n_states_)))
    #np.savetxt("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_transmat.csv" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",")
    #G = nx.from_numpy_matrix(msm_modeler.transmat_)
    #nx.write_edgelist(G, "/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",")
    transmat = msm_modeler.transmat_

    mapping = msm_modeler.mapping_

    edges = open(
        "/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist.csv" %
        (n_clusters, lag_time), "wb")
    for i in range(0, msm_modeler.n_states_):
        if i == 0:
            for j in range(0, msm_modeler.n_states_):
                edges.write(";")
                edges.write("%d" % mapping[j])
            edges.write("\n")

        edges.write("%d" % (mapping[i]))
        for j in range(0, msm_modeler.n_states_):
            prob = transmat[i][j]
            edges.write(";")
            if prob > 0.000001:
                edges.write("%f" % prob)
            else:
                edges.write("0")
        edges.write("\n")
    edges.close()
def fit_bootstrap(yaml_file,pool=None):
    mdl_params = yaml_file["mdl_params"]
    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("msm__"):
            current_mdl_params[i.split("msm__")[1]] = mdl_params[i]

    if "bootstrap__n_samples" in mdl_params.keys():
        bootstrap__n_samples = mdl_params["bootstrap__n_samples"]
    else:
        bootstrap__n_samples = 100
    for protein in yaml_file["protein_list"]:
        with enter_protein_mdl_dir(yaml_file, protein):
            print(protein)
            assignments = verboseload("assignments.pkl")
            msm_mdl =BootStrapMarkovStateModel(n_samples= bootstrap__n_samples, n_procs=2,
                                               msm_args = current_mdl_params
                                               )
            msm_mdl.fit([assignments[i] for i in assignments.keys()], pool=pool)
            verbosedump(msm_mdl, "bootstrap_msm_mdl.pkl")
            verbosedump(msm_mdl.mle_, "msm_mdl.pkl")
            fixed_assignments = {}
            for i in assignments.keys():
                fixed_assignments[i] = msm_mdl.mle_.transform(
                    assignments[i], mode='fill')[0]
            verbosedump(fixed_assignments, 'fixed_assignments.pkl')
    return            
def featurize_pnas_distance_pdbs(traj_dir, new_filename, features_dir, inactive_dir, active_dir, inactive_distances_dir, active_distances_dir, coords_dir):
	#if not os.path.exists(features_dir): os.makedirs(features_dir)

	inactive = md.load(inactive_dir)
	active = md.load(active_dir)

	agonist_bound = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
	samples = get_trajectory_files(traj_dir, ext = ".pdb")
	pool = mp.Pool(mp.cpu_count())
	trajs = pool.map(load_pdb_traj, samples)
	trajs_joined = trajs[0].join(trajs[1:])

	trajs_joined.save_hdf5(new_filename)

	features = compute_pnas_coords_and_distance(new_filename, inactive, active)

	coords = [f[0] for f in features]
	inactive_distances = [f[1][0] for f in features]
	active_distances = [f[1][1] for f in features]

	verbosedump(coords, coords_dir)
	verbosedump(inactive_distances, inactive_distances_dir)
	verbosedump(active_distances, active_distances_dir)

	print("Completed featurizing")
def fit_and_transform(features_directory, model_dir, stride=5, lag_time=10, n_components = 5):
	if not os.path.exists(model_dir):
		os.makedirs(model_dir)

	projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" %model_dir
	fit_model_filename  = "%s/phi_psi_chi2_allprot_tica_coords.h5" %model_dir
	#active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb"

	tica_model = tICA(n_components = n_components, lag_time = lag_time)

	if not os.path.exists(projected_data_filename):
		print("loading feature files")
		feature_files = get_trajectory_files(features_directory, ext = ".h5")
		pool = mp.Pool(mp.cpu_count())
		features = pool.map(load_features, feature_files)
		pool.terminate()
		if not os.path.exists(fit_model_filename):
			print("fitting data to tICA model")
			fit_model = tica_model.fit(features)
			verbosedump(fit_model, fit_model_filename)
			transformed_data = fit_model.transform(features)
			verbosedump(transformed_data, projected_data_filename)
		else:
			print("loading tICA model")
			fit_model = verboseload(fit_model_filename)
			print("transforming")
			transformed_data = fit_model.transform(features)
			verbosedump(transformed_data, projected_data_filename)
	else:
		fit_model = verboseload(fit_model_filename)
		transformed_data = verboseload(projected_data_filename)

	print fit_model.summarize()