def build_msm(clusterer_dir, lag_time): clusterer = verboseload(clusterer_dir) n_clusters = np.shape(clusterer.cluster_centers_)[0] labels = clusterer.labels_ msm_modeler = MarkovStateModel(lag_time=lag_time) print("fitting msm to trajectories with %d clusters and lag_time %d" %(n_clusters, lag_time)) msm_modeler.fit_transform(labels) verbosedump(msm_modeler, "/scratch/users/enf/b2ar_analysis/msm_model_%d_clusters_t%d" %(n_clusters, lag_time)) print("fitted msm to trajectories with %d states" %(msm_modeler.n_states_)) #np.savetxt("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_transmat.csv" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",") #G = nx.from_numpy_matrix(msm_modeler.transmat_) #nx.write_edgelist(G, "/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",") transmat = msm_modeler.transmat_ mapping = msm_modeler.mapping_ edges = open("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist.csv" %(n_clusters, lag_time), "wb") for i in range(0, msm_modeler.n_states_): if i == 0: for j in range(0, msm_modeler.n_states_): edges.write(";") edges.write("%d" %mapping[j]) edges.write("\n") edges.write("%d" %(mapping[i])) for j in range(0, msm_modeler.n_states_): prob = transmat[i][j] edges.write(";") if prob > 0.000001: edges.write("%f" %prob) else: edges.write("0") edges.write("\n") edges.close()
def build_msm(clusterer_dir, lag_time): clusterer = verboseload(clusterer_dir) n_clusters = np.shape(clusterer.cluster_centers_)[0] labels = clusterer.labels_ msm_modeler = MarkovStateModel(lag_time=lag_time) print("fitting msm to trajectories with %d clusters and lag_time %d" %(n_clusters, lag_time)) msm_modeler.fit_transform(labels) verbosedump(msm_modeler, "/scratch/users/enf/b2ar_analysis/msm_model_%d_clusters_t%d" %(n_clusters, lag_time)) print("fitted msm to trajectories with %d states" %(msm_modeler.n_states_)) '''
def build_msm(clusterer_dir, lag_time, msm_model_dir, prior_counts=0.0, ergodic_cutoff='on'): clusterer = verboseload(clusterer_dir) n_clusters = np.shape(clusterer.cluster_centers_)[0] labels = clusterer.labels_ msm_modeler = MarkovStateModel(lag_time=lag_time, prior_counts=prior_counts, ergodic_cutoff=ergodic_cutoff) print(("fitting msm to trajectories with %d clusters and lag_time %d" % (n_clusters, lag_time))) msm_modeler.fit_transform(labels) print(msm_modeler) verbosedump(msm_modeler, msm_model_dir) print(("fitted msm to trajectories with %d states" % (msm_modeler.n_states_))) return msm_modeler '''
def build_msm(clusterer_dir, lag_time): clusterer = verboseload(clusterer_dir) n_clusters = np.shape(clusterer.cluster_centers_)[0] labels = clusterer.labels_ msm_modeler = MarkovStateModel(lag_time=lag_time) print(("fitting msm to trajectories with %d clusters and lag_time %d" % (n_clusters, lag_time))) msm_modeler.fit_transform(labels) verbosedump( msm_modeler, "/scratch/users/enf/b2ar_analysis/msm_model_%d_clusters_t%d" % (n_clusters, lag_time)) print(("fitted msm to trajectories with %d states" % (msm_modeler.n_states_))) #np.savetxt("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_transmat.csv" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",") #G = nx.from_numpy_matrix(msm_modeler.transmat_) #nx.write_edgelist(G, "/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",") transmat = msm_modeler.transmat_ mapping = msm_modeler.mapping_ edges = open( "/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist.csv" % (n_clusters, lag_time), "wb") for i in range(0, msm_modeler.n_states_): if i == 0: for j in range(0, msm_modeler.n_states_): edges.write(";") edges.write("%d" % mapping[j]) edges.write("\n") edges.write("%d" % (mapping[i])) for j in range(0, msm_modeler.n_states_): prob = transmat[i][j] edges.write(";") if prob > 0.000001: edges.write("%f" % prob) else: edges.write("0") edges.write("\n") edges.close()
rs = np.random.RandomState(42) # Load Fs Peptide Data trajs = FsPeptide().get().trajectories # Extract Backbone Dihedrals featurizer = DihedralFeaturizer(types=['chi1']) diheds = featurizer.fit_transform(trajs) # Perform Dimensionality Reduction tica_model = tICA(lag_time=2, n_components=2) tica_trajs = tica_model.fit_transform(diheds) # Perform Clustering clusterer = MiniBatchKMeans(n_clusters=12, random_state=rs) clustered_trajs = clusterer.fit_transform(tica_trajs) # Construct MSM msm = MarkovStateModel(lag_time=2) assignments = msm.fit_transform(clustered_trajs) # Plot Stacked Distributions a = np.concatenate(assignments, axis=0) d = np.concatenate(diheds, axis=0) # Plot Stacked Distributions of the sine of each Chi1 angle # within an arbitrary set of states {2, 5, 0} path_data = [d[a == i][:, ::2] for i in [2, 5, 0]] msme.plot_stackdist(path_data)
lag_time = 10 n_clusters = 2000 sys = 'Src-' n_timescales = 10 lagTime = 50 # 5ns # loading the data dataset = [] import glob for file in glob.glob('highRMSF_phi_psi/*.npy'): a = np.array(np.load(file)) dataset.append(a) # building tica tica = tICA(n_components=n_components, lag_time=lag_time) tica.fit(dataset) tica_traj = tica.transform(dataset) pickle.dump(tica, open(sys + '_tICs_' + str(n_components) + '.pkl', 'wb')) # clustering states = msmbuilder.cluster.KMeans(n_clusters=n_clusters) states.fit(tica_traj) io.dump( states, sys + '_tICs_' + str(n_components) + 'nCluster_' + str(n_clusters) + '.pkl') # making MSM msm = MarkovStateModel(lag_time=lagTime, n_timescales=n_timescales) msm.fit_transform(cl.labels_) io.dump(msm, 'MSM' + sys)
def construct_graph(msm_modeler_dir, clusterer_dir, n_clusters, tica_lag_time, msm_lag_time, graph_file, inactive = None, active = None, pnas_clusters_averages = None, tica_clusters_averages = None, docking=None, macrostate = None): clusterer = verboseload(clusterer_dir) n_clusters = np.shape(clusterer.cluster_centers_)[0] labels = clusterer.labels_ if not os.path.exists(msm_modeler_dir): msm_modeler = MarkovStateModel(lag_time=msm_lag_time, n_timescales = 5, sliding_window = True, verbose = True) print("fitting msm to trajectories with %d clusters and lag_time %d" %(n_clusters, msm_lag_time)) msm_modeler.fit_transform(labels) verbosedump(msm_modeler, msm_modeler_dir) else: msm_modeler = verboseload(msm_modeler_dir) graph = nx.DiGraph() mapping = msm_modeler.mapping_ inv_mapping = {v: k for k, v in mapping.items()} transmat = msm_modeler.transmat_ for i in range(0, msm_modeler.n_states_): for j in range(0, msm_modeler.n_states_): prob = transmat[i][j] if prob > 0.0: if prob < 0.001: prob = 0.001 original_i = inv_mapping[i] original_j = inv_mapping[j] graph.add_edge(original_i, original_j, prob = float(prob), inverse_prob = 1.0 / float(prob), weight = float(prob)) print(graph.number_of_nodes()) if inactive is not None: scores = convert_csv_to_map_nocombine(inactive) for cluster in scores.keys(): cluster_id = int(cluster[7:len(cluster)]) if cluster_id in graph.nodes(): score = scores[cluster][0] graph.node[cluster_id]["inactive_pnas"] = score if active is not None: scores = convert_csv_to_map_nocombine(active) for cluster in scores.keys(): cluster_id = int(re.search(r'\d+',cluster).group()) if cluster_id in graph.nodes(): score = scores[cluster][0] graph.node[cluster_id]["active_pnas"] = score if pnas_clusters_averages is not None: scores = convert_csv_to_map_nocombine(pnas_clusters_averages) for cluster in scores.keys(): cluster_id = int(re.search(r'\d+',cluster).group()) if cluster_id in graph.nodes(): graph.node[cluster_id]["tm6_tm3_dist"] = scores[cluster][0] graph.node[cluster_id]["rmsd_npxxy_active"] = scores[cluster][2] graph.node[cluster_id]["rmsd_connector_active"] = scores[cluster][4] if tica_clusters_averages is not None: scores = convert_csv_to_map_nocombine(tica_clusters_averages) for cluster in scores.keys(): cluster_id = int(re.search(r'\d+',cluster).group()) if cluster_id in graph.nodes(): for i in range(0,len(scores[cluster])): graph.node[cluster_id]["tIC%d" %(i+1)] = scores[cluster][i] if docking is not None: scores = convert_csv_to_map_nocombine(docking) for cluster in scores.keys(): cluster_id = int(cluster[7:len(cluster)]) if cluster_id in graph.nodes(): score = scores[cluster][0] graph.node[cluster_id]["docking"] = score if macrostate is not None: macromodel = verboseload(macrostate) for cluster_id in range(0, n_clusters): if cluster_id in graph.nodes(): microstate_cluster_id = mapping[cluster_id] macrostate_cluster_id = macromodel.microstate_mapping_[microstate_cluster_id] #print(macrostate_cluster_id) graph.node[cluster_id]["macrostate"] = int(macrostate_cluster_id) nx.write_graphml(graph, graph_file)
linkage='ward', metric='rmsd', landmark_strategy='stride', random_state=None, max_landmarks=None, ward_predictor='ward') ctrajs = clusterer.fit_transform(trajs) lags = (np.arange(1, 50, 1) / to_ns).astype(int) n_timescales = 50 timescales = np.zeros((lags.shape[0], n_timescales)) eigenvalues = np.zeros((lags.shape[0], n_timescales)) for idx, lag in enumerate(lags): msm = MarkovStateModel(lag_time=lag, n_timescales=n_timescales) msm.fit_transform(ctrajs) timescales[idx] = msm.timescales_ eigenvalues[idx] = msm.eigenvalues_[1:] for idx in range(n_timescales): plt.plot(lags * to_ns, timescales.T[idx]) plt.savefig('figures/rmsd_timescales.png') plt.ylim((0, int(np.max(timescales.T[1])))) plt.savefig('figures/rmsd_timescales-detail.png') plt.clf() for idx in range(n_timescales): plt.plot(lags * to_ns, eigenvalues.T[idx]) plt.savefig('figures/rmsd_eigenvalues.png') # Make Pipeline
def construct_graph(msm_modeler_dir, clusterer_dir, n_clusters, tica_lag_time=5, msm_lag_time=10, graph_file="~/graph_file.graphml", msm_object=None, clusterer_object=None, inactive=None, active=None, pnas_clusters_averages=None, tica_clusters_averages=None, docking=None, macrostate=None, cluster_attributes=None, msm_attributes=None, min_prob=1e-4): """ Construct a .graphml graph based on an MSM and attributes of clusters and/or MSM states. Saves .graphml graph to disk and returns it as well. *needs networkx python package to use* Parameters ---------- msm_modeler_dir: location on disk of verboseload loadable msm object clusterer_dir: location on disk of verboseload loadable clusterer object n_clusters: number of clusters tica_lag_time: tica lag time msm_lag_time: msm lag time graph_file: location on disk for saving graphml file msm_object: pass msm object directly instead of loading from disk clusterer_object: pass clusterer object directly instead of loading from disk cluster_attributes: dictionary that maps names of attributes to lists of size n_clusters where each entry in the list is the value of that attribute for that cluster. for example, if n_clusters=3, an example cluster_attributes dict might be: cluster_attributes = {'tyr75-his319_dist': [7.0, 6.0, 8.0], 'phe289-chi2': [90.0, 93.0, 123.2]} msm_attributes: dictionary that maps names of attributes to lists of size n_msm_states where each entry in the list is the value of that attribute for that msm state. for example, if n_msm_states=3, an example cluster_attributes dict might be: msm_attributes = {'tyr75-his319_dist': [7.0, 6.0, 8.0], 'phe289-chi2': [90.0, 93.0, 123.2]} """ if clusterer_object is None: clusterer = verboseload(clusterer_dir) else: clusterer = clusterer_object n_clusters = np.shape(clusterer.cluster_centers_)[0] labels = clusterer.labels_ if not os.path.exists(msm_modeler_dir): if msm_object is not None: msm_modeler = msm_object else: msm_modeler = MarkovStateModel(lag_time=msm_lag_time, n_timescales=5, sliding_window=True, verbose=True) print(("fitting msm to trajectories with %d clusters and lag_time %d" % (n_clusters, msm_lag_time))) msm_modeler.fit_transform(labels) verbosedump(msm_modeler, msm_modeler_dir) else: msm_modeler = verboseload(msm_modeler_dir) graph = nx.DiGraph() mapping = msm_modeler.mapping_ inv_mapping = {v: k for k, v in list(mapping.items())} transmat = msm_modeler.transmat_ for i in range(0, msm_modeler.n_states_): for j in range(0, msm_modeler.n_states_): prob = transmat[i][j] if prob < min_prob: continue original_i = inv_mapping[i] original_j = inv_mapping[j] graph.add_edge(original_i, original_j, prob=float(prob), inverse_prob=1.0 / float(prob)) print("Number of nodes in graph:") print((graph.number_of_nodes())) if inactive is not None: scores = convert_csv_to_map_nocombine(inactive) for cluster in list(scores.keys()): cluster_id = int(cluster[7:len(cluster)]) if cluster_id in graph.nodes(): score = scores[cluster][0] graph.node[cluster_id]["inactive_pnas"] = score if active is not None: scores = convert_csv_to_map_nocombine(active) for cluster in list(scores.keys()): cluster_id = int(re.search(r'\d+', cluster).group()) if cluster_id in graph.nodes(): score = scores[cluster][0] graph.node[cluster_id]["active_pnas"] = score if pnas_clusters_averages is not None: scores = convert_csv_to_map_nocombine(pnas_clusters_averages) for cluster in list(scores.keys()): cluster_id = int(re.search(r'\d+', cluster).group()) if cluster_id in graph.nodes(): graph.node[cluster_id]["tm6_tm3_dist"] = scores[cluster][0] graph.node[cluster_id]["rmsd_npxxy_active"] = scores[cluster][ 2] graph.node[cluster_id]["rmsd_connector_active"] = scores[ cluster][4] if tica_clusters_averages is not None: scores = convert_csv_to_map_nocombine(tica_clusters_averages) for cluster in list(scores.keys()): cluster_id = int(re.search(r'\d+', cluster).group()) if cluster_id in graph.nodes(): for i in range(0, len(scores[cluster])): graph.node[cluster_id]["tIC%d" % (i + 1)] = scores[cluster][i] if docking is not None: scores = convert_csv_to_map_nocombine(docking) for cluster in list(scores.keys()): cluster_id = int(cluster[7:len(cluster)]) if cluster_id in graph.nodes(): score = scores[cluster][0] graph.node[cluster_id]["docking"] = score if macrostate is not None: macromodel = verboseload(macrostate) for cluster_id in range(0, n_clusters): if cluster_id in graph.nodes(): microstate_cluster_id = mapping[cluster_id] macrostate_cluster_id = macromodel.microstate_mapping_[ microstate_cluster_id] #print(macrostate_cluster_id) graph.node[cluster_id]["macrostate"] = int( macrostate_cluster_id) if cluster_attributes is not None: for attribute in cluster_attributes.keys(): for cluster_id in mapping.keys(): graph.node[cluster_id][attribute] = float( cluster_attributes[attribute][cluster_id]) if msm_attributes is not None: for attribute in msm_attributes.keys(): for cluster_id in mapping.keys(): graph.node[cluster_id][attribute] = float( msm_attributes[attribute][mapping[cluster_id]]) nx.write_graphml(graph, graph_file) return (graph)
import matplotlib matplotlib.use('Agg') from msmbuilder.msm import MarkovStateModel from msmbuilder.msm import implied_timescales import pylab as plt import matplotlib as mpl file = 'dataset_nark.best_nonredu.pkl' name = file[:-4] cl = pickle.load(open(name + "-GA-mbkm_mdl.pkl")) n_timescales = 5 lag_times = range(5, 225, 5) ts = np.zeros([n_timescales, len(lag_times)]) ns_lt = np.ndarray.tolist(np.array(lag_times)) index = 0 for i in lag_times: msm = MarkovStateModel(lag_time=i, n_timescales=n_timescales) clL = cl.labels_ #clL10 = [clL[i][::0] for i in range(len(clL))] msm.fit_transform(clL) print(msm.timescales_) len(msm.timescales_) ts[:, index] = msm.timescales_ index = index + 1 np.save('nark_best_nonredu_ts10_cl400_ns_lt', ns_lt) np.save('nark_best_nonredu_ts10_cl400_ts', ts)
font = {'family':'Times New Roman', 'size': 12} plt.rc('font', **font) cl = pickle.load(open('clustering.pkl','rb')) n_timescales=10 stepS = 1.2 lag_times=[1, 2, 3,4, 5,6, 7,8, 9,10,11,12,13,14,15,16,17] l = len(lag_times) ts=np.zeros([10,l]) ns_lt=np.ndarray.tolist(stepS*np.array(lag_times)) index = 0 for i in lag_times: msm=MarkovStateModel(lag_time=i, n_timescales=n_timescales) msm.fit_transform(cl.labels_) ts[:,index]=msm.timescales_ index=index+1 io.dump(msm,'MSM'+str(i)+'.pkl') """ for i in lag_times: msm = io.load('MSM'+str(i)+'.pkl') ts[:,index]=msm.timescales_ index=index+1 """ fig, ax = plt.subplots(1,1) for i in range(10): j=i+1
a = trjN.split('/')[-1] topN = a.split('_md')[0] rawtop = 'rawTrj/'+roundN+'-rwTop/'+topN+'.prmtop' return rawtop def findRawtrj(trjN): roundN = trjN.split('/')[1][0:3] sysName = trjN.split('/')[-1] rawTrj = 'rawTrj/'+roundN+'-rwTrj/'+sysName return rawTrj cluster = pickle.load(open(cl,'rb')) clL = cluster.labels_ msm = MarkovStateModel(lag_time=10,n_timescales=10) msm.fit_transform(clL) trjs = clL N = n_samples inits = ad.findStarting([trjs], N, method=method) T = [] for trj in sorted(glob.glob(Trjs)): T.append(trj) count = 0 for init in inits: structure = msm.draw_samples(clL, 1)[init] print structure top = findTop(T[structure[0][0]]) rawTrj = findRawtrj(T[structure[0][0]])
# the address should be the address of trajectories corresponding to dataset # findStarting(trjs, N, method='random') import adaptivsamplingMSM as ad from msmbuilder.msm import MarkovStateModel cluster=pickle.load(open('clustering.pkl','rb')) trjs = cluster.labels_ N = n_samples T = [] for trj in sorted(glob.glob('rawTrj/MD1-rwTrj/*.mdcrd')): T.append(trj) inits = ad.findStarting([trjs], N, method='leastPop') msm=MarkovStateModel(lag_time=1, n_timescales=10) msm.fit_transform(cluster.labels_) OPF = [] structure = msm.draw_samples(trjs, 1) for i in range(n_samples): try: init = structure[msm.mapping_[inits[i]]] except KeyError: print KeyError traj = T[init[0][0]] frame = init[0][1] OPF.append({'traj':traj, 'frame':frame}) json.dump(OPF, open("ClsInf.txt",'w')) ### Step 5: making the CPPtraj inputs import json