def setUpClass(cls): super(TestCluster, cls).setUpClass() cls.dtraj_dir = tempfile.mkdtemp() # generate Gaussian mixture means = [ np.array([-3, 0]), np.array([-1, 1]), np.array([0, 0]), np.array([1, -1]), np.array([4, 2]) ] widths = [ np.array([0.3, 2]), np.array([0.3, 2]), np.array([0.3, 2]), np.array([0.3, 2]), np.array([0.3, 2]) ] # continuous trajectory nsample = 1000 cls.T = len(means) * nsample cls.X = np.zeros((cls.T, 2)) for i in range(len(means)): cls.X[i * nsample:(i + 1) * nsample, 0] = widths[i][0] * np.random.randn() + means[i][0] cls.X[i * nsample:(i + 1) * nsample, 1] = widths[i][1] * np.random.randn() + means[i][1] # cluster in different ways cls.km = coor.cluster_kmeans(data=cls.X, k=100) cls.rs = coor.cluster_regspace(data=cls.X, dmin=0.5) cls.rt = coor.cluster_uniform_time(data=cls.X, k=100) cls.cl = [cls.km, cls.rs, cls.rt]
def setUpClass(cls): super(TestCluster, cls).setUpClass() cls.dtraj_dir = tempfile.mkdtemp() # generate Gaussian mixture means = [np.array([-3,0]), np.array([-1,1]), np.array([0,0]), np.array([1,-1]), np.array([4,2])] widths = [np.array([0.3,2]), np.array([0.3,2]), np.array([0.3,2]), np.array([0.3,2]), np.array([0.3,2])] # continuous trajectory nsample = 1000 cls.T = len(means)*nsample cls.X = np.zeros((cls.T, 2)) for i in range(len(means)): cls.X[i*nsample:(i+1)*nsample,0] = widths[i][0] * np.random.randn() + means[i][0] cls.X[i*nsample:(i+1)*nsample,1] = widths[i][1] * np.random.randn() + means[i][1] # cluster in different ways cls.km = coor.cluster_kmeans(data = cls.X, k = 100) cls.rs = coor.cluster_regspace(data = cls.X, dmin=0.5) cls.rt = coor.cluster_uniform_time(data = cls.X, k = 100) cls.cl = [cls.km, cls.rs, cls.rt]
def test_with_data_in_mem(self): import pyemma.coordinates as api data = [ np.random.random((100, 50)), np.random.random((103, 50)), np.random.random((33, 50)) ] reader = source(data) assert isinstance(reader, DataInMemory) tpca = api.pca(dim=2) n_centers = 10 km = api.cluster_kmeans(k=n_centers) disc = api.discretizer(reader, tpca, km) disc.parametrize() dtrajs = disc.dtrajs for dtraj in dtrajs: n_states = np.max((np.unique(dtraj))) self.assertGreaterEqual( n_centers - 1, n_states, "dtraj has more states than cluster centers")
def project_and_cluster(trajfiles, featurizer, sparsify=False, tica=True, lag=100000, scale=True, var_cutoff=1.0, ncluster=100): """ Returns ------- trans_obj, Y, clustering """ X = coor.load(trajfiles, featurizer) if sparsify: X = remove_constant(X) if tica: trans_obj = coor.tica(X, lag=lag, var_cutoff=var_cutoff) Y = trans_obj.get_output() else: trans_obj = coor.pca(X, dim=-1, var_cutoff=var_cutoff) Y = trans_obj.get_output() if scale: for y in Y: y *= trans_obj.eigenvalues[:trans_obj.dimension()] if cluster: cl_obj = coor.cluster_kmeans(Y, k=ncluster, max_iter=3, fixed_seed=True) return trans_obj, Y, cl_obj return trans_obj, Y
def clusterTrajectories(trajectories, numClusters, stride=1): """ Cluster the trajectories into numClusters clusters using kmeans algorithm. Returns a KmeansClusteringObject """ return coor.cluster_kmeans(data=trajectories, k=numClusters, max_iter=20, stride=stride)
def test_clustering_kmeans(self): params = {'k': 10, 'init_strategy': 'uniform', 'max_iter': 42, 'metric': 'minRMSD', 'stride': 1} cl = coor.cluster_kmeans([np.random.random((100, 3))],**params) params['n_clusters'] = params['k'] params['clustercenters'] = cl.clustercenters # this is a model param, so it should contained in the output del params['k'] self.compare(cl, params)
def cluster(self, trajectories): """ Cluster the trajectories into numClusters clusters using kmeans algorithm. Returns a KmeansClusteringObject """ return coor.cluster_kmeans(data=trajectories, k=self.numClusters, max_iter=500, stride=self.stride)
def setUpClass(cls): from pyemma.datasets import get_bpti_test_data d = get_bpti_test_data() trajs, top = d['trajs'], d['top'] s = source(trajs, top=top) t = tica(s, lag=1) c = cluster_kmeans(t) cls.model_file = tempfile.mktemp() c.save(cls.model_file, save_streaming_chain=True)
def lengthVsNtrajs(data, nruns, lagtime, clusters, outputFilename, cache, m, stride): nClusters = len(clusters) nLags = len(lagtime) results = np.zeros((nClusters, nLags)) results_cv = np.zeros((nClusters, nLags)) for i, cl in enumerate(clusters): clustering = coor.cluster_kmeans(data=data, k=cl, max_iter=500, stride=stride) for j, lag in enumerate(lagtime): if (cl, lag) in cache: print( "Loading cached computation for %d clusters and %d lagtime" % (cl, lag)) results[i][j], results_cv[i][j] = cache[(cl, lag)] with open(outputFilename, 'a') as f: f.write("%d %d %f %f\n" % (cl, lag, results[i][j], results_cv[i][j])) continue print("Computing for %d clusters and %d lagtime" % (cl, lag)) try: MSM = msm.estimate_markov_model(clustering.dtrajs, lag) print("MSM estimated on %d states" % MSM.nstates) except Exception: print("Estimation error in %d clusters, %d lagtime" % (cl, lag)) results[i][j] = 0.0 results_cv[i][j] = 0.0 continue try: results[i][j] = np.mean(MSM.score(MSM.dtrajs_full, score_k=m)) except Exception: print("Estimation error in %d clusters, %d lagtime" % (cl, lag)) results[i][j] = 0.0 results_cv[i][j] = 0.0 continue try: results_cv[i][j] = np.mean( MSM.score_cv(MSM.dtrajs_full, score_k=m, n=nruns)) except Exception: print("Estimation error in %d clusters, %d lagtime" % (cl, lag)) results_cv[i][j] = 0.0 with open(outputFilename, 'a') as f: f.write("%d %d %f %f\n" % (cl, lag, results[i][j], results_cv[i][j])) return results, results_cv
def estimateDG(data, nruns, cl, lag, ntraj, len_traj, skipFirstSnaphots, cluster_each_iteration): deltaG = [] if not cluster_each_iteration: clustering = coor.cluster_kmeans(data=data, k=cl, max_iter=500, stride=1) for _ in range(nruns): data_it = select_iteration_data(data, ntraj) data_it = [data[j][skipFirstSnaphots:len_traj] for j in data_it] if cluster_each_iteration: clustering = coor.cluster_kmeans(data=data_it, k=cl, max_iter=500, stride=1) dtrajs = clustering.dtrajs else: dtrajs = clustering.assign(data_it) try: MSM = msm.estimate_markov_model(dtrajs, lag) print("MSM estimated on %d states" % MSM.nstates) except Exception: print( "Estimation error in %d clusters, %d lagtime, %d trajectories of %d steps" % (cl, lag, ntraj, len_traj)) continue pi, cl_centers = compute.ensure_connectivity(MSM, clustering.clustercenters) d = 0.75 bins = compute.create_box(cl_centers, data_it, d) microstateVolume = compute.calculate_microstate_volumes_new( cl_centers, data_it, bins, d) _, string = compute.calculate_pmf(microstateVolume, pi) value = float(string.split()[1]) deltaG.append(value) return np.mean(deltaG), np.std(deltaG)
def multi_temperature_tram(feat, trajfiles, temperatures, dtrajs=None, stride=1, tica_lag=100, keep_tica_dims=20, n_clusters=100, tram_lag=100, engfile="Etot.dat", usecols=(1,), kb=0.0083145): """ Parameters ---------- feat : obj, pyemma.coor.featurizer Featurizer object that already has the appropriate features added. trajfiles : list Names of trajectories to include in estimation. temperatures : list Temperatures of corresponding trajectories. stride : int Number of frames to skip in tica and clustering. tica_lag : int Lagtime to use for constructing tica. keep_tica_dims : int Number of dimensions to keep from tica. Somewhat ambiguous. n_clusters : int Number of clusters for kmeans. Somewhat ambiguous. """ dirs = [ os.path.dirname(x) for x in trajfiles ] beta = [ 1./(kb*x) for x in temperatures ] if dtrajs is None: inp = coor.source(trajfiles, feat) tica_obj = coor.tica(inp, lag=tica_lag, dim=keep_tica_dims, stride=stride) Y = tica_obj.get_output() cl = coor.cluster_kmeans(data=Y, k=n_clusters, stride=stride) dtrajs = cl.dtrajs # dimensionless energy if engfile.endswith("npy"): energy_trajs = [ beta[i]*np.load("{}/{}".format(dirs[i], engfile)) for i in range(len(dirs)) ] else: energy_trajs = [ beta[i]*np.loadtxt("{}/{}".format(dirs[i], engfile), usecols=usecols) for i in range(len(dirs)) ] temp_trajs = [ kb*temperatures[i]*np.ones(energy_trajs[i].shape[0], float) for i in range(len(dirs)) ] # dTRAM approach tram = thermo.estimate_multi_temperature(energy_trajs, temp_trajs, dtrajs, energy_unit='kT', temp_unit='kT', estimator='tram', lag=tram_lag, maxiter=2000000, maxerr=1e-10) return dirs, dtrajs, tram
def model_file(): file = None try: from pyemma.datasets import get_bpti_test_data d = get_bpti_test_data() trajs, top = d['trajs'], d['top'] s = source(trajs, top=top) t = tica(s, lag=1) c = cluster_kmeans(t) file = tempfile.mktemp() c.save(file, save_streaming_chain=True) yield file finally: if file is not None: shutil.rmtree(file, ignore_errors=True)
dtraj_phi_3.append(dtraj_rama_3[i][:,1]) dtraj_phi_4.append(dtraj_rama_4[i][:,1]) dtraj_phi_5.append(dtraj_rama_5[i][:,1]) dtraj_phi_6.append(dtraj_rama_6[i][:,1]) # **simple clustering along psi only for discretization** # In[7]: n_clusters = 2 # number of k-means clusters # In[8]: clustering_rama_2 = coor.cluster_kmeans(dtraj_phi_2,k=n_clusters,max_iter=100, tolerance=1e-12, fixed_seed=True) clustering_rama_3 = coor.cluster_kmeans(dtraj_phi_3,k=n_clusters,max_iter=100, tolerance=1e-12, fixed_seed=True) clustering_rama_4 = coor.cluster_kmeans(dtraj_phi_4,k=n_clusters,max_iter=100, tolerance=1e-12, fixed_seed=True) clustering_rama_5 = coor.cluster_kmeans(dtraj_phi_5,k=n_clusters,max_iter=100, tolerance=1e-12, fixed_seed=True) clustering_rama_6 = coor.cluster_kmeans(dtraj_phi_6,k=n_clusters,max_iter=100, tolerance=1e-12, fixed_seed=True) # In[9]: cc_rama_2 = clustering_rama_2.clustercenters[:,0] cc_rama_3 = clustering_rama_3.clustercenters[:,0] cc_rama_4 = clustering_rama_4.clustercenters[:,0] cc_rama_5 = clustering_rama_5.clustercenters[:,0] cc_rama_6 = clustering_rama_6.clustercenters[:,0]
coordinates_source = coor.source(trajectory_files,featurizer) print("There are %d frames total in %d trajectories." % (coordinates_source.n_frames_total(), coordinates_source.number_of_trajectories())) ################################################################################ # Do tICA ################################################################################ print('tICA...') running_tica = coor.tica(lag=100, dim=100) ################################################################################ # Cluster ################################################################################ print('Clustering...') clustering = coor.cluster_kmeans(k=100, stride=50) coor.pipeline([coordinates_source,running_tica,clustering]) dtrajs = clustering.dtrajs # Save discrete trajectories. clustering.save_dtrajs(output_format='npy', extension='.npy') ################################################################################ # Make tics plot ################################################################################ tics = running_tica.get_output()[0] z,x,y = np.histogram2d(tics[:,0],tics[:,1], bins=50) F = -np.log(z+1) extent = [x[0], x[-1], y[0], y[-1]]
save_object('pca_obj.pkl', pca_obj) #plt.plot(tica_obj.eigenvalues,marker='x') #plt.xlim([-1,20]) #plt.ylim([0.5,1]) # here we do a little trick to ensure that eigenvectors always have the same sign structure. # That's irrelevant to the analysis and just nicer plots - you can ignore it. #for i in range(2): # if tica_obj.eigenvectors[0, i] > 0: # tica_obj.eigenvectors[:, i] *= -1 Y = pca_obj.get_output() # get tica coordinates np.save('Y.npy', Y) # Now, do the clustering Y_clust = [] for i in range(len(Y)): Y_clust.append(Y[i][:, 0:clust_dim]) clustering = coor.cluster_kmeans(data=Y_clust, k=n_clusters, max_iter=50, tolerance=1e-05, stride=1) save_object( 'clustering_kmeans_nclust-' + str(clustering.n_clusters) + '_clustdim-' + str(clust_dim) + '.pkl', clustering) #clustering = coor.cluster_regspace(Y_clust,max_centers=n_clusters,dmin=dmin) #save_object('clustering_regspace_nclust-'+str(clustering.n_clusters)+'_clustdim-'+str(clust_dim)+'.pkl', clustering) #print 'n_clusters = '+str(clustering.n_clusters)
trajfiles = [ x + "/" + trajname for x in tempdirs ] # add features feat = coor.featurizer(topfile) feat, feature_info = util.sbm_contact_features(feat, pairwise_file, n_native_pairs) if not os.path.exists("msm"): os.mkdir("msm") if (not os.path.exists("msm/dtrajs.pkl")) or recluster: # cluster if necessary inp = coor.source(trajfiles, feat) tica_obj = coor.tica(inp, dim=tica_dims, lag=tica_lag, stride=stride) Y = tica_obj.get_output() cl = coor.cluster_kmeans(data=Y, k=n_clusters) dtrajs = cl.dtrajs os.chdir("msm") dirs = [ os.path.basename(os.path.dirname(x)) for x in trajfiles ] if not dontsavemsm: dtraj_info = { dirs[x]:dtrajs[x] for x in range(len(dirs)) } dtraj_info["dirs"] = dirs with open("dtrajs.pkl", 'wb') as fhandle: pickle.dump(dtraj_info, fhandle) else: os.chdir("msm") with open("dtrajs.pkl", 'rb') as fhandle: dtraj_pkl = pickle.load(fhandle) dirs = dtraj_pkl["dirs"]
plt.plot(x, Y[ij][:, 1]) plt.ylabel('IC 2') plt.xticks([]) ax1 = plt.subplot(313) plt.plot(x, Y[ij][:, 2]) plt.ylabel('IC 3') plt.xlabel('time (frames)') plt.xticks([]) plt.savefig("traj_%d_ICs.png" % (ij + 1)) # if we have many trajectories having them all open might consume a lot of # memory plt.close() else: Y = trajs clustering = coor.cluster_kmeans(Y, k=numClusters, max_iter=100) dtrajs = clustering.dtrajs cc_x = clustering.clustercenters[:, 0] cc_y = clustering.clustercenters[:, 1] cc_z = clustering.clustercenters[:, 2] xall = np.vstack(Y)[:, 0] yall = np.vstack(Y)[:, 1] plt.figure(figsize=(8, 5)) mplt.plot_free_energy(xall, yall, cmap="Spectral") plt.plot(cc_x, cc_y, linewidth=0, marker='o', markersize=5, color='black') plt.xlabel("IC 1") plt.ylabel("IC 2") plt.title("FES IC1-2") plt.savefig("fes_IC1-2.png")
zfile = open( 'Intermediate_pickle_files/wt-h70a-d66a_cattraj_contourmap.pickle', 'w') pickle.dump(F, zfile) zfile.close() test = [] for t in combined: test.append(t) np.shape(test) # ### 100 K-means clusters nclusters = 100 kmean_cluster100 = coor.cluster_kmeans(data=test, k=nclusters, max_iter=1000, tolerance=1e-6) print "Done!" print "Saving cluster centers..." ccenters100 = kmean_cluster100.clustercenters f = open( 'Intermediate_pickle_files/wt-h70a-d66a_cattraj_dirrmsd_ccenter-100.pickle', 'w') pickle.dump(ccenters100, f) f.close() wt_dtrajs = coor.assign_to_centers(data=wt_dir_rmsd, centers=ccenters100) f = open( 'Intermediate_pickle_files/cypa_wt-d66a_cattraj_dirrmsd_dtrajs.pickle', 'w')
# 1- Clustering #cl = coor.cluster_uniform_time(data=data, k=100, stride=10) #cl = coor.cluster_kmeans(data=data, k=250, stride=10) # for later use we save the discrete trajectories and cluster center coordinates: #dtrajs = cl.dtrajs #cc_x = cl.clustercenters[:,0] #cc_y = cl.clustercenters[:,1] if os.path.isfile('clusterenters_kmeans.npy'): dtrajs = np.load('dtrajs_kmeans.npy') dtrajs = np.ravel(dtrajs) dummy = np.load('clustercenters_kmeans.npy') cc_x = dummy[:, 0] cc_y = dummy[:, 1] else: cl = coor.cluster_kmeans(data=data, k=100, stride=25) # for later use we save the discrete trajectories and cluster center pyemma.coordinatesdinates: dtrajs = cl.dtrajs cc_x = cl.clustercenters[:, 0] cc_y = cl.clustercenters[:, 1] np.save('dtrajs_kmeans.npy', dtrajs) np.save('clustercenters_kmeans.npy', np.column_stack([cc_x, cc_y])) # 2- Lag time: Note that the xtc files are saved every 0.2 ps. # Making the Markov model M = msm.estimate_markov_model(dtrajs, 2500) print('fraction of states used = ', M.active_state_fraction) print('fraction of counts used = ', M.active_count_fraction) print('transition matrix', M.transition_matrix) # doctest: +SKIP
plt.savefig('tic1_feature_corr.png') plt.clf() plt.title('Feature correlation to tIC 2') plt.bar(range(len(tica.feature_TIC_correlation[:, 1])), abs(tica.feature_TIC_correlation[:, 1]), align='center') plt.xlabel('Index within feature vector') plt.ylabel('Correlation') plt.tight_layout() plt.savefig('tic2_feature_corr.png') print('running kmeans') clkmeans = coor.cluster_kmeans(Y, 300, max_iter=300) plt.clf() plt.figure(figsize=(8, 5)) plt.plot(clkmeans.clustercenters[:, 0], clkmeans.clustercenters[:, 1], ' ok') mplt.plot_free_energy(np.hstack(Y1), np.hstack(Y2)) plt.xlabel('tic 1') plt.ylabel('tic 2') plt.savefig('kmeans_cluster-on_tic1tic2.png') np.save('clkmeans_dtrajs.npy', clkmeans.dtrajs) np.save('clkmeans_clustercenters.npy', clkmeans.clustercenters) print('running MSM')
n_sets = 3 print 'feat dimension' print feat.dimension() dataset = [] nlist = [] if 1: n_clusters = 200 tica_obj = coor.tica( dim=2, lag=tica_lagtime, kinetic_map=True) input_data = coor.cluster_kmeans( k=n_clusters, max_iter=50) disc = coor.discretizer(inp, tica_obj, input_data, stride=1, chunksize=10) disc.parametrize() print tica_obj.cumvar #TICA output is Y Y = tica_obj.get_output() print np.shape(Y) #print 'Y[0]' #print Y[0] print 'number of trajetories = ', np.shape(Y)[0] # #mapped_data is the TICA clustered data mapped to the microstates (so integer valued) mapped_data =input_data.dtrajs
tica_lagtime = 400 #number of PCCA clusters n_sets = 3 print 'feat dimension' print feat.dimension() dataset = [] nlist = [] if 1: n_clusters = 200 tica_obj = coor.tica(dim=2, lag=tica_lagtime, kinetic_map=True) input_data = coor.cluster_kmeans(k=n_clusters, max_iter=50) disc = coor.discretizer(inp, tica_obj, input_data, stride=1, chunksize=10) disc.parametrize() print tica_obj.cumvar #TICA output is Y Y = tica_obj.get_output() print np.shape(Y) #print 'Y[0]' #print Y[0] print 'number of trajetories = ', np.shape(Y)[0] # #mapped_data is the TICA clustered data mapped to the microstates (so integer valued) mapped_data = input_data.dtrajs
(coordinates_source.n_frames_total(), coordinates_source.number_of_trajectories())) ################################################################################ # Do tICA ################################################################################ print('tICA...') running_tica = coor.tica(lag=100, dim=100) ################################################################################ # Cluster ################################################################################ print('Clustering...') clustering = coor.cluster_kmeans(k=100, stride=50) coor.pipeline([coordinates_source, running_tica, clustering]) dtrajs = clustering.dtrajs # Save discrete trajectories. clustering.save_dtrajs(output_format='npy', extension='.npy') ################################################################################ # Make tics plot ################################################################################ tics = running_tica.get_output()[0] z, x, y = np.histogram2d(tics[:, 0], tics[:, 1], bins=50) F = -np.log(z + 1) extent = [x[0], x[-1], y[0], y[-1]]
#cluster tica data into clusters import pyemma.coordinates as coor import numpy as np sys = 'fdis' tica_data = coor.load('tica_data_05/fdis_tica_data.h5') n_clusters = 100 cl = coor.cluster_kmeans(tica_data, k=n_clusters, max_iter=50) #cl.save(f'cluster_data/{sys}_{n_clusters}_mini_cluster_object.h5', overwrite=True) cl.write_to_hdf5(f'cluster_data_11/{sys}_{n_clusters}_cluster_dtrajs22.h5')
pickle.dump(clust_col_skip_dtraj, open('clust_col_skip_dtraj_cl_full_try2.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL) print('length of clust_col_skip_dtraj is ', len(clust_col_skip_dtraj)) print('length of clust_col_skip_dtraj[0] is ', len(clust_col_skip_dtraj[0])) #Y = tica.get_output() tica = pickle.load(open('mix_tica_full.pickle', 'rb')) Y = tica.get_output() print('shape of tica is ', len(Y[0])) #cl_f = pickle.load(open('pg_cl_full_ax1.pickle', 'rb')) Y2 = [i[:, 1:3] for i in Y] cluster_tic = coor.cluster_kmeans(Y2, k=10, max_iter=100) clust_out = cluster_tic.dtrajs #clust_out = pickle.load(open('tic_cl_full_dtraj.pickle', 'rb')) pickle.dump(clust_out, open('tic_cl_full_dtraj3_try2.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cluster_tic.clustercenters, open('tic_cl_full_centers3_try2.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL) D = [0] * len(clust_col_skip_dtraj) for i in range(len(clust_col_skip_dtraj)): D[i] = [0] * len(clust_col_skip_dtraj[0])
# Choose parameters to be used in the task config.show_progress_bars = False lag = args.tica_lag feat = coor.featurizer(topfile) feat.add_backbone_torsions() inp = coor.source(trajfiles, feat) dim = args.tica_dim tica_obj = coor.tica(inp, lag=lag, dim=dim, kinetic_map=False) Y = tica_obj.get_output() cl = coor.cluster_kmeans(data=Y, k=args.msm_states, stride=args.stride) M = msm.estimate_markov_model(cl.dtrajs, args.msm_lag) # with open("model.dtraj", "w") as f: # f.write("\n".join(" ".join(map(str, x)) for x in cl.dtrajs)) # # # np.savetxt("model.dtraj", cl.dtrajs, delimiter=" ", fmt='%d') # np.savetxt("model.msm", M.P, delimiter=",") data = { 'input': { 'frames': inp.n_frames_total(), 'dimension': inp.dimension(), 'trajectories': inp.number_of_trajectories(), 'lengths': inp.trajectory_lengths().tolist(), },
plt.plot(x, Y[0][:, 1]) plt.ylabel('IC 2') plt.xticks([]) plt.yticks(np.arange(-2, 4)) ax1 = plt.subplot(313) plt.plot(x, Y[0][:, 2]) plt.xlabel('time / ns') plt.ylabel('IC 3') plt.yticks(np.arange(-4, 6, 2)) # for shorter trajectory, ideal number of clusters is 100 # optimal lag_time = 750? # optimal lag_time = 1000 timesteps clustering = coor.cluster_kmeans(Y, k=100) dtrajs = clustering.dtrajs msm = pyemma.msm.estimate_markov_model(dtrajs, 380) pyemma.plots.plot_cktest(msm.cktest(3, err_est=True), marker='.') # TRIALS - reg_space clustering and kmeans comparison - kmeans by far better clustering_reg = coor.cluster_regspace(Y, dmin=2, max_centers=100) cr_x = clustering_reg.clustercenters[:, 0] cr_y = clustering_reg.clustercenters[:, 0] cc_x = clustering.clustercenters[:, 0] cc_y = clustering.clustercenters[:, 1] c_reg = [cr_x, cr_y] c = [cc_x, cc_y] print(len(clustering_reg.clustercenters)) fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharex=True, sharey=True) for ax, cls in zip(axes.flat, [c, c_reg]):
plt.xlim([xmin, xmax]) #Trace la carte d'énergie libre, abscisse : rayon de gyration, ordonnee : RMSD plt.xlim([xmin, xmax]) #Borne plt.ylim([ymin, ymax]) mplt.plot_free_energy(gyrateArray, rms) #plt.plot([Refgyrate],[ymin], '+') plt.ylabel('RMSD (A)') plt.xlabel('Radius of gyration (A)') save_figure('free' + peptide_name + '.pdf', PathOut + "/") #Par défaut, image au format pdf #Mise en place du k-means n_clusters = args.kmeans Y = np.vstack((gyrateArray, rms)) X = np.transpose(Y) clustering = coor.cluster_kmeans(X, k=n_clusters, max_iter=100) dtrajs = clustering.dtrajs cc_x = clustering.clustercenters[:, 0] cc_y = clustering.clustercenters[:, 1] ind_clust = clustering.index_clusters plt.plot(cc_x, cc_y, linewidth=0, marker='o', markersize=5, color='black') for i in range(len(cc_x)): plt.text(cc_x[i], cc_y[i], str(i + 1), color='grey', fontsize=12) save_figure('free' + peptide_name + '_clusters.pdf', PathOut) compute_effectif_cluster(ind_clust, int(traj.time[0]), int(traj.timestep), PathOut, PathOut + "ex_md.xtc", struct)
if i == 3: for j in range(4): axes[i][j].set_xlabel("TIC " + str(j + 2), fontsize=20) axes[0][0].annotate("TICA " + f_str, fontsize=24, xy=(0, 0), xytext=(1.8, 1.1), xycoords="axes fraction", textcoords="axes fraction") fig.savefig(msm_savedir + "/tic_hist_grid.pdf") n_clusters = 300 msm_lags = [1, 10, 20, 50, 100, 200] cluster = coor.cluster_kmeans(k=n_clusters) coor.pipeline([reader, tica, cluster]) its = msm.its(cluster.dtrajs, lags=msm_lags) plt.figure() mplt.plot_implied_timescales(its) plt.title(msm_savedir) plt.savefig(msm_savedir + "/its_vs_lag_ylog.pdf") #plt.figure() #plt.plot(np.arange(1,21), M.timescales()[:20], 'o') #ymin, ymax = plt.ylim() #plt.ylim(0, ymax) #plt.savefig("msm_ti.pdf")
def main(lagtimes, clusters, m, tica_lag, tica, output_path): trajectoryFolder = "allTrajs" trajectoryBasename = "traj*" stride = 1 if output_path and not os.path.exists(output_path): os.makedirs(output_path) scores_path = os.path.join(output_path, "scores") if not os.path.exists(scores_path): os.makedirs(scores_path) data, _ = cluster.loadTrajFiles(trajectoryFolder, trajectoryBasename) if tica: tica_obj = coor.tica(data, lag=tica_lag, var_cutoff=0.9, kinetic_map=True) print('TICA dimension ', tica_obj.dimension()) data = tica_obj.get_output() for tau in lagtimes: scores = [] scores_cv = [] print("Estimating MSM with %d lagtime" % tau) for k in clusters: print("Calculating scores with %d clusters" % k) # cluster data cl = coor.cluster_kmeans(data=data, k=k, max_iter=500, stride=stride) try: MSM = msm.estimate_markov_model(cl.dtrajs, tau) print("MSM estimated on %d states" % MSM.nstates) except Exception: print("Estimation error in %d clusters, %d lagtime" % (k, tau)) scores.append(0) scores_cv.append(np.array([0, 0, 0, 0, 0])) continue try: scores.append(MSM.score(MSM.dtrajs_full, score_k=m)) except Exception: print("Estimation error in %d clusters, %d lagtime" % (k, tau)) scores.append(0) scores_cv.append(np.array([0, 0, 0, 0, 0])) continue try: scores_cv.append(MSM.score_cv(MSM.dtrajs_full, score_k=m, n=5)) except Exception: print("Estimation error in %d clusters, %d lagtime" % (k, tau)) scores_cv.append(np.array([0, 0, 0, 0, 0])) np.save(os.path.join(scores_path, "scores_lag_%d.npy" % tau), scores) np.save(os.path.join(scores_path, "scores_cv_lag_%d.npy" % tau), scores_cv) mean_scores = [sc.mean() for sc in scores_cv] std_scores = [sc.std() for sc in scores_cv] plt.figure() plt.plot(clusters, scores, label="Training") plt.errorbar(clusters, mean_scores, yerr=std_scores, fmt='k', label="Testing") plt.xlabel("Number of states") plt.ylabel("Score") plt.legend() plt.savefig(os.path.join(output_path, "scores_cv_lag_%d.png" % tau))
MSMlags = np.array([1]) for i in range(1, 4, 1): nmin = 10**(i) nmax = 10**(i + 1) dn = 10**(i) MSMlags1 = np.arange(nmin, nmax, dn) MSMlags = np.concatenate([MSMlags, MSMlags1]) nlagsMSM = np.shape(MSMlags)[0] print('number of different lag times chosen = ' + str(nlagsMSM)) print('lag time values used = ' + str(MSMlags)) sys.stdout.flush() n_clusters = 1000 clustering = coor.cluster_kmeans(list(Ys), k=n_clusters, max_iter=10000) dtrajs = clustering.dtrajs my_dict = {} my_dict['n_clusters'] = n_clusters my_dict['micro_membership'] = dtrajs my_dict['centers'] = clustering.clustercenters np.savez_compressed('2F4K_MSM_10TICA_clusters_1000.npz', **my_dict) #-------------------------------------------------------------------- # Build Markov State Model out of clustered data and save Markov transition matrices to file print('Building Markov Model at different lag times...') sys.stdout.flush()
show_titles=True, title_kwargs={"fontsize": 12}) plt.savefig('%s/corner.png' % mutant) plt.clf() plt.figure(figsize=(8, 5)) mplt.plot_free_energy(np.hstack(Y1_otherpro), np.hstack(Y2_otherpro)) plt.xlabel('tic 1') plt.ylabel('tic 2') plt.savefig('%s/tic1-tic2.png' % mutant) print('running %s kmeans' % mutant) clkmeans = coor.cluster_kmeans(Y_otherpro, 300, max_iter=300) plt.clf() plt.figure(figsize=(8, 5)) plt.plot(clkmeans.clustercenters[:, 0], clkmeans.clustercenters[:, 1], ' ok') mplt.plot_free_energy(np.hstack(Y1), np.hstack(Y2)) plt.xlabel('tic 1') plt.ylabel('tic 2') plt.savefig('%s/kmeans_cluster-on_tic1tic2.png' % mutant) np.save('%s/clkmeans_dtrajs.npy' % mutant, clkmeans.dtrajs) np.save('%s/clkmeans_clustercenters.npy' % mutant, clkmeans.clustercenters) print('running %s MSM' % mutant)