def plot_timescales(clusterer_dir, n_clusters, tica_dir, main="", lag_times=list(range(1, 50))): clusterer = verboseload(clusterer_dir) print(clusterer) sequences = clusterer.labels_ #print(sequences) #lag_times = list(np.arange(1,150,5)) n_timescales = 5 msm_timescales = implied_timescales(sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel( verbose=True, prior_counts=1e-5, ergodic_cutoff='off')) print(msm_timescales) for i in range(n_timescales): plt.plot(lag_times, msm_timescales[:, i]) plt.xlabel("Lag time (ns)") plt.ylabel("Implied Timescales (ns)") plt.title(main) plt.semilogy() pp = PdfPages("%s/%s_n_clusters%d_implied_timescales.pdf" % (tica_dir, main, n_clusters)) pp.savefig() pp.close() plt.clf()
def test_both(): model = MarkovStateModel( reversible_type='mle', lag_time=1, n_timescales=1) # note this might break it if we ask for more than 1 timescale sequences = np.random.randint(20, size=(10, 1000)) lag_times = [1, 5, 10] models_ref = [] for tau in lag_times: msm = MarkovStateModel( reversible_type='mle', lag_time=tau, n_timescales=10) msm.fit(sequences) models_ref.append(msm) timescales_ref = [m.timescales_ for m in models_ref] models = param_sweep(msm, sequences, {'lag_time' : lag_times}, n_jobs=2) timescales = implied_timescales(sequences, lag_times, msm=msm, n_timescales=10, n_jobs=2) print(timescales) print(timescales_ref) if np.abs(models[0].transmat_ - models[1].transmat_).sum() < 1E-6: raise Exception("you wrote a bad test.") for i in range(len(lag_times)): models[i].lag_time = lag_times[i] npt.assert_array_almost_equal(models[i].transmat_, models_ref[i].transmat_) npt.assert_array_almost_equal(timescales_ref[i], timescales[i])
def test_both(): sequences = [np.random.randint(20, size=1000) for _ in range(10)] lag_times = [1, 5, 10] models_ref = [] for tau in lag_times: msm = MarkovStateModel(reversible_type='mle', lag_time=tau, n_timescales=10) msm.fit(sequences) models_ref.append(msm) timescales_ref = [m.timescales_ for m in models_ref] model = MarkovStateModel(reversible_type='mle', lag_time=1, n_timescales=10) models = param_sweep(model, sequences, {'lag_time': lag_times}, n_jobs=2) timescales = implied_timescales(sequences, lag_times, msm=model, n_timescales=10, n_jobs=2) print(timescales) print(timescales_ref) if np.abs(models[0].transmat_ - models[1].transmat_).sum() < 1E-6: raise Exception("you wrote a bad test.") for i in range(len(lag_times)): npt.assert_array_almost_equal(models[i].transmat_, models_ref[i].transmat_) npt.assert_array_almost_equal(timescales_ref[i], timescales[i])
def test_both(): model = MarkovStateModel(reversible_type="mle", lag_time=1, n_timescales=1) # note this might break it if we ask for more than 1 timescale sequences = np.random.randint(20, size=(10, 1000)) lag_times = [1, 5, 10] models_ref = [] for tau in lag_times: msm = MarkovStateModel(reversible_type="mle", lag_time=tau, n_timescales=10) msm.fit(sequences) models_ref.append(msm) timescales_ref = [m.timescales_ for m in models_ref] models = param_sweep(msm, sequences, {"lag_time": lag_times}, n_jobs=2) timescales = implied_timescales(sequences, lag_times, msm=msm, n_timescales=10, n_jobs=2) print(timescales) print(timescales_ref) if np.abs(models[0].transmat_ - models[1].transmat_).sum() < 1e-6: raise Exception("you wrote a bad test.") for i in range(len(lag_times)): models[i].lag_time = lag_times[i] npt.assert_array_almost_equal(models[i].transmat_, models_ref[i].transmat_) npt.assert_array_almost_equal(timescales_ref[i], timescales[i])
def plot_timescales(clusterer_dir, n_clusters, tica_dir): clusterer = verboseload(clusterer_dir) print clusterer sequences = clusterer.labels_ #print(sequences) #lag_times = list(np.arange(1,150,5)) lag_times = [1, 4, 8, 12, 16] n_timescales = 10 msm_timescales = implied_timescales(sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel(verbose=True)) print msm_timescales implied_timescales(sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel(verbose=False)) for i in range(n_timescales): plt.plot(lag_times, msm_timescales[:,i]) plt.semilogy() pp = PdfPages("%s/n_clusters%d_implied_timescales.pdf" %(tica_dir, n_clusters)) pp.savefig() pp.close()
def plot_timescales(clusterer_dir, n_clusters, lag_time): clusterer = verboseload(clusterer_dir) sequences = clusterer.labels_ lag_times = list(np.arange(1,150,5)) n_timescales = 5 msm_timescales = implied_timescales(sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel(verbose=False)) print msm_timescales for i in range(n_timescales): plt.plot(lag_times, msm_timescales[:,i]) plt.semilogy() pp = PdfPages("/scratch/users/enf/b2ar_analysis/kmeans_%d_%d_implied_timescales.pdf" %(n_clusters, lag_time)) pp.savefig() pp.close()
def plot_timescales(clusterer_dir, n_clusters, lag_time): clusterer = verboseload(clusterer_dir) sequences = clusterer.labels_ lag_times = list(np.arange(1, 150, 5)) n_timescales = 5 msm_timescales = implied_timescales(sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel(verbose=False)) print(msm_timescales) for i in range(n_timescales): plt.plot(lag_times, msm_timescales[:, i]) plt.semilogy() pp = PdfPages( "/scratch/users/enf/b2ar_analysis/kmeans_%d_%d_implied_timescales.pdf" % (n_clusters, lag_time)) pp.savefig() pp.close()
def calculate_its(kcenters_sequences, lag_times, n_timescales, outfile_name, ergodic_cutoff_option): msm_timescales = implied_timescales( kcenters_sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel(verbose=True, reversible_type='transpose', ergodic_cutoff=ergodic_cutoff_option)) for k in range(n_timescales): plt.plot(lag_times, msm_timescales[:, k], 'o-') f2 = open(outfile_name + '.dat', 'w') for i in range(len(lag_times)): f2.write("%d " % (lag_times[i])) for j in range(n_timescales): f2.write("%f " % (msm_timescales[i, j])) f2.write('\n') f2.close() plt.title('Discrete-time MSM Relaxation Timescales') plt.semilogy() x1, x2, y1, y2 = plt.axis() plt.savefig(outfile_name + '.png') plt.close()
test_score = msm.score(test_data_sequence) f1 = open( sub_resultdir + '/Fold_%d_tica_lagtime_%d_ntics_%d_nMicro_%d_gmrq.summary' % (fold, tica_correlation_time, n_tics, n_Micro), 'w') f1.write('train_score:%f' % (train_score)) f1.write('\n') f1.write('test_score:%f' % (test_score)) f1.write('\n') f1.close() print( 'computing implied timescale for training data' ) #the x-range to plot implied timescale should also change train_msm_timescales = implied_timescales( train_data_sequence, range(1, 20, 1), n_timescales=10, msm=MarkovStateModel(reversible_type='transpose', ergodic_cutoff='on')) np.savetxt( sub_resultdir + '/Fold_%d_tica_lagtime_%d_ntics_%d_nMicro_%d_traindata_its.dat' % (fold, tica_correlation_time, n_tics, n_Micro), train_msm_timescales) print('computing implied timescale for testing data') test_msm_timescales = implied_timescales( test_data_sequence, range(1, 20, 1), n_timescales=10, msm=MarkovStateModel(reversible_type='transpose', ergodic_cutoff='on')) np.savetxt(
print msm.mapping_ print("for microstate lag time = ", microstate_lagtime, ",", msm.n_states_, " states are left") np.savetxt("kcenters_microstate_%s_transmat_.txt" % (reversible), msm.transmat_) np.savetxt("kcenters_%s_stationary_population" % (reversible), msm.populations_) #plot implied timescale n_timescales = 10 print "lagtime list is:", lag_times msm_timescales = implied_timescales(kcenters_sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel( verbose=True, reversible_type=reversible, ergodic_cutoff='on')) for k in range(n_timescales): plt.plot(lag_times, msm_timescales[:, k], 'o-') plt.title('Discrete-time MSM Relaxation Timescales') plt.semilogy() x1, x2, y1, y2 = plt.axis() #plt.axis((x1,x2,y1,1000000)) #need to change this number outfile_name = "%s_ITS.png" % (reversible) plt.savefig(outfile_name) plt.close() outfile_name = "%s_ITS.dat" % (reversible) print msm_timescales
def calculate_tica_components(): print("Calculating tICA components...") in_files = glob.glob("out*npy") loaded_files = [ np.load(filename) for filename in in_files ] tica = tICA(lag_time=tica_lagtime, n_components=int(tica_components)).fit_transform(loaded_files) np.save('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components), tica) tica_data = 'data_lag_%d_comp_%d' %(tica_lagtime, tica_components) joblib.dump(tica, tica_data) data = np.load('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components)) for i in range(len(glob.glob('out*npy'))): # extract the four tICA components for j in range(len(data[i])): tica_1.append(data[i][j][0]) tica_2.append(data[i][j][1]) tica_3.append(data[i][j][2]) tica_4.append(data[i][j][3]) # Clustering via KCenters if cluster_method == 'kcenters': print("Clustering via KCenters...") clusters = KCenters(n_clusters) elif cluster_method == 'kmeans': print("Clustering via KMeans...") clusters = KMeans(n_clusters) else: sys.exit("Invalid cluster_method. Use kmeans or kcenters.") sequences = clusters.fit_transform(tica) np.save('lag_%d_clusters_%d_sequences.npy' %(tica_lagtime, n_clusters), sequences) np.save('lag_%d_clusters_%d_center.npy' %(tica_lagtime, n_clusters), clusters.cluster_centers_) cluster_data = 'lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters) joblib.dump(sequences, cluster_data) # Determining cluster populations print("Determining cluster populations...") counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster normalized_counts = counts/float(counts.sum()) percentages = [ i*100 for i in normalized_counts ] # Plotting the tICA components print("Plotting tICA components with cluster centers...") plt.figure(0) # plotting tica_1, tica_2 plt.hexbin(tica_1, tica_2, bins='log') #, cmap=cmaps.viridis x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][1] for i in range(len(clusters.cluster_centers_))] plt.plot(x_centers, y_centers, 'wo') for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers): # adds percentage contribution for each cluster plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('tica_1_2.png') plt.figure(1) # plotting tica_1, tica_3 plt.hexbin(tica_1, tica_3, bins='log') x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][2] for i in range(len(clusters.cluster_centers_))] plt.plot(x_centers, y_centers, 'wo') for label, x, y in zip([ "%.4f"%i for i in percentages], x_centers, y_centers): plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('tica_1_3.png') plt.figure(2) # plotting tica_2, tica_3 plt.hexbin(tica_2, tica_3, bins='log') x_centers = [clusters.cluster_centers_[j][1] for j in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[j][2] for j in range(len(clusters.cluster_centers_))] plt.plot(x_centers, y_centers, 'wo') for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers): plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('tica_2_3.png') # Determining cluster entropy ( this yields errors for me ) # print("Determining cluster entropy") # cluster_entropy = (-1.0*normalized_counts*np.log(normalized_counts)).sum() # np.savetxt('cluster_entropy.dat', cluster_entropy) # Determining the cluster populations and writing out PDBs for cluster centers print("Determining cluster populations...") counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster normalized_counts = counts/float(counts.sum()) np.savetxt('populations.dat', normalized_counts) print("Performing cluster analytics and saving center PDBs...\n") for i in range(len(glob.glob("traj*xtc"))): n_snapshots = len(clusters.distances_[i]) cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # frames that have centers cluster_labels = sequences[i][cluster_indices] # number of cluster if cluster_indices.size != 0: # print only the trajectories that have cluster centers for j in range(len(cluster_labels)): # for each cluster center found in this trajectory print('Cluster center', cluster_labels[j], 'was found in trajectory', str(i) + '.') print('It is found on frame', cluster_indices[j], 'and has a relative population of', "%.4f"%percentages[cluster_labels[j]], '%.') xtcfile = sorted(glob.glob("traj*xtc"))[i] for j in range(len(cluster_indices)): # actually saving the snapshots cluster_traj = md.load_frame(xtcfile, cluster_indices[j], top='structure.gro') cluster_traj.save_pdb('state_%d.pdb' %cluster_labels[j]+1) # Calculating IPTs print("\nCalculating Implied Timescales...") timescales = implied_timescales(sequences, lagtimes, n_timescales=n_timescales, msm=MarkovStateModel(verbose=False)) implied_timescale_data = 'ipt_lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters) joblib.dump(timescales, implied_timescale_data) numpy_timescale_data = 'lag_%d_clusters_%d_timescales.npy' %(tica_lagtime, n_clusters) np.savetxt('lagtimes.txt', lagtimes) np.save(numpy_timescale_data, timescales) # Plotting IPTs (lagtimes and timescales) print("Plotting Implied Timescales...") for i in range(n_timescales): plt.figure(42) plt.plot(lagtimes * time_step, timescales[:, i] * time_step, 'o-') plt.yscale('log') plt.xlabel('lagtime (ns)') plt.ylabel('Implied timescales (ns)') plt.savefig('lag_%d_clusters_%d_.png' %(tica_lagtime, n_clusters))
def test_ntimescales(): # see issue #603 trajs = [np.random.randint(0, 30, 500) for _ in range(5)] its = implied_timescales(trajs, [1, 2, 3], n_timescales=11) assert its.shape[1] == 11
#print("now output TPM") #print(msm.transmat_) #np.savetxt('TPM.txt', msm.transmat_) #print("now output populations") #print(msm.populations_) #np.savetxt("population.txt", msm.populations_) from msmbuilder.msm import MarkovStateModel, implied_timescales data = dataset('./cluster', mode='r', fmt='dir-npy', verbose=True) lag_times = range(10, 400, 10) msm_timescales = implied_timescales( data, lag_times, n_timescales=10, msm=MarkovStateModel(reversible_type='transpose')) np.savetxt('msm_timescales_XXX.txt', msm_timescales) exit() #below: perform pcca plus to get the macrostates from msmbuilder.lumping import PCCAPlus pcca = PCCAPlus.from_msm(msm, n_macrostates=3) macro_trajs = pcca.transform(clustered_trajs) plt.hexbin(txx[:, 0], txx[:, 1], bins='log', mincnt=0.1, cmap="bone_r") plt.scatter( clusterer.cluster_centers_[msm.state_labels_, 0], clusterer.cluster_centers_[msm.state_labels_, 1],
kcenters = KCenters(n_clusters=nMicro) #kcenters = KCenters(n_clusters=num_tics_for_clustering) # Fr :) kcenters_sequences = kcenters.fit_predict( tica_sequences) #here it is ground state tica sequences print "begin to plot the microstate implied timescale into the objective dir" #plot implied timescale lag_times = range(10, 100, 10) #adjust variables n_timescales = 5 #adjust variables msm_timescales = implied_timescales(kcenters_sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel( verbose=True, reversible_type='transpose')) outfile_name = "%s/GS_ITS_tic%d_lagtime%d_clustersize%d.dat" % ( outputdir, num_tics_for_clustering, tic_lag_time, nMicro) print msm_timescales print msm_timescales.shape for k in range(n_timescales): plt.plot(lag_times, msm_timescales[:, k], 'o-') f2 = open(outfile_name, 'w') for i in range(len(lag_times)): f2.write("%d " % (lag_times[i])) for j in range(n_timescales): f2.write("%f " % (msm_timescales[i, j]))
plt.title(title) plt.semilogy() plt.yticks(fontsize=18) plt.xlabel('Lag times ', fontsize=22) plt.ylabel('Implied times ', fontsize=22) plt.savefig(outname) plt.close() implied_times() msm_timescales_d = implied_timescales(sequences, lag_times, n_timescales=n_timescales, n_jobs=1, msm=MarkovStateModel( verbose=True, reversible_type='transpose', ergodic_cutoff=0), verbose=1) plot(msm_timescales_d, 'Discrete-time MSM Relaxation Timescales', 'imp_times_t_erg_off.png') msm_timescales_d_mle = implied_timescales(sequences, lag_times, n_timescales=n_timescales, n_jobs=1, msm=MarkovStateModel(verbose=True), verbose=1) plot(msm_timescales_d_mle, 'Discrete-time MSM Relaxation Timescales MLE', 'imp_times_mle.png')
data = pd.read_csv("clust2.5_si.dat", header=None, delim_whitespace=True) npdata = np.array(data) npdata_filtered = npdata[:, 4:] sequences = [] sequences.append(npdata_filtered) kmeanslabel = list(npdata[:, 4].astype(int)) #print (type(kmeanslabel)) lag_times = list(range(1, 400, 10)) print("lag", lag_times) n_timescales = 10 msm_timescales = implied_timescales(kmeanslabel, lag_times, n_timescales=n_timescales, msm=MarkovStateModel(verbose=False)) #print(msm_timescales[:,0]) #for i in range(n_timescales): # print (i) pp.plot(lag_times, msm_timescales[:, 0], 'o-') pp.plot(lag_times, msm_timescales[:, 1], 'o-') pp.plot(lag_times, msm_timescales[:, 2], 'o-') pp.title('Discrete-time MSM Relaxation Timescales') pp.semilogy() pp.show() #ctmsm_timescales = implied_timescales(kmeanslabel, lag_times, n_timescales=n_timescales, msm=ContinuousTimeMSM(verbose=False)) #X_scaled = preprocessing.normalize(npdata_filtered)
kcenters = KCenters(n_clusters=nMicro, metric='euclidean', random_state=0) microstate_sequences = kcenters.fit(tica_trajs) print("output of msm:", microstate_sequences.labels_) plt.figure() plot_states_on_tic_space(resultdir, 'micorstate.png', tica_trajs, microstate_sequences.labels_, 1, 2) # In[159]: #plot the microstate implied timescale, which will show how many macrostates we need plt.figure() lag_times = range(2, 50, 2) msm_timescales = implied_timescales(microstate_sequences.labels_, lag_times, n_timescales=10, msm=MarkovStateModel( reversible_type='transpose', ergodic_cutoff='off')) plot_impliedtimescale(resultdir, 'microstate_its.png', lag_times, msm_timescales, 'ps') # In[160]: ####Evaluate the thermodynamics and kinetics from the microstate MSM #the first dynamic eigenvector is associated to the slowest transitions in the dataset #we can understand the physical meaning of the first eigenmode through sampling the conformations micro_msm_lagtime = 4 msm = MarkovStateModel( lag_time=micro_msm_lagtime, reversible_type='transpose', n_timescales=3,