Exemple #1
0
def plot_timescales(clusterer_dir,
                    n_clusters,
                    tica_dir,
                    main="",
                    lag_times=list(range(1, 50))):
    clusterer = verboseload(clusterer_dir)
    print(clusterer)
    sequences = clusterer.labels_
    #print(sequences)
    #lag_times = list(np.arange(1,150,5))
    n_timescales = 5

    msm_timescales = implied_timescales(sequences,
                                        lag_times,
                                        n_timescales=n_timescales,
                                        msm=MarkovStateModel(
                                            verbose=True,
                                            prior_counts=1e-5,
                                            ergodic_cutoff='off'))
    print(msm_timescales)

    for i in range(n_timescales):
        plt.plot(lag_times, msm_timescales[:, i])
    plt.xlabel("Lag time (ns)")
    plt.ylabel("Implied Timescales (ns)")
    plt.title(main)
    plt.semilogy()
    pp = PdfPages("%s/%s_n_clusters%d_implied_timescales.pdf" %
                  (tica_dir, main, n_clusters))
    pp.savefig()
    pp.close()
    plt.clf()
def test_both():
    model = MarkovStateModel(
        reversible_type='mle', lag_time=1, n_timescales=1) 

    # note this might break it if we ask for more than 1 timescale
    sequences = np.random.randint(20, size=(10, 1000))
    lag_times = [1, 5, 10]

    models_ref = []
    for tau in lag_times:
        msm = MarkovStateModel(
            reversible_type='mle', lag_time=tau, n_timescales=10)
        msm.fit(sequences)
        models_ref.append(msm)

    timescales_ref = [m.timescales_ for m in models_ref]

    models = param_sweep(msm, sequences, {'lag_time' : lag_times}, n_jobs=2)
    timescales = implied_timescales(sequences, lag_times, msm=msm,
                                    n_timescales=10, n_jobs=2)

    print(timescales)
    print(timescales_ref)

    if np.abs(models[0].transmat_ - models[1].transmat_).sum() < 1E-6:
        raise Exception("you wrote a bad test.")

    for i in range(len(lag_times)):
        models[i].lag_time = lag_times[i]
        npt.assert_array_almost_equal(models[i].transmat_, models_ref[i].transmat_)
        npt.assert_array_almost_equal(timescales_ref[i], timescales[i])
def test_both():
    sequences = [np.random.randint(20, size=1000) for _ in range(10)]
    lag_times = [1, 5, 10]

    models_ref = []
    for tau in lag_times:
        msm = MarkovStateModel(reversible_type='mle', lag_time=tau,
                               n_timescales=10)
        msm.fit(sequences)
        models_ref.append(msm)

    timescales_ref = [m.timescales_ for m in models_ref]

    model = MarkovStateModel(reversible_type='mle', lag_time=1, n_timescales=10)
    models = param_sweep(model, sequences, {'lag_time': lag_times}, n_jobs=2)
    timescales = implied_timescales(sequences, lag_times, msm=model,
                                    n_timescales=10, n_jobs=2)

    print(timescales)
    print(timescales_ref)

    if np.abs(models[0].transmat_ - models[1].transmat_).sum() < 1E-6:
        raise Exception("you wrote a bad test.")

    for i in range(len(lag_times)):
        npt.assert_array_almost_equal(models[i].transmat_,
                                      models_ref[i].transmat_)
        npt.assert_array_almost_equal(timescales_ref[i], timescales[i])
def test_both():
    model = MarkovStateModel(reversible_type="mle", lag_time=1, n_timescales=1)

    # note this might break it if we ask for more than 1 timescale
    sequences = np.random.randint(20, size=(10, 1000))
    lag_times = [1, 5, 10]

    models_ref = []
    for tau in lag_times:
        msm = MarkovStateModel(reversible_type="mle", lag_time=tau, n_timescales=10)
        msm.fit(sequences)
        models_ref.append(msm)

    timescales_ref = [m.timescales_ for m in models_ref]

    models = param_sweep(msm, sequences, {"lag_time": lag_times}, n_jobs=2)
    timescales = implied_timescales(sequences, lag_times, msm=msm, n_timescales=10, n_jobs=2)

    print(timescales)
    print(timescales_ref)

    if np.abs(models[0].transmat_ - models[1].transmat_).sum() < 1e-6:
        raise Exception("you wrote a bad test.")

    for i in range(len(lag_times)):
        models[i].lag_time = lag_times[i]
        npt.assert_array_almost_equal(models[i].transmat_, models_ref[i].transmat_)
        npt.assert_array_almost_equal(timescales_ref[i], timescales[i])
Exemple #5
0
def plot_timescales(clusterer_dir, n_clusters, tica_dir):
	clusterer = verboseload(clusterer_dir)
	print clusterer
	sequences = clusterer.labels_
	#print(sequences)
	#lag_times = list(np.arange(1,150,5))
	lag_times = [1, 4, 8, 12, 16]
	n_timescales = 10

	msm_timescales = implied_timescales(sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel(verbose=True))
	print msm_timescales
	implied_timescales(sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel(verbose=False))

	for i in range(n_timescales):
		plt.plot(lag_times, msm_timescales[:,i])
	plt.semilogy()
	pp = PdfPages("%s/n_clusters%d_implied_timescales.pdf" %(tica_dir, n_clusters))
	pp.savefig()
	pp.close()
def plot_timescales(clusterer_dir, n_clusters, lag_time):
	clusterer = verboseload(clusterer_dir)
	sequences = clusterer.labels_
	lag_times = list(np.arange(1,150,5))
	n_timescales = 5

	msm_timescales = implied_timescales(sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel(verbose=False))
	print msm_timescales

	for i in range(n_timescales):
		plt.plot(lag_times, msm_timescales[:,i])
	plt.semilogy()
	pp = PdfPages("/scratch/users/enf/b2ar_analysis/kmeans_%d_%d_implied_timescales.pdf" %(n_clusters, lag_time))
	pp.savefig()
	pp.close()
Exemple #7
0
def plot_timescales(clusterer_dir, n_clusters, lag_time):
    clusterer = verboseload(clusterer_dir)
    sequences = clusterer.labels_
    lag_times = list(np.arange(1, 150, 5))
    n_timescales = 5

    msm_timescales = implied_timescales(sequences,
                                        lag_times,
                                        n_timescales=n_timescales,
                                        msm=MarkovStateModel(verbose=False))
    print(msm_timescales)

    for i in range(n_timescales):
        plt.plot(lag_times, msm_timescales[:, i])
    plt.semilogy()
    pp = PdfPages(
        "/scratch/users/enf/b2ar_analysis/kmeans_%d_%d_implied_timescales.pdf"
        % (n_clusters, lag_time))
    pp.savefig()
    pp.close()
def calculate_its(kcenters_sequences, lag_times, n_timescales, outfile_name,
                  ergodic_cutoff_option):
    msm_timescales = implied_timescales(
        kcenters_sequences,
        lag_times,
        n_timescales=n_timescales,
        msm=MarkovStateModel(verbose=True,
                             reversible_type='transpose',
                             ergodic_cutoff=ergodic_cutoff_option))
    for k in range(n_timescales):
        plt.plot(lag_times, msm_timescales[:, k], 'o-')
    f2 = open(outfile_name + '.dat', 'w')
    for i in range(len(lag_times)):
        f2.write("%d    " % (lag_times[i]))
        for j in range(n_timescales):
            f2.write("%f    " % (msm_timescales[i, j]))
        f2.write('\n')
    f2.close()
    plt.title('Discrete-time MSM Relaxation Timescales')
    plt.semilogy()
    x1, x2, y1, y2 = plt.axis()
    plt.savefig(outfile_name + '.png')
    plt.close()
Exemple #9
0
 test_score = msm.score(test_data_sequence)
 f1 = open(
     sub_resultdir +
     '/Fold_%d_tica_lagtime_%d_ntics_%d_nMicro_%d_gmrq.summary'
     % (fold, tica_correlation_time, n_tics, n_Micro), 'w')
 f1.write('train_score:%f' % (train_score))
 f1.write('\n')
 f1.write('test_score:%f' % (test_score))
 f1.write('\n')
 f1.close()
 print(
     'computing implied timescale for training data'
 )  #the x-range to plot implied timescale should also change
 train_msm_timescales = implied_timescales(
     train_data_sequence,
     range(1, 20, 1),
     n_timescales=10,
     msm=MarkovStateModel(reversible_type='transpose',
                          ergodic_cutoff='on'))
 np.savetxt(
     sub_resultdir +
     '/Fold_%d_tica_lagtime_%d_ntics_%d_nMicro_%d_traindata_its.dat'
     % (fold, tica_correlation_time, n_tics, n_Micro),
     train_msm_timescales)
 print('computing implied timescale for testing data')
 test_msm_timescales = implied_timescales(
     test_data_sequence,
     range(1, 20, 1),
     n_timescales=10,
     msm=MarkovStateModel(reversible_type='transpose',
                          ergodic_cutoff='on'))
 np.savetxt(
print msm.mapping_
print("for microstate lag time = ", microstate_lagtime, ",", msm.n_states_,
      " states are left")

np.savetxt("kcenters_microstate_%s_transmat_.txt" % (reversible),
           msm.transmat_)
np.savetxt("kcenters_%s_stationary_population" % (reversible),
           msm.populations_)

#plot implied timescale
n_timescales = 10
print "lagtime list is:", lag_times
msm_timescales = implied_timescales(kcenters_sequences,
                                    lag_times,
                                    n_timescales=n_timescales,
                                    msm=MarkovStateModel(
                                        verbose=True,
                                        reversible_type=reversible,
                                        ergodic_cutoff='on'))

for k in range(n_timescales):
    plt.plot(lag_times, msm_timescales[:, k], 'o-')
plt.title('Discrete-time MSM Relaxation Timescales')
plt.semilogy()
x1, x2, y1, y2 = plt.axis()
#plt.axis((x1,x2,y1,1000000))  #need to change this number
outfile_name = "%s_ITS.png" % (reversible)
plt.savefig(outfile_name)
plt.close()
outfile_name = "%s_ITS.dat" % (reversible)
print msm_timescales
Exemple #11
0
def calculate_tica_components():
    print("Calculating tICA components...")
    in_files = glob.glob("out*npy")
    loaded_files = [ np.load(filename) for filename in in_files ]
    tica = tICA(lag_time=tica_lagtime,
        n_components=int(tica_components)).fit_transform(loaded_files)
    np.save('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components), tica)
    tica_data = 'data_lag_%d_comp_%d' %(tica_lagtime, tica_components)
    joblib.dump(tica, tica_data)
    data = np.load('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components))

    for i in range(len(glob.glob('out*npy'))): # extract the four tICA components
        for j in range(len(data[i])):
            tica_1.append(data[i][j][0])
            tica_2.append(data[i][j][1])
            tica_3.append(data[i][j][2])
            tica_4.append(data[i][j][3])

# Clustering via KCenters
    if cluster_method == 'kcenters':
        print("Clustering via KCenters...")
        clusters = KCenters(n_clusters)
    elif cluster_method == 'kmeans':
        print("Clustering via KMeans...")
        clusters = KMeans(n_clusters)
    else:
        sys.exit("Invalid cluster_method. Use kmeans or kcenters.")
    sequences = clusters.fit_transform(tica)
    np.save('lag_%d_clusters_%d_sequences.npy' %(tica_lagtime, n_clusters), sequences)
    np.save('lag_%d_clusters_%d_center.npy' %(tica_lagtime, n_clusters),
        clusters.cluster_centers_)
    cluster_data = 'lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters)
    joblib.dump(sequences, cluster_data)

 # Determining cluster populations
    print("Determining cluster populations...")
    counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster
    normalized_counts =  counts/float(counts.sum())
    percentages = [ i*100 for i in normalized_counts ]

# Plotting the tICA components
    print("Plotting tICA components with cluster centers...")
    plt.figure(0) # plotting tica_1, tica_2
    plt.hexbin(tica_1, tica_2, bins='log') #, cmap=cmaps.viridis
    x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))]
    y_centers = [clusters.cluster_centers_[i][1] for i in range(len(clusters.cluster_centers_))]
    plt.plot(x_centers, y_centers, 'wo')
    for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers): # adds percentage contribution for each cluster
        plt.annotate(
          label,
          xy = (x, y), xytext = (-20, 20),
          textcoords = 'offset points', ha = 'right', va = 'bottom',
          bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
          arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
    plt.savefig('tica_1_2.png')
    plt.figure(1) # plotting tica_1, tica_3
    plt.hexbin(tica_1, tica_3, bins='log')
    x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))]
    y_centers = [clusters.cluster_centers_[i][2] for i in range(len(clusters.cluster_centers_))]
    plt.plot(x_centers, y_centers, 'wo')
    for label, x, y in zip([ "%.4f"%i for i in percentages], x_centers, y_centers):
        plt.annotate(
          label,
          xy = (x, y), xytext = (-20, 20),
          textcoords = 'offset points', ha = 'right', va = 'bottom',
          bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
          arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
    plt.savefig('tica_1_3.png')
    plt.figure(2) # plotting tica_2, tica_3
    plt.hexbin(tica_2, tica_3, bins='log')
    x_centers = [clusters.cluster_centers_[j][1] for j in range(len(clusters.cluster_centers_))]
    y_centers = [clusters.cluster_centers_[j][2] for j in range(len(clusters.cluster_centers_))]
    plt.plot(x_centers, y_centers, 'wo')
    for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers):
        plt.annotate(
          label,
          xy = (x, y), xytext = (-20, 20),
          textcoords = 'offset points', ha = 'right', va = 'bottom',
          bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
          arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
    plt.savefig('tica_2_3.png')


   # Determining cluster entropy ( this yields errors for me )
    # print("Determining cluster entropy")
    # cluster_entropy = (-1.0*normalized_counts*np.log(normalized_counts)).sum()
    # np.savetxt('cluster_entropy.dat', cluster_entropy)

  
 # Determining the cluster populations and writing out PDBs for cluster centers
    print("Determining cluster populations...")
    counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster
    normalized_counts =  counts/float(counts.sum())
    np.savetxt('populations.dat', normalized_counts)
    print("Performing cluster analytics and saving center PDBs...\n")
    for i in range(len(glob.glob("traj*xtc"))):
        n_snapshots = len(clusters.distances_[i])
        cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # frames that have centers
        cluster_labels = sequences[i][cluster_indices] # number of cluster
	if cluster_indices.size != 0: # print only the trajectories that have cluster centers
            for j in range(len(cluster_labels)): # for each cluster center found in this trajectory
                print('Cluster center', cluster_labels[j], 'was found in trajectory', str(i) + '.')
                print('It is found on frame', cluster_indices[j], 'and has a relative population of',
                  "%.4f"%percentages[cluster_labels[j]], '%.')

        xtcfile = sorted(glob.glob("traj*xtc"))[i]
        for j in range(len(cluster_indices)): # actually saving the snapshots
            cluster_traj = md.load_frame(xtcfile, cluster_indices[j], top='structure.gro')
            cluster_traj.save_pdb('state_%d.pdb' %cluster_labels[j]+1)


   # Calculating IPTs
    print("\nCalculating Implied Timescales...")
    timescales = implied_timescales(sequences, lagtimes, n_timescales=n_timescales,
        msm=MarkovStateModel(verbose=False))
    
    implied_timescale_data = 'ipt_lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters)
    joblib.dump(timescales, implied_timescale_data)
    numpy_timescale_data = 'lag_%d_clusters_%d_timescales.npy' %(tica_lagtime, n_clusters)
    np.savetxt('lagtimes.txt', lagtimes)
    np.save(numpy_timescale_data, timescales)
   
# Plotting IPTs (lagtimes and timescales)
    print("Plotting Implied Timescales...")
    for i in range(n_timescales):
	plt.figure(42)
	plt.plot(lagtimes * time_step, timescales[:, i] * time_step, 'o-')
	plt.yscale('log')
	plt.xlabel('lagtime (ns)')
	plt.ylabel('Implied timescales (ns)')
	plt.savefig('lag_%d_clusters_%d_.png' %(tica_lagtime, n_clusters))
Exemple #12
0
def test_ntimescales():
    # see issue #603
    trajs = [np.random.randint(0, 30, 500) for _ in range(5)]
    its = implied_timescales(trajs, [1, 2, 3], n_timescales=11)
    assert its.shape[1] == 11
def test_ntimescales():
    # see issue #603
    trajs = [np.random.randint(0, 30, 500) for _ in range(5)]
    its = implied_timescales(trajs, [1, 2, 3], n_timescales=11)
    assert its.shape[1] == 11
#print("now output TPM")
#print(msm.transmat_)
#np.savetxt('TPM.txt', msm.transmat_)

#print("now output populations")
#print(msm.populations_)
#np.savetxt("population.txt", msm.populations_)

from msmbuilder.msm import MarkovStateModel, implied_timescales

data = dataset('./cluster', mode='r', fmt='dir-npy', verbose=True)

lag_times = range(10, 400, 10)
msm_timescales = implied_timescales(
    data,
    lag_times,
    n_timescales=10,
    msm=MarkovStateModel(reversible_type='transpose'))
np.savetxt('msm_timescales_XXX.txt', msm_timescales)

exit()

#below: perform pcca plus to get the macrostates
from msmbuilder.lumping import PCCAPlus
pcca = PCCAPlus.from_msm(msm, n_macrostates=3)
macro_trajs = pcca.transform(clustered_trajs)

plt.hexbin(txx[:, 0], txx[:, 1], bins='log', mincnt=0.1, cmap="bone_r")
plt.scatter(
    clusterer.cluster_centers_[msm.state_labels_, 0],
    clusterer.cluster_centers_[msm.state_labels_, 1],
kcenters = KCenters(n_clusters=nMicro)
#kcenters = KCenters(n_clusters=num_tics_for_clustering)        # Fr :)

kcenters_sequences = kcenters.fit_predict(
    tica_sequences)  #here it is ground state tica sequences

print "begin to plot the microstate implied timescale into the objective dir"
#plot implied timescale

lag_times = range(10, 100, 10)
#adjust variables
n_timescales = 5  #adjust variables

msm_timescales = implied_timescales(kcenters_sequences,
                                    lag_times,
                                    n_timescales=n_timescales,
                                    msm=MarkovStateModel(
                                        verbose=True,
                                        reversible_type='transpose'))

outfile_name = "%s/GS_ITS_tic%d_lagtime%d_clustersize%d.dat" % (
    outputdir, num_tics_for_clustering, tic_lag_time, nMicro)
print msm_timescales
print msm_timescales.shape

for k in range(n_timescales):
    plt.plot(lag_times, msm_timescales[:, k], 'o-')
f2 = open(outfile_name, 'w')
for i in range(len(lag_times)):
    f2.write("%d    " % (lag_times[i]))
    for j in range(n_timescales):
        f2.write("%f    " % (msm_timescales[i, j]))
Exemple #16
0
    plt.title(title)
    plt.semilogy()
    plt.yticks(fontsize=18)
    plt.xlabel('Lag times ', fontsize=22)
    plt.ylabel('Implied times ', fontsize=22)
    plt.savefig(outname)
    plt.close()


implied_times()

msm_timescales_d = implied_timescales(sequences,
                                      lag_times,
                                      n_timescales=n_timescales,
                                      n_jobs=1,
                                      msm=MarkovStateModel(
                                          verbose=True,
                                          reversible_type='transpose',
                                          ergodic_cutoff=0),
                                      verbose=1)
plot(msm_timescales_d, 'Discrete-time MSM Relaxation Timescales',
     'imp_times_t_erg_off.png')

msm_timescales_d_mle = implied_timescales(sequences,
                                          lag_times,
                                          n_timescales=n_timescales,
                                          n_jobs=1,
                                          msm=MarkovStateModel(verbose=True),
                                          verbose=1)
plot(msm_timescales_d_mle, 'Discrete-time MSM Relaxation Timescales MLE',
     'imp_times_mle.png')
Exemple #17
0
data = pd.read_csv("clust2.5_si.dat", header=None, delim_whitespace=True)
npdata = np.array(data)
npdata_filtered = npdata[:, 4:]
sequences = []
sequences.append(npdata_filtered)

kmeanslabel = list(npdata[:, 4].astype(int))

#print (type(kmeanslabel))

lag_times = list(range(1, 400, 10))
print("lag", lag_times)
n_timescales = 10
msm_timescales = implied_timescales(kmeanslabel,
                                    lag_times,
                                    n_timescales=n_timescales,
                                    msm=MarkovStateModel(verbose=False))
#print(msm_timescales[:,0])
#for i in range(n_timescales):
#   		print (i)
pp.plot(lag_times, msm_timescales[:, 0], 'o-')
pp.plot(lag_times, msm_timescales[:, 1], 'o-')
pp.plot(lag_times, msm_timescales[:, 2], 'o-')
pp.title('Discrete-time MSM Relaxation Timescales')
pp.semilogy()
pp.show()

#ctmsm_timescales = implied_timescales(kmeanslabel, lag_times, n_timescales=n_timescales, msm=ContinuousTimeMSM(verbose=False))

#X_scaled =  preprocessing.normalize(npdata_filtered)
kcenters = KCenters(n_clusters=nMicro, metric='euclidean', random_state=0)
microstate_sequences = kcenters.fit(tica_trajs)
print("output of msm:", microstate_sequences.labels_)

plt.figure()
plot_states_on_tic_space(resultdir, 'micorstate.png', tica_trajs,
                         microstate_sequences.labels_, 1, 2)

# In[159]:

#plot the microstate implied timescale, which will show how many macrostates we need
plt.figure()
lag_times = range(2, 50, 2)
msm_timescales = implied_timescales(microstate_sequences.labels_,
                                    lag_times,
                                    n_timescales=10,
                                    msm=MarkovStateModel(
                                        reversible_type='transpose',
                                        ergodic_cutoff='off'))
plot_impliedtimescale(resultdir, 'microstate_its.png', lag_times,
                      msm_timescales, 'ps')

# In[160]:

####Evaluate the thermodynamics and kinetics from the microstate MSM
#the first dynamic eigenvector is associated to the slowest transitions in the dataset
#we can understand the physical meaning of the first eigenmode through sampling the conformations
micro_msm_lagtime = 4
msm = MarkovStateModel(
    lag_time=micro_msm_lagtime,
    reversible_type='transpose',
    n_timescales=3,