Beispiel #1
0
def test_kcenters_7():
    # are fit_predict and fit().predict() consistent?
    trj = np.random.RandomState(0).randn(30, 2)
    k = KCenters(n_clusters=10, random_state=0).fit([trj])
    l1 = KCenters(n_clusters=10, random_state=0).fit([trj]).predict([trj])[0]
    l2 = KCenters(n_clusters=10, random_state=0).fit_predict([trj])[0]

    eq(l1, l2)
Beispiel #2
0
def test_kcenters_3():
    # test for predict using euclidean distance

    m  = KCenters(n_clusters=10)
    data = np.random.randn(100, 2)
    labels1 = m.fit_predict([data])
    labels2 = m.predict([data])

    eq(labels1[0], labels2[0])
    all_pairs = scipy.spatial.distance.cdist(data, m.cluster_centers_)
    eq(labels2[0], np.argmin(all_pairs, axis=1))
Beispiel #3
0
def test_kcenters_4():
    # test for predict() using non-euclidean distance. because of the
    # way the code is structructured, this takes a different path
    model = KCenters(n_clusters=10, metric='cityblock')
    data = np.random.randn(100, 2)
    labels1 = model.fit_predict([data])
    labels2 = model.predict([data])

    eq(labels1[0], labels2[0])
    all_pairs = scipy.spatial.distance.cdist(data, model.cluster_centers_, metric='cityblock')
    eq(labels2[0], np.argmin(all_pairs, axis=1))
Beispiel #4
0
def test_kcenters_2():
    # some data at (0,0), some data at (1,1) and some data at (0.5, 0.5)
    data = [np.zeros((10,2)), np.ones((10,2)), 0.5*np.ones((10,2))]

    m = KCenters(n_clusters=2, random_state=0)
    m.fit(data)

    # the centers should be [0,0], [1,1] (in either order). This
    # assumes that the random state seeded the initial center at
    # either (0,0) or (1,1). A different random state could have
    # seeded the first cluster at [0.5, 0.5]
    assert np.all(m.cluster_centers_ == np.array([[0,0], [1,1]])) or \
        np.all(m.cluster_centers_ == np.array([[1,1], [0,0]]))

    # the distances should be 0 or sqrt(2)/2
    eq(np.unique(np.concatenate(m.distances_)), np.array([0, np.sqrt(2)/2]))
Beispiel #5
0
def test_kcenters_1():
    # make sure all the shapes are correct of the fit parameters
    m = KCenters(n_clusters=3)
    m.fit([np.random.randn(23,2), np.random.randn(10,2)])

    assert isinstance(m.labels_, list)
    assert isinstance(m.distances_, list)
    assert len(m.labels_) == 2
    eq(m.cluster_centers_.shape, (3,2))
    eq(m.labels_[0].shape, (23,))
    eq(m.labels_[1].shape, (10,))
    eq(m.distances_[0].shape, (23,))
    eq(m.distances_[1].shape, (10,))

    eq(m.fit_predict([np.random.randn(10, 2)])[0].shape, (10,))
    assert np.all(np.logical_not(np.isnan(m.distances_[0])))
def cluster_features(features, clusterer, n_clusters=8):
    '''
    Input
    features : list of arrays, length n_trajs, each of shape (n_samples, n_features)
	
    Output
    clst : msmbuilder.cluster object, with attributes
        cluster_centers_ : (n_clusters, n_features)
	labels_	         : list of arrays, each of shape (n_samples, )
    '''
    if clusterer == 'KMeans':
        from msmbuilder.cluster import KMeans
        clst = KMeans(n_clusters=n_clusters)
    elif clusterer == 'KCenters':
        from msmbuilder.cluster import KCenters
        clst = KCenters(n_clusters=n_clusters)
    elif clusterer == 'KMedoids':
        from msmbuilder.cluster import KMedoids
        clst = KMedoids(n_clusters=n_clusters)
    elif clusterer == 'MiniBatchKMeans':
        from msmbuilder.cluster import MiniBatchKMeans
        clst = MiniBatchKMeans(n_clusters=n_clusters)
    elif clusterer == 'MiniBatchKMedoids':
        from msmbuilder.cluster import MiniBatchKMedoids
        clst = MiniBatchKMedoids(n_clusters=n_clusters)
    clusters = clst.fit_transform(features)
    return clst
Beispiel #7
0
def test_kcenters_8():
    X = np.random.RandomState(1).randn(100, 2)
    X32 = X.astype(np.float32)
    X64 = X.astype(np.float64)
    m1 = KCenters(n_clusters=10, random_state=0).fit([X32])
    m2 = KCenters(n_clusters=10, random_state=0).fit([X64])

    eq(m1.cluster_centers_, m2.cluster_centers_)
    eq(m1.distances_[0], m2.distances_[0])
    eq(m1.labels_[0], m2.labels_[0])
    assert np.all(np.logical_not(np.isnan(m1.distances_[0])))
    eq(m1.predict([X32])[0], m2.predict([X64])[0])
    eq(m1.predict([X32])[0], m1.labels_[0])
    eq(float(m1.inertia_), libdistance.assign_nearest(X32, m1.cluster_centers_, "euclidean")[1])
draw_tica_projection(resultdir, tica_trajs, 'tica_12.png', 1, 2)

#sample conformations along tIC1
print('now we are sampling representative conformations along tIC1')
plt.figure()
sampling_along_tIC(resultdir, 'samples_tic1.png', tica_trajs, trajectory_dir,
                   traj_list_array, pdb_name, 1)
print("You can use vmd to visualize the tica-dimension-tIC1.xtc file")

# In[158]:

#step 1.1: split the conformations into hundreds of microstates
#perform kCenters on the tIC subspace
#input:tICA projections, output:assignments indicating which microstate each conformation is assigned to
nMicro = 100  #specified a priori
kcenters = KCenters(n_clusters=nMicro, metric='euclidean', random_state=0)
microstate_sequences = kcenters.fit(tica_trajs)
print("output of msm:", microstate_sequences.labels_)

plt.figure()
plot_states_on_tic_space(resultdir, 'micorstate.png', tica_trajs,
                         microstate_sequences.labels_, 1, 2)

# In[159]:

#plot the microstate implied timescale, which will show how many macrostates we need
plt.figure()
lag_times = range(2, 50, 2)
msm_timescales = implied_timescales(microstate_sequences.labels_,
                                    lag_times,
                                    n_timescales=10,
Beispiel #9
0
from itertools import combinations
from msmbuilder.featurizer import AtomPairsFeaturizer
from msmbuilder.decomposition import tICA
from sklearn.pipeline import Pipeline
from msmbuilder.example_datasets import fetch_met_enkephalin
from matplotlib import pyplot as plt
from sklearn.externals import joblib

#Featurization
t = md.load('conf.gro')
trajs = md.load('traj0.xtc', top='conf.gro')
#Ind = t.topology.select("(backbone and protein)or name 'CB'")
#trajs1=trajs.atom_slice(Ind)
print "Preparation done, now begin clustering..."
#Cluster
kcenters = KCenters(n_clusters=25, metric='rmsd').fit(trajs)
traj2 = kcenters.cluster_centers_
traj2.save_pdb('Gens_total.pdb')
sys.exit()
traj2[0].save_pdb('Gens0.pdb')
traj2[1].save_pdb('Gens1.pdb')
traj2[2].save_pdb('Gens2.pdb')
traj2[3].save_pdb('Gens3.pdb')
traj2[4].save_pdb('Gens4.pdb')
traj2[5].save_pdb('Gens5.pdb')
traj2[6].save_pdb('Gens6.pdb')
traj2[7].save_pdb('Gens7.pdb')
traj2[8].save_pdb('Gens8.pdb')
traj2[9].save_pdb('Gens9.pdb')
traj2[10].save_pdb('Gens10.pdb')
traj2[11].save_pdb('Gens11.pdb')
          and atom.residue == ligand]
    inds_N.append(iis)
print inds_N
#sequences of coordinates of ligands
sequences_all = []
for this_sim in simulations:
    if use_COM:
        this_seq = util.featurize_RawPos(inds_all,this_sim,average = True)
    else:
        this_seq = util.featurize_RawPos(inds_N,this_sim)
    sequences_all.extend(this_seq)
    
seqfile = '/home/shenglan/TryMSMbuilder/output/sequences'+'_s'+str(LOAD_STRIDE)+'.out'
pickle.dump(sequences_all, open(seqfile,'wb'))
    
KC_clustering = KCenters(n_clusters = N_CLUSTER)
KC_assignments = KC_clustering.fit_predict(sequences_all)
KC_centers = KC_clustering.cluster_centers_

KM_clustering = KCenters(n_clusters = N_CLUSTER)
KM_assignments = KM_clustering.fit_predict(sequences_all)
KM_centers = KM_clustering.cluster_centers_

KC_output_file = '/home/shenglan/TryMSMbuilder/output/KC_centers_c'+str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out'
KM_output_file = '/home/shenglan/TryMSMbuilder/output/KM_centers_c'+str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out'
np.savetxt(KC_output_file,KC_centers,fmt = '%10.4g')
np.savetxt(KM_output_file,KM_centers,fmt = '%10.4g')

KC_assign_file = '/home/shenglan/TryMSMbuilder/output/KC_assign_'+str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out'
KM_assign_file = '/home/shenglan/TryMSMbuilder/output/KM_assign_'+str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out'
pickle.dump(KC_assignments,open(KC_assign_file,'wb'))
            this_lig.extend(md_dist.compute_distances(this_traj,[this_atom_pair]))
        distances.append(this_lig)

dist_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/dist_to_binding'\
+'_s'+str(LOAD_STRIDE)+'.out'
pickle.dump(distances,open(dist_path,'wb'))

# get N positions
sequences_all = []
for this_sim in simulations:
    this_seq = util.featurize_RawPos(inds_N,[this_sim])
    sequences_all.extend(this_seq)
seq_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/sequences'+'_s'+str(LOAD_STRIDE)+'.out'
pickle.dump(sequences_all,open(seq_path,'wb'))

clustering = KCenters(n_clusters = N_CLUSTER)
geo_assign = clustering.fit_predict(sequences_all)
centers = clustering.cluster_centers_

geo_assign_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/KC_geoassign_c' \
+str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out'
pickle.dump(geo_assign,open(geo_assign_path,'wb'))

micro_msm = MarkovStateModel(lag_time=1, reversible_type = 'transpose', 
ergodic_cutoff = 'off'
,verbose=True).fit(geo_assign)

msm_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/KC_msm_c'+str(N_CLUSTER)+ \
'_s'+str(LOAD_STRIDE)+'.out'
pickle.dump(micro_msm,open(msm_path,'wb'))
Beispiel #12
0
def calculate_tica_components():
    print("Calculating tICA components...")
    in_files = glob.glob("out*npy")
    loaded_files = [ np.load(filename) for filename in in_files ]
    tica = tICA(lag_time=tica_lagtime,
        n_components=int(tica_components)).fit_transform(loaded_files)
    np.save('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components), tica)
    tica_data = 'data_lag_%d_comp_%d' %(tica_lagtime, tica_components)
    joblib.dump(tica, tica_data)
    data = np.load('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components))

    for i in range(len(glob.glob('out*npy'))): # extract the four tICA components
        for j in range(len(data[i])):
            tica_1.append(data[i][j][0])
            tica_2.append(data[i][j][1])
            tica_3.append(data[i][j][2])
            tica_4.append(data[i][j][3])

# Clustering via KCenters
    if cluster_method == 'kcenters':
        print("Clustering via KCenters...")
        clusters = KCenters(n_clusters)
    elif cluster_method == 'kmeans':
        print("Clustering via KMeans...")
        clusters = KMeans(n_clusters)
    else:
        sys.exit("Invalid cluster_method. Use kmeans or kcenters.")
    sequences = clusters.fit_transform(tica)
    np.save('lag_%d_clusters_%d_sequences.npy' %(tica_lagtime, n_clusters), sequences)
    np.save('lag_%d_clusters_%d_center.npy' %(tica_lagtime, n_clusters),
        clusters.cluster_centers_)
    cluster_data = 'lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters)
    joblib.dump(sequences, cluster_data)

 # Determining cluster populations
    print("Determining cluster populations...")
    counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster
    normalized_counts =  counts/float(counts.sum())
    percentages = [ i*100 for i in normalized_counts ]

# Plotting the tICA components
    print("Plotting tICA components with cluster centers...")
    plt.figure(0) # plotting tica_1, tica_2
    plt.hexbin(tica_1, tica_2, bins='log') #, cmap=cmaps.viridis
    x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))]
    y_centers = [clusters.cluster_centers_[i][1] for i in range(len(clusters.cluster_centers_))]
    plt.plot(x_centers, y_centers, 'wo')
    for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers): # adds percentage contribution for each cluster
        plt.annotate(
          label,
          xy = (x, y), xytext = (-20, 20),
          textcoords = 'offset points', ha = 'right', va = 'bottom',
          bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
          arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
    plt.savefig('tica_1_2.png')
    plt.figure(1) # plotting tica_1, tica_3
    plt.hexbin(tica_1, tica_3, bins='log')
    x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))]
    y_centers = [clusters.cluster_centers_[i][2] for i in range(len(clusters.cluster_centers_))]
    plt.plot(x_centers, y_centers, 'wo')
    for label, x, y in zip([ "%.4f"%i for i in percentages], x_centers, y_centers):
        plt.annotate(
          label,
          xy = (x, y), xytext = (-20, 20),
          textcoords = 'offset points', ha = 'right', va = 'bottom',
          bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
          arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
    plt.savefig('tica_1_3.png')
    plt.figure(2) # plotting tica_2, tica_3
    plt.hexbin(tica_2, tica_3, bins='log')
    x_centers = [clusters.cluster_centers_[j][1] for j in range(len(clusters.cluster_centers_))]
    y_centers = [clusters.cluster_centers_[j][2] for j in range(len(clusters.cluster_centers_))]
    plt.plot(x_centers, y_centers, 'wo')
    for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers):
        plt.annotate(
          label,
          xy = (x, y), xytext = (-20, 20),
          textcoords = 'offset points', ha = 'right', va = 'bottom',
          bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
          arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
    plt.savefig('tica_2_3.png')


   # Determining cluster entropy ( this yields errors for me )
    # print("Determining cluster entropy")
    # cluster_entropy = (-1.0*normalized_counts*np.log(normalized_counts)).sum()
    # np.savetxt('cluster_entropy.dat', cluster_entropy)

  
 # Determining the cluster populations and writing out PDBs for cluster centers
    print("Determining cluster populations...")
    counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster
    normalized_counts =  counts/float(counts.sum())
    np.savetxt('populations.dat', normalized_counts)
    print("Performing cluster analytics and saving center PDBs...\n")
    for i in range(len(glob.glob("traj*xtc"))):
        n_snapshots = len(clusters.distances_[i])
        cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # frames that have centers
        cluster_labels = sequences[i][cluster_indices] # number of cluster
	if cluster_indices.size != 0: # print only the trajectories that have cluster centers
            for j in range(len(cluster_labels)): # for each cluster center found in this trajectory
                print('Cluster center', cluster_labels[j], 'was found in trajectory', str(i) + '.')
                print('It is found on frame', cluster_indices[j], 'and has a relative population of',
                  "%.4f"%percentages[cluster_labels[j]], '%.')

        xtcfile = sorted(glob.glob("traj*xtc"))[i]
        for j in range(len(cluster_indices)): # actually saving the snapshots
            cluster_traj = md.load_frame(xtcfile, cluster_indices[j], top='structure.gro')
            cluster_traj.save_pdb('state_%d.pdb' %cluster_labels[j]+1)


   # Calculating IPTs
    print("\nCalculating Implied Timescales...")
    timescales = implied_timescales(sequences, lagtimes, n_timescales=n_timescales,
        msm=MarkovStateModel(verbose=False))
    
    implied_timescale_data = 'ipt_lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters)
    joblib.dump(timescales, implied_timescale_data)
    numpy_timescale_data = 'lag_%d_clusters_%d_timescales.npy' %(tica_lagtime, n_clusters)
    np.savetxt('lagtimes.txt', lagtimes)
    np.save(numpy_timescale_data, timescales)
   
# Plotting IPTs (lagtimes and timescales)
    print("Plotting Implied Timescales...")
    for i in range(n_timescales):
	plt.figure(42)
	plt.plot(lagtimes * time_step, timescales[:, i] * time_step, 'o-')
	plt.yscale('log')
	plt.xlabel('lagtime (ns)')
	plt.ylabel('Implied timescales (ns)')
	plt.savefig('lag_%d_clusters_%d_.png' %(tica_lagtime, n_clusters))
Beispiel #13
0
lines = f.read()
f.close
round_num = int(lines)

## Construct and save the dataframe
parser = NumberedRunsParser(
    traj_fmt="trj-{run}.xtc",
    top_fn="/scratch/jap12009/msm/fast/try1/frame0nw_startingAPO.pdb",
    step_ps=240,
)
meta = gather_metadata("/scratch/jap12009/msm/fast/try1/trj/trj-*.xtc", parser)
save_meta(meta)

## Set up parameters for clustering
kcen = KCenters(
    n_clusters=num_clusters,
    metric='rmsd',
)

## Try to limit RAM usage
def guestimate_stride():
    total_data = meta['nframes'].sum()
    want = kcen.n_clusters * 20
    stride = max(1, total_data // want)
    print("Since we have", total_data, "frames, we're going to stride by",
          stride, "during fitting, because this is probably adequate for",
          kcen.n_clusters, "clusters")
    return stride


## Fit
kcen.fit([traj for _, traj in itertrajs(meta, stride=guestimate_stride())])
Beispiel #14
0
#sequences of coordinates of ligand aromatic ring and Aps113
sequences_all = []
for this_sim in simulations:
    this_seq = util.featurize_RawPos(inds,this_sim)
    sequences_all.extend(this_seq)

#print len(sequences_all)
#print sequences_all[-1].shape

#average position of Asp113
#res_pos_ave = np.mean(res_pos_A_1[0],axis = 0)
# 
time_step = util.calc_time_step(times_path,stride = LOAD_STRIDE)
# 
clustering = KCenters(n_clusters = 10)
assignments = clustering.fit_predict(sequences_all)
centers = clustering.cluster_centers_

#print len(assignments)
#print assignments[1].shape

msm = MarkovStateModel(lag_time=180, verbose=True).fit(assignments)
countsmat = msm.countsmat_
transmat = msm.transmat_
#print np.sum(countsmat)

#np.savetxt('/home/shenglan/TryMSMbuilder/output/assignments.out',assignments, fmt = '%3.0f')
np.savetxt('/home/shenglan/TryMSMbuilder/output/countsmat.out',countsmat,fmt = '%8.4g')
np.savetxt('/home/shenglan/TryMSMbuilder/output/transmat.out',transmat,fmt = '%10.4g')
Beispiel #15
0
show()

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~         MARKOV STATE MODEL     ~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

msmts0, msmts1 = {}, {}
lag_times = [1, 10, 20, 30, 40]
n_states = [4, 8, 16, 32, 64]

for n in n_states:
    msmts0[n] = []
    msmts1[n] = []
    for lag_time in lag_times:
        assignments = KCenters(n_clusters=n).fit_predict(sequences)
        msm = MarkovStateModel(lag_time=lag_time,
                               verbose=False).fit(assignments)
        timescales = msm.timescales_
        msmts0[n].append(timescales[0])
        msmts1[n].append(timescales[1])
        print('n_states=%d\tlag_time=%d\ttimescales=%s' %
              (n, lag_time, timescales[0:2]))
    print()

figure(figsize=(14, 3))

for i, n in enumerate(n_states):
    subplot(1, len(n_states), 1 + i)
    plot(lag_times, msmts0[n])
    plot(lag_times, msmts1[n])
                         (outputdir, line))
    temp = temp[:, 0:num_tics_for_clustering]
    tica_sequences.append(temp)

TS2_ticproj_list_array = []
tica_TS2_sequences = []
for line in open("ticproj_TS2"):
    TS2_ticproj_list_array.append(line.strip())
    temp1 = numpy.loadtxt("%s/TS2_project_onto_GS_tics/%s_ticproj.txt" %
                          (outputdir, line.strip()))
    temp1 = temp1[:, 0:num_tics_for_clustering]
    tica_TS2_sequences.append(temp1)

tmp_counter = 0

kcenters = KCenters(n_clusters=nMicro)
#kcenters = KCenters(n_clusters=num_tics_for_clustering)        # Fr :)

kcenters_sequences = kcenters.fit_predict(
    tica_sequences)  #here it is ground state tica sequences

print "begin to plot the microstate implied timescale into the objective dir"
#plot implied timescale

lag_times = range(10, 100, 10)
#adjust variables
n_timescales = 5  #adjust variables

msm_timescales = implied_timescales(kcenters_sequences,
                                    lag_times,
                                    n_timescales=n_timescales,
Beispiel #17
0
                (fold, tica_correlation_time), train_data_projection,
                test_data_projection, 1, 2)
            plt.figure()
            draw_tica_projection_cross_validation(
                sub_resultdir,
                'Fold_%d_tica_lagtime_%d_train_data_proj_tIC13.png' %
                (fold, tica_correlation_time), train_data_projection,
                test_data_projection, 1, 3)

            for n_tics in n_tics_range:
                for n_Micro in n_Micro_range:
                    print("parameters: fold-", fold, ',tica_lagtime-',
                          tica_correlation_time, ',n_tics-', n_tics,
                          ',n_Micro-', n_Micro)
                    kcenters = KCenters(n_clusters=n_Micro,
                                        metric='euclidean',
                                        random_state=0)
                    kcenters.fit(train_data_projection)
                    train_data_sequence = kcenters.predict(
                        train_data_projection)
                    test_data_sequence = kcenters.predict(test_data_projection)
                    msm = MarkovStateModel(
                        n_timescales=3,
                        lag_time=100,
                        reversible_type='transpose',
                        verbose=False,
                        sliding_window=True,
                        ergodic_cutoff='on')  #the parameters may change
                    msm.fit(train_data_sequence)
                    train_score = msm.score(train_data_sequence)
                    test_score = msm.score(test_data_sequence)
Beispiel #18
0
def test_kcenters_5():
    model1 = KCenters(n_clusters=10, random_state=0, metric='euclidean')
    model2 = KCenters(n_clusters=10, random_state=0, metric='sqeuclidean')

    data = np.random.RandomState(0).randn(100, 2)
    eq(model1.fit_predict([data])[0], model2.fit_predict([data])[0])
        distances.append(this_lig)

dist_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/dist_to_binding'\
+'_s'+str(LOAD_STRIDE)+'.out'
pickle.dump(distances, open(dist_path, 'wb'))

# get N positions
sequences_all = []
for this_sim in simulations:
    this_seq = util.featurize_RawPos(inds_N, [this_sim])
    sequences_all.extend(this_seq)
seq_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/sequences' + '_s' + str(
    LOAD_STRIDE) + '.out'
pickle.dump(sequences_all, open(seq_path, 'wb'))

clustering = KCenters(n_clusters=N_CLUSTER)
geo_assign = clustering.fit_predict(sequences_all)
centers = clustering.cluster_centers_

geo_assign_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/KC_geoassign_c' \
+str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out'
pickle.dump(geo_assign, open(geo_assign_path, 'wb'))

micro_msm = MarkovStateModel(lag_time=1,
                             reversible_type='transpose',
                             ergodic_cutoff='off',
                             verbose=True).fit(geo_assign)

msm_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/KC_msm_c'+str(N_CLUSTER)+ \
'_s'+str(LOAD_STRIDE)+'.out'
pickle.dump(micro_msm, open(msm_path, 'wb'))
Beispiel #20
0
traj_list_array = []
for line in open("trajlist"):
    traj_list_array.append(line.strip())
print traj_list_array

dataset = []
for trajfile in traj_list_array:
    t = md.load(xtc_file_dir + trajfile,
                top='test.pdb',
                atom_indices=select_atoms)
    dataset.append(t)
print dataset
#ww: check whether they have aligned w.r.t reference

kcenters = KCenters(n_clusters=nMicro, metric='rmsd', random_state=0)

kcenters_sequences = kcenters.fit(dataset)

out_assignment_dir = 'Microassignment/'
out_kcenters_distances_dir = 'distances/'
os.system("mkdir %s" % (out_assignment_dir))
os.system("mkdir %s" % (out_kcenters_distances_dir))

tmp_counter = 0
for ifile in traj_list_array:
    numpy.savetxt("%s/%s_assignment_.txt" % (out_assignment_dir, ifile[:-4]),
                  kcenters.labels_[tmp_counter],
                  fmt='%d')
    numpy.savetxt("%s/%s_distances_.txt" %
                  (out_kcenters_distances_dir, ifile[:-4]),
Beispiel #21
0
pp.plot(lag_times, msm_timescales[:, 0], 'o-')
pp.plot(lag_times, msm_timescales[:, 1], 'o-')
pp.plot(lag_times, msm_timescales[:, 2], 'o-')
pp.title('Discrete-time MSM Relaxation Timescales')
pp.semilogy()
pp.show()

#ctmsm_timescales = implied_timescales(kmeanslabel, lag_times, n_timescales=n_timescales, msm=ContinuousTimeMSM(verbose=False))

#X_scaled =  preprocessing.normalize(npdata_filtered)

#sequences2=list(np.transpose(np.reshape(npdata_filtered2[:,3].astype(int),(-1,1))))

######K-Means-Clustering#####
#############################
cluster = KCenters(metric='euclidean', n_clusters=4)

#sequences = cluster.fit_transform(seq)
#for item in sequences:
#	print (item)
'''
kmeans = KMeans(n_clusters=4,random_state=0).fit_transform(npdata_filtered)   #states from kmeans
kmeanslabel=kmeans.labels_.tolist()


########Time scale calculations
lag_times=list(range(1, 100,2))
n_timescales=10

msm_timescales = implied_timescales(sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel(verbose=False))
Beispiel #22
0
def compute_tica_components():
          
    '''Load in the features, calculate a given number of tICA components (tica_components) given a
       lagtime (lag_time), and save tICA coordinates and eigenvector data. It then creates and populates
       a list for each desired component, clusters the data, saving normalized populations as populations.dat
       and saving each cluster center as a .pdb. tICA plots are created and saved, and implied timescales are
       calculated, saved, and plotted.
    '''
        
    verbose = False
    save_pdb = True
    color_by = 'cluster'
    
    if verbose:
        print("\nCalculating tICA components...")
    if not os.path.exists(project_title + '/tica_%d'%n_clusters):
        os.mkdir(project_title + '/tica_%d'%n_clusters)
    
    # load in feature files and determine indices of unbiased ensembles
    feature_files = []
    for i in range(runs):
        run_files = sorted(glob.glob(/features/' + "P*R%d_*npy"%i))
        feature_files += run_files
        if i in unbiased_runs:
            unbiased_indices = [len(feature_files) - len(run_files),len(feature_files)]
    features = [np.load(x) for x in feature_files]
    
    # perform tICA calculation and extract score / eigenvectors
    tica_coordinates = tICA(lag_time=tica_lagtime,
        n_components=int(n_components)).fit_transform(features)
    tica_components = tICA(lag_time=tica_lagtime,
        n_components=int(n_components)).fit(features)
    eigenvectors = np.transpose(tica_components.eigenvectors_)
    tica_score = tica_components.score(features)
          
    np.save('%s/tica_%d/tica_coords-lag_%d-comp_%d.npy' %(
        project_title, n_clusters, tica_lagtime, n_components), tica_coordinates)
    np.save('%s/tica_%d/tica_comps-lag_%d-comp_%d.npy' %(
        project_title, n_clusters, tica_lagtime, n_components), tica_components)
    
    # Perform clustering based on the cluster_method parameter.
    if verbose:
        print('Clustering via %s'%cluster_method)
    if cluster_method == 'kcenters':
        clusters = KCenters(n_clusters)
    elif cluster_method == 'kmeans':
        clusters = KMeans(n_clusters)
    elif cluster_method == 'kmedoids':
        clusters = KMedoids(n_clusters)
    else:
        sys.exit('Invalid cluster_method. Use kcenters/kmeans/kmedoids.')
        
    # Cluster unbiased data and fit biased data to these centers
    new_assignments = []
    sequences = clusters.fit_transform(tica_coordinates[unbiased_indices[0]:unbiased_indices[1]])
    for i in tqdm.tqdm_notebook(range(unbiased_indices[0])):
        tica_traj = tica_coordinates[i]
        if isinstance(tica_traj, np.ndarray):
            if not (tica_traj.dtype == 'float32' or tica_traj.dtype == 'float64'):
                tica_traj = tica_traj.astype('float64')
        labels, inertia = msmbuilder.libdistance.assign_nearest(
            tica_traj, clusters.cluster_centers_, metric='euclidean')
        new_assignments.append(labels)

    new_assignments += sequences # tack the unbiased assignments back on to the end.


    np.save('%s/tica_%d/lag_%d_clusters_%d_assignments.npy' %(
        project_title, n_clusters, tica_lagtime, n_clusters), new_assignments)
    np.save('%s/tica_%d/lag_%d_clusters_%d_center.npy' %(
        project_title, n_clusters, tica_lagtime, n_clusters), clusters.cluster_centers_)

    # Determine cluster populations, normalize the counts, and save as percentages for
    # labeling if a cluster contains more than cluster_percentage_cutoff percent of the data.
    # Finally, save normalized counts.
    
    if verbose:
        print("\nDetermining cluster populations...")
    if not os.path.exists('%s/tica_%d/%s_clusters'%(project_title,n_clusters,cluster_method)):
        os.mkdir('%s/tica_%d/%s_clusters'%(project_title,n_clusters,cluster_method))
    if not os.path.exists('%s/tica_%d/plots'%(project_title,n_clusters)):
        os.mkdir('%s/tica_%d/plots'%(project_title,n_clusters))
        
    counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)])
    normalized_counts =  counts/float(counts.sum())
    percentages = [ i*100 for i in normalized_counts ]
    population_labels = [ [i,"%.2f"%percentages[i]] for i in range(len(percentages)) if percentages[i] > cluster_percentage_cutoff ]
    np.savetxt('%s/tica_%d/%s_clusters/populations.dat'
               %(project_title,n_clusters,cluster_method), normalized_counts)

    # Plot all unique combinations of tICA components
    if verbose:
        print("\nPlotting tICA components...")
    tica_coordinates = np.concatenate(tica_coordinates)
    new_assignments = np.concatenate(new_assignments)
    cluster_colors = matplotlib.cm.rainbow(np.linspace(0,1,n_clusters))
    for j in tqdm.tqdm_notebook(range(len(all_ticas)),leave=False): # For each pair
        if all_ticas[j][0] < all_ticas[j][1]:
            plt.figure(j, figsize=(20,16))
            tICx, tICy = all_ticas[j][0]-1, all_ticas[j][1]-1
            plt.hexbin(tica_coordinates[:,tICx],tica_coordinates[:,tICy], bins='log')
            for l in tqdm.tqdm(range(len(tica_coordinates))[::stride*2]):
                if color_by == 'cluster':
                    plt.plot(tica_coordinates[l][tICx], tica_coordinates[l][tICy],
                        color=cluster_colors[new_assignments[l]], linestyle="", marker="o")
            x_centers = [clusters.cluster_centers_[i][tICx] for i in range(len(clusters.cluster_centers_))]
            y_centers = [clusters.cluster_centers_[i][tICy] for i in range(len(clusters.cluster_centers_))]
            high_pop_x_centers = [ x_centers[i] for i in range(len(x_centers)) if percentages[i] > cluster_percentage_cutoff ]
            high_pop_y_centers = [ y_centers[i] for i in range(len(y_centers)) if percentages[i] > cluster_percentage_cutoff ]
            plt.plot(x_centers, y_centers, color='y', linestyle="", marker="o")
            plt.plot(tica_coordinates[:,tICx][0],tica_coordinates[:,tICy][0], color='k', marker='*',markersize=24)
            plt.xlabel('tIC'+str(all_ticas[j][0]))
            plt.ylabel('tIC'+str(all_ticas[j][1]))
            plt.title(project_title)
            # Add labels for high-population cluster centers
            for label, x, y in zip(population_labels, high_pop_x_centers, high_pop_y_centers):
                plt.annotate(
                  label,
                  xy = (x, y), xytext = (-15, 15),
                  textcoords = 'offset points', ha = 'right', va = 'bottom',
                  bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
                  arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
            plt.savefig('%s/tica_%d/plots/tica_%d_%d.png'%(project_title,n_clusters,
                all_ticas[j][0], all_ticas[j][1]))
            plt.close()

    # Write out PDBs for each cluster center
    if verbose:
        print("Performing cluster analytics and saving center PDBs...\n")
    if save_pdb:
        trajectory_files, feature_files, cluster_features = [],[],[]
        for run in range(runs): # get only xtc files that correlate to cluster-center features
            trajectory_files += [re.sub('features',
                                    'traj_data/RUN%d'%run,re.sub('npy','xtc',x)
                                     ) for x in sorted(glob.glob('%s/features/*R%d_*npy'%(
                                        project_title,run)))]
            feature_files += sorted(glob.glob('%s/features/*R%d_*npy'%(project_title,run)))

        for i in tqdm.tqdm_notebook(range(len(trajectory_files)),leave=False):

                n_snapshots = len(clusters.distances_[i])

                # Determine frames that are cluster centers
                cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ]

                # Determine number of each cluster, correlates to populations.dat
                cluster_labels = sequences[i][cluster_indices]

                # Save each cluster center as a pdb
                if list(cluster_indices): # load center-containing xtcs to check length
                    xtc_len = len(md.load(trajectory_files[i],top=structure_file))
                    
                # map strided frame number back to xtc frame number
                for j in range(len(cluster_indices)):
                        frames = range(xtc_len) 
                        strided_frames = frames[equil_steps:][::stride]
                        xtc_frame = frames.index(strided_frames[cluster_indices[j]])
                        cluster_traj = md.load_frame(trajectory_files[i], xtc_frame,
                                            top=structure_file)
                        cluster_features.append(np.load(feature_files[i])[cluster_indices[j]])
                        cluster_traj.save_pdb('%s/tica_%d/%s_clusters/state_%d.pdb'
                                            %(project_title,n_clusters,cluster_method,
                                            cluster_labels[j]))
                        
                        # save cluster information
                        with open('%s/tica_%d/cluster.dat'%(project_title,n_clusters),'w') as f:
                            f.write('\nSuccessfully saved PDB for cluster: %d, (rel.pop: %.3f)'%(
                                cluster_labels[j],percentages[cluster_labels[j]]))
                            f.write('traj_file: %s (%d/%d)'%(trajectory_files[i],i,len(features)))
                            f.write('frame: %d (%d/%d centers from this trajectory)'%(
                                cluster_indices[j],j,len(cluster_indices)))
                            f.write('strided: npy_frame/npy_len = %d/%d = %f'%(
                                cluster_indices[j],n_snapshots,cluster_indices[j]/n_snapshots))
                            f.write('re-mapped: orig_frame/xtc_len = %d/%d = %f\n'%(
                                xtc_frame,xtc_len,xtc_frame/xtc_len))
                            f.close()
                        
        # save features corresponding to each cluster center
        np.save('%s/tica_%d/cluster_features.npy'%(project_title,n_clusters),cluster_features)
                    
    return tica_score
    def calculate_tica_components(self, cluster_method, calculate_strides=False, feats=None):
          
        '''Load in the features, calculate a given number of tICA components (tica_components) given a
        lagtime (lag_time), and save tICA coordinates and eigenvector data. It then creates and populates
        a list for each desired component, clusters the data, saving normalized populations as populations.dat
        and saving each cluster center as a .pdb. tICA plots are created and saved, and implied timescales are
        calculated, saved, and plotted.
        '''
	
	# tICA parameters
        tica_lagtime = 10 # determine from implied timescales
        tica_components = 8 # how many tICs to compute
        n_clusters = 100 # denotes number of microstates
        n_timescales = tica_components # plot all eigenvalues --> timescales
        md_time_step = 0.02 # ns
        subsampled_time_step = 1. # ns multiplier of timescales and lagtimes in implied timescale plot
        stride = int(subsampled_time_step / md_time_step)  #time step stride for sub-sampling
	equil_time = 1. # ns
        equil_steps = 1 #int(equil_time / md_time_step)  time steps to be removed from start
        lagtimes = np.array([1,2,4,8,16,32,64,128,256,512,1024])
        cluster_method = 'kcenters' # 'kcenters/kmeans'
        all_ticas = list(itertools.permutations(range(1,tica_components+1), 2)) # all combinations
        all_ticas = [[1,2]] # override: just show analysis for first two components
        cluster_percentage_cutoff = 5 # clusters with a relative population less than this
                                  # number will not be labeled on plot i.e. 0 : all clusters labeled
        verbose = False

        print("\nCalculating tICA components...")
        
        # Load in feature files THIS WILL NEED TO BE CHANGED
	if feats == None:
	    if calculate_strides:
                self.calculate_stride_distances(stride, equil_steps)
	        data = np.load('/home/server/git/fah-scripts/DataAnalysisScripts/stride_dist/stride_dist_%d.npy' % self.proj_num)
	    else:
	        data = self.data
	else:
	    data = np.load(feats)

	features = []
	for run in data:
	    for clone in run:
	        gen_seq = []
	        for gen in clone:
	            if gen is not None and gen[0] is not None:
    	                if calculate_strides or feats is not None:
			    gen_seq.append(gen)
		        else:
	                    gen_seq.append(gen[::stride]) 
	        if len(gen_seq) > 0:
		    gen_cat = np.concatenate(gen_seq)
		    if calculate_strides:
		        features.append(gen_cat)
		    else:
		        features.append(gen_cat[equil_steps:])
	features = np.asarray(features)
	print(features.shape)
	print(features[0].shape)
	tica_coordinates = tICA(lag_time=tica_lagtime,
            n_components=int(tica_components)).fit_transform(features)
      
        np.save('%s/lag_%d_coord_%d.npy' %(self.tICA_dir, tica_lagtime, tica_components), tica_coordinates)
          
        # Initiate and populate an array for each component    
        for i in range(tica_components):
            exec('tica_' + str(i+1) + ' = []')
          
        for i in tqdm.tqdm(range(len(features))):
            for j in range(len(tica_coordinates[i])):
                for k in range(tica_components):
                    exec('tica_' + str(k+1) + '.append(tica_coordinates[i][j][k])')
            
        # Perform clustering based on the cluster_method parameter.
        if cluster_method == 'kcenters':
            print("Clustering via KCenters...")
            clusters = KCenters(n_clusters)
        elif cluster_method == 'kmeans':
            print("Clustering via KMeans...")
            clusters = KMeans(n_clusters)
        else:
            sys.exit("Invalid cluster_method. Use kmeans or kcenters.")
        
        # Determine cluster assignment for each frame.      
        sequences = clusters.fit_transform(tica_coordinates)
	
        np.save('%s/lag_%d_clusters_%d_sequences.npy' %(self.tICA_dir, tica_lagtime, n_clusters), sequences)
        np.save('%s/lag_%d_clusters_%d_center.npy' %(self.tICA_dir, tica_lagtime, n_clusters),
        clusters.cluster_centers_)

        # Determine cluster populations, normalize the counts, and save as percentages for
        # labeling if a cluster contains more than cluster_percentage_cutoff percent of the data.
        # Finally, save normalized counts.
        print("\nDetermining cluster populations...")
    
        if not os.path.exists('%s/cluster_centers' % self.tICA_dir):
            os.makedirs('%s/cluster_centers' % self.tICA_dir)
        counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)])
        normalized_counts =  counts/float(counts.sum())
        percentages = [ i*100 for i in normalized_counts ]
        population_labels = [ [i,"%.2f"%percentages[i]] for i in range(len(percentages)) if percentages[i] > cluster_percentage_cutoff ]
        np.savetxt('%s/cluster_centers/populations.dat' % self.tICA_dir, normalized_counts)
	

        # Plot all unique combinations of tICA components
        print("\nPlotting tICA components with cluster centers...")
        all_ticas = list(itertools.permutations(range(1,tica_components+1), 2))
        for j in tqdm.tqdm(range(len(all_ticas))): # For each pair
            if all_ticas[j][0] < all_ticas[j][1]:
                plt.figure(j, figsize=(20,16))
                plt.hexbin(eval("tica_"+str(all_ticas[j][0])), eval("tica_"+str(all_ticas[j][1])), bins='log')
                x_centers = [clusters.cluster_centers_[i][all_ticas[j][0]-1] for i in range(len(clusters.cluster_centers_))]
                y_centers = [clusters.cluster_centers_[i][all_ticas[j][1]-1] for i in range(len(clusters.cluster_centers_))]
                high_pop_x_centers = [ x_centers[i] for i in range(len(x_centers)) if percentages[i] > cluster_percentage_cutoff ]
                high_pop_y_centers = [ y_centers[i] for i in range(len(y_centers)) if percentages[i] > cluster_percentage_cutoff ]
                plt.plot(x_centers, y_centers, color='y', linestyle="", marker="o")
                plt.plot(eval("tica_"+str(all_ticas[j][0])+'[0]'), eval("tica_"+str(all_ticas[j][1])+'[0]'), color='k', marker='*',markersize=24)
                plt.xlabel('tic'+str(all_ticas[j][0]))
                plt.ylabel('tic'+str(all_ticas[j][1]))
                plt.title(self.proj_num)
                # Add labels for high-population cluster centers
                for label, x, y in zip(population_labels, high_pop_x_centers, high_pop_y_centers):
                    plt.annotate(
                      label,
                      xy = (x, y), xytext = (-15, 15),
                      textcoords = 'offset points', ha = 'right', va = 'bottom',
                      bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
                      arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
                plt.savefig('%s/tica_' % (self.tICA_dir) +str(all_ticas[j][0])+'_'+str(all_ticas[j][1])+'.png')
                plt.close()
        
###########################################################################
	for filename in os.listdir(self.tICA_dir + '/cluster_centers'):
	    if filename.endswith('.pdb'):
                os.remove(self.tICA_dir + '/cluster_centers/' + filename)  
    # Write out PDBs for each cluster center
        print("Performing cluster analytics and saving center PDBs...\n")
	runs, clones, gens = data.shape[0], data.shape[1], data.shape[2]
	x, y, z = 0, 0, 0
	for i in range(len(features)):
	    if i % clones == 0 and i != 0:
		x += 1
	    if i % gens == 0:
		y = 0
            n_snapshots = len(clusters.distances_[i])

            # Determine frames that are cluster centers
            cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ]
            # Determine number of each cluster, correlates to populations.dat
            cluster_labels = sequences[i][cluster_indices]
            # Save each cluster center as a pdb
            if list(cluster_indices): # load center-containing xtcs to check length
		traj_cat = []
		print('x: %d, y: %d, z: %d' % (x, y, z))

		while True:
		    try:
			traj = base_dir + 'PROJ%s/RUN%s/CLONE%s/results%s/traj_comp.xtc' % (self.proj_num, x, y, z)
                	traj_cat.append(md.load(traj, top=self.gro_file))
			z += 1
		    except:
			break
		if len(traj_cat) > 0:
		    trajectory_file = md.join(traj_cat)
                xtc_len = len(trajectory_file)
	    y += 1
            z = 0
            for j in range(len(cluster_indices)):
                frames = range(xtc_len) # map the strided frame number back to xtc frame number
                strided_frames = frames[equil_steps:][::stride]  
                xtc_frame = frames.index(strided_frames[cluster_indices[j]])
                cluster_traj = trajectory_file[xtc_frame]
                cluster_traj.save_pdb('%s/cluster_centers/state_%d_%.3f.pdb'%(self.tICA_dir, cluster_labels[j],percentages[cluster_labels[j]]))
                if verbose:
                    print('Successfully saved PDB for cluster: %d, (rel.pop: %.3f)'%(cluster_labels[j],percentages[cluster_labels[j]]))
                    print('traj_file: %s (%d/%d)'%(trajectory_file,i,len(features)))
                    print('frame: %d (%d/%d centers from this trajectory)'%(cluster_indices[j],j,len(cluster_indices)))
                    print('strided: npy_frame/npy_len = %d/%d = %f'%(cluster_indices[j],n_snapshots,cluster_indices[j]/n_snapshots))
                    print('re-mapped: orig_frame/xtc_len = %d/%d = %f\n'%(xtc_frame,xtc_len,xtc_frame/xtc_len))
Beispiel #24
0
import sys
"""
xyz = dataset('../xtc/*.xtc', topology = '~/Desktop/tica-projection/Structures/Reference-PRE.pdb')
list1=np.loadtxt('atompairs-5pairs-5helix-P')
featurizer = AtomPairsFeaturizer(pair_indices=list1)

ticadist = xyz.fit_transform_with(featurizer, 'atompairsfeaturizer/', fmt='dir-npy')

#ticadist =dataset('../atompairsfeaturizer/',mode='r',fmt='dir-npy',verbose=True)
tica_model=tICA(lag_time=400,n_components=2)
tica_model=ticadist.fit_with(tica_model)
tica_trajs = ticadist.transform_with(tica_model, 'tica/',fmt='dir-npy')
"""
tica_trajs=dataset('./tica',mode='r',fmt='dir-npy',verbose=True)
txx = np.concatenate(tica_trajs)
clusterer = KCenters(n_clusters=1000,random_state=8)
#clusterer = dataset('./cluster',mode='r',fmt='dir-npy',verbose=True)
clustered_trajs = tica_trajs.fit_transform_with(clusterer, 'cluster-test/', fmt='dir-npy')

"""
from msmbuilder.msm import MarkovStateModel, implied_timescales
data=dataset('cluster',mode='r',fmt='dir-npy',verbose=True)
lag_times=range(100,1300,100)
msm_timescales = implied_timescales(data, lag_times, n_timescales=10,msm=MarkovStateModel(lag_time=250,reversible_type='transpose',ergodic_cutoff='off'))
np.savetxt('msm_timescales_2.txt',msm_timescales)


#data=np.loadtxt('frame')
#data1=np.loadtxt('frame-2211-2216')
txx = np.concatenate(tica_trajs)
plt.hexbin(txx[:,0], txx[:,1],bins='log', mincnt=0.1, cmap='viridis')
Beispiel #25
0
def test_2():
    # Tet that PCA it works in a msmbuilder pipeline

    p = Pipeline([('pca', PCA()), ('cluster', KCenters())])
    p.fit(trajs)
    inds_N.append(iis)
print inds_N
#sequences of coordinates of ligands
sequences_all = []
for this_sim in simulations:
    if use_COM:
        this_seq = util.featurize_RawPos(inds_all, this_sim, average=True)
    else:
        this_seq = util.featurize_RawPos(inds_N, this_sim)
    sequences_all.extend(this_seq)

seqfile = '/home/shenglan/TryMSMbuilder/output/sequences' + '_s' + str(
    LOAD_STRIDE) + '.out'
pickle.dump(sequences_all, open(seqfile, 'wb'))

KC_clustering = KCenters(n_clusters=N_CLUSTER)
KC_assignments = KC_clustering.fit_predict(sequences_all)
KC_centers = KC_clustering.cluster_centers_

KM_clustering = KCenters(n_clusters=N_CLUSTER)
KM_assignments = KM_clustering.fit_predict(sequences_all)
KM_centers = KM_clustering.cluster_centers_

KC_output_file = '/home/shenglan/TryMSMbuilder/output/KC_centers_c' + str(
    N_CLUSTER) + '_s' + str(LOAD_STRIDE) + '.out'
KM_output_file = '/home/shenglan/TryMSMbuilder/output/KM_centers_c' + str(
    N_CLUSTER) + '_s' + str(LOAD_STRIDE) + '.out'
np.savetxt(KC_output_file, KC_centers, fmt='%10.4g')
np.savetxt(KM_output_file, KM_centers, fmt='%10.4g')

KC_assign_file = '/home/shenglan/TryMSMbuilder/output/KC_assign_' + str(