Exemple #1
0
def test_kcenters_2():
    # some data at (0,0), some data at (1,1) and some data at (0.5, 0.5)
    data = [np.zeros((10,2)), np.ones((10,2)), 0.5*np.ones((10,2))]

    m = KCenters(n_clusters=2, random_state=0)
    m.fit(data)

    # the centers should be [0,0], [1,1] (in either order). This
    # assumes that the random state seeded the initial center at
    # either (0,0) or (1,1). A different random state could have
    # seeded the first cluster at [0.5, 0.5]
    assert np.all(m.cluster_centers_ == np.array([[0,0], [1,1]])) or \
        np.all(m.cluster_centers_ == np.array([[1,1], [0,0]]))

    # the distances should be 0 or sqrt(2)/2
    eq(np.unique(np.concatenate(m.distances_)), np.array([0, np.sqrt(2)/2]))
Exemple #2
0
def test_kcenters_1():
    # make sure all the shapes are correct of the fit parameters
    m = KCenters(n_clusters=3)
    m.fit([np.random.randn(23,2), np.random.randn(10,2)])

    assert isinstance(m.labels_, list)
    assert isinstance(m.distances_, list)
    assert len(m.labels_) == 2
    eq(m.cluster_centers_.shape, (3,2))
    eq(m.labels_[0].shape, (23,))
    eq(m.labels_[1].shape, (10,))
    eq(m.distances_[0].shape, (23,))
    eq(m.distances_[1].shape, (10,))

    eq(m.fit_predict([np.random.randn(10, 2)])[0].shape, (10,))
    assert np.all(np.logical_not(np.isnan(m.distances_[0])))
#sample conformations along tIC1
print('now we are sampling representative conformations along tIC1')
plt.figure()
sampling_along_tIC(resultdir, 'samples_tic1.png', tica_trajs, trajectory_dir,
                   traj_list_array, pdb_name, 1)
print("You can use vmd to visualize the tica-dimension-tIC1.xtc file")

# In[158]:

#step 1.1: split the conformations into hundreds of microstates
#perform kCenters on the tIC subspace
#input:tICA projections, output:assignments indicating which microstate each conformation is assigned to
nMicro = 100  #specified a priori
kcenters = KCenters(n_clusters=nMicro, metric='euclidean', random_state=0)
microstate_sequences = kcenters.fit(tica_trajs)
print("output of msm:", microstate_sequences.labels_)

plt.figure()
plot_states_on_tic_space(resultdir, 'micorstate.png', tica_trajs,
                         microstate_sequences.labels_, 1, 2)

# In[159]:

#plot the microstate implied timescale, which will show how many macrostates we need
plt.figure()
lag_times = range(2, 50, 2)
msm_timescales = implied_timescales(microstate_sequences.labels_,
                                    lag_times,
                                    n_timescales=10,
                                    msm=MarkovStateModel(
#tica_trajs is the data for your DBSCAN

for j in range(len(traj_list_array)):
    np.savetxt('tica_projections/%s_tica.txt' % (traj_list_array[j][:-4]),
               tica_trajs[j][:, 0:4])  #we save the top tica projections

###after you have the tica projection (tica_trajs), you can do clustering like DBSCAN, KCENTERS

exit()

#########################below: kcenters to get the microstates
nMicro = 100
kcenters = KCenters(n_clusters=nMicro, metric='euclidean', random_state=0)

kcenters_sequences = kcenters.fit(tica_trajs)

out_assignment_dir = 'Microassignment/'
os.system("mkdir %s" % (out_assignment_dir))

tmp_counter = 0
for ifile in traj_list_array:
    np.savetxt("%s/%s_assignment_.txt" % (out_assignment_dir, ifile[:-4]),
               kcenters.labels_[tmp_counter],
               fmt='%d')
    tmp_counter += 1

exit()
from msmbuilder.msm import MarkovStateModel
from msmbuilder.utils import dump
Exemple #5
0
for line in open("trajlist"):
    traj_list_array.append(line.strip())
print traj_list_array

dataset = []
for trajfile in traj_list_array:
    t = md.load(xtc_file_dir + trajfile,
                top='test.pdb',
                atom_indices=select_atoms)
    dataset.append(t)
print dataset
#ww: check whether they have aligned w.r.t reference

kcenters = KCenters(n_clusters=nMicro, metric='rmsd', random_state=0)

kcenters_sequences = kcenters.fit(dataset)

out_assignment_dir = 'Microassignment/'
out_kcenters_distances_dir = 'distances/'
os.system("mkdir %s" % (out_assignment_dir))
os.system("mkdir %s" % (out_kcenters_distances_dir))

tmp_counter = 0
for ifile in traj_list_array:
    numpy.savetxt("%s/%s_assignment_.txt" % (out_assignment_dir, ifile[:-4]),
                  kcenters.labels_[tmp_counter],
                  fmt='%d')
    numpy.savetxt("%s/%s_distances_.txt" %
                  (out_kcenters_distances_dir, ifile[:-4]),
                  kcenters.distances_[tmp_counter],
                  fmt='%18.5f')
Exemple #6
0
            plt.figure()
            draw_tica_projection_cross_validation(
                sub_resultdir,
                'Fold_%d_tica_lagtime_%d_train_data_proj_tIC13.png' %
                (fold, tica_correlation_time), train_data_projection,
                test_data_projection, 1, 3)

            for n_tics in n_tics_range:
                for n_Micro in n_Micro_range:
                    print("parameters: fold-", fold, ',tica_lagtime-',
                          tica_correlation_time, ',n_tics-', n_tics,
                          ',n_Micro-', n_Micro)
                    kcenters = KCenters(n_clusters=n_Micro,
                                        metric='euclidean',
                                        random_state=0)
                    kcenters.fit(train_data_projection)
                    train_data_sequence = kcenters.predict(
                        train_data_projection)
                    test_data_sequence = kcenters.predict(test_data_projection)
                    msm = MarkovStateModel(
                        n_timescales=3,
                        lag_time=100,
                        reversible_type='transpose',
                        verbose=False,
                        sliding_window=True,
                        ergodic_cutoff='on')  #the parameters may change
                    msm.fit(train_data_sequence)
                    train_score = msm.score(train_data_sequence)
                    test_score = msm.score(test_data_sequence)
                    f1 = open(
                        sub_resultdir +
Exemple #7
0
    metric='rmsd',
)

## Try to limit RAM usage
def guestimate_stride():
    total_data = meta['nframes'].sum()
    want = kcen.n_clusters * 20
    stride = max(1, total_data // want)
    print("Since we have", total_data, "frames, we're going to stride by",
          stride, "during fitting, because this is probably adequate for",
          kcen.n_clusters, "clusters")
    return stride


## Fit
kcen.fit([traj for _, traj in itertrajs(meta, stride=guestimate_stride())])
print(kcen.summarize())

## Save
save_generic(kcen, 'clusterer' + str(round_num) +'.pickl')


## Save centroids
def frame(traj_i, frame_i):
    # Note: kmedoids does 0-based, contiguous integers so we use .iloc
    row = meta.iloc[traj_i]
    return md.load_frame(row['traj_fn'], frame_i, top=row['top_fn'])


centroids = md.join((frame(ti, fi) for ti, fi in kcen.cluster_ids_),
                    check_topology=False)