def test_3():
    # test using a callable metric. should get same results
    model1 = LandmarkAgglomerative(n_clusters=10, n_landmarks=20, metric='euclidean')
    model2 = LandmarkAgglomerative(n_clusters=10, n_landmarks=20, metric=lambda target, ref, i: np.sqrt(np.sum((target-ref[i])**2, axis=1)))

    data = np.random.RandomState(0).randn(100, 2)
    eq(model1.fit_predict([data])[0], model2.fit_predict([data])[0])
Example #2
0
def test_agglom_with_metric_msm():
    my_list = [_get_random_prob_dist(4) for i in range(100)]
    my_flat = np.array([x.flatten() for x in my_list])
    model = LandmarkAgglomerative(n_clusters=2,
                                  metric=sym_kl_divergence_msm,
                                  linkage='complete')
    assert model.fit_predict([my_flat])[0].shape == (100, )
Example #3
0
def test_cluster_centers():
    x = [random.randn(20, 2) + 10, random.randn(20, 2)]
    n_clusters = np.random.randint(2, 7)
    model = LandmarkAgglomerative(n_clusters=n_clusters,
                                   linkage='ward')
    labels = model.fit_predict(x)
    print(model.cluster_centers_)
    assert model.cluster_centers_.shape == (n_clusters, 2)
Example #4
0
def test_alanine_dipeptide():
    # test for rmsd metric compatibility with ward clustering
    # keep n_landmarks small or this will get really slow
    trajectories = AlanineDipeptide().get_cached().trajectories
    n_clusters = 4
    model = LandmarkAgglomerative(n_clusters=n_clusters, n_landmarks=20,
                                  linkage='ward', metric='rmsd')
    labels = model.fit_predict(trajectories[0][0:100])

    assert len(np.unique(np.concatenate(labels))) <= n_clusters
def test_2():
    # this should be a really easy clustering problem
    x = [random.randn(20,2)+10, random.randn(20,2)]

    n_clusters = 2
    model1 = LandmarkAgglomerative(n_clusters=n_clusters)
    model2 = LandmarkAgglomerative(n_clusters=n_clusters,
        landmark_strategy='random', random_state=random, n_landmarks=20)

    labels1 = model1.fit_predict(x)
    labels2 = model2.fit_predict(x)
    assert adjusted_rand_score(np.concatenate(labels1), np.concatenate(labels2)) == 1.0
Example #6
0
    def _do_lumping(self):
        """Do the MVCA lumping.
        """
        model = LandmarkAgglomerative(linkage='ward',
                                      n_clusters=self.n_macrostates,
                                      metric=self.metric,
                                      n_landmarks=self.n_landmarks,
                                      landmark_strategy=self.landmark_strategy,
                                      random_state=self.random_state)
        microstate_mapping_ = model.fit_transform([self.transmat_])[0]

        self.microstate_mapping_ = microstate_mapping_
Example #7
0
    def _do_lumping(self):
        """Do the MVCA lumping.
        """
        model = LandmarkAgglomerative(linkage='ward',
                                      n_clusters=self.n_macrostates,
                                      metric=self.metric,
                                      n_landmarks=self.n_landmarks,
                                      landmark_strategy=self.landmark_strategy,
                                      random_state=self.random_state)
        model.fit([self.transmat_])

        if self.fit_only:
            microstate_mapping_ = model.landmark_labels_

        else:
            microstate_mapping_ = model.transform([self.transmat_])[0]

        self.microstate_mapping_ = microstate_mapping_
def test_1():
    x = [random.randn(10,2), random.randn(10,2)]
    
    n_clusters = 2
    model1 = LandmarkAgglomerative(n_clusters=n_clusters)
    model2 = LandmarkAgglomerative(n_clusters=n_clusters,
        n_landmarks=sum(len(s) for s in x))

    labels0 = clone(model1).fit(x).predict(x)
    labels1 = model1.fit_predict(x)
    labels2 = model2.fit_predict(x)
    
    assert len(labels0) == 2
    assert len(labels1) == 2
    assert len(labels2) == 2
    eq(labels0[0], labels1[0])
    eq(labels0[1], labels1[1])
    eq(labels0[0], labels2[0])
    eq(labels0[1], labels2[1])

    assert len(np.unique(np.concatenate(labels0))) == n_clusters
Example #9
0
def test_callable_metric():
    def my_euc(target, ref, i):
        return np.sqrt(np.sum((target - ref[i]) ** 2, axis=1))

    model1 = LandmarkAgglomerative(n_clusters=10, n_landmarks=20,
                                   metric='euclidean')
    model2 = LandmarkAgglomerative(n_clusters=10, n_landmarks=20, metric=my_euc)

    data = np.random.RandomState(0).randn(100, 2)
    eq(model1.fit_predict([data])[0], model2.fit_predict([data])[0])
Example #10
0
def test_3():
    # test using a callable metric. should get same results
    model1 = LandmarkAgglomerative(n_clusters=10,
                                   n_landmarks=20,
                                   metric='euclidean')
    model2 = LandmarkAgglomerative(n_clusters=10,
                                   n_landmarks=20,
                                   metric=lambda target, ref, i: np.sqrt(
                                       np.sum((target - ref[i])**2, axis=1)))

    data = np.random.RandomState(0).randn(100, 2)
    eq(model1.fit_predict([data])[0], model2.fit_predict([data])[0])
Example #11
0
def test_2():
    # this should be a really easy clustering problem
    x = [random.randn(20, 2) + 10, random.randn(20, 2)]

    n_clusters = 2
    model1 = LandmarkAgglomerative(n_clusters=n_clusters)
    model2 = LandmarkAgglomerative(n_clusters=n_clusters,
                                   landmark_strategy='random',
                                   random_state=random, n_landmarks=20)

    labels1 = model1.fit_predict(x)
    labels2 = model2.fit_predict(x)
    assert adjusted_rand_score(np.concatenate(labels1),
                               np.concatenate(labels2)) == 1.0
Example #12
0
def test_1():
    x = [random.randn(10, 2), random.randn(10, 2)]

    n_clusters = 2
    model1 = LandmarkAgglomerative(n_clusters=n_clusters)
    model2 = LandmarkAgglomerative(n_clusters=n_clusters,
                                   n_landmarks=sum(len(s) for s in x))

    labels0 = clone(model1).fit(x).predict(x)
    labels1 = model1.fit_predict(x)
    labels2 = model2.fit_predict(x)

    assert len(labels0) == 2
    assert len(labels1) == 2
    assert len(labels2) == 2
    eq(labels0[0], labels1[0])
    eq(labels0[1], labels1[1])
    eq(labels0[0], labels2[0])
    eq(labels0[1], labels2[1])

    assert len(np.unique(np.concatenate(labels0))) == n_clusters
Example #13
0
tops = preload_tops(meta)
totframes = meta['nframes'].sum()


def traj_load(irow):
    i, row = irow
    traj = md.load(row['traj_fn'], top=tops[row['top_fn']])
    return i, traj


traj_dict = dict(map(traj_load, meta.iterrows()))
trajs = [traj for traj in traj_dict.values() if traj.n_frames > 1000]
print(len(trajs))
num_clust = 20
cluster = LandmarkAgglomerative(n_clusters=num_clust,
                                n_landmarks=int(totframes / 100),
                                linkage='ward',
                                metric='rmsd')
ctrajs = cluster.fit_transform(trajs)

# print('Fitting cluster labels for MSM')
# ctraj = {}
# count = 0
# for k, v in traj_dict.items():
#     print(k, count)
#     count +=1
#     ctraj[k] = cluster.partial_predict(v)
#
# ctrajs = [traj for traj in ctraj.values() if traj.shape[0] > 1000]

print('Fitting MSM')
lag = 4000
Example #14
0

def traj_load(irow):
    i, row = irow
    traj = md.load(row['traj_fn'], top=tops[row['top_fn']])
    return i, traj


traj_dict = dict(map(traj_load, meta.iterrows()))
trajs = [traj for traj in traj_dict.values()]

# Make Pipeline
cv_iter = ShuffleSplit(n_splits=5, test_size=0.5)
estimators = [('cluster',
               LandmarkAgglomerative(n_clusters=2,
                                     n_landmarks=int(totframes / 200),
                                     linkage='ward',
                                     metric='rmsd')),
              ('msm', MarkovStateModel())]

params = {'cluster__n_clusters': [200]}

pipe = Pipeline(estimators)
pipe.set_params(msm__lag_time=999)
pipe.set_params(msm__n_timescales=20)

if __name__ == "__main__":

    cvSearch = GridSearchCV(pipe, params, n_jobs=1, verbose=1, cv=cv_iter)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipe.steps])
Example #15
0
matplotlib.use('Agg')
from matplotlib.pylab import plt
import numpy as np
import seaborn as sns
from utilities import to_dataframe


# load trajectories
feature = 'dihedrals'
meta, traj_dict= load_trajs('pruned_trajectories/{}-ftraj'.format(feature))
trajs = [traj for traj in traj_dict.values()]
nframes = int(np.max(meta['nframes'].unique()[0]))

# cluster
num_clusters=10
cluster = LandmarkAgglomerative(n_clusters=num_clusters, n_landmarks=200, linkage='ward', metric='euclidean')
cluster.fit(trajs)

ctraj = {}
for k, v in traj_dict.items():
    v = v.copy(order='C')
    v = cluster.partial_predict(v)
    diff = nframes-v.shape[0]
    v = np.append(v, np.zeros(diff)-1)
    ctraj[k] = v

# Convert to DF for plotting and sampling.
df = to_dataframe(ctraj, nframes, dt=1)

# Plot trajectories
sample = df.sample(frac=0.1, axis=0)
Example #16
0

if __name__ == '__main__':

    with Pool() as pool:
        trajs_dct = dict(pool.imap_unordered(traj_load, meta.iterrows()))
    trajs = [traj for traj in trajs_dct.values()]

    to_ns, t_max, frames_tot = get_timings(meta)
    n_clusters = int(np.sqrt(frames_tot))
    print(n_clusters)
    # n_clusters = int(frames_tot/1000)
    clusterer = LandmarkAgglomerative(n_clusters=n_clusters,
                                      n_landmarks=n_clusters // 10,
                                      linkage='ward',
                                      metric='rmsd',
                                      landmark_strategy='stride',
                                      random_state=None,
                                      max_landmarks=None,
                                      ward_predictor='ward')
    ctrajs = clusterer.fit_transform(trajs)

    lags = (np.arange(1, 50, 1) / to_ns).astype(int)
    n_timescales = 50
    timescales = np.zeros((lags.shape[0], n_timescales))
    eigenvalues = np.zeros((lags.shape[0], n_timescales))

    for idx, lag in enumerate(lags):
        msm = MarkovStateModel(lag_time=lag, n_timescales=n_timescales)
        msm.fit_transform(ctrajs)
        timescales[idx] = msm.timescales_
        eigenvalues[idx] = msm.eigenvalues_[1:]

def traj_load(irow):
    i, row = irow
    traj = md.load(row['traj_fn'], top=tops[row['top_fn']])
    return i, traj


traj_dict = dict(map(traj_load, meta.iterrows()))
trajs = [traj for traj in traj_dict.values()]

# cluster
print('Attempting to cluster')
num_clusters = 20
cluster = LandmarkAgglomerative(n_clusters=num_clusters,
                                n_landmarks=int(totframes / 100),
                                linkage='ward',
                                metric='rmsd')
cluster.fit(trajs)

#
# print('Fitting cluster labels')
# ctraj = {}
# for k, v in traj_dict.items():
#     v = cluster.partial_predict(v)
#     diff = nframes-v.shape[0]
#     v = np.append(v, np.zeros(diff)-1)
#     ctraj[k] = v

# Convert to DF for plotting and sampling.
# df = to_dataframe(ctraj, nframes, dt=1)
    ctraj_path = 'ctraj-200'
    if isdir(ctraj_path):
        meta, all_ctrajs_dict = load_trajs(ctraj_path)
    else:

        def traj_load(irow):
            i, row = irow
            traj = md.load(row['traj_fn'], top=tops[row['top_fn']])
            return i, traj


        traj_dict = dict(map(traj_load, meta.iterrows()))
        all_trajs = [traj for traj in traj_dict.values()]

        cluster = LandmarkAgglomerative(n_clusters=200, n_landmarks=int(totframes /200), linkage='ward', metric='rmsd')
        cluster.fit(all_trajs)
        # TODO will this work?
        args = [(k,v,cluster) for k, v in traj_dict.items()]

        with Pool() as pool:
            all_ctrajs_dict = dict(pool.imap_unordered(clust, args))

        save_generic(cluster, 'cluster-200')
        save_trajs(all_ctrajs_dict, 'ctraj-200', meta)

    long_ctrajs = [np.squeeze(traj) for traj in all_ctrajs_dict.values() if traj.shape[0] > 1000]
    all_ctrajs = [np.squeeze(traj) for traj in all_ctrajs_dict.values()]

    lags = np.concatenate((np.arange(200, 1000, 200),np.arange(1000, 5000, 500)))
    all_msms = []