def test_3(): # test using a callable metric. should get same results model1 = LandmarkAgglomerative(n_clusters=10, n_landmarks=20, metric='euclidean') model2 = LandmarkAgglomerative(n_clusters=10, n_landmarks=20, metric=lambda target, ref, i: np.sqrt(np.sum((target-ref[i])**2, axis=1))) data = np.random.RandomState(0).randn(100, 2) eq(model1.fit_predict([data])[0], model2.fit_predict([data])[0])
def test_agglom_with_metric_msm(): my_list = [_get_random_prob_dist(4) for i in range(100)] my_flat = np.array([x.flatten() for x in my_list]) model = LandmarkAgglomerative(n_clusters=2, metric=sym_kl_divergence_msm, linkage='complete') assert model.fit_predict([my_flat])[0].shape == (100, )
def test_cluster_centers(): x = [random.randn(20, 2) + 10, random.randn(20, 2)] n_clusters = np.random.randint(2, 7) model = LandmarkAgglomerative(n_clusters=n_clusters, linkage='ward') labels = model.fit_predict(x) print(model.cluster_centers_) assert model.cluster_centers_.shape == (n_clusters, 2)
def test_alanine_dipeptide(): # test for rmsd metric compatibility with ward clustering # keep n_landmarks small or this will get really slow trajectories = AlanineDipeptide().get_cached().trajectories n_clusters = 4 model = LandmarkAgglomerative(n_clusters=n_clusters, n_landmarks=20, linkage='ward', metric='rmsd') labels = model.fit_predict(trajectories[0][0:100]) assert len(np.unique(np.concatenate(labels))) <= n_clusters
def test_2(): # this should be a really easy clustering problem x = [random.randn(20,2)+10, random.randn(20,2)] n_clusters = 2 model1 = LandmarkAgglomerative(n_clusters=n_clusters) model2 = LandmarkAgglomerative(n_clusters=n_clusters, landmark_strategy='random', random_state=random, n_landmarks=20) labels1 = model1.fit_predict(x) labels2 = model2.fit_predict(x) assert adjusted_rand_score(np.concatenate(labels1), np.concatenate(labels2)) == 1.0
def _do_lumping(self): """Do the MVCA lumping. """ model = LandmarkAgglomerative(linkage='ward', n_clusters=self.n_macrostates, metric=self.metric, n_landmarks=self.n_landmarks, landmark_strategy=self.landmark_strategy, random_state=self.random_state) microstate_mapping_ = model.fit_transform([self.transmat_])[0] self.microstate_mapping_ = microstate_mapping_
def _do_lumping(self): """Do the MVCA lumping. """ model = LandmarkAgglomerative(linkage='ward', n_clusters=self.n_macrostates, metric=self.metric, n_landmarks=self.n_landmarks, landmark_strategy=self.landmark_strategy, random_state=self.random_state) model.fit([self.transmat_]) if self.fit_only: microstate_mapping_ = model.landmark_labels_ else: microstate_mapping_ = model.transform([self.transmat_])[0] self.microstate_mapping_ = microstate_mapping_
def test_1(): x = [random.randn(10,2), random.randn(10,2)] n_clusters = 2 model1 = LandmarkAgglomerative(n_clusters=n_clusters) model2 = LandmarkAgglomerative(n_clusters=n_clusters, n_landmarks=sum(len(s) for s in x)) labels0 = clone(model1).fit(x).predict(x) labels1 = model1.fit_predict(x) labels2 = model2.fit_predict(x) assert len(labels0) == 2 assert len(labels1) == 2 assert len(labels2) == 2 eq(labels0[0], labels1[0]) eq(labels0[1], labels1[1]) eq(labels0[0], labels2[0]) eq(labels0[1], labels2[1]) assert len(np.unique(np.concatenate(labels0))) == n_clusters
def test_callable_metric(): def my_euc(target, ref, i): return np.sqrt(np.sum((target - ref[i]) ** 2, axis=1)) model1 = LandmarkAgglomerative(n_clusters=10, n_landmarks=20, metric='euclidean') model2 = LandmarkAgglomerative(n_clusters=10, n_landmarks=20, metric=my_euc) data = np.random.RandomState(0).randn(100, 2) eq(model1.fit_predict([data])[0], model2.fit_predict([data])[0])
def test_3(): # test using a callable metric. should get same results model1 = LandmarkAgglomerative(n_clusters=10, n_landmarks=20, metric='euclidean') model2 = LandmarkAgglomerative(n_clusters=10, n_landmarks=20, metric=lambda target, ref, i: np.sqrt( np.sum((target - ref[i])**2, axis=1))) data = np.random.RandomState(0).randn(100, 2) eq(model1.fit_predict([data])[0], model2.fit_predict([data])[0])
def test_2(): # this should be a really easy clustering problem x = [random.randn(20, 2) + 10, random.randn(20, 2)] n_clusters = 2 model1 = LandmarkAgglomerative(n_clusters=n_clusters) model2 = LandmarkAgglomerative(n_clusters=n_clusters, landmark_strategy='random', random_state=random, n_landmarks=20) labels1 = model1.fit_predict(x) labels2 = model2.fit_predict(x) assert adjusted_rand_score(np.concatenate(labels1), np.concatenate(labels2)) == 1.0
def test_1(): x = [random.randn(10, 2), random.randn(10, 2)] n_clusters = 2 model1 = LandmarkAgglomerative(n_clusters=n_clusters) model2 = LandmarkAgglomerative(n_clusters=n_clusters, n_landmarks=sum(len(s) for s in x)) labels0 = clone(model1).fit(x).predict(x) labels1 = model1.fit_predict(x) labels2 = model2.fit_predict(x) assert len(labels0) == 2 assert len(labels1) == 2 assert len(labels2) == 2 eq(labels0[0], labels1[0]) eq(labels0[1], labels1[1]) eq(labels0[0], labels2[0]) eq(labels0[1], labels2[1]) assert len(np.unique(np.concatenate(labels0))) == n_clusters
tops = preload_tops(meta) totframes = meta['nframes'].sum() def traj_load(irow): i, row = irow traj = md.load(row['traj_fn'], top=tops[row['top_fn']]) return i, traj traj_dict = dict(map(traj_load, meta.iterrows())) trajs = [traj for traj in traj_dict.values() if traj.n_frames > 1000] print(len(trajs)) num_clust = 20 cluster = LandmarkAgglomerative(n_clusters=num_clust, n_landmarks=int(totframes / 100), linkage='ward', metric='rmsd') ctrajs = cluster.fit_transform(trajs) # print('Fitting cluster labels for MSM') # ctraj = {} # count = 0 # for k, v in traj_dict.items(): # print(k, count) # count +=1 # ctraj[k] = cluster.partial_predict(v) # # ctrajs = [traj for traj in ctraj.values() if traj.shape[0] > 1000] print('Fitting MSM') lag = 4000
def traj_load(irow): i, row = irow traj = md.load(row['traj_fn'], top=tops[row['top_fn']]) return i, traj traj_dict = dict(map(traj_load, meta.iterrows())) trajs = [traj for traj in traj_dict.values()] # Make Pipeline cv_iter = ShuffleSplit(n_splits=5, test_size=0.5) estimators = [('cluster', LandmarkAgglomerative(n_clusters=2, n_landmarks=int(totframes / 200), linkage='ward', metric='rmsd')), ('msm', MarkovStateModel())] params = {'cluster__n_clusters': [200]} pipe = Pipeline(estimators) pipe.set_params(msm__lag_time=999) pipe.set_params(msm__n_timescales=20) if __name__ == "__main__": cvSearch = GridSearchCV(pipe, params, n_jobs=1, verbose=1, cv=cv_iter) print("Performing grid search...") print("pipeline:", [name for name, _ in pipe.steps])
matplotlib.use('Agg') from matplotlib.pylab import plt import numpy as np import seaborn as sns from utilities import to_dataframe # load trajectories feature = 'dihedrals' meta, traj_dict= load_trajs('pruned_trajectories/{}-ftraj'.format(feature)) trajs = [traj for traj in traj_dict.values()] nframes = int(np.max(meta['nframes'].unique()[0])) # cluster num_clusters=10 cluster = LandmarkAgglomerative(n_clusters=num_clusters, n_landmarks=200, linkage='ward', metric='euclidean') cluster.fit(trajs) ctraj = {} for k, v in traj_dict.items(): v = v.copy(order='C') v = cluster.partial_predict(v) diff = nframes-v.shape[0] v = np.append(v, np.zeros(diff)-1) ctraj[k] = v # Convert to DF for plotting and sampling. df = to_dataframe(ctraj, nframes, dt=1) # Plot trajectories sample = df.sample(frac=0.1, axis=0)
if __name__ == '__main__': with Pool() as pool: trajs_dct = dict(pool.imap_unordered(traj_load, meta.iterrows())) trajs = [traj for traj in trajs_dct.values()] to_ns, t_max, frames_tot = get_timings(meta) n_clusters = int(np.sqrt(frames_tot)) print(n_clusters) # n_clusters = int(frames_tot/1000) clusterer = LandmarkAgglomerative(n_clusters=n_clusters, n_landmarks=n_clusters // 10, linkage='ward', metric='rmsd', landmark_strategy='stride', random_state=None, max_landmarks=None, ward_predictor='ward') ctrajs = clusterer.fit_transform(trajs) lags = (np.arange(1, 50, 1) / to_ns).astype(int) n_timescales = 50 timescales = np.zeros((lags.shape[0], n_timescales)) eigenvalues = np.zeros((lags.shape[0], n_timescales)) for idx, lag in enumerate(lags): msm = MarkovStateModel(lag_time=lag, n_timescales=n_timescales) msm.fit_transform(ctrajs) timescales[idx] = msm.timescales_ eigenvalues[idx] = msm.eigenvalues_[1:]
def traj_load(irow): i, row = irow traj = md.load(row['traj_fn'], top=tops[row['top_fn']]) return i, traj traj_dict = dict(map(traj_load, meta.iterrows())) trajs = [traj for traj in traj_dict.values()] # cluster print('Attempting to cluster') num_clusters = 20 cluster = LandmarkAgglomerative(n_clusters=num_clusters, n_landmarks=int(totframes / 100), linkage='ward', metric='rmsd') cluster.fit(trajs) # # print('Fitting cluster labels') # ctraj = {} # for k, v in traj_dict.items(): # v = cluster.partial_predict(v) # diff = nframes-v.shape[0] # v = np.append(v, np.zeros(diff)-1) # ctraj[k] = v # Convert to DF for plotting and sampling. # df = to_dataframe(ctraj, nframes, dt=1)
ctraj_path = 'ctraj-200' if isdir(ctraj_path): meta, all_ctrajs_dict = load_trajs(ctraj_path) else: def traj_load(irow): i, row = irow traj = md.load(row['traj_fn'], top=tops[row['top_fn']]) return i, traj traj_dict = dict(map(traj_load, meta.iterrows())) all_trajs = [traj for traj in traj_dict.values()] cluster = LandmarkAgglomerative(n_clusters=200, n_landmarks=int(totframes /200), linkage='ward', metric='rmsd') cluster.fit(all_trajs) # TODO will this work? args = [(k,v,cluster) for k, v in traj_dict.items()] with Pool() as pool: all_ctrajs_dict = dict(pool.imap_unordered(clust, args)) save_generic(cluster, 'cluster-200') save_trajs(all_ctrajs_dict, 'ctraj-200', meta) long_ctrajs = [np.squeeze(traj) for traj in all_ctrajs_dict.values() if traj.shape[0] > 1000] all_ctrajs = [np.squeeze(traj) for traj in all_ctrajs_dict.values()] lags = np.concatenate((np.arange(200, 1000, 200),np.arange(1000, 5000, 500))) all_msms = []