def cluster_minikmeans(tica_dir, data_dir, traj_dir, n_clusters, clusterer_dir=None, tICs=None): if (os.path.exists(clusterer_dir)): reduced_data = load_file(data_dir) clusterer = verboseload(clusterer_dir) clusterer.labels_ = clusterer.transform(reduced_data) verbosedump(clusterer, clusterer_dir) else: print("Clustering by KMeans") try: reduced_data = verboseload(data_dir) except: reduced_data = load_dataset(data_dir) if tICs is not None: X = [] for traj in reduced_data: X.append(traj[:, tICs]) else: X = reduced_data clusterer = MiniBatchKMeans(n_clusters=n_clusters, n_init=10) clusterer.fit_transform(X) verbosedump(clusterer, clusterer_dir)
def cluster_minikmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time): clusterer_dir = "%s/clusterer_%dclusters.h5" %(tica_dir, n_clusters) if (os.path.exists(clusterer_dir)): print "Already clustered" else: print "Clustering by KMeans" try: reduced_data = verboseload(data_dir) except: reduced_data = load_dataset(data_dir) trajs = np.concatenate(reduced_data) clusterer = MiniBatchKMeans(n_clusters = n_clusters) clusterer.fit_transform(reduced_data) verbosedump(clusterer, clusterer_dir)
def generate_clusters(self, ticad): """ Updates the cluster data. Needs to be re-done each iteration as cluster from previous trajectories may change as we get more data. Returns: clustered dataset """ clustr = MiniBatchKMeans( n_clusters=self.config.getint("model", "num_clusters")) clustered = clustr.fit_transform(ticad) if self.save_extras: utils.dump(clustr, "microstater.pkl") return clustered
rs = np.random.RandomState(42) # Load Fs Peptide Data trajs = FsPeptide().get().trajectories # Extract Backbone Dihedrals featurizer = DihedralFeaturizer(types=['chi1']) diheds = featurizer.fit_transform(trajs) # Perform Dimensionality Reduction tica_model = tICA(lag_time=2, n_components=2) tica_trajs = tica_model.fit_transform(diheds) # Perform Clustering clusterer = MiniBatchKMeans(n_clusters=12, random_state=rs) clustered_trajs = clusterer.fit_transform(tica_trajs) # Construct MSM msm = MarkovStateModel(lag_time=2) assignments = msm.fit_transform(clustered_trajs) # Plot Stacked Distributions a = np.concatenate(assignments, axis=0) d = np.concatenate(diheds, axis=0) # Plot Stacked Distributions of the sine of each Chi1 angle # within an arbitrary set of states {2, 5, 0} path_data = [d[a == i][:, ::2] for i in [2, 5, 0]] msme.plot_stackdist(path_data)
def calculate_fitness(population_dihedral, diheds, score_global, i, lock): import pandas as pd import numpy as np pop_index = i new_diheds = [] for i in range(0, len(diheds)): X = diheds[i] selected_features = X[:, population_dihedral] new_diheds.append(selected_features) from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(new_diheds) scaled_diheds = new_diheds from msmbuilder.decomposition import tICA tica_model = tICA(lag_time=2, n_components=5) tica_model.fit(scaled_diheds) tica_trajs = tica_model.transform(scaled_diheds) from msmbuilder.cluster import MiniBatchKMeans clusterer = MiniBatchKMeans(n_clusters=200, random_state=42) clustered_trajs = clusterer.fit_transform(tica_trajs) from msmbuilder.msm import MarkovStateModel msm = MarkovStateModel(lag_time=50, n_timescales=5) #msm.fit_transform(clustered_trajs) from sklearn.cross_validation import KFold n_states = [4] cv = KFold(len(clustered_trajs), n_folds=5) results = [] for n in n_states: msm.n_states_ = n for fold, (train_index, test_index) in enumerate(cv): train_data = [clustered_trajs[i] for i in train_index] test_data = [clustered_trajs[i] for i in test_index] msm.fit(train_data) train_score = msm.score(train_data) test_score = msm.score(test_data) time_score = msm.timescales_[0] time_test_score = time_score + test_score print(time_score) print(test_score) av_score = time_test_score / 2 results.append({ 'train_score': train_score, 'test_score': test_score, 'time_score': time_score, 'av_score': av_score, 'n_states': n, 'fold': fold }) print(msm.timescales_) results = pd.DataFrame(results) avgs = (results.groupby('n_states').aggregate(np.median).drop('fold', axis=1)) best_nt = avgs['test_score'].idxmax() best_n = avgs['av_score'].idxmax() best_score = avgs.loc[best_n, 'av_score'] best_scorent = avgs.loc[best_nt, 'test_score'] print(best_scorent) lock.acquire() score_global.update({pop_index: best_scorent}) lock.release()