def fit_protein_kmeans(yaml_file,mini=True,pca=False): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("cluster__"): current_mdl_params[i.split("cluster__")[1]] = mdl_params[i] if mini: current_mdl_params["batch_size"] = 100*current_mdl_params["n_clusters"] kmeans_mdl = MiniBatchKMeans(**current_mdl_params) else: kmeans_mdl = KMeans(**current_mdl_params) data = [] for protein in yaml_file["protein_list"]: with enter_protein_mdl_dir(yaml_file, protein): if pca: tica_data = verboseload("pca_data.pkl") else: tica_data = verboseload("tica_data.pkl") # get all traj sorted_list = sorted(tica_data.keys(), key=keynat) data.extend([tica_data[i] for i in sorted_list]) kmeans_mdl.fit(data) kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl") verbosedump(kmeans_mdl, kmeans_mdl_path) return
class TestPlotUtils: def setUp(self): numpy.random.seed(12) self.ttrajs = { 0 : numpy.random.rand(20, 3), 1 : numpy.random.rand(20, 3), } self.clusterer = MiniBatchKMeans(n_clusters=2) self.clusterer.fit(list(self.ttrajs.values())) def test_plot_spawns(self): ax = plot_spawns( inds=spawns, tica_trajs=self.ttrajs, ax=None ) assert isinstance(ax, Axes) def test_plot_tica_landscape(self): f, ax = plot_tica_landscape(self.ttrajs) assert isinstance(ax, Axes) def test_plot_clusters(self): ax = plot_clusters(self.clusterer) assert isinstance(ax, Axes)
def cluster_minikmeans(tica_dir, data_dir, traj_dir, n_clusters, clusterer_dir=None, tICs=None): if (os.path.exists(clusterer_dir)): reduced_data = load_file(data_dir) clusterer = verboseload(clusterer_dir) clusterer.labels_ = clusterer.transform(reduced_data) verbosedump(clusterer, clusterer_dir) else: print("Clustering by KMeans") try: reduced_data = verboseload(data_dir) except: reduced_data = load_dataset(data_dir) if tICs is not None: X = [] for traj in reduced_data: X.append(traj[:, tICs]) else: X = reduced_data clusterer = MiniBatchKMeans(n_clusters=n_clusters, n_init=10) clusterer.fit_transform(X) verbosedump(clusterer, clusterer_dir)
def fit_protein_kmeans(yaml_file,mini=True): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("cluster__"): current_mdl_params[i.split("cluster__")[1]] = mdl_params[i] if mini: current_mdl_params["batch_size"] = 100*current_mdl_params["n_clusters"] kmeans_mdl = MiniBatchKMeans(**current_mdl_params) else: kmeans_mdl = KMeans(**current_mdl_params) data = [] for protein in yaml_file["protein_list"]: with enter_protein_mdl_dir(yaml_file, protein): tica_data = verboseload("tica_data.pkl") # get all traj sorted_list = sorted(tica_data.keys(), key=keynat) data.extend([tica_data[i] for i in sorted_list]) kmeans_mdl.fit(data) kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl") verbosedump(kmeans_mdl, kmeans_mdl_path) return
def setUp(self): numpy.random.seed(12) self.ttrajs = { 0 : numpy.random.rand(20, 3), 1 : numpy.random.rand(20, 3), } self.clusterer = MiniBatchKMeans(n_clusters=2) self.clusterer.fit(list(self.ttrajs.values()))
def generate_clusters(self, ticad): """ Updates the cluster data. Needs to be re-done each iteration as cluster from previous trajectories may change as we get more data. Returns: clustered dataset """ clustr = MiniBatchKMeans( n_clusters=self.config.getint("model", "num_clusters")) clustered = clustr.fit_transform(ticad) if self.save_extras: utils.dump(clustr, "microstater.pkl") return clustered
def cluster_minikmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time): clusterer_dir = "%s/clusterer_%dclusters.h5" %(tica_dir, n_clusters) if (os.path.exists(clusterer_dir)): print "Already clustered" else: print "Clustering by KMeans" try: reduced_data = verboseload(data_dir) except: reduced_data = load_dataset(data_dir) trajs = np.concatenate(reduced_data) clusterer = MiniBatchKMeans(n_clusters = n_clusters) clusterer.fit_transform(reduced_data) verbosedump(clusterer, clusterer_dir)
def cluster_features(features, clusterer, n_clusters=8): ''' Input features : list of arrays, length n_trajs, each of shape (n_samples, n_features) Output clst : msmbuilder.cluster object, with attributes cluster_centers_ : (n_clusters, n_features) labels_ : list of arrays, each of shape (n_samples, ) ''' if clusterer == 'KMeans': from msmbuilder.cluster import KMeans clst = KMeans(n_clusters=n_clusters) elif clusterer == 'KCenters': from msmbuilder.cluster import KCenters clst = KCenters(n_clusters=n_clusters) elif clusterer == 'KMedoids': from msmbuilder.cluster import KMedoids clst = KMedoids(n_clusters=n_clusters) elif clusterer == 'MiniBatchKMeans': from msmbuilder.cluster import MiniBatchKMeans clst = MiniBatchKMeans(n_clusters=n_clusters) elif clusterer == 'MiniBatchKMedoids': from msmbuilder.cluster import MiniBatchKMedoids clst = MiniBatchKMedoids(n_clusters=n_clusters) clusters = clst.fit_transform(features) return clst
def get_pipeline(parameters): """ Wrapper so that new instance of a pipeline can be instantiated for every fold. :return: sklean.pipeline.Pipeline object """ pipe = Pipeline([('variance_cut', VarianceThreshold()), ('tica', tICA(kinetic_mapping=True)), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(use_gap='timescales', lag_time=50, verbose=True))]) pipe.set_params(**parameters) return pipe
def build_model(self, user_defined_model): """ Load or build a model (Pipeline from scikit-learn) to do all the transforming and fitting :param user_defined_model: Either a string (to load from disk) or a Pipeline object to use as model :return model: Return the model back """ if user_defined_model is None: if os.path.exists(self.model_pkl_fname): logger.info('Loading model pkl file {}'.format( self.model_pkl_fname)) model = load_generic(self.model_pkl_fname) else: logger.info('Building default model based on dihedrals') # build a lag time of 1 ns for tICA and msm # if the stride is too big and we can't do that # use 1 frame and report how much that is in ns if self.app.meta is not None: lag_time = max(1, int(1 / self.timestep)) logger.info( 'Using a lag time of {} ns for the tICA and MSM'. format(lag_time * self.timestep)) else: self.timestep = None lag_time = 1 logger.warning( 'Cannot determine timestep. Defaulting to 1 frame.'. format(lag_time)) model = Pipeline([('feat', DihedralFeaturizer()), ('scaler', RobustScaler()), ('tICA', tICA(lag_time=lag_time, commute_mapping=True, n_components=10)), ('clusterer', MiniBatchKMeans(n_clusters=200)), ('msm', MarkovStateModel(lag_time=lag_time, ergodic_cutoff='off', reversible_type=None))]) else: if not isinstance(user_defined_model, Pipeline): raise ValueError( 'model is not an sklearn.pipeline.Pipeline object') else: logger.info('Using user defined model') model = user_defined_model return model
# tICA is similar to principal component analysis tica_model = tICA(lag_time=int(args.lag), n_components=int(args.components)) # fit and transform can be done in seperate steps: tica_model = scaled_diheds.fit_with(tica_model) tica_trajs = scaled_diheds.transform_with(tica_model, 'ticas/', fmt='dir-npy') # Conformations need to be clustered into states (sometimes written as microstates). # We cluster based on the tICA projections to group conformations that interconvert # rapidly. Note that we transform our trajectories from the n_components-dimensional # tICA space into a 1-dimensional cluster index txx = np.concatenate(tica_trajs) #_ = msme.plot_histogram(txx) clusterer = MiniBatchKMeans(n_clusters=int(args.clusters), random_state=42) clustered_trajs = tica_trajs.fit_transform_with(clusterer, 'kmeans/', fmt='dir-npy') #plt.figure() #plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap='viridis') #plt.scatter(clusterer.cluster_centers_[:,0], clusterer.cluster_centers_[:,1], s=100, c='w') #plt.savefig('microstate_clusters.png') # We can construct an MSM from the labeled trajectories msm = MarkovStateModel(lag_time=int(args.lag), n_timescales=20) msm.fit(clustered_trajs) assignments = clusterer.partial_transform(txx) assignments = msm.partial_transform(assignments) #msme.plot_free_energy(txx, obs=(0, 1), n_samples=10000, # pi=msm.populations_[assignments],
def calculate_fitness(population_dihedral, diheds, score_global, i, lock): import pandas as pd import numpy as np pop_index = i new_diheds = [] for i in range(0, len(diheds)): X = diheds[i] selected_features = X[:, population_dihedral] new_diheds.append(selected_features) from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(new_diheds) scaled_diheds = new_diheds from msmbuilder.decomposition import tICA tica_model = tICA(lag_time=2, n_components=5) tica_model.fit(scaled_diheds) tica_trajs = tica_model.transform(scaled_diheds) from msmbuilder.cluster import MiniBatchKMeans clusterer = MiniBatchKMeans(n_clusters=200, random_state=42) clustered_trajs = clusterer.fit_transform(tica_trajs) from msmbuilder.msm import MarkovStateModel msm = MarkovStateModel(lag_time=50, n_timescales=5) #msm.fit_transform(clustered_trajs) from sklearn.cross_validation import KFold n_states = [4] cv = KFold(len(clustered_trajs), n_folds=5) results = [] for n in n_states: msm.n_states_ = n for fold, (train_index, test_index) in enumerate(cv): train_data = [clustered_trajs[i] for i in train_index] test_data = [clustered_trajs[i] for i in test_index] msm.fit(train_data) train_score = msm.score(train_data) test_score = msm.score(test_data) time_score = msm.timescales_[0] time_test_score = time_score + test_score print(time_score) print(test_score) av_score = time_test_score / 2 results.append({ 'train_score': train_score, 'test_score': test_score, 'time_score': time_score, 'av_score': av_score, 'n_states': n, 'fold': fold }) print(msm.timescales_) results = pd.DataFrame(results) avgs = (results.groupby('n_states').aggregate(np.median).drop('fold', axis=1)) best_nt = avgs['test_score'].idxmax() best_n = avgs['av_score'].idxmax() best_score = avgs.loc[best_n, 'av_score'] best_scorent = avgs.loc[best_nt, 'test_score'] print(best_scorent) lock.acquire() score_global.update({pop_index: best_scorent}) lock.release()
from msmbuilder.cluster import MiniBatchKMeans from msmbuilder.msm import MarkovStateModel from sklearn.pipeline import Pipeline import os from ..adaptive import create_folder logging.disable(logging.CRITICAL) parser = NumberedRunsParser(traj_fmt='run-{run}.nc', top_fn='data_app/runs/structure.prmtop', step_ps=200) meta = gather_metadata('/'.join(['data_app/runs/', '*nc']), parser) model = Pipeline([('feat', DihedralFeaturizer()), ('scaler', MinMaxScaler()), ('tICA', tICA(lag_time=1, n_components=4)), ('clusterer', MiniBatchKMeans(n_clusters=5)), ('msm', MarkovStateModel(lag_time=1, n_timescales=4))]) spawns = [ (0, 1), ] epoch = 1 class TestAppBase: def __init__(self): self.app = App(generator_folder='data_app/generators', data_folder='data_app/runs', input_folder='data_app/inputs', filtered_folder='data_app/filtered_trajs', model_folder='data_app/model',
# # TIMESCALES # # The data will be loaded with a stride of 10 frames. Each fame is 50ps, so the time per frame will be # 500ps/frame or 0.5ns/frame. # Each trajectory is 1000 frames long # Lag time will be 40 frames (20 ns) based on a visual inspection of /Misc/MSM_lag_time.ipynb to_ns = 0.5 msm_lag = int(40 / to_ns) # # FEATURE INDICES # all_idx = np.load('indices_all.npy') # # OTHER PARAMETERS # ref_traj = md.load('../Data/data/trajectory-1.xtc', top='../Data/data/fs-peptide.pdb') featurizer = FeatureSelector(features=feats) pipe = Pipeline([('features', featurizer), ('variance_cut', VarianceThreshold()), ('scaling', RobustScaler()), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False))]) save_generic(pipe, 'model.pickl')
import pandas as pd import pickle import glob from msmbuilder.featurizer import ContactFeaturizer from msmbuilder.dataset import dataset from msmbuilder.decomposition import tICA from msmbuilder.cluster import MiniBatchKMeans from msmbuilder.msm import MarkovStateModel from msmbuilder.utils import verbosedump,verboseload file='dataset_nark.best_nonredu.pkl' alpha=pickle.load(open(file)) print ('#_trajs:'+ str(np.shape(alpha)[0])+'\n' '#_CA_contacts:'+str(np.shape(alpha[0])[1])) tica_model=tICA(n_components=10,lag_time=1) tica_trajs=tica_model.fit_transform(alpha) clusterer =MiniBatchKMeans(n_clusters=400) clustered_trajs = clusterer.fit_transform(tica_trajs) msm =MarkovStateModel(lag_time=150, n_timescales=5) assignments = msm.fit_transform(clustered_trajs) data = np.concatenate(tica_trajs, axis=0) pi_0 = msm.populations_[np.concatenate(assignments, axis=0)] name=file[:-4] verbosedump(tica_model, name+"-GA-tica_model.pkl") verbosedump(tica_trajs, name+"-GA-tica_trajs.pkl") verbosedump(clusterer, name+"-GA-mbkm_mdl.pkl") verbosedump(clustered_trajs, name+"-GA-clustered_trajs.pkl") verbosedump(msm,name+"-GA-msm.pkl") verbosedump(assignments,name+"-GA-assignments.pkl") verbosedump(data,name+"-GA-weighted-msme-tica-data.pkl")
# Concatenate features # ftraj = num trajectories x np.array(n_frames, n_features) ftraj = [] for traj in all_trajs: tmp = [] for feat in traj: if feat.ndim == 1: feat = feat.reshape(-1, 1) tmp.append(feat) ftraj.append(np.concatenate(tmp, axis=1)) # Make Pipeline cv_iter = ShuffleSplit(n_splits=5, test_size=0.5) estimators = [('scale', StandardScaler()), ('tica', tICA()), ('cluster', MiniBatchKMeans(random_state=0)), ('msm', MarkovStateModel())] param_grid = { 'cluster__n_clusters': list(np.linspace(200, 500, num=2).astype(int)), 'tica__n_components': list(np.linspace(10, 30, num=2).astype(int)), 'tica__lag_time': list(np.linspace(200, 500, num=2).astype(int)) } params = { 'cluster__n_clusters': scipy.stats.randint(low=200, high=200), 'tica__n_components': scipy.stats.randint(low=2, high=40), 'tica__lag_time': scipy.stats.randint(low=100, high=999) } pipe = Pipeline(estimators)
rs = np.random.RandomState(42) # Load Fs Peptide Data trajs = FsPeptide().get().trajectories # Extract Backbone Dihedrals featurizer = DihedralFeaturizer(types=['chi1']) diheds = featurizer.fit_transform(trajs) # Perform Dimensionality Reduction tica_model = tICA(lag_time=2, n_components=2) tica_trajs = tica_model.fit_transform(diheds) # Perform Clustering clusterer = MiniBatchKMeans(n_clusters=12, random_state=rs) clustered_trajs = clusterer.fit_transform(tica_trajs) # Construct MSM msm = MarkovStateModel(lag_time=2) assignments = msm.fit_transform(clustered_trajs) # Plot Stacked Distributions a = np.concatenate(assignments, axis=0) d = np.concatenate(diheds, axis=0) # Plot Stacked Distributions of the sine of each Chi1 angle # within an arbitrary set of states {2, 5, 0} path_data = [d[a == i][:, ::2] for i in [2, 5, 0]] msme.plot_stackdist(path_data)
import matplotlib import matplotlib.patches as mpatches from seaborn.distributions import (_scipy_univariate_kde, _scipy_bivariate_kde) ###READ TRAJECTORY FILES AND PREPROCESSING the data ds=dataset("*.nc", topology="s.pdb") feat = DihedralFeaturizer(types=['phi', 'psi']) ds_alpha=ds.fit_transform_with(feat, "dihed/",fmt='dir-npy') ds_alpha = dataset("./dihed/") print(len(ds_alpha),len(ds)) print(ds[0].xyz.shape) ds_alpha = dataset("./dihed/") tica_mdl = tICA(lag_time=10,n_components=2) tica_features = ds_alpha.fit_transform_with(tica_mdl, out_ds = 'tica') tica_features = dataset("./tica/") kmeans_mdl = MiniBatchKMeans(10) assignments = tica_features.fit_transform_with(kmeans_mdl, out_ds='assignments/') assignments = dataset("assignments/") import msmexplorer as msme from matplotlib import pyplot as plt tica_trajs=dataset("./tica/") ###PREPROCESSING THE TICA DATA j=0 tica=[] for k in range(6): f1=[] for i in range(len(tica_trajs)): f=list(tica_trajs[i][j:j+800]) f1.append(f) f1=np.array(f1)
featurizer = DihedralFeaturizer(types=['phi', 'psi']) diheds = xyz.fit_transform_with(featurizer, 'diheds/', fmt='dir-npy') #tICA from msmbuilder.decomposition import tICA tica_model = tICA(lag_time=2, n_components=4) # fit and transform can be done in seperate steps: tica_model = diheds.fit_with(tica_model) tica_trajs = diheds.transform_with(tica_model, 'ticas/', fmt='dir-npy') txx = np.concatenate(tica_trajs) # clustering: can change hyperparameters from msmbuilder.cluster import MiniBatchKMeans #clusterer = MiniBatchKMeans(n_clusters=num_clusters) clusterer = MiniBatchKMeans(n_clusters=num_clusters, max_no_improvement=1000, batch_size=num_clusters*10) clustered_trajs = tica_trajs.fit_transform_with( clusterer, 'kmeans/', fmt='dir-npy' ) # msm builder from msmbuilder.msm import MarkovStateModel from msmbuilder.utils import dump msm = MarkovStateModel(lag_time=2, n_timescales=20, ergodic_cutoff='off') msm.fit(clustered_trajs) # save tIC plot import matplotlib matplotlib.use('Agg') # Must be placed before matplotlib.pyplot import matplotlib.pyplot as plt plt.hexbin(txx[:, 0], txx[:, 1], bins='log', mincnt=1, cmap="bone_r")
import os from glob import glob import numpy as np from multiprocessing import Pool import pandas as pd from msmbuilder.featurizer import DihedralFeaturizer, KappaAngleFeaturizer from sklearn.model_selection import cross_val_score, cross_val_predict # Globals num_procs = 5 traj_dir = '/mnt/storage/home/ra15808/scratch/train' # traj_dir = '/Users/robert_arbon/Datasets/DHFR/train' pipe_fixed = Pipeline([('variance_cut', VarianceThreshold()), ('tica', tICA(kinetic_mapping=True)), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(n_timescales=2, lag_time=50, verbose=True))]) pipe_csp = Pipeline([('variance_cut', VarianceThreshold()), ('tica', tICA(kinetic_mapping=True)), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(use_gap='timescales', lag_time=50, verbose=True))]) best = pd.read_pickle('best_trials.pickl')
"""Cluster tICA results {{header}} Meta ---- depends: - ttrajs - meta.pandas.pickl """ from msmbuilder.io import load_trajs, save_trajs, save_generic from msmbuilder.cluster import MiniBatchKMeans ## Load meta, ttrajs = load_trajs('ttrajs') ## Fit dim = 5 kmeans = MiniBatchKMeans(n_clusters=500) kmeans.fit([traj[:, :dim] for traj in ttrajs.values()]) ## Transform ktrajs = {} for k, v in ttrajs.items(): ktrajs[k] = kmeans.partial_transform(v[:, :dim]) ## Save print(kmeans.summarize()) save_trajs(ktrajs, 'ktrajs', meta) save_generic(kmeans, 'kmeans.pickl')
tica_model = tICA(lag_time=40, n_components=20) # fit and transform can be done in seperate steps: tica_model = diheds.fit_with(tica_model) tica_trajs = diheds.transform_with(tica_model, 'ticas/', fmt='dir-npy') txx = np.concatenate(tica_trajs) # save tICA np.savetxt(folder + 'tICA_coord_+' + which_dataset + '.csv', txx, delimiter=',') # clustering from msmbuilder.cluster import MiniBatchKMeans clusterer = MiniBatchKMeans(n_clusters=num_clusters) #100 for camodulin clustered_trajs = tica_trajs.fit_transform_with(clusterer, 'kmeans/', fmt='dir-npy') # msm builder from msmbuilder.msm import MarkovStateModel from msmbuilder.utils import dump if which_dataset == 'fspeptide': msm = MarkovStateModel(lag_time=2, n_timescales=20, ergodic_cutoff='on') if which_dataset == 'apo_calmodulin': msm = MarkovStateModel(lag_time=20, n_timescales=20, ergodic_cutoff='on') msm.fit(clustered_trajs)
from msmbuilder.featurizer import DihedralFeaturizer featurizer = DihedralFeaturizer(types=['phi', 'psi']) diheds = xyz.fit_transform_with(featurizer, 'diheds/', fmt='dir-npy') #tICA from msmbuilder.decomposition import tICA tica_model = tICA(lag_time=2, n_components=4) # fit and transform can be done in seperate steps: tica_model = diheds.fit_with(tica_model) tica_trajs = diheds.transform_with(tica_model, 'ticas/', fmt='dir-npy') txx = np.concatenate(tica_trajs) # clustering from msmbuilder.cluster import MiniBatchKMeans clusterer = MiniBatchKMeans(n_clusters=num_clusters) clustered_trajs = tica_trajs.fit_transform_with( clusterer, 'kmeans/', fmt='dir-npy' ) # msm builder from msmbuilder.msm import MarkovStateModel from msmbuilder.utils import dump msm = MarkovStateModel(lag_time=20, n_timescales=20, ergodic_cutoff='on') msm.fit(clustered_trajs) # Get MFPT from msmbuilder.tpt import mfpts mfpt_matrix = mfpts(msm) # Get flux matrix