def test_robustscaler_vs_sklearn(): # Compare msmbuilder.preprocessing.RobustScaler # with sklearn.preprocessing.RobustScaler robustscalerr = RobustScalerR() robustscalerr.fit(np.concatenate(trajs)) robustscaler = RobustScaler() robustscaler.fit(trajs) y_ref1 = robustscalerr.transform(trajs[0]) y1 = robustscaler.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def scale_data(self, scaler='Robust'): print('Scale featurized data been called\n') print('-------------------------------\n') from msmbuilder.preprocessing import RobustScaler if scaler == 'Robust': scaler = RobustScaler() self.scaled_data = scaler.fit_transform(self.sim_seqs) print('scaled ', self.scaled_data[0].shape) # # print("Scaling feautirized data successfully") print('-----------------------------------\n')
def setUp(self): numpy.random.seed(12) self.top = 'data_app/runs/structure.prmtop' self.traj_1 = 'data_app/runs/run-000.nc' self.traj_2 = 'data_app/runs/run-001.nc' self.feat = DihedralFeaturizer() self.traj_dict = { 0: load(self.traj_1, top=self.top), 1: load(self.traj_2, top=self.top) } self.scaler = RobustScaler() self.tica = tICA(n_components=2) self.ftrajs = { 0: numpy.random.rand(100, 50), 1: numpy.random.rand(100, 50), }
def build_model(self, user_defined_model): """ Load or build a model (Pipeline from scikit-learn) to do all the transforming and fitting :param user_defined_model: Either a string (to load from disk) or a Pipeline object to use as model :return model: Return the model back """ if user_defined_model is None: if os.path.exists(self.model_pkl_fname): logger.info('Loading model pkl file {}'.format( self.model_pkl_fname)) model = load_generic(self.model_pkl_fname) else: logger.info('Building default model based on dihedrals') # build a lag time of 1 ns for tICA and msm # if the stride is too big and we can't do that # use 1 frame and report how much that is in ns if self.app.meta is not None: lag_time = max(1, int(1 / self.timestep)) logger.info( 'Using a lag time of {} ns for the tICA and MSM'. format(lag_time * self.timestep)) else: self.timestep = None lag_time = 1 logger.warning( 'Cannot determine timestep. Defaulting to 1 frame.'. format(lag_time)) model = Pipeline([('feat', DihedralFeaturizer()), ('scaler', RobustScaler()), ('tICA', tICA(lag_time=lag_time, commute_mapping=True, n_components=10)), ('clusterer', MiniBatchKMeans(n_clusters=200)), ('msm', MarkovStateModel(lag_time=lag_time, ergodic_cutoff='off', reversible_type=None))]) else: if not isinstance(user_defined_model, Pipeline): raise ValueError( 'model is not an sklearn.pipeline.Pipeline object') else: logger.info('Using user defined model') model = user_defined_model return model
# # TIMESCALES # # The data will be loaded with a stride of 10 frames. Each fame is 50ps, so the time per frame will be # 500ps/frame or 0.5ns/frame. # Each trajectory is 1000 frames long # Lag time will be 40 frames (20 ns) based on a visual inspection of /Misc/MSM_lag_time.ipynb to_ns = 0.5 msm_lag = int(40 / to_ns) # # FEATURE INDICES # all_idx = np.load('indices_all.npy') # # OTHER PARAMETERS # ref_traj = md.load('../Data/data/trajectory-1.xtc', top='../Data/data/fs-peptide.pdb') featurizer = FeatureSelector(features=feats) pipe = Pipeline([('features', featurizer), ('variance_cut', VarianceThreshold()), ('scaling', RobustScaler()), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False))]) save_generic(pipe, 'model.pickl')
# memory at once. The dataset object lazily-loads trajectories as they are needed. # Below, we create a dataset out of the many *.xtc files we downloaded. We only load # every 10th frame xyz = dataset("./*.xtc", topology='./%s' % args.pdb) # The raw (x, y, z) coordinates from the simulation do not respect the translational # and rotational symmetry of our problem. A Featurizer transforms cartesian # coordinates into other representations. Here we use the DihedralFeaturizer to turn # our data into phi and psi dihedral angles. featurizer = DihedralFeaturizer(types=['phi', 'psi']) diheds = xyz.fit_transform_with(featurizer, 'diheds/', fmt='dir-npy') # Since the range of values in our raw data can vary widely from feature to feature, # we can scale values to reduce bias. Here we use the RobustScaler to center and # scale our dihedral angles by their respective interquartile ranges. scaler = RobustScaler() scaled_diheds = diheds.fit_transform_with(scaler, 'scaled_diheds/', fmt='dir-npy') # Intermediate kinetic model: tICA # tICA is similar to principal component analysis tica_model = tICA(lag_time=int(args.lag), n_components=int(args.components)) # fit and transform can be done in seperate steps: tica_model = scaled_diheds.fit_with(tica_model) tica_trajs = scaled_diheds.transform_with(tica_model, 'ticas/', fmt='dir-npy') # Conformations need to be clustered into states (sometimes written as microstates).
from msmbuilder.cluster import MiniBatchKMeans from msmbuilder.msm import MarkovStateModel from msmbuilder.io import save_generic from sklearn.base import clone, BaseEstimator from six import iteritems # The data will be loaded with a stride of 10 frames. Each fame is 50ps, so the time per frame will be # 500ps/frame or 0.5ns/frame. # Each trajectory is 1000 frames long # Lag time will be 40 (20 ns) frames based on a visual inspection of /Misc/MSM_lag_time.ipynb to_ns = 0.5 msm_lag = int(40 / to_ns) # [‘phi’, ‘psi’, ‘omega’, ‘chi1’, ‘chi2’, ‘chi3’, ‘chi4’] feats = [('backbone_dihed', DihedralFeaturizer(types=['phi', 'psi'])), ('residues_dihed', DihedralFeaturizer(types=['chi1', 'chi2', 'chi3', 'chi4'])), ('contacts', ContactFeaturizer())] featurizer = FeatureSelector(features=feats) pipe = Pipeline([('features', featurizer), ('variance_cut', VarianceThreshold()), ('scaling', RobustScaler()), ('tica', tICA(kinetic_mapping=True)), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False))]) save_generic(pipe, 'model.pickl')
def calculate_fitness(population_dihedral, diheds, score_global, i, lock): import pandas as pd import numpy as np pop_index = i new_diheds = [] for i in range(0, len(diheds)): X = diheds[i] selected_features = X[:, population_dihedral] new_diheds.append(selected_features) from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(new_diheds) scaled_diheds = new_diheds from msmbuilder.decomposition import tICA tica_model = tICA(lag_time=2, n_components=5) tica_model.fit(scaled_diheds) tica_trajs = tica_model.transform(scaled_diheds) from msmbuilder.cluster import MiniBatchKMeans clusterer = MiniBatchKMeans(n_clusters=200, random_state=42) clustered_trajs = clusterer.fit_transform(tica_trajs) from msmbuilder.msm import MarkovStateModel msm = MarkovStateModel(lag_time=50, n_timescales=5) #msm.fit_transform(clustered_trajs) from sklearn.cross_validation import KFold n_states = [4] cv = KFold(len(clustered_trajs), n_folds=5) results = [] for n in n_states: msm.n_states_ = n for fold, (train_index, test_index) in enumerate(cv): train_data = [clustered_trajs[i] for i in train_index] test_data = [clustered_trajs[i] for i in test_index] msm.fit(train_data) train_score = msm.score(train_data) test_score = msm.score(test_data) time_score = msm.timescales_[0] time_test_score = time_score + test_score print(time_score) print(test_score) av_score = time_test_score / 2 results.append({ 'train_score': train_score, 'test_score': test_score, 'time_score': time_score, 'av_score': av_score, 'n_states': n, 'fold': fold }) print(msm.timescales_) results = pd.DataFrame(results) avgs = (results.groupby('n_states').aggregate(np.median).drop('fold', axis=1)) best_nt = avgs['test_score'].idxmax() best_n = avgs['av_score'].idxmax() best_score = avgs.loc[best_n, 'av_score'] best_scorent = avgs.loc[best_nt, 'test_score'] print(best_scorent) lock.acquire() score_global.update({pop_index: best_scorent}) lock.release()
def main(): cli = argparse.ArgumentParser() cli.add_argument('-e', '--eps', help='eps', default=1, type=float) cli.add_argument('-m', '--min_samples', help='min_samples', default=5, type=int) cli.add_argument('-l', '--nlist', help='nlist', default=1000, type=int) cli.add_argument('-p', '--nprobe', help='nprob', default=10, type=int) # Download example dataset from msmbuilder.example_datasets import AlanineDipeptide ala2 = AlanineDipeptide(verbose=False) xyz = ala2.get().trajectories print(ala2.description()) #xyz = [t[::10] for t in xyz] print("{} trajectories".format(len(xyz))) # msmbuilder does not keep track of units! You must keep track of your # data's timestep to_ns = 0.5 print("with length {} ns".format(set(len(x) * to_ns for x in xyz))) from msmbuilder.featurizer import DihedralFeaturizer featurizer = DihedralFeaturizer(types=['phi', 'psi']) diheds = featurizer.fit_transform(xyz) print(xyz[0].xyz.shape) print(diheds[0].shape) from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(diheds) print(diheds[0].shape) print(scaled_diheds[0].shape) from msmbuilder.decomposition import tICA tica_model = tICA(lag_time=2, n_components=2) # fit and transform can be done in seperate steps: tica_model.fit(diheds) tica_trajs = tica_model.transform(diheds) featurizer = DihedralFeaturizer(types=['phi', 'psi'], sincos=False) diheds = featurizer.fit_transform(xyz) print(diheds[0].shape) print(tica_trajs[0].shape) # =========================================================================== #if os.path.isfile("./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True: # phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32) # psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32) #X = np.column_stack((phi_angles, psi_angles)) #print(X.shape) phi_angles = np.degrees(diheds[0][:, 0]) psi_angles = np.degrees(diheds[0][:, 1]) print(phi_angles) X = tica_trajs[0].astype(np.float32) #rint(X) n_size = X.shape[0] dimension = X.shape[1] #print(X.shape) # =========================================================================== args = cli.parse_args() eps = args.eps # eps min_samples = args.min_samples # min_samples nlist = args.nlist nprobe = args.nprobe IVFFlat = True print('n_size = %d,\t dimension = %d,\t eps = %f, min_samples = %d' % (n_size, dimension, eps, min_samples)) n_samples = 1000 percent = 0.9 import random whole_samples = random.sample(list(X), n_samples) #print whole_samples from metrics.pairwise import pairwise_distances sample_dist_metric = pairwise_distances(whole_samples, whole_samples, metric='l2') print(sample_dist_metric.shape) sample_dist = [] for i in range(0, n_samples): for j in range(i + 1, n_samples): sample_dist.append(sample_dist_metric[i, j]) sorted_sample_dist = np.sort(sample_dist) print("Len of samples:", len(sorted_sample_dist), np.max(sorted_sample_dist), np.min(sorted_sample_dist)) eps_list = [] len_samples = len(sorted_sample_dist) for percent in [0.30, 0.20, 0.10]: #,0.005, 0.003, # 0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]: #percent /= 10.0 index = int(round(len_samples * percent)) if index == len_samples: index -= 1 dc = sorted_sample_dist[index] #print index, sorted_sample_dist[index] eps_list.append(dc) print(eps_list) #print X # =========================================================================== # do Clustering using MR -DBSCAN method clustering_name = "mr-dbscan_iter_" #potential = True remove_outliers = False potential = False eps = eps_list[0] min_samples = 1 len_frames = len(X) print("Total frames:", len_frames) print("Running first calculation") db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat) db.fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True old_assignments = db.labels_ n_microstates = len( set(old_assignments)) - (1 if -1 in old_assignments else 0) print('Estimated number of clusters: %d' % n_microstates) # Calculating percentage of each states frame_bincount = np.bincount( old_assignments[old_assignments >= 0]) #remove outliers frame_freq_index_sorted = np.argsort( frame_bincount)[::-1] # descending arg sort frame_freq_percent_sorted = frame_bincount[ frame_freq_index_sorted] / np.float32(len_frames) print(frame_freq_percent_sorted[0:10]) print(frame_freq_index_sorted[0:10]) old_frame_freq_percent_sorted = frame_freq_percent_sorted old_frame_freq_index_sorted = frame_freq_index_sorted n_microstates = len( set(old_assignments)) - (1 if -1 in old_assignments else 0) print('Estimated number of clusters: %d' % n_microstates) iter_name = clustering_name + '0' + '_eps_' + str( eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str( n_microstates) plot_cluster(labels=old_assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) n_iterations = len(eps_list) print("n_iterations:", n_iterations) min_samples_list = [50, 30, 10] #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2] n_min_samples = len(min_samples_list) #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5] #min_samples_list = [3, 3, 3, 3, 3, 2, 2] results = np.zeros((n_min_samples, n_iterations, len_frames), dtype=np.int32) for i in range(1, n_iterations): eps = eps_list[i] min_samples = min_samples_list[i] db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True new_assignments = db.labels_ if i is n_iterations - 1: remove_outliers = True #else: # remove_outliers = False assignments = merge_assignments(new_assignments, old_assignments, remove_outliers=remove_outliers) n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0) #results[j,i, :]= np.array(assignments) print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:', min_samples, 'Estimated number of clusters:', n_microstates) #print('Estimated number of clusters: %d' % n_microstates) iter_name = clustering_name + str(i) + '_eps_' + str( eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str( n_microstates) plot_cluster(labels=assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) #old_assignments = assignments #print(results) #np.save("results.npy", results) #np.savetxt("results.csv", results, fmt="%d", delimiter=",") np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",") np.savetxt("min_samples_list.txt", min_samples_list, fmt="%d", delimiter=",")
if __name__ == "__main__": trajectory_dir = '/Volumes/REA_Data/AADH/traj_5_rxts' topology_file = '/Users/robert_arbon/Code/AADH/Analysis/MSM_Reactants_Only/2agy_rxt.psf' reference_file = '/Users/robert_arbon/Code/AADH/Analysis/MSM_Reactants_Only/2agy_rxt.pdb' reference_traj = md.load(reference_file) # Load the meta data meta = load_metadata(traj_dir=trajectory_dir, top=topology_file) # Featurize feature = RawPositionsFeaturizer(ref_traj=reference_traj) ftrajs = featurize(featurizer=feature, meta_data=meta) # Summarize variance = np.var(combine(ftrajs), axis=0) plot_features(variance, name='Variance.png', feature_name='Variance', ordered=False) # Normalize scaler = RobustScaler() strajs = scaler.fit_transform(ftrajs) # perform tICA tica_obj = tICA(n_components=10, lag_time=10, kinetic_mapping=True) tica_traj = tica_obj.fit_transform(strajs)
class TestUtils: def setUp(self): numpy.random.seed(12) self.top = 'data_app/runs/structure.prmtop' self.traj_1 = 'data_app/runs/run-000.nc' self.traj_2 = 'data_app/runs/run-001.nc' self.feat = DihedralFeaturizer() self.traj_dict = { 0: load(self.traj_1, top=self.top), 1: load(self.traj_2, top=self.top) } self.scaler = RobustScaler() self.tica = tICA(n_components=2) self.ftrajs = { 0: numpy.random.rand(100, 50), 1: numpy.random.rand(100, 50), } def test_get_ftrajs(self): output = get_ftrajs(self.traj_dict, self.feat) assert len(output) == 2 assert type(output) == dict def test_get_sctrajs(self): self.scaler.fit(list(self.ftrajs.values())) output = get_sctrajs(self.ftrajs, self.scaler) assert len(output) == 2 assert type(output) == dict def test_get_ttrajs(self): self.tica.fit(list(self.ftrajs.values())) output = get_ttrajs(self.ftrajs, self.tica) assert len(output) == 2 assert type(output) == dict def test_traj_from_stateinds(self): traj = traj_from_stateinds(spawns, meta) assert traj.n_frames == 1 def test_write_production_file(self): write_production_file() assert os.path.exists('Production.in') os.remove('Production.in') def test_write_cpptraj_script(self): write_cpptraj_script(self.traj_1, self.top) assert os.path.exists('script.cpptraj') os.remove('script.cpptraj') def test_write_tleap_script(self): write_tleap_script(write=True) assert os.path.exists('script.tleap') os.remove('script.tleap') def test_create_folder(self): fname = 'foo' create_folder(fname) assert os.path.isdir(fname) os.removedirs(fname) def test_create_symlinks(self): create_folder('src_symlinks') create_folder('dst_symlinks') with open('src_symlinks/1.txt', 'w') as f: f.writelines('foo') create_symlinks(files='src_symlinks/*.txt', dst_folder='dst_symlinks') assert os.path.exists('dst_symlinks/1.txt') rmtree('src_symlinks') rmtree('dst_symlinks') def test_hmr_prmtop(self): new_top = hmr_prmtop(self.top, save=False) assert isinstance(new_top, AmberParm)
dump(f, "raw_featurizer.pkl") #featurizer = DihedralFeaturizer(types=['chi1', 'chi2'], resids= 73,74,75,76,77,78,79,80,81,82,83) diheds = featurizer.fit_transform(ds) dump(diheds, "features.pkl") #print(ds[0].shape) print(diheds[0].shape) # this basically maps every feature to atom indices. df1 = pd.DataFrame(featurizer.describe_features(ds)) dump(df1, "feature_descriptor.pkl") #Robust scaling from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(diheds) print(diheds[0].shape) print(scaled_diheds[0].shape) #Reducing dimension tica_model = tICA(lag_time=1, n_components=10) # fit and transform can be done in seperate steps: tica_model.fit(diheds) tica_trajs = tica_model.transform(diheds) print(diheds[0].shape) print(tica_trajs[0].shape) #lets dump the tica mdl for future use
def main_modified(generations): import numpy as np from msmbuilder.preprocessing import RobustScaler import time import pickle import os import multiprocessing os.environ["OMP_NUM_THREADS"] = "1" import operator from multiprocessing import Pool from operator import itemgetter diheds=Get_dihedral_features_villin() scaler = RobustScaler() scaled_feature = scaler.fit_transform(diheds) Val=Laplacian_score(scaled_feature) # output of imp_features and col_mean of the laplacian score of each dihedral col_mean=Val[0] imp_features=Val[1] current_gen = 0 for_each_gen_score =[] population_each_gen=[] population_dihedral=[] population_dihedral=initial_population(imp_features) cross_probability=0.8 num_parents=(int)(cross_probability*len(population_dihedral)) population_dihedral_duplicate=[] numberOfThreads = multiprocessing.cpu_count() f = open("benzamidine_diheds_ga_score"+str(generations)+".txt", "a") while current_gen < generations: manager = multiprocessing.Manager() score = manager.dict() processes = [] lock = multiprocessing.Lock() for i in range(len(population_dihedral)): p = multiprocessing.Process(target=calculate_fitness, args=(population_dihedral[i],scaled_feature,score,i,lock)) processes.append(p) #starttime = time.time() for i in chunks(processes,numberOfThreads): #chunks is a function : has to be defined p_count=0 for process in i: process.start() p_count=p_count+1 print("the started process are"+str(p_count)) for process in i: process.join() p_count=p_count-1 print("the joined process are"+str(p_count)) for process in i: process.terminate() p_count=p_count+1 print("the terminated process are"+str(p_count)) scored_population={} scored_population=dict(sorted(score.items(), key=operator.itemgetter(1))) for_each_gen_score.append(scored_population) population_each_gen.append(population_dihedral) scored_population_list=list(scored_population.keys()) parents=[] parents = select_parents_rank_based(scored_population,population_dihedral,cross_probability) offsprings_1=[] offsprings_1=crossover(parents,population_dihedral) parents_binary=[] parents_binary=parents_binarize(parents,imp_features) offsprings_2_binary=[] count_mutation=len(population_dihedral)-len(offsprings_1) offsprings_2_binary=mutation_binary_offspring(parents_binary,4,count_mutation) #,col_mean,imp_features) offsprings_2=[] offsprings_2=binary_to_pop_dih(offsprings_2_binary) for i in range(len(offsprings_2)): offsprings_2[i]=np.asarray(offsprings_2[i]) for i in range(len(offsprings_1)): offsprings_1[i]=np.asarray(offsprings_1[i]) offsprings=[] offsprings=offsprings_1+offsprings_2 # offsprings.append(population_dihedral[scored_population_list[len(scored_population_list)-1]]) #offsprings.append(population_dihedral[scored_population_list[len(scored_population_list)-2]]) population_dihedral=[] population_dihedral=offsprings current_gen = current_gen+1 print(for_each_gen_score,file=f) f.close() return for_each_gen_score,population_each_gen,scaled_feature,imp_features
from msmbuilder.preprocessing import RobustScaler import numpy as np from msmbuilder.io import load_trajs, save_trajs, save_generic import matplotlib matplotlib.use('Agg') from matplotlib.pylab import plt from utilities import plot_box if __name__ == '__main__': # Load feature_name = 'Positions' meta, feature_trajs = load_trajs('Unscaled-{}-ftraj'.format(feature_name)) # Select scaler featurizer = RobustScaler() # Transform values featurizer.fit_transform(feature_trajs.values()) scaled_trajs = {} for k, v in feature_trajs.items(): scaled_trajs[k] = featurizer.partial_transform(v) # Plot unscaled features ftrajs = np.concatenate([fx[::100] for fx in scaled_trajs.values()]) fig, ax = plt.subplots(figsize=(15, 5)) plot_box(ax, fxx=ftrajs, feature_name='Scaled {}'.format(feature_name)) fig.tight_layout() fig.savefig("Scaled-{}-box.pdf".format(feature_name)) # Save