def _algorithm(self): logger.info('Postprocessing new data') sims = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', 'structure.pdb')), glob(path.join(self.inputpath, '*', ''))) if self.filter: sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel) metr = Metric(sims, skip=self.skip) metr.set(self.projection) #if self.contactsym is not None: # contactSymmetry(data, self.contactsym) if self.ticadim > 0: # tica = TICA(metr, int(max(2, np.ceil(self.ticalag)))) # gianni: without project it was tooooo slow tica = TICA(metr.project(), int(max(2, np.ceil(self.ticalag)))) datadr = tica.project(self.ticadim) else: datadr = metr.project() datadr.dropTraj( ) # Preferably we should do this before any projections. Corrupted sims can affect TICA datadr.cluster( self.clustmethod(n_clusters=self._numClusters(datadr.numFrames))) self._model = Model(datadr) self._model.markovModel(self.lag, self._numMacrostates(datadr)) if self.save: self._model.save('adapt_model_e' + str(self._getEpoch()) + '.dat') relFrames = self._getSpawnFrames(self._model, datadr) self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
def handle_model(self): """Creates a model is model is not set. Loads a model from a string. Or assign a model to self.model.out_folder Calling this function results in self.model to be and htmd.model.Model class """ from htmd.model import Model from htmd.molecule.molecule import Molecule if not self.model: from IDP_htmd.IDP_analysis import analyze_folder print("Creating new analysis") self.write_parameters() self.model = analyze_folder(self.input_folder, self.out_folder, self.skip, self.metrics, self.cluster, self.tica, self.ticadim, self.ticalag, self.modellag, self.modelunits, self.macronum, self.bulk_split, self.fes, self.rg_analysis, self.save_model, self.data_fstep) if isinstance(self.model, str): try: print("Loading model") model = Model() model.load(self.model) self.model = model except: print("Could not load the model") return if isinstance(self.model, Model): print("Model loaded") self.mol = Molecule(self.model.data.simlist[0].molfile)
def _createMSM(self, data): data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames))) self._model = Model(data) self._model.markovModel(self.lag, self._numMacrostates(data)) if self.save: if not path.exists('saveddata'): makedirs('saveddata') self._model.save(path.join('saveddata', 'e{}_adapt_model.dat'.format(self._getEpoch())))
def bootstrap(model, rounds, fraction=0.8, clusters=500): from htmd.model import Model from sklearn.cluster import MiniBatchKMeans for boot_round in range(rounds): dataBoot = model.data.bootstrap(fraction) print(f"Starting a new round of bootstrap - {boot_round}") dataBoot.cluster(MiniBatchKMeans(n_clusters=clusters), mergesmall=5) b_model = Model(dataBoot) yield (b_model)
def viewModel(model_name): model = Model(file=model_name) try: model.macronum except: model.markovModel(20, 5, units="ns") model.viewStates(alignsel="noh and resname MOL", protein=True, ligand="protein and backbone")
def _algorithm(self): logger.info('Postprocessing new data') sims = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', 'structure.pdb')), glob(path.join(self.inputpath, '*', ''))) if self.filter: sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel) metr = Metric(sims, skip=self.skip) metr.set(self.projection) # if self.contactsym is not None: # contactSymmetry(data, self.contactsym) if self.ticadim > 0: # tica = TICA(metr, int(max(2, np.ceil(self.ticalag)))) # gianni: without project it was tooooo slow tica = TICA(metr.project(), int(max(2, np.ceil(self.ticalag)))) datadr = tica.project(self.ticadim) else: datadr = metr.project() datadr.dropTraj( ) # Preferably we should do this before any projections. Corrupted sims can affect TICA datadr.cluster( self.clustmethod(n_clusters=self._numClusters(datadr.numFrames))) model = Model(datadr) self._model = model self._model.markovModel(self.lag, self._numMacrostates(datadr)) if self.save: self._model.save('adapt_model_e' + str(self._getEpoch()) + '.dat') # Undirected component uc = -model.data.N # Lower counts should give higher score hence the - if self.statetype == 'micro': uc = uc[model.cluster_ofmicro] if self.statetype == 'macro': uc = macroAccumulate(model, uc[model.cluster_ofmicro]) # Calculating the directed component dc = self._calculateDirectedComponent(sims, model.data.St, model.data.N) if self.statetype == 'micro': dc = dc[model.cluster_ofmicro] if self.statetype == 'macro': dc = macroAccumulate(model, dc[model.cluster_ofmicro]) uc = self._featScale(uc) dc = self._featScale(dc) reward = dc + self.ucscale * uc relFrames = self._getSpawnFrames(reward, self._model, datadr) self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
def _createMSM(self, data): from htmd.model import Model kmeanserror = True while kmeanserror: try: data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames))) except IndexError: continue kmeanserror = False self._model = Model(data) self._model.markovModel(self.lag, self._numMacrostates(data)) if self.save: makedirs('saveddata', exist_ok=True) self._model.save(path.join('saveddata', 'e{}_adapt_model.dat'.format(self._getEpoch())))
def scan_clusters(model, nclusters, out_dir): """Create models In order to assess the effect on timescales using different clusters in a model. Parameters ---------- model : htmd.model.Model Model class we want to perfom the analysis nclusters : int[] Array of clusters to be tested out_dir : str Directory to save the generated plots """ from sklearn.cluster import MiniBatchKMeans for i in nclusters: model.data.cluster(MiniBatchKMeans(n_clusters=i), mergesmall=5) new_mod = Model(model.data) new_mod.plotTimescales(plot=False, save=f"{out_dir}/1_its-{i}_clu")
def _algorithm(self): logger.info('Postprocessing new data') datalist = simlist( glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', 'structure.pdb')), glob(path.join(self.inputpath, '*', ''))) filtlist = simfilter(datalist, self.filteredpath, filtersel=self.filtersel) if hasattr(self, 'metricsel2') and self.metricsel2 is not None: proj = MetricDistance(self.metricsel1, self.metricsel2, metric=self.metrictype) else: proj = MetricSelfDistance(self.metricsel1, metric=self.metrictype) metr = Metric(filtlist, skip=self.skip) metr.projection(proj) data = metr.project() #if self.contactsym is not None: # contactSymmetry(data, self.contactsym) data.dropTraj() if self.ticadim > 0: tica = TICA(data, int(max(2, np.ceil(20 / self.skip)))) datadr = tica.project(self.ticadim) else: datadr = data K = int( max(np.round(0.6 * np.log10(datadr.numFrames / 1000) * 1000 + 50), 100)) # heuristic if K > datadr.numFrames / 3: # Freaking ugly patches ... K = int(datadr.numFrames / 3) datadr.cluster(self.clustmethod(n_clusters=K), mergesmall=5) replacement = False if datadr.K < 10: datadr.cluster(self.clustmethod(n_clusters=K)) replacement = True model = Model(datadr) macronum = self.macronum if datadr.K < macronum: macronum = np.ceil(datadr.K / 2) logger.warning( 'Using less macrostates than requested due to lack of microstates. macronum = ' + str(macronum)) from pyemma.msm import timescales_msm timesc = timescales_msm(datadr.St.tolist(), lags=self.lag, nits=macronum).get_timescales() macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2)) model.markovModel(self.lag, macronum) p_i = self._criteria(model, self.method) (spawncounts, prob) = self._spawn(p_i, self.nmax - self.running) logger.debug('spawncounts {}'.format(spawncounts)) stateIdx = np.where(spawncounts > 0)[0] _, relFrames = model.sampleStates(stateIdx, spawncounts[stateIdx], statetype='micro', replacement=replacement) logger.debug('relFrames {}'.format(relFrames)) self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
def analyze_folder(folder=None, out_folder="/tmp", skip=1, metrics=None, clu=500, tica=True, ticadim=5, tica_lag=20, model_lag=10, model_units='ns', macro_N=10, bulk_split=False, fes=True, rg_analysis=True, save=True, data_fstep=None): """Analysis script for create a Markov State Model Creates and returns a Markov State Model given a data folder. Intented to follow up the evolution of an adaptive sampling run. Allows to save the model ans several informative plots Parameters ---------- folder : str Data folder where adaptive is running out_folder : str Output folder to store derived data skip : int Number of frames to skip while projecting the MD data metrics : [:class: `Metric` object] Metric array used to project the data clu : int Number of cluster to create using the MiniBatchKMeans method. tica: bool Wether to use TICA of GWPCA for dimensionality reduction ticadim : int Number of TICA dimension to project the data. If None, the model will be created using the raw projected data tica_lag : int, optional Description model_lag : int Number of ns used to create the model model_units : str, optional Description macro_N : int Number of macrostate to split the final Markov State Model fes : bool, optional If true it will save a plot projecting the first two TICA dimension. Requires ticadim to be defined rg_analysis : bool, optional If true, a plot with information relative to the radious of gyration of the molecule will be created. save : bool, optional If true, the model will be saved in the outputs folder Returns ------- :class:`Model` Final model """ from htmd.model import Model from htmd.molecule.molecule import Molecule from htmd.simlist import simlist from htmd.projections.metric import Metric from sklearn.cluster import MiniBatchKMeans from IDP_htmd.IDP_model import plot_RG from IDP_htmd.model_utils import create_bulk from glob import glob import os try: os.mkdir(out_folder) except: print("Folder already exists") try: fsims = np.load(f"{folder}/simlist.npy", allow_pickle=True) print(f"Loaded {folder}/simlist.npy") except: print("Creating simlist") sims = glob(folder + 'filtered/*/') fsims = simlist(sims, folder + 'filtered/filtered.pdb') metr = Metric(fsims, skip=skip) metr.set(metrics) #Check if this gives problems to ITS try: model = Model(file=f"{out_folder}/model.dat") out_data = model.data print(f"Loading model: {out_folder}/model.dat") except: if tica and ticadim: from htmd.projections.tica import TICA print("Projecting TICA") tica = TICA(metr, tica_lag) out_data = tica.project(ticadim) elif not tica and ticadim: from htmd.projections.gwpca import GWPCA data = metr.project() data.dropTraj() print("using GWPCA") gwpca = GWPCA(data, tica_lag) out_data = gwpca.project(ticadim) else: print("Not using TICA") data = metr.project() data.dropTraj() out_data = data #Avoid some possibles error while clustering if data_fstep: out_data.fstep = data_fstep x = True while x: try: out_data.cluster(MiniBatchKMeans(n_clusters=clu), mergesmall=5) x = False except Exception as e: raise Exception("Error " + str(e)) model = Model(out_data) model.plotTimescales(plot=False, save=f"{out_folder}/1_its.png") if macro_N: model.markovModel(model_lag, macro_N, units=model_units) if bulk_split: try: print("Starting bulk splitting") create_bulk(model, bulk_split) except Exception as e: print("Could not perform the bulk splitting") print(e) model.eqDistribution(plot=False, save=f"{out_folder}/1.2_eqDistribution.png") if rg_analysis: from IDP_htmd.IDP_analysis import rg_analysis mol = Molecule(model.data.simlist[0].molfile) rg_data = rg_analysis(model, skip=skip) plot_RG(rg_data, mol, save=f"{out_folder}/1.4_rg.png") # if fes and ticadim: # model.plotFES(0, 1, temperature=310, states=True, # plot=False, save=f"{out_folder}/1.3_fes.png") if save: model.save(f"{out_folder}/model.dat") return model
from htmd.model import Model mt = ModelAnalysis("/workspace8/excitome/adaptiveRun/O75376_MOR_58/", "/home/pablo/testModel/") mt.metrics = [ MetricDistance( sel1="noh and protein", sel2="noh and protein", metric="contacts", threshold=5, groupsel1="residue", groupsel2="residue") ] model = Model() model.load("/home/pablo/testModel/model.dat") mt.model = model mt.handle_model() mt.sasa_variation() # mt.model = "/home/pablo/testModel/model.dat" # mt.plot_dihedral = "2_dihedral" # mt.macronum = 4 # mt.plot_contacts = [ # ('all_contacts', 'noh and protein', 5), # ('backbone', 'noh and backbone', 5), # ('sidechain', 'noh and sidechain', 4), # ] # mt.write_parameters() # mt.generate_html_summary()
def _createMSM(self, epoch, output_folder, basedata=None, skip=1, clusters=0, ticadim=0, ticalag=20, macronum=2, modellag=5, modelunits="frames", fstep=None, data2combine=None): from htmd.projections.tica import TICA from sklearn.cluster import MiniBatchKMeans from htmd.model import Model try: model = Model( file=f"{output_folder}/{self.analysis_type[0]}{epoch}_model.dat" ) if (model.macronum != macronum or model.lag != modellag): model.markovModel(modellag, macronum, units=modelunits) print("Model loaded") except: if not self.precalculated_data and not self.low_memory_usage: print("Calculating PRECALC DATA") precalc_data = self._precalculateData(self.precalc_metric, self.input_folder, fstep=fstep, skip=skip) self.precalc_data = precalc_data self.precalculated_data = True if self.analysis_type == "epoch" and not self.low_memory_usage: epoch_sim = np.concatenate( np.array([ self.epoch_sim_indexes[i] for i in range(1, epoch + 1) if i in list(self.epoch_sim_indexes.keys()) ])) drop_traj_idx = np.ones(self.precalc_data.numTrajectories) drop_traj_idx[epoch_sim] = 0 drop_idx = np.where(drop_traj_idx == 1)[0] elif self.analysis_type == "sims" and not self.low_memory_usage: drop_traj_idx = np.ones(self.precalc_data.numTrajectories) no_drop_idx = np.arange(1, epoch) drop_traj_idx[no_drop_idx] = 0 drop_idx = np.where(drop_traj_idx == 1)[0] if not self.low_memory_usage: data = self.precalc_data.copy() data.dropTraj(idx=drop_idx) data.dropTraj() if basedata: from htmd.projections.metric import MetricData r_fit = self._fitBaseline(data, basedata) data = MetricData(dat=r_fit, simlist=data.simlist) elif ticadim and not self.low_memory_usage: tica = TICA(data, ticalag) data = tica.project(ticadim) elif ticadim and self.low_memory_usage: from htmd.projections.metric import Metric if self.analysis_type == "epoch": epoch_sim = np.concatenate( np.array([ self.epoch_sim_indexes[i] for i in range(1, epoch + 1) if i in list(self.epoch_sim_indexes.keys()) ])) else: epoch_sim = range(0, epoch) metr = Metric(self._sims[epoch_sim], skip=skip) metr.set(self.precalc_metric) tica = TICA(metr, ticalag) data = tica.project(ticadim) if not clusters: clusters = self._numClusters(data.numFrames) if data2combine: try: print("Adding extra dimension") data2combine_copy = data2combine.copy() data2combine_copy.dropTraj(keepsims=data.simlist) data.combine(data2combine_copy) except Exception as e: print("Could not combined data", str(e)) data.cluster(MiniBatchKMeans(clusters), mergesmall=5) model = Model(data) model.markovModel(modellag, macronum, units=modelunits) model.save( f"{output_folder}/{self.analysis_type[0]}{epoch}_model.dat") for name, met in self.associated_metrics.items(): try: self.associated_data[name] except: print(f"Calcualtion associted data - {name.upper()}") assoc_data = self._precalculateData(met, self.input_folder, fstep=fstep, skip=skip) self.associated_data[name] = assoc_data for name, data in self.associated_data.items(): tmp_data = data.copy() tmp_data.dropTraj(keepsims=model.data.simlist) self.tmp_associated_data[name] = tmp_data return model