def handle_model(self): """Creates a model is model is not set. Loads a model from a string. Or assign a model to self.model.out_folder Calling this function results in self.model to be and htmd.model.Model class """ from htmd.model import Model from htmd.molecule.molecule import Molecule if not self.model: from IDP_htmd.IDP_analysis import analyze_folder print("Creating new analysis") self.write_parameters() self.model = analyze_folder(self.input_folder, self.out_folder, self.skip, self.metrics, self.cluster, self.tica, self.ticadim, self.ticalag, self.modellag, self.modelunits, self.macronum, self.bulk_split, self.fes, self.rg_analysis, self.save_model, self.data_fstep) if isinstance(self.model, str): try: print("Loading model") model = Model() model.load(self.model) self.model = model except: print("Could not load the model") return if isinstance(self.model, Model): print("Model loaded") self.mol = Molecule(self.model.data.simlist[0].molfile)
def _algorithm(self): logger.info('Postprocessing new data') sims = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', 'structure.pdb')), glob(path.join(self.inputpath, '*', ''))) if self.filter: sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel) metr = Metric(sims, skip=self.skip) metr.set(self.projection) #if self.contactsym is not None: # contactSymmetry(data, self.contactsym) if self.ticadim > 0: # tica = TICA(metr, int(max(2, np.ceil(self.ticalag)))) # gianni: without project it was tooooo slow tica = TICA(metr.project(), int(max(2, np.ceil(self.ticalag)))) datadr = tica.project(self.ticadim) else: datadr = metr.project() datadr.dropTraj( ) # Preferably we should do this before any projections. Corrupted sims can affect TICA datadr.cluster( self.clustmethod(n_clusters=self._numClusters(datadr.numFrames))) self._model = Model(datadr) self._model.markovModel(self.lag, self._numMacrostates(datadr)) if self.save: self._model.save('adapt_model_e' + str(self._getEpoch()) + '.dat') relFrames = self._getSpawnFrames(self._model, datadr) self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
def _createMSM(self, data): data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames))) self._model = Model(data) self._model.markovModel(self.lag, self._numMacrostates(data)) if self.save: if not path.exists('saveddata'): makedirs('saveddata') self._model.save(path.join('saveddata', 'e{}_adapt_model.dat'.format(self._getEpoch())))
def _algorithm(self): logger.info('Postprocessing new data') datalist = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', 'structure.pdb')), glob(path.join(self.inputpath, '*', ''))) filtlist = simfilter(datalist, self.filteredpath, filtersel=self.filtersel) if hasattr(self, 'metricsel2') and self.metricsel2 is not None: proj = MetricDistance(self.metricsel1, self.metricsel2, metric=self.metrictype) else: proj = MetricSelfDistance(self.metricsel1, metric=self.metrictype) metr = Metric(filtlist, skip=self.skip) metr.projection(proj) data = metr.project() #if self.contactsym is not None: # contactSymmetry(data, self.contactsym) data.dropTraj() if self.ticadim > 0: tica = TICA(data, int(max(2, np.ceil(20/self.skip)))) datadr = tica.project(self.ticadim) else: datadr = data K = int(max(np.round(0.6 * np.log10(datadr.numFrames/1000)*1000+50), 100)) # heuristic if K > datadr.numFrames / 3: # Freaking ugly patches ... K = int(datadr.numFrames / 3) datadr.cluster(self.clustmethod(n_clusters=K), mergesmall=5) replacement = False if datadr.K < 10: datadr.cluster(self.clustmethod(n_clusters=K)) replacement = True model = Model(datadr) macronum = self.macronum if datadr.K < macronum: macronum = np.ceil(datadr.K / 2) logger.warning('Using less macrostates than requested due to lack of microstates. macronum = ' + str(macronum)) from pyemma.msm import timescales_msm timesc = timescales_msm(datadr.St.tolist(), lags=self.lag, nits=macronum).get_timescales() macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2)) model.markovModel(self.lag, macronum) p_i = self._criteria(model, self.method) (spawncounts, prob) = self._spawn(p_i, self.nmax-self.running) logger.debug('spawncounts {}'.format(spawncounts)) stateIdx = np.where(spawncounts > 0)[0] _, relFrames = model.sampleStates(stateIdx, spawncounts[stateIdx], statetype='micro', replacement=replacement) logger.debug('relFrames {}'.format(relFrames)) self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
def _createMSM(self, data): from htmd.model import Model kmeanserror = True while kmeanserror: try: data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames))) except IndexError: continue kmeanserror = False self._model = Model(data) self._model.markovModel(self.lag, self._numMacrostates(data)) if self.save: makedirs('saveddata', exist_ok=True) self._model.save(path.join('saveddata', 'e{}_adapt_model.dat'.format(self._getEpoch())))
def scan_clusters(model, nclusters, out_dir): """Create models In order to assess the effect on timescales using different clusters in a model. Parameters ---------- model : htmd.model.Model Model class we want to perfom the analysis nclusters : int[] Array of clusters to be tested out_dir : str Directory to save the generated plots """ from sklearn.cluster import MiniBatchKMeans for i in nclusters: model.data.cluster(MiniBatchKMeans(n_clusters=i), mergesmall=5) new_mod = Model(model.data) new_mod.plotTimescales(plot=False, save=f"{out_dir}/1_its-{i}_clu")
def bootstrap(model, rounds, fraction=0.8, clusters=500): from htmd.model import Model from sklearn.cluster import MiniBatchKMeans for boot_round in range(rounds): dataBoot = model.data.bootstrap(fraction) print(f"Starting a new round of bootstrap - {boot_round}") dataBoot.cluster(MiniBatchKMeans(n_clusters=clusters), mergesmall=5) b_model = Model(dataBoot) yield (b_model)
def _algorithm(self): logger.info('Postprocessing new data') sims = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', 'structure.pdb')), glob(path.join(self.inputpath, '*', ''))) if self.filter: sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel) metr = Metric(sims, skip=self.skip) metr.set(self.projection) # if self.contactsym is not None: # contactSymmetry(data, self.contactsym) if self.ticadim > 0: # tica = TICA(metr, int(max(2, np.ceil(self.ticalag)))) # gianni: without project it was tooooo slow tica = TICA(metr.project(), int(max(2, np.ceil(self.ticalag)))) datadr = tica.project(self.ticadim) else: datadr = metr.project() datadr.dropTraj( ) # Preferably we should do this before any projections. Corrupted sims can affect TICA datadr.cluster( self.clustmethod(n_clusters=self._numClusters(datadr.numFrames))) model = Model(datadr) self._model = model self._model.markovModel(self.lag, self._numMacrostates(datadr)) if self.save: self._model.save('adapt_model_e' + str(self._getEpoch()) + '.dat') # Undirected component uc = -model.data.N # Lower counts should give higher score hence the - if self.statetype == 'micro': uc = uc[model.cluster_ofmicro] if self.statetype == 'macro': uc = macroAccumulate(model, uc[model.cluster_ofmicro]) # Calculating the directed component dc = self._calculateDirectedComponent(sims, model.data.St, model.data.N) if self.statetype == 'micro': dc = dc[model.cluster_ofmicro] if self.statetype == 'macro': dc = macroAccumulate(model, dc[model.cluster_ofmicro]) uc = self._featScale(uc) dc = self._featScale(dc) reward = dc + self.ucscale * uc relFrames = self._getSpawnFrames(reward, self._model, datadr) self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
def viewModel(model_name): model = Model(file=model_name) try: model.macronum except: model.markovModel(20, 5, units="ns") model.viewStates(alignsel="noh and resname MOL", protein=True, ligand="protein and backbone")
class AdaptiveMD(AdaptiveBase): """ Adaptive class which uses a Markov state model for respawning AdaptiveMD uses Markov state models to choose respawning poses for the next epochs. In more detail, it projects all currently retrieved simulations according to the specified projection, clusters those and then builds a Markov model using the discretized trajectories. From the Markov model it then chooses conformations from the various states based on the chosen criteria which will be used for starting new simulations. Parameters ---------- app : :class:`SimQueue <htmd.queues.simqueue.SimQueue>` object, default=None A SimQueue class object used to retrieve and submit simulations project : str, default='adaptive' The name of the project nmin : int, default=1 Minimum number of running simulations nmax : int, default=1 Maximum number of running simulations nepochs : int, default=1000 Stop adaptive once we have reached this number of epochs nframes : int, default=0 Stop adaptive once we have simulated this number of aggregate simulation frames. inputpath : str, default='input' The directory used to store input folders generatorspath : str, default='generators' The directory containing the generators dryrun : boolean, default=False A dry run means that the adaptive will retrieve and generate a new epoch but not submit the simulations updateperiod : float, default=0 When set to a value other than 0, the adaptive will run synchronously every `updateperiod` seconds coorname : str, default='input.coor' Name of the file containing the starting coordinates for the new simulations lock : bool, default=False Lock the folder while adaptive is ongoing datapath : str, default='data' The directory in which the completed simulations are stored filter : bool, default=True Enable or disable filtering of trajectories. filtersel : str, default='not water' Atom selection string for filtering. See more `here <http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html>`__ filteredpath : str, default='filtered' The directory in which the filtered simulations will be stored projection : :class:`Projection <moleculekit.projections.projection.Projection>` object, default=None A Projection class object or a list of objects which will be used to project the simulation data before constructing a Markov model truncation : str, default=None Method for truncating the prob distribution (None, 'cumsum', 'statecut' statetype : ('micro', 'cluster', 'macro'), str, default='micro' What states (cluster, micro, macro) to use for calculations. macronum : int, default=8 The number of macrostates to produce skip : int, default=1 Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame lag : int, default=1 The lagtime used to create the Markov model clustmethod : :class:`ClusterMixin <sklearn.base.ClusterMixin>` class, default=<class 'htmd.clustering.kcenters.KCenter'> Clustering algorithm used to cluster the contacts or distances method : str, default='1/Mc' Criteria used for choosing from which state to respawn from ticalag : int, default=20 Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly. ticadim : int, default=3 Number of TICA dimensions to use. When set to 0 it disables TICA contactsym : str, default=None Contact symmetry save : bool, default=False Save the model generated Example ------- >>> adapt = AdaptiveMD() >>> adapt.nmin = 2 >>> adapt.nmax = 3 >>> adapt.nepochs = 2 >>> adapt.ticadim = 3 >>> adapt.projection = [MetricDistance('name CA', 'name N'), MetricDihedral()] >>> adapt.generatorspath = htmd.home()+'/data/dhfr' >>> adapt.app = AcemdLocal() >>> adapt.run() """ def __init__(self): from sklearn.base import ClusterMixin from htmd.clustering.kcenters import KCenter from moleculekit.projections.projection import Projection super().__init__() self._arg( 'datapath', 'str', 'The directory in which the completed simulations are stored', 'data', val.String()) self._arg('filter', 'bool', 'Enable or disable filtering of trajectories.', True, val.Boolean()) self._arg('filtersel', 'str', 'Filtering atom selection', 'not water', val.String()) self._arg( 'filteredpath', 'str', 'The directory in which the filtered simulations will be stored', 'filtered', val.String()) self._arg( 'projection', ':class:`Projection <moleculekit.projections.projection.Projection>` object', 'A Projection class object or a list of objects which will be used to project the simulation ' 'data before constructing a Markov model', None, val.Object(Projection), nargs='+') self._arg( 'truncation', 'str', 'Method for truncating the prob distribution (None, \'cumsum\', \'statecut\'', None, val.String()) self._arg( 'statetype', 'str', 'What states (cluster, micro, macro) to use for calculations.', 'micro', val.String(), valid_values=('micro', 'cluster', 'macro')) self._arg('macronum', 'int', 'The number of macrostates to produce', 8, val.Number(int, 'POS')) self._arg( 'skip', 'int', 'Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame', 1, val.Number(int, 'POS')) self._arg('lag', 'int', 'The lagtime used to create the Markov model', 1, val.Number(int, 'POS')) self._arg( 'clustmethod', ':class:`ClusterMixin <sklearn.base.ClusterMixin>` class', 'Clustering algorithm used to cluster the contacts or distances', KCenter, val.Class(ClusterMixin)) self._arg( 'method', 'str', 'Criteria used for choosing from which state to respawn from', '1/Mc', val.String()) self._arg( 'ticalag', 'int', 'Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly.', 20, val.Number(int, '0POS')) self._arg( 'ticadim', 'int', 'Number of TICA dimensions to use. When set to 0 it disables TICA', 3, val.Number(int, '0POS')) self._arg('contactsym', 'str', 'Contact symmetry', None, val.String()) self._arg('save', 'bool', 'Save the model generated', False, val.Boolean()) def _algorithm(self): data = self._getData(self._getSimlist()) if not self._checkNFrames(data): return False self._createMSM(data) N = self.nmax - self._running reward = self._criteria(self._model, self.method) reward = self._truncate(reward, N) relFrames, _, _ = self._getSpawnFrames(reward, self._model, self._model.data, N) self._writeInputs(self._model.data.rel2sim(np.concatenate(relFrames))) return True def _checkNFrames(self, data): if self.nframes != 0 and data.numFrames >= self.nframes: logger.info('Reached maximum number of frames. Stopping adaptive.') return False return True def _getSimlist(self): logger.info('Postprocessing new data') sims = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', '')), glob(path.join(self.inputpath, '*', ''))) if self.filter: sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel) return sims def _getData(self, sims): metr = Metric(sims, skip=self.skip) metr.set(self.projection) # if self.contactsym is not None: # contactSymmetry(data, self.contactsym) if self.ticadim > 0: # tica = TICA(metr, int(max(2, np.ceil(self.ticalag)))) # gianni: without project it was tooooo slow data = metr.project() data.dropTraj() # Drop before TICA to avoid broken trajectories ticalag = int( np.ceil(max(2, min(np.min(data.trajLengths) / 2, self.ticalag)))) # 1 < ticalag < (trajLen / 2) tica = TICA(data, ticalag) datadr = tica.project(self.ticadim) else: datadr = metr.project() datadr.dropTraj( ) # Preferably we should do this before any projections. Corrupted sims can affect TICA return datadr def _createMSM(self, data): data.cluster( self.clustmethod(n_clusters=self._numClusters(data.numFrames))) self._model = Model(data) self._model.markovModel(self.lag, self._numMacrostates(data)) if self.save: if not path.exists('saveddata'): makedirs('saveddata') self._model.save( path.join('saveddata', 'e{}_adapt_model.dat'.format(self._getEpoch()))) def _getSpawnFrames(self, reward, model, data, N): prob = reward / np.sum(reward) logger.debug('Sampling probabilities {}'.format(prob)) spawncounts = np.random.multinomial(N, prob) logger.debug('spawncounts {}'.format(spawncounts)) stateIdx = np.where(spawncounts > 0)[0] _, relFrames = model.sampleStates(stateIdx, spawncounts[stateIdx], statetype='micro', replacement=True) logger.debug('relFrames {}'.format(relFrames)) return relFrames, spawncounts, prob def _criteria(self, model, criteria): if criteria == '1/Mc': nMicroPerMacro = macroAccumulate(model, np.ones(model.micronum)) P_I = 1 / macroAccumulate(model, model.data.N[model.cluster_ofmicro]) P_I = P_I / nMicroPerMacro ret = P_I[model.macro_ofmicro] elif criteria == 'pi/Mc': nMicroPerMacro = macroAccumulate(model, np.ones(model.micronum)) P_I = 1 / macroAccumulate(model, model.data.N[model.cluster_ofmicro]) P_I = P_I / nMicroPerMacro ret = P_I[model.macro_ofmicro] * model.msm.stationary_distribution return ret def _truncate(self, ranking, N): if self.truncation is not None and self.truncation.lower() != 'none': if self.truncation == 'cumsum': idx = np.argsort(ranking) idx = idx[::-1] # decreasing sort errs = ranking[idx] H = (N * errs / np.cumsum(errs)) < 1 ranking[idx[H]] = 0 if self.truncation == 'statecut': idx = np.argsort(ranking) idx = idx[::-1] # decreasing sort ranking[idx[N:]] = 0 # Set all states ranked > N to zero. return ranking def _numClusters(self, numFrames): """ Heuristic that calculates number of clusters from number of frames """ K = int( max(np.round(0.6 * np.log10(numFrames / 1000) * 1000 + 50), 100)) # heuristic if K > numFrames / 3: # Ugly patch for low-data regimes ... K = int(numFrames / 3) return K def _numMacrostates(self, data): """ Heuristic for calculating the number of macrostates for the Markov model """ macronum = self.macronum if data.K < macronum: macronum = np.ceil(data.K / 2) logger.warning( 'Using less macrostates than requested due to lack of microstates. macronum = ' + str(macronum)) # Calculating how many timescales are above the lag time to limit number of macrostates from pyemma.msm import timescales_msm timesc = timescales_msm(data.St.tolist(), lags=self.lag, nits=macronum).get_timescales() macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2)) return macronum
def _algorithm(self): logger.info('Postprocessing new data') datalist = simlist( glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', 'structure.pdb')), glob(path.join(self.inputpath, '*', ''))) filtlist = simfilter(datalist, self.filteredpath, filtersel=self.filtersel) if hasattr(self, 'metricsel2') and self.metricsel2 is not None: proj = MetricDistance(self.metricsel1, self.metricsel2, metric=self.metrictype) else: proj = MetricSelfDistance(self.metricsel1, metric=self.metrictype) metr = Metric(filtlist, skip=self.skip) metr.projection(proj) data = metr.project() #if self.contactsym is not None: # contactSymmetry(data, self.contactsym) data.dropTraj() if self.ticadim > 0: tica = TICA(data, int(max(2, np.ceil(20 / self.skip)))) datadr = tica.project(self.ticadim) else: datadr = data K = int( max(np.round(0.6 * np.log10(datadr.numFrames / 1000) * 1000 + 50), 100)) # heuristic if K > datadr.numFrames / 3: # Freaking ugly patches ... K = int(datadr.numFrames / 3) datadr.cluster(self.clustmethod(n_clusters=K), mergesmall=5) replacement = False if datadr.K < 10: datadr.cluster(self.clustmethod(n_clusters=K)) replacement = True model = Model(datadr) macronum = self.macronum if datadr.K < macronum: macronum = np.ceil(datadr.K / 2) logger.warning( 'Using less macrostates than requested due to lack of microstates. macronum = ' + str(macronum)) from pyemma.msm import timescales_msm timesc = timescales_msm(datadr.St.tolist(), lags=self.lag, nits=macronum).get_timescales() macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2)) model.markovModel(self.lag, macronum) p_i = self._criteria(model, self.method) (spawncounts, prob) = self._spawn(p_i, self.nmax - self.running) logger.debug('spawncounts {}'.format(spawncounts)) stateIdx = np.where(spawncounts > 0)[0] _, relFrames = model.sampleStates(stateIdx, spawncounts[stateIdx], statetype='micro', replacement=replacement) logger.debug('relFrames {}'.format(relFrames)) self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
def analyze_folder(folder=None, out_folder="/tmp", skip=1, metrics=None, clu=500, tica=True, ticadim=5, tica_lag=20, model_lag=10, model_units='ns', macro_N=10, bulk_split=False, fes=True, rg_analysis=True, save=True, data_fstep=None): """Analysis script for create a Markov State Model Creates and returns a Markov State Model given a data folder. Intented to follow up the evolution of an adaptive sampling run. Allows to save the model ans several informative plots Parameters ---------- folder : str Data folder where adaptive is running out_folder : str Output folder to store derived data skip : int Number of frames to skip while projecting the MD data metrics : [:class: `Metric` object] Metric array used to project the data clu : int Number of cluster to create using the MiniBatchKMeans method. tica: bool Wether to use TICA of GWPCA for dimensionality reduction ticadim : int Number of TICA dimension to project the data. If None, the model will be created using the raw projected data tica_lag : int, optional Description model_lag : int Number of ns used to create the model model_units : str, optional Description macro_N : int Number of macrostate to split the final Markov State Model fes : bool, optional If true it will save a plot projecting the first two TICA dimension. Requires ticadim to be defined rg_analysis : bool, optional If true, a plot with information relative to the radious of gyration of the molecule will be created. save : bool, optional If true, the model will be saved in the outputs folder Returns ------- :class:`Model` Final model """ from htmd.model import Model from htmd.molecule.molecule import Molecule from htmd.simlist import simlist from htmd.projections.metric import Metric from sklearn.cluster import MiniBatchKMeans from IDP_htmd.IDP_model import plot_RG from IDP_htmd.model_utils import create_bulk from glob import glob import os try: os.mkdir(out_folder) except: print("Folder already exists") try: fsims = np.load(f"{folder}/simlist.npy", allow_pickle=True) print(f"Loaded {folder}/simlist.npy") except: print("Creating simlist") sims = glob(folder + 'filtered/*/') fsims = simlist(sims, folder + 'filtered/filtered.pdb') metr = Metric(fsims, skip=skip) metr.set(metrics) #Check if this gives problems to ITS try: model = Model(file=f"{out_folder}/model.dat") out_data = model.data print(f"Loading model: {out_folder}/model.dat") except: if tica and ticadim: from htmd.projections.tica import TICA print("Projecting TICA") tica = TICA(metr, tica_lag) out_data = tica.project(ticadim) elif not tica and ticadim: from htmd.projections.gwpca import GWPCA data = metr.project() data.dropTraj() print("using GWPCA") gwpca = GWPCA(data, tica_lag) out_data = gwpca.project(ticadim) else: print("Not using TICA") data = metr.project() data.dropTraj() out_data = data #Avoid some possibles error while clustering if data_fstep: out_data.fstep = data_fstep x = True while x: try: out_data.cluster(MiniBatchKMeans(n_clusters=clu), mergesmall=5) x = False except Exception as e: raise Exception("Error " + str(e)) model = Model(out_data) model.plotTimescales(plot=False, save=f"{out_folder}/1_its.png") if macro_N: model.markovModel(model_lag, macro_N, units=model_units) if bulk_split: try: print("Starting bulk splitting") create_bulk(model, bulk_split) except Exception as e: print("Could not perform the bulk splitting") print(e) model.eqDistribution(plot=False, save=f"{out_folder}/1.2_eqDistribution.png") if rg_analysis: from IDP_htmd.IDP_analysis import rg_analysis mol = Molecule(model.data.simlist[0].molfile) rg_data = rg_analysis(model, skip=skip) plot_RG(rg_data, mol, save=f"{out_folder}/1.4_rg.png") # if fes and ticadim: # model.plotFES(0, 1, temperature=310, states=True, # plot=False, save=f"{out_folder}/1.3_fes.png") if save: model.save(f"{out_folder}/model.dat") return model
from htmd.model import Model mt = ModelAnalysis("/workspace8/excitome/adaptiveRun/O75376_MOR_58/", "/home/pablo/testModel/") mt.metrics = [ MetricDistance( sel1="noh and protein", sel2="noh and protein", metric="contacts", threshold=5, groupsel1="residue", groupsel2="residue") ] model = Model() model.load("/home/pablo/testModel/model.dat") mt.model = model mt.handle_model() mt.sasa_variation() # mt.model = "/home/pablo/testModel/model.dat" # mt.plot_dihedral = "2_dihedral" # mt.macronum = 4 # mt.plot_contacts = [ # ('all_contacts', 'noh and protein', 5), # ('backbone', 'noh and backbone', 5), # ('sidechain', 'noh and sidechain', 4), # ] # mt.write_parameters() # mt.generate_html_summary()
def _createMSM(self, epoch, output_folder, basedata=None, skip=1, clusters=0, ticadim=0, ticalag=20, macronum=2, modellag=5, modelunits="frames", fstep=None, data2combine=None): from htmd.projections.tica import TICA from sklearn.cluster import MiniBatchKMeans from htmd.model import Model try: model = Model( file=f"{output_folder}/{self.analysis_type[0]}{epoch}_model.dat" ) if (model.macronum != macronum or model.lag != modellag): model.markovModel(modellag, macronum, units=modelunits) print("Model loaded") except: if not self.precalculated_data and not self.low_memory_usage: print("Calculating PRECALC DATA") precalc_data = self._precalculateData(self.precalc_metric, self.input_folder, fstep=fstep, skip=skip) self.precalc_data = precalc_data self.precalculated_data = True if self.analysis_type == "epoch" and not self.low_memory_usage: epoch_sim = np.concatenate( np.array([ self.epoch_sim_indexes[i] for i in range(1, epoch + 1) if i in list(self.epoch_sim_indexes.keys()) ])) drop_traj_idx = np.ones(self.precalc_data.numTrajectories) drop_traj_idx[epoch_sim] = 0 drop_idx = np.where(drop_traj_idx == 1)[0] elif self.analysis_type == "sims" and not self.low_memory_usage: drop_traj_idx = np.ones(self.precalc_data.numTrajectories) no_drop_idx = np.arange(1, epoch) drop_traj_idx[no_drop_idx] = 0 drop_idx = np.where(drop_traj_idx == 1)[0] if not self.low_memory_usage: data = self.precalc_data.copy() data.dropTraj(idx=drop_idx) data.dropTraj() if basedata: from htmd.projections.metric import MetricData r_fit = self._fitBaseline(data, basedata) data = MetricData(dat=r_fit, simlist=data.simlist) elif ticadim and not self.low_memory_usage: tica = TICA(data, ticalag) data = tica.project(ticadim) elif ticadim and self.low_memory_usage: from htmd.projections.metric import Metric if self.analysis_type == "epoch": epoch_sim = np.concatenate( np.array([ self.epoch_sim_indexes[i] for i in range(1, epoch + 1) if i in list(self.epoch_sim_indexes.keys()) ])) else: epoch_sim = range(0, epoch) metr = Metric(self._sims[epoch_sim], skip=skip) metr.set(self.precalc_metric) tica = TICA(metr, ticalag) data = tica.project(ticadim) if not clusters: clusters = self._numClusters(data.numFrames) if data2combine: try: print("Adding extra dimension") data2combine_copy = data2combine.copy() data2combine_copy.dropTraj(keepsims=data.simlist) data.combine(data2combine_copy) except Exception as e: print("Could not combined data", str(e)) data.cluster(MiniBatchKMeans(clusters), mergesmall=5) model = Model(data) model.markovModel(modellag, macronum, units=modelunits) model.save( f"{output_folder}/{self.analysis_type[0]}{epoch}_model.dat") for name, met in self.associated_metrics.items(): try: self.associated_data[name] except: print(f"Calcualtion associted data - {name.upper()}") assoc_data = self._precalculateData(met, self.input_folder, fstep=fstep, skip=skip) self.associated_data[name] = assoc_data for name, data in self.associated_data.items(): tmp_data = data.copy() tmp_data.dropTraj(keepsims=model.data.simlist) self.tmp_associated_data[name] = tmp_data return model
class AdaptiveBandit(AdaptiveBase): """ Parameters ---------- app : :class:`SimQueue <jobqueues.simqueue.SimQueue>` object, default=None A SimQueue class object used to retrieve and submit simulations project : str, default='adaptive' The name of the project nmin : int, default=1 Minimum number of running simulations nmax : int, default=1 Maximum number of running simulations nepochs : int, default=1000 Stop adaptive once we have reached this number of epochs nframes : int, default=0 Stop adaptive once we have simulated this number of aggregate simulation frames. inputpath : str, default='input' The directory used to store input folders generatorspath : str, default='generators' The directory containing the generators dryrun : boolean, default=False A dry run means that the adaptive will retrieve and generate a new epoch but not submit the simulations updateperiod : float, default=0 When set to a value other than 0, the adaptive will run synchronously every `updateperiod` seconds coorname : str, default='input.coor' Name of the file containing the starting coordinates for the new simulations lock : bool, default=False Lock the folder while adaptive is ongoing datapath : str, default='data' The directory in which the completed simulations are stored filter : bool, default=True Enable or disable filtering of trajectories. filtersel : str, default='not water' Filtering atom selection filteredpath : str, default='filtered' The directory in which the filtered simulations will be stored projection : :class:`Projection <moleculekit.projections.projection.Projection>` object, default=None A Projection class object or a list of objects which will be used to project the simulation data before constructing a Markov model goalfunction : function, default=None This function will be used to convert the goal-projected simulation data to a ranking whichcan be used for the directed component of FAST. reward_method : str, default='max' The reward method skip : int, default=1 Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame lag : int, default=1 The lagtime used to create the Markov model. Units are in frames. exploration : float, default=0.5 Exploration is the coefficient used in UCB algorithm to weight the exploration value temperature : int, default=300 Temperature used to compute the free energy ticalag : int, default=20 Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly. ticadim : int, default=3 Number of TICA dimensions to use. When set to 0 it disables TICA clustmethod : :class:`ClusterMixin <sklearn.base.ClusterMixin>` class, default=<class 'sklearn.cluster.k_means_.MiniBatchKMeans'> Clustering algorithm used to cluster the contacts or distances macronum : int, default=8 The number of macrostates to produce save : bool, default=False Save the model generated save_qval : bool, default=False Save the Q(a) and N values for every epoch actionspace : str, default='metric' The action space recluster : bool, default=False If to recluster the action space. reclusterMethod : , default=<class 'sklearn.cluster.k_means_.MiniBatchKMeans'> Clustering method for reclustering. random : bool, default=False Random decision mode for baseline. reward_mode : str, default='parent' (parent, frame) reward_window : int, default=None The reward window pucb : bool, default=False If True, it uses PUCB algorithm using the provided goal function as a prior goal_init : float, default=0.3 The proportional ratio of goal initialization compared to max frames set by nframes goal_preprocess : function, default=None This function will be used to preprocess goal data after it has been computed for all frames. actionpool : int, default=0 The number of top scoring actions used to randomly select respawning simulations """ def __init__(self): from sklearn.base import ClusterMixin from moleculekit.projections.projection import Projection super().__init__() self._arg('datapath', 'str', 'The directory in which the completed simulations are stored', 'data', val.String()) self._arg('filter', 'bool', 'Enable or disable filtering of trajectories.', True, val.Boolean()) self._arg('filtersel', 'str', 'Filtering atom selection', 'not water', val.String()) self._arg('filteredpath', 'str', 'The directory in which the filtered simulations will be stored', 'filtered', val.String()) self._arg('projection', ':class:`Projection <moleculekit.projections.projection.Projection>` object', 'A Projection class object or a list of objects which will be used to project the simulation ' 'data before constructing a Markov model', None, val.Object(Projection), nargs='+') self._arg('goalfunction', 'function', 'This function will be used to convert the goal-projected simulation data to a ranking which' 'can be used for the directed component of FAST.', None, val.Function(), nargs='any') self._arg('reward_method', 'str', 'The reward method', 'max', val.String()) self._arg('skip', 'int', 'Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame', 1, val.Number(int, 'POS')) self._arg('lag', 'int', 'The lagtime used to create the Markov model. Units are in frames.', 1, val.Number(int, 'POS')) self._arg('exploration', 'float', 'Exploration is the coefficient used in UCB algorithm to weight the exploration value', 0.5, val.Number(float, 'OPOS')) self._arg('temperature', 'int', 'Temperature used to compute the free energy', 300, val.Number(int, 'POS')) self._arg('ticalag', 'int', 'Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly.', 20, val.Number(int, '0POS')) self._arg('ticadim', 'int', 'Number of TICA dimensions to use. When set to 0 it disables TICA', 3, val.Number(int, '0POS')) self._arg('clustmethod', ':class:`ClusterMixin <sklearn.base.ClusterMixin>` class', 'Clustering algorithm used to cluster the contacts or distances', MiniBatchKMeans, val.Class(ClusterMixin)) self._arg('macronum', 'int', 'The number of macrostates to produce', 8, val.Number(int, 'POS')) self._arg('save', 'bool', 'Save the model generated', False, val.Boolean()) self._arg('save_qval', 'bool', 'Save the Q(a) and N values for every epoch', False, val.Boolean()) self._arg('actionspace', 'str', 'The action space', 'metric', val.String()) self._arg('recluster', 'bool', 'If to recluster the action space.', False, val.Boolean()) self._arg('reclusterMethod', '', 'Clustering method for reclustering.', MiniBatchKMeans) self._arg('random', 'bool', 'Random decision mode for baseline.', False, val.Boolean()) self._arg('reward_mode', 'str', '(parent, frame)', 'parent', val.String()) self._arg('reward_window', 'int', 'The reward window', None, val.Number(int, 'POS')) self._arg('pucb', 'bool', 'If True, it uses PUCB algorithm using the provided goal function as a prior', False, val.Boolean()) self._arg('goal_init', 'float', 'The proportional ratio of goal initialization compared to max frames set by nframes', 0.3, val.Number(float, 'POS')) self._arg('goal_preprocess', 'function', 'This function will be used to preprocess goal data after it has been computed for all frames.', None, val.Function(), nargs='any') self._arg('actionpool', 'int', 'The number of top scoring actions used to randomly select respawning simulations', 0, val.Number(int, 'OPOS')) def _algorithm(self): from htmd.kinetics import Kinetics sims = self._getSimlist() metr = Metric(sims, skip=self.skip) metr.set(self.projection) data = metr.project() data.dropTraj() # Drop before TICA to avoid broken trajectories if self.goalfunction is not None: goaldata = self._getGoalData(data.simlist) if len(data.simlist) != len(goaldata.simlist): raise RuntimeError('The goal function was not able to project all trajectories that the MSM projection could. Check for possible errors in the goal function.') goaldataconcat = np.concatenate(goaldata.dat) if self.save: makedirs('saveddata', exist_ok=True) goaldata.save(path.join('saveddata', 'e{}_goaldata.dat'.format(self._getEpoch()))) # tica = TICA(metr, int(max(2, np.ceil(self.ticalag)))) # gianni: without project it was tooooo slow if self.ticadim > 0: ticalag = int(np.ceil(max(2, min(np.min(data.trajLengths) / 2, self.ticalag)))) # 1 < ticalag < (trajLen / 2) tica = TICA(data, ticalag) datatica = tica.project(self.ticadim) if not self._checkNFrames(datatica): return False self._createMSM(datatica) else: if not self._checkNFrames(data): return False self._createMSM(data) confstatdist = self.conformationStationaryDistribution(self._model) if self.actionspace == 'metric': if not data.K: data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames))) data_q = data.copy() elif self.actionspace == 'goal': data_q = goaldata.copy() elif self.actionspace == 'tica': data_q = datatica.copy() elif self.actionspace == 'ticapcca': data_q = datatica.copy() for traj in data_q.trajectories: traj.cluster = self._model.macro_ofcluster[traj.cluster] data_q.K = self._model.macronum if self.recluster: print('Reclustering with {}'.format(self.reclusterMethod)) data_q.cluster(self.reclusterMethod) numstates = data_q.K print('Numstates: {}'.format(numstates)) currepoch = self._getEpoch() q_values = np.zeros(numstates, dtype=np.float32) n_values = np.zeros(numstates, dtype=np.int32) if self.random: # If random mode respawn from random action states action_sel = np.zeros(numstates, dtype=int) N = self.nmax - self._running randomactions = np.bincount(np.random.randint(numstates, size=N)) action_sel[:len(randomactions)] = randomactions if self.save_qval: makedirs('saveddata', exist_ok=True) np.save(path.join('saveddata', 'e{}_actions.npy'.format(currepoch)), action_sel) relFrames = self._getSpawnFrames_UCB(action_sel, data_q) self._writeInputs(data.rel2sim(np.concatenate(relFrames))) return True if self.goalfunction is not None: ## For every cluster in data_q, get the max score and initialize if self.goal_preprocess is not None: goaldataconcat = self.goal_preprocess(goaldataconcat) qstconcat = np.concatenate(data_q.St) statemaxes = np.zeros(numstates) np.maximum.at(statemaxes, qstconcat, np.squeeze(goaldataconcat)) if not self.pucb: goalenergies = -Kinetics._kB * self.temperature * np.log(1-statemaxes) q_values = goalenergies n_values += int((self.nframes / self._numClusters(self.nframes)) * self.goal_init) ## Needs nframes to be set properly!!!!!!!! rewardtraj = np.arange(data_q.numTrajectories) # Recalculate reward for all states rewards = self.getRewards(rewardtraj, data_q, confstatdist, numstates, self.reward_method, self.reward_mode, self.reward_window) for i in range(numstates): if len(rewards[i]) == 0: continue q_values[i] = updatingMean(q_values[i], n_values[i], rewards[i]) n_values += np.array([len(x) for x in rewards]) if self.save_qval: makedirs('saveddata', exist_ok=True) np.save(path.join('saveddata', 'e{}_qval.npy'.format(currepoch)), q_values) np.save(path.join('saveddata', 'e{}_nval.npy'.format(currepoch)), n_values) if self.pucb: ucb_values = np.array([self.count_pucb(q_values[clust], self.exploration, statemaxes[clust], currepoch + 1, n_values[clust]) for clust in range(numstates)]) else: ucb_values = np.array([self.count_ucb(q_values[clust], self.exploration, currepoch + 1, n_values[clust]) for clust in range(numstates)]) if self.save_qval: makedirs('saveddata', exist_ok=True) np.save(path.join('saveddata', 'e{}_ucbvals.npy'.format(currepoch)), ucb_values) N = self.nmax - self._running if self.actionpool <= 0: self.actionpool = N topactions = np.argsort(-ucb_values)[:self.actionpool] action = np.random.choice(topactions, N, replace=False) action_sel = np.zeros(numstates, dtype=int) action_sel[action] += 1 while np.sum(action_sel) < N: # When K is lower than N repeat some actions for a in action: action_sel[a] +=1 if np.sum(action_sel) == N: break if self.save_qval: np.save(path.join('saveddata', 'e{}_actions.npy'.format(currepoch)), action_sel) relFrames = self._getSpawnFrames_UCB(action_sel, data_q) self._writeInputs(data.rel2sim(np.concatenate(relFrames))) return True def _getSimlist(self): from glob import glob from htmd.simlist import simlist, simfilter logger.info('Postprocessing new data') sims = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', '')), glob(path.join(self.inputpath, '*', ''))) if self.filter: sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel) return sims def count_ucb(self, q_value, exploration, step, n_value): return (q_value + (exploration * np.sqrt((np.log(step) / (n_value + 1))))) def count_pucb(self, q_value, exploration, predictor, step, n_value): return (q_value + (exploration * predictor * np.sqrt((np.log(step) / (n_value + 1))))) def getRewards(self, trajidx, data_q, confstatdist, numstates, rewardmethod, rewardmode, rewardwindow): from htmd.kinetics import Kinetics import pandas as pd rewards = [[] for _ in range(numstates)] for simidx in trajidx: # Get the eq distribution of each of the states the sim passed through states = data_q.St[simidx] statprob = confstatdist[simidx] connected = (states != -1) & (statprob != 0) if not np.any(connected): continue states = states[connected] statprob = statprob[connected] #energies = Kinetics._kB * self.temperature * np.log(statprob) energies = -Kinetics._kB * self.temperature * np.log(1-statprob) ww = rewardwindow if rewardwindow is None: ww = len(energies) if rewardmethod == 'mean': windowedreward = pd.Series(energies[::-1]).rolling(ww, min_periods=1).mean().values[::-1] elif rewardmethod == 'max': windowedreward = pd.Series(energies[::-1]).rolling(ww, min_periods=1).max().values[::-1] else: raise RuntimeError('Reward method {} not available'.format(rewardmethod)) if rewardmode == 'parent': # Get the state of the conformation from which the sim was spawned parentidx, parentframe = getParentSimIdxFrame(data_q, simidx) if parentidx == -1: # Parent frame doesn't belong to any state print('Parent frame doesn\'t belong to any state') continue prev_action = data_q.St[parentidx][parentframe] rewards[prev_action].append(windowedreward[0]) elif rewardmode == 'frames': for st, re in zip(states, windowedreward): rewards[st].append(re) else: raise RuntimeError('Invalid reward mode {}'.format(rewardmode)) return rewards def conformationStationaryDistribution(self, model): statdist = np.zeros(model.data.numFrames) # zero for disconnected set dataconcatSt = np.concatenate(model.data.St) for i in range(model.micronum): microframes = np.where(model.micro_ofcluster[dataconcatSt] == i)[0] statdist[microframes] = model.msm.stationary_distribution[i] return model.data.deconcatenate(statdist) def _checkNFrames(self, data): if self.nframes != 0 and data.numFrames >= self.nframes: logger.info('Reached maximum number of frames. Stopping adaptive.') return False return True def _getGoalData(self, sims): from htmd.projections.metric import Metric logger.debug('Starting projection of directed component') metr = Metric(sims, skip=self.skip) metr.set(self.goalfunction) data = metr.project() logger.debug('Finished calculating directed component') return data def _createMSM(self, data): from htmd.model import Model kmeanserror = True while kmeanserror: try: data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames))) except IndexError: continue kmeanserror = False self._model = Model(data) self._model.markovModel(self.lag, self._numMacrostates(data)) if self.save: makedirs('saveddata', exist_ok=True) self._model.save(path.join('saveddata', 'e{}_adapt_model.dat'.format(self._getEpoch()))) def _getSpawnFrames_UCB(self, reward, data): stateIdx = np.where(reward > 0)[0] _, relFrames = data.sampleClusters(stateIdx, reward[stateIdx], replacement=True, allframes=False) logger.debug('relFrames {}'.format(relFrames)) return relFrames def _numClusters(self, numFrames): """ Heuristic that calculates number of clusters from number of frames """ K = int(max(np.round(0.6 * np.log10(numFrames / 1000) * 1000 + 50), 100)) # heuristic if K > numFrames / 3: # Ugly patch for low-data regimes ... K = int(numFrames / 3) return K def _numMacrostates(self, data): """ Heuristic for calculating the number of macrostates for the Markov model """ macronum = self.macronum if data.K < macronum: macronum = np.ceil(data.K / 2) logger.warning('Using less macrostates than requested due to lack of microstates. macronum = ' + str(macronum)) # Calculating how many timescales are above the lag time to limit number of macrostates from pyemma.msm import timescales_msm timesc = timescales_msm(data.St.tolist(), lags=self.lag, nits=macronum).get_timescales() macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2)) return macronum
return ani if __name__ == "__main__": from htmd.model import getStateStatistic from htmd.projections.metric import MetricData from htmd.projections.metricdistance import MetricDistance from htmd.model import Model from htmd.molecule.molecule import Molecule import numpy as np data = MetricData() data.load( "/workspace8/p27_sj403/10-11-2018_p27_short_sj403/analysis/17_11_2018/testing.dat" ) model = Model() model.load( "/workspace8/p27_sj403/10-11-2018_p27_short_sj403/analysis/17_11_2018/model.dat" ) mol = Molecule(model.data.simlist[0].molfile) mean_dat = getStateStatistic(model, data, range(model.macronum)) met = MetricDistance(sel1="noh and protein or resname MOL", sel2="noh and protein or resname MOL", groupsel1="residue", groupsel2="residue", metric="distances", pbc=False) mapping = met.getMapping(mol) contact_plot(mean_dat, mol, rows=2,
class AdaptiveMD(AdaptiveBase): """ Adaptive class which uses a Markov state model for respawning AdaptiveMD uses Markov state models to choose respawning poses for the next epochs. In more detail, it projects all currently retrieved simulations according to the specified projection, clusters those and then builds a Markov model using the discretized trajectories. From the Markov model it then chooses conformations from the various states based on the chosen criteria which will be used for starting new simulations. Parameters ---------- app : :class:`SimQueue <htmd.queues.simqueue.SimQueue>` object, default=None A SimQueue class object used to retrieve and submit simulations project : str, default='adaptive' The name of the project nmin : int, default=1 Minimum number of running simulations nmax : int, default=1 Maximum number of running simulations nepochs : int, default=1000 Stop adaptive once we have reached this number of epochs nframes : int, default=0 Stop adaptive once we have simulated this number of aggregate simulation frames. inputpath : str, default='input' The directory used to store input folders generatorspath : str, default='generators' The directory containing the generators dryrun : boolean, default=False A dry run means that the adaptive will retrieve and generate a new epoch but not submit the simulations updateperiod : float, default=0 When set to a value other than 0, the adaptive will run synchronously every `updateperiod` seconds coorname : str, default='input.coor' Name of the file containing the starting coordinates for the new simulations lock : bool, default=False Lock the folder while adaptive is ongoing datapath : str, default='data' The directory in which the completed simulations are stored filter : bool, default=True Enable or disable filtering of trajectories. filtersel : str, default='not water' Atom selection string for filtering. See more `here <http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html>`__ filteredpath : str, default='filtered' The directory in which the filtered simulations will be stored projection : :class:`Projection <htmd.projections.projection.Projection>` object, default=None A Projection class object or a list of objects which will be used to project the simulation data before constructing a Markov model truncation : str, default=None Method for truncating the prob distribution (None, 'cumsum', 'statecut' statetype : ('micro', 'cluster', 'macro'), str, default='micro' What states (cluster, micro, macro) to use for calculations. macronum : int, default=8 The number of macrostates to produce skip : int, default=1 Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame lag : int, default=1 The lagtime used to create the Markov model clustmethod : :class:`ClusterMixin <sklearn.base.ClusterMixin>` class, default=<class 'htmd.clustering.kcenters.KCenter'> Clustering algorithm used to cluster the contacts or distances method : str, default='1/Mc' Criteria used for choosing from which state to respawn from ticalag : int, default=20 Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly. ticadim : int, default=3 Number of TICA dimensions to use. When set to 0 it disables TICA contactsym : str, default=None Contact symmetry save : bool, default=False Save the model generated Example ------- >>> adapt = AdaptiveMD() >>> adapt.nmin = 2 >>> adapt.nmax = 3 >>> adapt.nepochs = 2 >>> adapt.ticadim = 3 >>> adapt.projection = [MetricDistance('name CA', 'name N'), MetricDihedral()] >>> adapt.generatorspath = htmd.home()+'/data/dhfr' >>> adapt.app = AcemdLocal() >>> adapt.run() """ def __init__(self): from sklearn.base import ClusterMixin from htmd.clustering.kcenters import KCenter from htmd.projections.projection import Projection super().__init__() self._arg('datapath', 'str', 'The directory in which the completed simulations are stored', 'data', val.String()) self._arg('filter', 'bool', 'Enable or disable filtering of trajectories.', True, val.Boolean()) self._arg('filtersel', 'str', 'Filtering atom selection', 'not water', val.String()) self._arg('filteredpath', 'str', 'The directory in which the filtered simulations will be stored', 'filtered', val.String()) self._arg('projection', ':class:`Projection <htmd.projections.projection.Projection>` object', 'A Projection class object or a list of objects which will be used to project the simulation ' 'data before constructing a Markov model', None, val.Object(Projection), nargs='+') self._arg('truncation', 'str', 'Method for truncating the prob distribution (None, \'cumsum\', \'statecut\'', None, val.String()) self._arg('statetype', 'str', 'What states (cluster, micro, macro) to use for calculations.', 'micro', val.String(), valid_values=('micro', 'cluster', 'macro')) self._arg('macronum', 'int', 'The number of macrostates to produce', 8, val.Number(int, 'POS')) self._arg('skip', 'int', 'Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame', 1, val.Number(int, 'POS')) self._arg('lag', 'int', 'The lagtime used to create the Markov model', 1, val.Number(int, 'POS')) self._arg('clustmethod', ':class:`ClusterMixin <sklearn.base.ClusterMixin>` class', 'Clustering algorithm used to cluster the contacts or distances', KCenter, val.Class(ClusterMixin)) self._arg('method', 'str', 'Criteria used for choosing from which state to respawn from', '1/Mc', val.String()) self._arg('ticalag', 'int', 'Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly.', 20, val.Number(int, '0POS')) self._arg('ticadim', 'int', 'Number of TICA dimensions to use. When set to 0 it disables TICA', 3, val.Number(int, '0POS')) self._arg('contactsym', 'str', 'Contact symmetry', None, val.String()) self._arg('save', 'bool', 'Save the model generated', False, val.Boolean()) def _algorithm(self): data = self._getData(self._getSimlist()) if not self._checkNFrames(data): return False self._createMSM(data) N = self.nmax - self._running reward = self._criteria(self._model, self.method) reward = self._truncate(reward, N) relFrames, _, _ = self._getSpawnFrames(reward, self._model, self._model.data, N) self._writeInputs(self._model.data.rel2sim(np.concatenate(relFrames))) return True def _checkNFrames(self, data): if self.nframes != 0 and data.numFrames >= self.nframes: logger.info('Reached maximum number of frames. Stopping adaptive.') return False return True def _getSimlist(self): logger.info('Postprocessing new data') sims = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', '')), glob(path.join(self.inputpath, '*', ''))) if self.filter: sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel) return sims def _getData(self, sims): metr = Metric(sims, skip=self.skip) metr.set(self.projection) # if self.contactsym is not None: # contactSymmetry(data, self.contactsym) if self.ticadim > 0: # tica = TICA(metr, int(max(2, np.ceil(self.ticalag)))) # gianni: without project it was tooooo slow data = metr.project() data.dropTraj() # Drop before TICA to avoid broken trajectories ticalag = int( np.ceil(max(2, min(np.min(data.trajLengths) / 2, self.ticalag)))) # 1 < ticalag < (trajLen / 2) tica = TICA(data, ticalag) datadr = tica.project(self.ticadim) else: datadr = metr.project() datadr.dropTraj() # Preferably we should do this before any projections. Corrupted sims can affect TICA return datadr def _createMSM(self, data): data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames))) self._model = Model(data) self._model.markovModel(self.lag, self._numMacrostates(data)) if self.save: if not path.exists('saveddata'): makedirs('saveddata') self._model.save(path.join('saveddata', 'e{}_adapt_model.dat'.format(self._getEpoch()))) def _getSpawnFrames(self, reward, model, data, N): prob = reward / np.sum(reward) logger.debug('Sampling probabilities {}'.format(prob)) spawncounts = np.random.multinomial(N, prob) logger.debug('spawncounts {}'.format(spawncounts)) stateIdx = np.where(spawncounts > 0)[0] _, relFrames = model.sampleStates(stateIdx, spawncounts[stateIdx], statetype='micro', replacement=True) logger.debug('relFrames {}'.format(relFrames)) return relFrames, spawncounts, prob def _criteria(self, model, criteria): if criteria == '1/Mc': nMicroPerMacro = macroAccumulate(model, np.ones(model.micronum)) P_I = 1 / macroAccumulate(model, model.data.N[model.cluster_ofmicro]) P_I = P_I / nMicroPerMacro ret = P_I[model.macro_ofmicro] elif criteria == 'pi/Mc': nMicroPerMacro = macroAccumulate(model, np.ones(model.micronum)) P_I = 1 / macroAccumulate(model, model.data.N[model.cluster_ofmicro]) P_I = P_I / nMicroPerMacro ret = P_I[model.macro_ofmicro]*model.msm.stationary_distribution return ret def _truncate(self, ranking, N): if self.truncation is not None and self.truncation.lower() != 'none': if self.truncation == 'cumsum': idx = np.argsort(ranking) idx = idx[::-1] # decreasing sort errs = ranking[idx] H = (N * errs / np.cumsum(errs)) < 1 ranking[idx[H]] = 0 if self.truncation == 'statecut': idx = np.argsort(ranking) idx = idx[::-1] # decreasing sort ranking[idx[N:]] = 0 # Set all states ranked > N to zero. return ranking def _numClusters(self, numFrames): """ Heuristic that calculates number of clusters from number of frames """ K = int(max(np.round(0.6 * np.log10(numFrames / 1000) * 1000 + 50), 100)) # heuristic if K > numFrames / 3: # Ugly patch for low-data regimes ... K = int(numFrames / 3) return K def _numMacrostates(self, data): """ Heuristic for calculating the number of macrostates for the Markov model """ macronum = self.macronum if data.K < macronum: macronum = np.ceil(data.K / 2) logger.warning('Using less macrostates than requested due to lack of microstates. macronum = ' + str(macronum)) # Calculating how many timescales are above the lag time to limit number of macrostates from pyemma.msm import timescales_msm timesc = timescales_msm(data.St.tolist(), lags=self.lag, nits=macronum).get_timescales() macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2)) return macronum
class AdaptiveMD(AdaptiveBase): """ Adaptive class which uses a Markov state model for respawning AdaptiveMD uses Markov state models to choose respawning poses for the next epochs. In more detail, it projects all currently retrieved simulations according to the specified projection, clusters those and then builds a Markov model using the discretized trajectories. From the Markov model it then chooses conformations from the various states based on the chosen criteria which will be used for starting new simulations. Parameters ---------- app : :class:`App <htmd.apps.app.App>` object, default=None An App class object used to retrieve and submit simulations project : str, default='adaptive' The name of the project nmin : int, default=1 Minimum number of running simulations nmax : int, default=1 Maximum number of running simulations nepochs : int, default=100 Maximum number of epochs inputpath : str, default='input' The directory used to store input folders generatorspath : str, default='generators' The directory containing the generators dryrun : boolean, default=False A dry run means that the adaptive will retrieve and generate a new epoch but not submit the simulations updateperiod : float, default=0 When set to a value other than 0, the adaptive will run synchronously every `updateperiod` seconds datapath : str, default='data' The directory in which the completed simulations are stored filter : bool, default=True Enable or disable filtering of trajectories. filtersel : str, default='not water' Filtering atom selection filteredpath : str, default='filtered' The directory in which the filtered simulations will be stored projection : :class:`Projection <htmd.projections.projection.Projection>` object, default=None A Projection class object or a list of objects which will be used to project the simulation data before constructing a Markov model macronum : int, default=8 The number of macrostates to produce skip : int, default=1 Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame lag : int, default=1 The lagtime used to create the Markov model clustmethod : :class:`ClusterMixin <sklearn.base.ClusterMixin>` object, default=<class 'sklearn.cluster.k_means_.MiniBatchKMeans'> Clustering algorithm used to cluster the contacts or distances method : str, default='1/Mc' Criteria used for choosing from which state to respawn from ticalag : int, default=20 Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly. ticadim : int, default=3 Number of TICA dimensions to use. When set to 0 it disables TICA contactsym : str, default=None Contact symmetry save : bool, default=False Save the model generated Example ------- >>> adapt = AdaptiveMD() >>> adapt.nmin = 2 >>> adapt.nmax = 3 >>> adapt.nepochs = 2 >>> adapt.ticadim = 3 >>> adapt.projection = [MetricDistance('name CA', 'name N'), MetricDihedral()] >>> adapt.generatorspath = htmd.home()+'/data/dhfr' >>> adapt.app = AcemdLocal() >>> adapt.run() """ def __init__(self): from sklearn.base import ClusterMixin from htmd.projections.projection import Projection super().__init__() self._cmdString( 'datapath', 'str', 'The directory in which the completed simulations are stored', 'data') self._cmdBoolean('filter', 'bool', 'Enable or disable filtering of trajectories.', True) self._cmdString('filtersel', 'str', 'Filtering atom selection', 'not water') self._cmdString( 'filteredpath', 'str', 'The directory in which the filtered simulations will be stored', 'filtered') self._cmdObject( 'projection', ':class:`Projection <htmd.projections.projection.Projection>` object', 'A Projection class object or a list of objects which will be used to project the simulation ' 'data before constructing a Markov model', None, Projection) self._cmdValue('macronum', 'int', 'The number of macrostates to produce', 8, TYPE_INT, RANGE_POS) self._cmdValue( 'skip', 'int', 'Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame', 1, TYPE_INT, RANGE_POS) self._cmdValue('lag', 'int', 'The lagtime used to create the Markov model', 1, TYPE_INT, RANGE_POS) self._cmdObject( 'clustmethod', ':class:`ClusterMixin <sklearn.base.ClusterMixin>` object', 'Clustering algorithm used to cluster the contacts or distances', MiniBatchKMeans, ClusterMixin) self._cmdString( 'method', 'str', 'Criteria used for choosing from which state to respawn from', '1/Mc') self._cmdValue( 'ticalag', 'int', 'Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly.', 20, TYPE_INT, RANGE_0POS) self._cmdValue( 'ticadim', 'int', 'Number of TICA dimensions to use. When set to 0 it disables TICA', 3, TYPE_INT, RANGE_0POS) self._cmdString('contactsym', 'str', 'Contact symmetry', None) self._cmdBoolean('save', 'bool', 'Save the model generated', False) def _algorithm(self): logger.info('Postprocessing new data') sims = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', 'structure.pdb')), glob(path.join(self.inputpath, '*', ''))) if self.filter: sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel) metr = Metric(sims, skip=self.skip) metr.set(self.projection) #if self.contactsym is not None: # contactSymmetry(data, self.contactsym) if self.ticadim > 0: # tica = TICA(metr, int(max(2, np.ceil(self.ticalag)))) # gianni: without project it was tooooo slow tica = TICA(metr.project(), int(max(2, np.ceil(self.ticalag)))) datadr = tica.project(self.ticadim) else: datadr = metr.project() datadr.dropTraj( ) # Preferably we should do this before any projections. Corrupted sims can affect TICA datadr.cluster( self.clustmethod(n_clusters=self._numClusters(datadr.numFrames))) self._model = Model(datadr) self._model.markovModel(self.lag, self._numMacrostates(datadr)) if self.save: self._model.save('adapt_model_e' + str(self._getEpoch()) + '.dat') relFrames = self._getSpawnFrames(self._model, datadr) self._writeInputs(datadr.rel2sim(np.concatenate(relFrames))) def _getSpawnFrames(self, model, data): p_i = self._criteria(model, self.method) (spawncounts, prob) = self._spawn(p_i, self.nmax - self._running) logger.debug('spawncounts {}'.format(spawncounts)) stateIdx = np.where(spawncounts > 0)[0] _, relFrames = model.sampleStates(stateIdx, spawncounts[stateIdx], statetype='micro', replacement=(data.K < 10)) logger.debug('relFrames {}'.format(relFrames)) return relFrames def _criteria(self, model, criteria): if criteria == '1/Mc': nMicroPerMacro = macroAccumulate(model, np.ones(model.micronum)) P_I = 1 / macroAccumulate(model, model.data.N[model.cluster_ofmicro]) P_I = P_I / nMicroPerMacro ret = P_I[model.macro_ofmicro] elif criteria == 'pi/Mc': nMicroPerMacro = macroAccumulate(model, np.ones(model.micronum)) P_I = 1 / macroAccumulate(model, model.data.N[model.cluster_ofmicro]) P_I = P_I / nMicroPerMacro ret = P_I[model.macro_ofmicro] * model.msm.stationary_distribution return ret def _spawn(self, ranking, N, truncated=False): if truncated: idx = np.argsort(ranking) idx = idx[::-1] # decreasing sort errs = ranking[idx] H = (N * errs / np.cumsum(errs)) < 1 ranking[idx[H]] = 0 prob = ranking / np.sum(ranking) spawnmicro = np.random.multinomial(N, prob) return spawnmicro, prob def _numClusters(self, numFrames): """ Heuristic that calculates number of clusters from number of frames """ K = int( max(np.round(0.6 * np.log10(numFrames / 1000) * 1000 + 50), 100)) # heuristic if K > numFrames / 3: # Ugly patch for low-data regimes ... K = int(numFrames / 3) return K def _numMacrostates(self, data): """ Heuristic for calculating the number of macrostates for the Markov model """ macronum = self.macronum if data.K < macronum: macronum = np.ceil(data.K / 2) logger.warning( 'Using less macrostates than requested due to lack of microstates. macronum = ' + str(macronum)) # Calculating how many timescales are above the lag time to limit number of macrostates from pyemma.msm import timescales_msm timesc = timescales_msm(data.St.tolist(), lags=self.lag, nits=macronum).get_timescales() macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2)) return macronum
def _createMSM(self, data): data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames))) self._model = Model(data) self._model.markovModel(self.lag, self._numMacrostates(data)) if self.save: self._model.save('adapt_model_e{}.dat'.format(self._getEpoch()))