def load(self, filename): """ Load a :class:`MetricData <htmd.metricdata.MetricData>` object from disk Parameters ---------- filename : str Path to the saved MetricData object Examples -------- >>> model = Model() >>> model.load('./model.dat') """ import sys import pickle from htmd.metricdata import MetricData try: import pandas.indexes except ImportError: import pandas.core.indexes sys.modules['pandas.indexes'] = pandas.core.indexes # Hacky fix for new pandas version f = open(filename, 'rb') z = pickle.load(f) f.close() for k in z: if k == 'data': m = MetricData() m.load(z[k].__dict__) self.__dict__[k] = m else: self.__dict__[k] = z[k]
def load(self, filename): """ Load a :class:`MetricData <htmd.metricdata.MetricData>` object from disk Parameters ---------- filename : str Path to the saved MetricData object Examples -------- >>> model = Model() >>> model.load('./model.dat') """ import sys import pickle from htmd.metricdata import MetricData try: import pandas.indexes except ImportError: import pandas.core.indexes sys.modules[ 'pandas.indexes'] = pandas.core.indexes # Hacky fix for new pandas version f = open(filename, 'rb') z = pickle.load(f) f.close() for k in z: if k == 'data': m = MetricData() m.load(z[k].__dict__) self.__dict__[k] = m else: self.__dict__[k] = z[k]
def project(self, ndim=None): """ Projects the data object given to the constructor onto `ndim` dimensions Parameters ---------- ndim : int The number of dimensions we want to project the data on. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the projected data Example ------- >>> tri = KMeansTri(data) >>> datatri = tri.project(5) """ import scipy.spatial.distance as scidist from sklearn.cluster import MiniBatchKMeans from htmd.metricdata import MetricData datconcat = np.concatenate(self.data.dat) mb = MiniBatchKMeans(n_clusters=ndim) mb.fit(datconcat) # TODO: Could make it into a loop to waste less memory dist = scidist.cdist(datconcat, mb.cluster_centers_) dist = np.mean(dist, axis=1)[:, np.newaxis] - dist dist[dist < 0] = 0 return MetricData(dat=self.data.deconcatenate(dist), ref=self.data.ref, simlist=self.data.simlist, fstep=self.data.fstep, parent=self.data)
def newMetricData(self, datasource, trajectories=None, olddata=None): """Converts trajectory indexes to a new MetricData object""" dat = [] ref = [] sim = [] for traj in trajectories: d, r, s = self._collectTrajectory(datasource, traj) dat.append(d) ref.append(r) sim.append(s) newdata = MetricData( dat=dat, ref=ref, simlist=sim, description=datasource.description, fstep=datasource.fstep, ) if olddata is not None: olddata.append(newdata) # Merge with old data return olddata else: return newdata
def project(self, ndim=None): """ Projects the data object given to the constructor onto `ndim` dimensions Parameters ---------- ndim : int The number of dimensions we want to project the data on. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the projected data Example ------- >>> tri = KMeansTri(data) >>> datatri = tri.project(5) """ import scipy.spatial.distance as scidist from sklearn.cluster import MiniBatchKMeans from htmd.metricdata import MetricData datconcat = np.concatenate(self.data.dat) mb = MiniBatchKMeans(n_clusters=ndim) mb.fit(datconcat) # TODO: Could make it into a loop to waste less memory dist = scidist.cdist(datconcat, mb.cluster_centers_) dist = np.mean(dist, axis=1)[:, np.newaxis] - dist dist[dist < 0] = 0 projdata = MetricData() projdata.simlist = self.data.simlist projdata.dat = self.data.deconcatenate(dist) projdata.ref = self.data.ref projdata.parent = self.data projdata.fstep = self.data.fstep return projdata
def _metrify(self, sims, skip, update): """ Takes a set of trajectory folders and projects all trajectories within them onto the given space defined by the Metric* class. Parameters ---------- simList : numpy list of structs A list of structs produced by the simList function. skip : int Skips every x frames. update : MetricData object Provide a previous MetricData object and only metrify new trajectories. Returns ------- data : MetricData object Returns a MetricData object containing the projected data and the ref data. """ if isinstance(sims, Molecule): return self._processTraj(sims) # [updList, oldList] = checkUpdate(simList, update, verbose); updList = sims numSim = len(updList) # Find out if there is a unique molfile. If there is, initialize a single Molecule to speed up calculations uniqueMol = 0 uqMol = [] map = [] (single, molfile) = _singleMolfile(updList) if single: uniqueMol = 1 uqMol = Molecule(molfile) # Calculating the mapping of metric columns to atom pair indeces map = self._getMapping(uqMol) logger.info('Metric: Starting projection of trajectories.') metrics = np.empty(numSim, dtype=object) ref = np.empty(numSim, dtype=object) deletesims = np.zeros(numSim, dtype=bool) fstep = np.zeros(numSim) # Monkey-patching callback class #oldcallback = joblib.parallel.CallBack #joblib.parallel.CallBack = CallBack #p = ProgressBar(numSim, description='Projecting trajectories') from htmd.config import _config results = Parallel(n_jobs=_config['ncpus'], verbose=11)( delayed(_processSimOld)(self, i, updList, uniqueMol, uqMol, skip, deletesims, metrics, ref, fstep) for i in range(numSim)) #joblib.parallel.CallBack = oldcallback for i in range(len(results)): metrics[i] = results[i][0] ref[i] = results[i][1] fstep[i] = results[i][2] deletesims[i] = results[i][3] logger.info('Finished projecting the trajectories.') # Removing empty trajectories emptyM = np.array([True if x is None else False for x in metrics], dtype=bool) emptyR = np.array([True if x is None else False for x in ref], dtype=bool) assert np.all(deletesims == emptyM) and np.all(emptyR == emptyM) metrics = np.delete(metrics, np.where(emptyM)[0]) ref = np.delete(ref, np.where(emptyM)[0]) updList = np.delete(updList, np.where(emptyM)[0]) if len(metrics) == 0: raise NameError('No trajectories were read') # Constructing a MetricData object if not update: data = MetricData(dat=metrics, ref=ref, map=map, simlist=updList) else: data = update data.dat.extend(metrics) data.ref.extend(ref) data.simList.extend( updList) # This is wrong but we don't use update anyways uqfsteps = np.unique(fstep) data.fstep = float(stats.mode(fstep).mode) if len(uqfsteps) != 1: logger.warning( 'Multiple framesteps were read from the simulations. Taking the statistical mode: ' + str(data.fstep) + 'ns. If it looks wrong, you can modify it by manually setting the MetricData.fstep property.' ) else: logger.info( 'Frame step {}ns was read from the trajectories. If it looks wrong, redefine it by manually setting the MetricData.fstep property.' .format(data.fstep)) return data
def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ if ndim is not None: # self.tic._dim = ndim # Old way of doing it. Deprecated since pyEMMA 2.1 self.tic.set_params( dim=ndim) # Change to this in 2.1 pyEMMA version if isinstance( self.data, Metric): # Doesn't project on correct number of dimensions proj = [] refs = [] fstep = None metr = self.data p = ProgressBar(len(metr.simulations)) k = -1 droppedsims = [] for projecteddata in _projectionGenerator(metr, _getNcpus()): for pro in projecteddata: k += 1 if pro is None: droppedsims.append(k) continue proj.append(self.tic.transform(pro[0])) refs.append(pro[1]) if fstep is None: fstep = pro[2] p.progress(len(projecteddata)) p.stop() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) #fstep = 0 parent = None else: proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data if ndim is None: logger.info( 'Kept {} dimension(s) to cover 95% of kinetic variance.'. format(self.tic.dimension())) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj, dtype=object), simlist=simlist, ref=ref, fstep=fstep, parent=parent) from pandas import DataFrame types = [] indexes = [] description = [] for i in range(ndim): types += ['tica'] indexes += [-1] description += ['TICA dimension {}'.format(i + 1)] datatica.map = DataFrame({ 'type': types, 'indexes': indexes, 'description': description }) return datatica
def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ if ndim is not None: # self.tic._dim = ndim # Old way of doing it. Deprecated since pyEMMA 2.1 self.tic.set_params( dim=ndim) # Change to this in 2.1 pyEMMA version if isinstance( self.data, Metric): # Doesn't project on correct number of dimensions proj = [] refs = [] fstep = None '''from htmd.config import _config from joblib import Parallel, delayed results = Parallel(n_jobs=_config['ncpus'], verbose=11)( delayed(_test)(self.data, self.tic, i) for i in range(len(self.data.simulations))) for i in range(len(results)): proj.append(results[i][0]) refs.append(results[i][1]) fstep.append(results[i][2])''' droppedsims = [] p = ProgressBar(len(self.data.simulations)) for i in range(len(self.data.simulations)): d, r, f = self.data._projectSingle(i) if d is None: droppedsims.append(i) continue if fstep is None: fstep = f refs.append(r) proj.append(self.tic.transform(d)) p.progress() p.stop() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) #fstep = 0 parent = None else: proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data if ndim is None: logger.info( 'Kept {} dimension(s) to cover 95% of kinetic variance.'. format(self.tic.dimension())) #print(np.shape(proj)) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj, dtype=object), simlist=simlist, ref=ref, fstep=fstep, parent=parent) '''datatica = self.data.copy() #datatica.dat = self.data.deconcatenate(np.squeeze(proj)) datatica.dat = np.array(proj, dtype=object) datatica.parent = self.data datatica.St = None datatica.Centers = None datatica.N = None datatica.K = None datatica._dataid = random.random() datatica._clusterid = None''' return datatica
def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ from tqdm import tqdm if ndim is not None: self.tic.set_params(dim=ndim) keepdata = [] keepdim = None keepdimdesc = None if isinstance( self.data, Metric ): # Memory efficient TICA projecting trajectories on the fly proj = [] refs = [] fstep = None metr = self.data k = -1 droppedsims = [] pbar = tqdm(total=len(metr.simulations)) for projecteddata in _projectionGenerator(metr, _getNcpus()): for pro in projecteddata: k += 1 if pro is None: droppedsims.append(k) continue if self.dimensions is not None: numDimensions = pro[0].shape[1] keepdim = np.setdiff1d(range(numDimensions), self.dimensions) keepdata.append(pro[0][:, keepdim]) proj.append( self.tic.transform( pro[0][:, self.dimensions]).astype(np.float32) ) # Sub-select dimensions for projecting else: proj.append( self.tic.transform(pro[0]).astype(np.float32)) refs.append(pro[1]) if fstep is None: fstep = pro[2] pbar.update(len(projecteddata)) pbar.close() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) parent = None if self.dimensions is not None: from htmd.projections.metric import _singleMolfile from htmd.molecule.molecule import Molecule (single, molfile) = _singleMolfile(metr.simulations) if single: keepdimdesc = metr.getMapping(Molecule(molfile)) keepdimdesc = keepdimdesc.iloc[keepdim] else: if ndim is not None and self.data.numDimensions < ndim: raise RuntimeError( 'TICA cannot increase the dimensionality of your data. Your data has {} dimensions and you requested {} TICA dimensions' .format(self.data.numDimensions, ndim)) if self.dimensions is not None: keepdim = np.setdiff1d(range(self.data.numDimensions), self.dimensions) keepdata = [x[:, keepdim] for x in self.data.dat] if self.data.description is not None: keepdimdesc = self.data.description.iloc[keepdim] proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data # If TICA is done on a subset of dimensions, combine non-projected data with projected data if self.dimensions is not None: newproj = [] for k, t in zip(keepdata, proj): newproj.append(np.hstack((k, t))) proj = newproj if ndim is None: ndim = self.tic.dimension() logger.info( 'Kept {} dimension(s) to cover 95% of kinetic variance.'. format(ndim)) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj), simlist=simlist, ref=ref, fstep=fstep, parent=parent) from pandas import DataFrame # TODO: Make this messy pandas creation cleaner. I'm sure I can append rows to DataFrame types = [] indexes = [] description = [] for i in range(ndim): types += ['tica'] indexes += [-1] description += ['TICA dimension {}'.format(i + 1)] datatica.description = DataFrame({ 'type': types, 'atomIndexes': indexes, 'description': description }) if self.dimensions is not None and keepdimdesc is not None: # If TICA is done on a subset of dims datatica.description = keepdimdesc.append(datatica.description, ignore_index=True) return datatica
def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ if ndim is not None: self.tic.set_params(dim=ndim) keepdata = [] keepdim = None keepdimdesc = None if isinstance(self.data, Metric): # Memory efficient TICA projecting trajectories on the fly proj = [] refs = [] fstep = None metr = self.data p = ProgressBar(len(metr.simulations)) k = -1 droppedsims = [] for projecteddata in _projectionGenerator(metr, _getNcpus()): for pro in projecteddata: k += 1 if pro is None: droppedsims.append(k) continue if self.dimensions is not None: numDimensions = pro[0].shape[1] keepdim = np.setdiff1d(range(numDimensions), self.dimensions) keepdata.append(pro[0][:, keepdim]) proj.append(self.tic.transform(pro[0][:, self.dimensions]).astype(np.float32)) # Sub-select dimensions for projecting else: proj.append(self.tic.transform(pro[0]).astype(np.float32)) refs.append(pro[1]) if fstep is None: fstep = pro[2] p.progress(len(projecteddata)) p.stop() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) parent = None if self.dimensions is not None: from htmd.projections.metric import _singleMolfile from htmd.molecule.molecule import Molecule (single, molfile) = _singleMolfile(metr.simulations) if single: keepdimdesc = metr.getMapping(Molecule(molfile)) keepdimdesc = keepdimdesc.iloc[keepdim] else: if ndim is not None and self.data.numDimensions < ndim: raise RuntimeError('TICA cannot increase the dimensionality of your data. Your data has {} dimensions and you requested {} TICA dimensions'.format(self.data.numDimensions, ndim)) if self.dimensions is not None: keepdim = np.setdiff1d(range(self.data.numDimensions), self.dimensions) keepdata = [x[:, keepdim] for x in self.data.dat] if self.data.description is not None: keepdimdesc = self.data.description.iloc[keepdim] proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data # If TICA is done on a subset of dimensions, combine non-projected data with projected data if self.dimensions is not None: newproj = [] for k, t in zip(keepdata, proj): newproj.append(np.hstack((k, t))) proj = newproj if ndim is None: ndim = self.tic.dimension() logger.info('Kept {} dimension(s) to cover 95% of kinetic variance.'.format(ndim)) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj), simlist=simlist, ref=ref, fstep=fstep, parent=parent) from pandas import DataFrame # TODO: Make this messy pandas creation cleaner. I'm sure I can append rows to DataFrame types = [] indexes = [] description = [] for i in range(ndim): types += ['tica'] indexes += [-1] description += ['TICA dimension {}'.format(i+1)] datatica.description = DataFrame({'type': types, 'atomIndexes': indexes, 'description': description}) if self.dimensions is not None and keepdimdesc is not None: # If TICA is done on a subset of dims datatica.description = keepdimdesc.append(datatica.description, ignore_index=True) return datatica
def _metrify(self, sims, skip, verbose, update): """ Takes a set of trajectory folders and projects all trajectories within them onto the given space defined by the Metric* class. Parameters ---------- simList : numpy list of structs A list of structs produced by the simList function. skip : int Skips every x frames. verbose : int Verbosity toggle update : MetricData object Provide a previous MetricData object and only metrify new trajectories. Returns ------- data : MetricData object Returns a MetricData object containing the projected data and the ref data. """ if isinstance(sims, Molecule): return self.processTraj(sims) # [updList, oldList] = checkUpdate(simList, update, verbose); updList = sims numSim = len(updList) # Find out if there is a unique molfile. If there is, initialize a single Molecule to speed up calculations uniqueMol = 0 uqMol = [] map = [] (single, molfile) = _singleMolfile(updList) if single: uniqueMol = 1 uqMol = Molecule(molfile) # Calculating the mapping of metric columns to atom pair indeces map = self._getMapping(uqMol) logger.info('Metric: Starting projection of trajectories.') metrics = np.empty(numSim, dtype=object) ref = np.empty(numSim, dtype=object) deleteSims = np.zeros(numSim, dtype=bool) fstep = np.empty(numSim) #global parpool Parallel(n_jobs=6, backend="threading")(delayed(_processSimPyemma)(self, i, updList, uniqueMol, uqMol, skip, deleteSims, metrics, ref, fstep) for i in range(numSim)) logger.info('Finished projecting the trajectories.') # Removing empty trajectories emptyM = [True if np.size(x) == 0 else False for x in metrics] emptyR = [True if np.size(x) == 0 else False for x in ref] #assert np.all(deleteSims == emptyM)# and np.all(emptyR == emptyM) metrics = np.delete(metrics, np.where(emptyM)) ref = np.delete(ref, np.where(emptyM)) #updList = np.delete(updList, emptyM) if len(metrics) == 0: raise NameError('No trajectories were read') # Constructing a MetricData object if not update: data = MetricData(dat=metrics, ref=ref, map=map, simlist=updList) else: data = update data.dat.extend(metrics) data.ref.extend(ref) data.simList.extend(updList) uqfsteps = np.unique(fstep) data.fstep = stats.mode(fstep).mode if len(uqfsteps) != 1: logger.warning('Multiple framesteps were read from the simulations. Taking the statistical mode: ' + str(data.fstep) + 'ns.') logger.warning('If it looks wrong, you can modify it by manually setting the MetricData.fstep property.') return data
def project(self): """ Applies all projections stored in Metric on all simulations. Returns ------- data : MetricData object Returns a MetricData object containing the projected data. """ if len(self.projectionlist) == 0: raise RuntimeError( 'You need to provide projections using the Metric.set method.') # Projecting single Molecules if isinstance(self.simulations, Molecule): data = [] mol = self.simulations for proj in self.projectionlist: data.append(_project(proj, mol)) return data numSim = len(self.simulations) # Find out if there is a unique molfile. If there is, initialize a single Molecule to speed up calculations uqMol = None (single, molfile) = _singleMolfile(self.simulations) if single: uqMol = Molecule(molfile) for proj in self.projectionlist: if isinstance(proj, Projection): proj._precalculate(uqMol) else: logger.warning( 'Cannot calculate description of dimensions due to different topology files for each trajectory.' ) mapping = self.getMapping(uqMol) logger.debug('Metric: Starting projection of trajectories.') from htmd.config import _config aprun = ParallelExecutor(n_jobs=_config['ncpus']) results = aprun(total=numSim, description='Projecting trajectories')( delayed(_processSim)(self.simulations[i], self.projectionlist, uqMol, self.skip) for i in range(numSim)) metrics = np.empty(numSim, dtype=object) ref = np.empty(numSim, dtype=object) deletesims = np.zeros(numSim, dtype=bool) fstep = np.zeros(numSim) for i in range(len(results)): metrics[i] = results[i][0] ref[i] = results[i][1] fstep[i] = results[i][2] deletesims[i] = results[i][3] logger.debug('Finished projecting the trajectories.') # Removing empty trajectories metrics, ref, updlist, fstep = self._removeEmpty( metrics, ref, deletesims, fstep) # Constructing a MetricData object data = MetricData(dat=metrics, ref=ref, description=mapping, simlist=updlist) uqfsteps = np.unique(fstep) data.fstep = float(stats.mode(fstep).mode) if len(uqfsteps) != 1: logger.warning( 'Multiple framesteps [{}] ns were read from the simulations. ' 'Taking the statistical mode: {}ns. ' 'If it looks wrong, you can modify it by manually ' 'setting the MetricData.fstep property.'.format( ', '.join(map(str, uqfsteps)), data.fstep)) else: logger.info( 'Frame step {}ns was read from the trajectories. If it looks wrong, redefine it by manually ' 'setting the MetricData.fstep property.'.format(data.fstep)) return data
def project(self, njobs=None): """ Applies all projections stored in Metric on all simulations. Parameters ---------- njobs : int Number of parallel jobs to spawn for projection of trajectories. Take care that this can use large amounts of memory as multiple trajectories are loaded at once. If None it will use the default from htmd.config. Returns ------- data : MetricData object Returns a MetricData object containing the projected data. """ if len(self.projectionlist) == 0: raise RuntimeError( "You need to provide projections using the Metric.set method.") # Projecting single Molecules if isinstance(self.simulations, Molecule): data = [] mol = self.simulations for proj in self.projectionlist: data.append(_project(proj, mol)) return data numSim = len(self.simulations) # Find out if there is a unique molfile. If there is, initialize a single Molecule to speed up calculations uqMol = None (single, molfile) = _singleMolfile(self.simulations) if single: uqMol = Molecule(molfile) for proj in self.projectionlist: if isinstance(proj, Projection): proj._setCache(uqMol) else: logger.warning( "Cannot calculate description of dimensions due to different topology files for each trajectory." ) mapping = self.getMapping(uqMol) logger.debug("Metric: Starting projection of trajectories.") from htmd.config import _config aprun = ParallelExecutor( n_jobs=njobs if njobs is not None else _config["njobs"]) results = aprun(total=numSim, desc="Projecting trajectories")( delayed(_processSim)(self.simulations[i], self.projectionlist, uqMol, self.skip) for i in range(numSim)) metrics = np.empty(numSim, dtype=object) ref = np.empty(numSim, dtype=object) deletesims = np.zeros(numSim, dtype=bool) fstep = np.zeros(numSim) for i in range(len(results)): metrics[i] = results[i][0] ref[i] = results[i][1] fstep[i] = results[i][2] deletesims[i] = results[i][3] logger.debug("Finished projecting the trajectories.") # Removing empty trajectories metrics, ref, updlist, fstep = self._removeEmpty( metrics, ref, deletesims, fstep) # Constructing a MetricData object data = MetricData(dat=metrics, ref=ref, description=mapping, simlist=updlist) uqfsteps = np.unique(fstep) if np.all(np.isnan(uqfsteps)): logger.warning( "No framestep could be read from the trajectories. Please manually set the MetricData.fstep" " property, otherwise calculations in Model and Kinetics classes can fail." ) else: data.fstep = float(stats.mode(fstep).mode) if len(uqfsteps) != 1: logger.warning( "Multiple framesteps [{}] ns were read from the simulations. " "Taking the statistical mode: {}ns. " "If it looks wrong, you can modify it by manually " "setting the MetricData.fstep property.".format( ", ".join(map(str, uqfsteps)), data.fstep)) else: if data.fstep == 0: logger.warning( "A framestep of 0 was read from the trajectories. Please manually set the MetricData.fstep" " property, otherwise calculations in Model and Kinetics classes can fail." ) else: logger.info( "Frame step {}ns was read from the trajectories. If it looks wrong, redefine it by manually " "setting the MetricData.fstep property.".format( data.fstep)) return data
def project(self): """ Applies all projections stored in Metric on all simulations. Returns ------- data : MetricData object Returns a MetricData object containing the projected data. """ if len(self.projectionlist) == 0: raise NameError('You need to provide projections using the Metric.projection method.') if isinstance(self.simulations, Molecule): data = [] for proj in self.projectionlist: data.append(proj.project(self.simulations)) return data numSim = len(self.simulations) # Find out if there is a unique molfile. If there is, initialize a single Molecule to speed up calculations uqMol = None map = [] (single, molfile) = _singleMolfile(self.simulations) if single: uqMol = Molecule(molfile) for proj in self.projectionlist: proj._precalculate(uqMol) #map.append(np.array(proj.getMapping(uqMol), dtype=object)) #map = np.hstack(map) logger.info('Metric: Starting projection of trajectories.') metrics = np.empty(numSim, dtype=object) ref = np.empty(numSim, dtype=object) deletesims = np.zeros(numSim, dtype=bool) fstep = np.zeros(numSim) # # Monkey-patching callback class # oldcallback = joblib.parallel.BatchCompletionCallBack # joblib.parallel.BatchCompletionCallBack = BatchCompletionCallBack # from htmd.config import _config # results = Parallel(n_jobs=_config['ncpus'], verbose=11)( # delayed(_processSim)(self.simulations[i], self.projectionlist, uqMol, self.skip) for i in range(numSim)) # joblib.parallel.BatchCompletionCallBack = oldcallback from htmd.config import _config results = Parallel(n_jobs=_config['ncpus'], verbose=11)( delayed(_processSim)(self.simulations[i], self.projectionlist, uqMol, self.skip) for i in range(numSim)) for i in range(len(results)): metrics[i] = results[i][0] ref[i] = results[i][1] fstep[i] = results[i][2] deletesims[i] = results[i][3] logger.info('Finished projecting the trajectories.') # Removing empty trajectories emptyM = np.array([True if x is None else False for x in metrics], dtype=bool) emptyR = np.array([True if x is None else False for x in ref], dtype=bool) assert np.all(deletesims == emptyM) and np.all(emptyR == emptyM) metrics = np.delete(metrics, np.where(emptyM)[0]) ref = np.delete(ref, np.where(emptyM)[0]) updlist = np.delete(self.simulations, np.where(emptyM)[0]) if len(metrics) == 0: raise NameError('No trajectories were read') # Constructing a MetricData object data = MetricData(dat=metrics, ref=ref, map=map, simlist=updlist) uqfsteps = np.unique(fstep) data.fstep = float(stats.mode(fstep).mode) if len(uqfsteps) != 1: logger.warning('Multiple framesteps were read from the simulations. ' 'Taking the statistical mode: ' + str(data.fstep) + 'ns. ' 'If it looks wrong, you can modify it by manually setting the MetricData.fstep property.') else: logger.info('Frame step {}ns was read from the trajectories. If it looks wrong, redefine it by manually ' 'setting the MetricData.fstep property.'.format(data.fstep)) return data
def project(self): """ Applies all projections stored in Metric on all simulations. Returns ------- data : MetricData object Returns a MetricData object containing the projected data. """ if len(self.projectionlist) == 0: raise NameError( 'You need to provide projections using the Metric.projection method.' ) if isinstance(self.simulations, Molecule): data = [] for proj in self.projectionlist: data.append(proj.project(self.simulations)) return data numSim = len(self.simulations) # Find out if there is a unique molfile. If there is, initialize a single Molecule to speed up calculations uqMol = None map = [] (single, molfile) = _singleMolfile(self.simulations) if single: uqMol = Molecule(molfile) for proj in self.projectionlist: proj._precalculate(uqMol) #map.append(np.array(proj.getMapping(uqMol), dtype=object)) #map = np.hstack(map) logger.info('Metric: Starting projection of trajectories.') metrics = np.empty(numSim, dtype=object) ref = np.empty(numSim, dtype=object) deletesims = np.zeros(numSim, dtype=bool) fstep = np.zeros(numSim) from htmd.config import _config results = Parallel(n_jobs=_config['ncpus'], verbose=11)( delayed(_processSim)(self.simulations[i], self.projectionlist, uqMol, self.skip) for i in range(numSim)) for i in range(len(results)): metrics[i] = results[i][0] ref[i] = results[i][1] fstep[i] = results[i][2] deletesims[i] = results[i][3] logger.info('Finished projecting the trajectories.') # Removing empty trajectories emptyM = np.array([True if x is None else False for x in metrics], dtype=bool) emptyR = np.array([True if x is None else False for x in ref], dtype=bool) assert np.all(deletesims == emptyM) and np.all(emptyR == emptyM) metrics = np.delete(metrics, np.where(emptyM)[0]) ref = np.delete(ref, np.where(emptyM)[0]) updlist = np.delete(self.simulations, np.where(emptyM)[0]) if len(metrics) == 0: raise NameError('No trajectories were read') # Constructing a MetricData object data = MetricData(dat=metrics, ref=ref, map=map, simlist=updlist) uqfsteps = np.unique(fstep) data.fstep = float(stats.mode(fstep).mode) if len(uqfsteps) != 1: logger.warning( 'Multiple framesteps were read from the simulations. ' 'Taking the statistical mode: ' + str(data.fstep) + 'ns. ' 'If it looks wrong, you can modify it by manually setting the MetricData.fstep property.' ) else: logger.info( 'Frame step {}ns was read from the trajectories. If it looks wrong, redefine it by manually ' 'setting the MetricData.fstep property.'.format(data.fstep)) return data
def project(self): """ Applies all projections stored in Metric on all simulations. Returns ------- data : MetricData object Returns a MetricData object containing the projected data. """ if len(self.projectionlist) == 0: raise RuntimeError('You need to provide projections using the Metric.set method.') # Projecting single Molecules if isinstance(self.simulations, Molecule): data = [] mol = self.simulations for proj in self.projectionlist: data.append(_project(proj, mol)) return data numSim = len(self.simulations) # Find out if there is a unique molfile. If there is, initialize a single Molecule to speed up calculations uqMol = None (single, molfile) = _singleMolfile(self.simulations) if single: uqMol = Molecule(molfile) for proj in self.projectionlist: if isinstance(proj, Projection): proj._precalculate(uqMol) else: logger.warning('Cannot calculate description of dimensions due to different topology files for each trajectory.') mapping = self.getMapping(uqMol) logger.debug('Metric: Starting projection of trajectories.') from htmd.config import _config aprun = ParallelExecutor(n_jobs=_config['ncpus']) results = aprun(total=numSim, description='Projecting trajectories')(delayed(_processSim)(self.simulations[i], self.projectionlist, uqMol, self.skip) for i in range(numSim)) metrics = np.empty(numSim, dtype=object) ref = np.empty(numSim, dtype=object) deletesims = np.zeros(numSim, dtype=bool) fstep = np.zeros(numSim) for i in range(len(results)): metrics[i] = results[i][0] ref[i] = results[i][1] fstep[i] = results[i][2] deletesims[i] = results[i][3] logger.debug('Finished projecting the trajectories.') # Removing empty trajectories metrics, ref, updlist, fstep = self._removeEmpty(metrics, ref, deletesims, fstep) # Constructing a MetricData object data = MetricData(dat=metrics, ref=ref, description=mapping, simlist=updlist) uqfsteps = np.unique(fstep) data.fstep = float(stats.mode(fstep).mode) if len(uqfsteps) != 1: logger.warning('Multiple framesteps [{}] ns were read from the simulations. ' 'Taking the statistical mode: {}ns. ' 'If it looks wrong, you can modify it by manually ' 'setting the MetricData.fstep property.'.format(', '.join(map(str,uqfsteps)), data.fstep)) else: logger.info('Frame step {}ns was read from the trajectories. If it looks wrong, redefine it by manually ' 'setting the MetricData.fstep property.'.format(data.fstep)) return data
def _metrify(self, sims, skip, verbose, update): """ Takes a set of trajectory folders and projects all trajectories within them onto the given space defined by the Metric* class. Parameters ---------- simList : numpy list of structs A list of structs produced by the simList function. skip : int Skips every x frames. verbose : int Verbosity toggle update : MetricData object Provide a previous MetricData object and only metrify new trajectories. Returns ------- data : MetricData object Returns a MetricData object containing the projected data and the ref data. """ if isinstance(sims, Molecule): return self.processTraj(sims) # [updList, oldList] = checkUpdate(simList, update, verbose); updList = sims numSim = len(updList) # Find out if there is a unique molfile. If there is, initialize a single Molecule to speed up calculations uniqueMol = 0 uqMol = [] map = [] (single, molfile) = _singleMolfile(updList) if single: uniqueMol = 1 uqMol = Molecule(molfile) # Calculating the mapping of metric columns to atom pair indeces map = self._getMapping(uqMol) logger.info('Metric: Starting projection of trajectories.') metrics = np.empty(numSim, dtype=object) ref = np.empty(numSim, dtype=object) deleteSims = np.zeros(numSim, dtype=bool) fstep = np.empty(numSim) #global parpool Parallel(n_jobs=6, backend="threading")( delayed(_processSimPyemma)(self, i, updList, uniqueMol, uqMol, skip, deleteSims, metrics, ref, fstep) for i in range(numSim)) logger.info('Finished projecting the trajectories.') # Removing empty trajectories emptyM = [True if np.size(x) == 0 else False for x in metrics] emptyR = [True if np.size(x) == 0 else False for x in ref] #assert np.all(deleteSims == emptyM)# and np.all(emptyR == emptyM) metrics = np.delete(metrics, np.where(emptyM)) ref = np.delete(ref, np.where(emptyM)) #updList = np.delete(updList, emptyM) if len(metrics) == 0: raise NameError('No trajectories were read') # Constructing a MetricData object if not update: data = MetricData(dat=metrics, ref=ref, map=map, simlist=updList) else: data = update data.dat.extend(metrics) data.ref.extend(ref) data.simList.extend(updList) uqfsteps = np.unique(fstep) data.fstep = stats.mode(fstep).mode if len(uqfsteps) != 1: logger.warning( 'Multiple framesteps were read from the simulations. Taking the statistical mode: ' + str(data.fstep) + 'ns.') logger.warning( 'If it looks wrong, you can modify it by manually setting the MetricData.fstep property.' ) return data
def _metrify(self, sims, skip, update): """ Takes a set of trajectory folders and projects all trajectories within them onto the given space defined by the Metric* class. Parameters ---------- simList : numpy list of structs A list of structs produced by the simList function. skip : int Skips every x frames. update : MetricData object Provide a previous MetricData object and only metrify new trajectories. Returns ------- data : MetricData object Returns a MetricData object containing the projected data and the ref data. """ if isinstance(sims, Molecule): return self._processTraj(sims) # [updList, oldList] = checkUpdate(simList, update, verbose); updList = sims numSim = len(updList) # Find out if there is a unique molfile. If there is, initialize a single Molecule to speed up calculations uniqueMol = 0 uqMol = [] map = [] (single, molfile) = _singleMolfile(updList) if single: uniqueMol = 1 uqMol = Molecule(molfile) # Calculating the mapping of metric columns to atom pair indeces map = self._getMapping(uqMol) logger.info('Metric: Starting projection of trajectories.') metrics = np.empty(numSim, dtype=object) ref = np.empty(numSim, dtype=object) deletesims = np.zeros(numSim, dtype=bool) fstep = np.zeros(numSim) # Monkey-patching callback class #oldcallback = joblib.parallel.CallBack #joblib.parallel.CallBack = CallBack #p = ProgressBar(numSim, description='Projecting trajectories') from htmd.config import _config results = Parallel(n_jobs=_config['ncpus'], verbose=11)(delayed(_processSimOld)(self, i, updList, uniqueMol, uqMol, skip, deletesims, metrics, ref, fstep) for i in range(numSim)) #joblib.parallel.CallBack = oldcallback for i in range(len(results)): metrics[i] = results[i][0] ref[i] = results[i][1] fstep[i] = results[i][2] deletesims[i] = results[i][3] logger.info('Finished projecting the trajectories.') # Removing empty trajectories emptyM = np.array([True if x is None else False for x in metrics], dtype=bool) emptyR = np.array([True if x is None else False for x in ref], dtype=bool) assert np.all(deletesims == emptyM) and np.all(emptyR == emptyM) metrics = np.delete(metrics, np.where(emptyM)[0]) ref = np.delete(ref, np.where(emptyM)[0]) updList = np.delete(updList, np.where(emptyM)[0]) if len(metrics) == 0: raise NameError('No trajectories were read') # Constructing a MetricData object if not update: data = MetricData(dat=metrics, ref=ref, map=map, simlist=updList) else: data = update data.dat.extend(metrics) data.ref.extend(ref) data.simList.extend(updList) # This is wrong but we don't use update anyways uqfsteps = np.unique(fstep) data.fstep = float(stats.mode(fstep).mode) if len(uqfsteps) != 1: logger.warning('Multiple framesteps were read from the simulations. Taking the statistical mode: ' + str(data.fstep) + 'ns. If it looks wrong, you can modify it by manually setting the MetricData.fstep property.') else: logger.info('Frame step {}ns was read from the trajectories. If it looks wrong, redefine it by manually setting the MetricData.fstep property.'.format(data.fstep)) return data