def __init__(self, data, lag, units='frames'): from pyemma.coordinates import tica # data.dat.tolist() might be better? self.data = data if isinstance(data, Metric): if units != 'frames': raise RuntimeError( 'Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.' ) metr = data from pyemma.coordinates.transform.tica import TICA self.tic = TICA(lag) p = ProgressBar(len(metr.simulations)) for proj in _projectionGenerator(metr, _getNcpus()): for pro in proj: self.tic.partial_fit(pro[0]) p.progress(len(proj)) p.stop() else: lag = unitconvert(units, 'frames', lag, data.fstep) if lag == 0: raise RuntimeError( 'Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.' ) self.tic = tica(data.dat.tolist(), lag=lag)
def __init__(self, data, lag, units='frames', dimensions=None): from pyemma.coordinates.transform.tica import TICA as TICApyemma self.data = data self.dimensions = dimensions if isinstance(data, Metric): # Memory efficient TICA projecting trajectories on the fly if units != 'frames': raise RuntimeError('Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.') self.tic = TICApyemma(lag) metr = data p = ProgressBar(len(metr.simulations)) for proj in _projectionGenerator(metr, _getNcpus()): for pro in proj: if pro is None: continue if self.dimensions is None: self.tic.partial_fit(pro[0]) else: # Sub-select dimensions for fitting self.tic.partial_fit(pro[0][:, self.dimensions]) p.progress(len(proj)) p.stop() else: # In-memory TICA lag = unitconvert(units, 'frames', lag, data.fstep) if lag == 0: raise RuntimeError('Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.') self.tic = TICApyemma(lag) if self.dimensions is None: datalist = data.dat.tolist() else: # Sub-select dimensions for fitting datalist = [x[:, self.dimensions].copy() for x in data.dat] self.tic.fit(datalist)
def _calculateDirectedComponent(self, sims, St, N): metr = Metric(sims, skip=self.skip) metr.set(self.goalprojection) clustermeans = np.zeros(len(N)) k = 0 for proj in _projectionGenerator(metr, _getNcpus()): for pro in proj: clustermeans[:np.max(St[k]) + 1] += np.bincount( St[k], self.goalfunction(pro[0]).flatten()) k += 1 return clustermeans / N
def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ if ndim is not None: # self.tic._dim = ndim # Old way of doing it. Deprecated since pyEMMA 2.1 self.tic.set_params( dim=ndim) # Change to this in 2.1 pyEMMA version if isinstance( self.data, Metric): # Doesn't project on correct number of dimensions proj = [] refs = [] fstep = None metr = self.data p = ProgressBar(len(metr.simulations)) k = -1 droppedsims = [] for projecteddata in _projectionGenerator(metr, _getNcpus()): for pro in projecteddata: k += 1 if pro is None: droppedsims.append(k) continue proj.append(self.tic.transform(pro[0])) refs.append(pro[1]) if fstep is None: fstep = pro[2] p.progress(len(projecteddata)) p.stop() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) #fstep = 0 parent = None else: proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data if ndim is None: logger.info( 'Kept {} dimension(s) to cover 95% of kinetic variance.'. format(self.tic.dimension())) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj, dtype=object), simlist=simlist, ref=ref, fstep=fstep, parent=parent) from pandas import DataFrame types = [] indexes = [] description = [] for i in range(ndim): types += ['tica'] indexes += [-1] description += ['TICA dimension {}'.format(i + 1)] datatica.map = DataFrame({ 'type': types, 'indexes': indexes, 'description': description }) return datatica
def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ from tqdm import tqdm if ndim is not None: self.tic.set_params(dim=ndim) keepdata = [] keepdim = None keepdimdesc = None if isinstance( self.data, Metric ): # Memory efficient TICA projecting trajectories on the fly proj = [] refs = [] fstep = None metr = self.data k = -1 droppedsims = [] pbar = tqdm(total=len(metr.simulations)) for projecteddata in _projectionGenerator(metr, _getNcpus()): for pro in projecteddata: k += 1 if pro is None: droppedsims.append(k) continue if self.dimensions is not None: numDimensions = pro[0].shape[1] keepdim = np.setdiff1d(range(numDimensions), self.dimensions) keepdata.append(pro[0][:, keepdim]) proj.append( self.tic.transform( pro[0][:, self.dimensions]).astype(np.float32) ) # Sub-select dimensions for projecting else: proj.append( self.tic.transform(pro[0]).astype(np.float32)) refs.append(pro[1]) if fstep is None: fstep = pro[2] pbar.update(len(projecteddata)) pbar.close() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) parent = None if self.dimensions is not None: from htmd.projections.metric import _singleMolfile from htmd.molecule.molecule import Molecule (single, molfile) = _singleMolfile(metr.simulations) if single: keepdimdesc = metr.getMapping(Molecule(molfile)) keepdimdesc = keepdimdesc.iloc[keepdim] else: if ndim is not None and self.data.numDimensions < ndim: raise RuntimeError( 'TICA cannot increase the dimensionality of your data. Your data has {} dimensions and you requested {} TICA dimensions' .format(self.data.numDimensions, ndim)) if self.dimensions is not None: keepdim = np.setdiff1d(range(self.data.numDimensions), self.dimensions) keepdata = [x[:, keepdim] for x in self.data.dat] if self.data.description is not None: keepdimdesc = self.data.description.iloc[keepdim] proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data # If TICA is done on a subset of dimensions, combine non-projected data with projected data if self.dimensions is not None: newproj = [] for k, t in zip(keepdata, proj): newproj.append(np.hstack((k, t))) proj = newproj if ndim is None: ndim = self.tic.dimension() logger.info( 'Kept {} dimension(s) to cover 95% of kinetic variance.'. format(ndim)) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj), simlist=simlist, ref=ref, fstep=fstep, parent=parent) from pandas import DataFrame # TODO: Make this messy pandas creation cleaner. I'm sure I can append rows to DataFrame types = [] indexes = [] description = [] for i in range(ndim): types += ['tica'] indexes += [-1] description += ['TICA dimension {}'.format(i + 1)] datatica.description = DataFrame({ 'type': types, 'atomIndexes': indexes, 'description': description }) if self.dimensions is not None and keepdimdesc is not None: # If TICA is done on a subset of dims datatica.description = keepdimdesc.append(datatica.description, ignore_index=True) return datatica
def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ if ndim is not None: self.tic.set_params(dim=ndim) keepdata = [] keepdim = None keepdimdesc = None if isinstance(self.data, Metric): # Memory efficient TICA projecting trajectories on the fly proj = [] refs = [] fstep = None metr = self.data p = ProgressBar(len(metr.simulations)) k = -1 droppedsims = [] for projecteddata in _projectionGenerator(metr, _getNcpus()): for pro in projecteddata: k += 1 if pro is None: droppedsims.append(k) continue if self.dimensions is not None: numDimensions = pro[0].shape[1] keepdim = np.setdiff1d(range(numDimensions), self.dimensions) keepdata.append(pro[0][:, keepdim]) proj.append(self.tic.transform(pro[0][:, self.dimensions]).astype(np.float32)) # Sub-select dimensions for projecting else: proj.append(self.tic.transform(pro[0]).astype(np.float32)) refs.append(pro[1]) if fstep is None: fstep = pro[2] p.progress(len(projecteddata)) p.stop() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) parent = None if self.dimensions is not None: from htmd.projections.metric import _singleMolfile from htmd.molecule.molecule import Molecule (single, molfile) = _singleMolfile(metr.simulations) if single: keepdimdesc = metr.getMapping(Molecule(molfile)) keepdimdesc = keepdimdesc.iloc[keepdim] else: if ndim is not None and self.data.numDimensions < ndim: raise RuntimeError('TICA cannot increase the dimensionality of your data. Your data has {} dimensions and you requested {} TICA dimensions'.format(self.data.numDimensions, ndim)) if self.dimensions is not None: keepdim = np.setdiff1d(range(self.data.numDimensions), self.dimensions) keepdata = [x[:, keepdim] for x in self.data.dat] if self.data.description is not None: keepdimdesc = self.data.description.iloc[keepdim] proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data # If TICA is done on a subset of dimensions, combine non-projected data with projected data if self.dimensions is not None: newproj = [] for k, t in zip(keepdata, proj): newproj.append(np.hstack((k, t))) proj = newproj if ndim is None: ndim = self.tic.dimension() logger.info('Kept {} dimension(s) to cover 95% of kinetic variance.'.format(ndim)) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj), simlist=simlist, ref=ref, fstep=fstep, parent=parent) from pandas import DataFrame # TODO: Make this messy pandas creation cleaner. I'm sure I can append rows to DataFrame types = [] indexes = [] description = [] for i in range(ndim): types += ['tica'] indexes += [-1] description += ['TICA dimension {}'.format(i+1)] datatica.description = DataFrame({'type': types, 'atomIndexes': indexes, 'description': description}) if self.dimensions is not None and keepdimdesc is not None: # If TICA is done on a subset of dims datatica.description = keepdimdesc.append(datatica.description, ignore_index=True) return datatica