def _circularFingerprintsClustering(rdkit_mols, radius=2): """ Returns the dice distance matrix based on circularfingerprints method Parameters ---------- rdkit_mols: list The list of rdkit.Chem.rdchem.Mol objects radius: int The radius of the MorganCircularFingerprint Default: 2 Returns ------- dicematrix: np.array The numpy array containing the dice matrix """ from rdkit.Chem import AllChem # calcola circular fingerprints fps = [] for m in rdkit_mols: fps.append(AllChem.GetMorganFingerprint(m, radius)) aprun = ParallelExecutor(n_jobs=-1) # _config['ncpus']) dice_matrix = aprun(total=len(fps), desc='CircularFingerprints Distance') \ (delayed(DiceDistances)(fp1, fps) for fp1 in fps) return np.array(dice_matrix)
def _pathFingerprintsClustering(rdkit_mols): """ Returns the tanimoto distance matrix based on fingerprints method Parameters ---------- rdkit_mols: list The list of rdkit.Chem.rdchem.Mol objects Returns ------- tanimotomatrix: np.array The numpy array containing the tanimoto matrix """ from rdkit.Chem.Fingerprints import FingerprintMols # calcola path fingerprints fps = [] for m in tqdm(rdkit_mols): fps.append(FingerprintMols.FingerprintMol(m)) aprun = ParallelExecutor(n_jobs=-1) # _config['ncpus']) tanimoto_matrix = aprun(total=len(fps), desc='PathFingerprints Distance') \ (delayed(TanimotoDistances)(fp1, fps) for fp1 in fps) return np.array(tanimoto_matrix)
def _torsionsFingerprintsClustering(rdkit_mols): """ Returns the dice distance matrix based on torsionsfingerprints method Parameters ---------- rdkit_mols: list The list of rdkit.Chem.rdchem.Mol objects Returns ------- dicematrix: np.array The numpy array containing the dice matrix """ from rdkit.Chem.AtomPairs import Torsions # Topological Torsions fps = [] for m in tqdm(rdkit_mols): fps.append(Torsions.GetHashedTopologicalTorsionFingerprint(m)) aprun = ParallelExecutor(n_jobs=-1) # _config['ncpus']) dice_matrix = aprun(total=len(fps), desc='TorsionsFingerprints Distance') \ (delayed(DiceDistances)(fp1, fps) for fp1 in fps) return np.array(dice_matrix)
def calculate(self): from htmd.config import _config from htmd.parallelprogress import ParallelExecutor numchi = self.chi.numDimensions statdist = self.model.msm.stationary_distribution stconcat = np.concatenate(self.model.data.St) microcat = self.model.micro_ofcluster[stconcat] aprun = ParallelExecutor(n_jobs=_config['ncpus']) res = aprun(total=numchi, desc='Calculating MI')( delayed(self._parallelAll)(numchi, dih1, 4, self.model.micronum, self.bindihcat, microcat, statdist) for dih1 in range(numchi)) MI_all = np.zeros((len(self.resids), len(self.resids))) for r in res: dihcounts = r[0] pairs = r[1] for dihc, p in zip(dihcounts, pairs): dih1, dih2 = p if dih1 == dih2: continue resid1 = self.residmap[self.mol.resid[ self.chi.description.atomIndexes[dih1][0]]] resid2 = self.residmap[self.mol.resid[ self.chi.description.atomIndexes[dih2][0]]] MI_all[resid1][resid2] = self._calcMutualInfo(dihc) self.mi_matrix = self._cleanautocorrelations(MI_all)
def _maccsClustering(rdkit_mols): """ Returns the tanimoto distance matrix based on maccs method Parameters ---------- rdkit_mols: list The list of rdkit.Chem.rdchem.Mol objects Returns ------- tanimotomatrix: np.array The numpy array containing the tanimoto matrix """ from rdkit.Chem import MACCSkeys # calcola MACCS keys fps = [] for m in tqdm(rdkit_mols): fps.append(MACCSkeys.GenMACCSKeys(m)) aprun = ParallelExecutor(n_jobs=-1) # _config['ncpus']) tanimoto_matrix = aprun(total=len(fps), desc='MACCS Distance') \ (delayed(TanimotoDistances)(fp1, fps) for fp1 in fps) return np.array(tanimoto_matrix)
def _writeInputs(self, simsframes, epoch=None): if epoch is None: epoch = self._getEpoch() + 1 test = glob(path.join(self.inputpath, 'e' + str(epoch) + '*')) if len(test) != 0: raise NameError('Input dirs of epoch ' + str(epoch) + ' already exists.') from htmd.parallelprogress import ParallelExecutor from htmd.config import _config from joblib import delayed aprun = ParallelExecutor(n_jobs=_config['njobs']) aprun(total=len(simsframes), desc='Writing inputs')( delayed(_writeInputsFunction)(i, f, epoch, self.inputpath, self.coorname) for i, f in enumerate(simsframes))
def simfilter(sims, outfolder, filtersel, njobs=None): """Filters a list of simulations generated by :func:`simlist` This function takes as input a list of simulations produced by `simList` and writes new trajectories containing only the desired atoms in a new directory. Parameters ---------- sims : list A simulation list produced by the `simList` function outfolder : str The folder in which to write the modified trajectories filtersel : str Atom selection string describing the atoms we want to keep. See more `here <http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html>`__ njobs : int Number of parallel jobs to spawn for filtering of trajectories. If None it will use the default from htmd.config. Returns ------- fsims : np.ndarray of :class:`Sim <htmd.simlist.Sim>` objects A list of filtered simulations Example ------- >>> sims = simlist(glob('data/*/'), glob('input/*/structure.pdb')) >>> fsims = simfilter(sims, 'filtered', filtersel='not water') """ if not path.exists(outfolder): makedirs(outfolder) if len(sims) > 0: _filterTopology(sims[0], outfolder, filtersel) logger.debug("Starting filtering of simulations.") from htmd.config import _config from htmd.parallelprogress import ParallelExecutor, delayed aprun = ParallelExecutor( n_jobs=njobs if njobs is not None else _config["njobs"]) filtsims = aprun(total=len(sims), desc="Filtering trajectories")( delayed(_filtSim)(i, sims, outfolder, filtersel) for i in range(len(sims))) logger.debug("Finished filtering of simulations") return np.array(filtsims)
def simfilter(sims, outfolder, filtersel): """ Filters a list of simulations generated by :func:`simlist` This function takes as input a list of simulations produced by `simList` and writes new trajectories containing only the desired atoms in a new directory. Parameters ---------- sims : list A simulation list produced by the `simList` function outFolder : str The folder in which to write the modified trajectories filterSel : str An atomselection string describing the atoms we want to keep Returns ------- fsims : np.ndarray of :class:`Sim <htmd.simlist.Sim>` objects A list of filtered simulations Example ------- >>> sims = simlist(glob('data/*/'), glob('input/*/structure.pdb')) >>> fsims = simfilter(sims, 'filtered', filtersel='not water') """ if not path.exists(outfolder): makedirs(outfolder) if len(sims) > 0: _filterTopology(sims[0], outfolder, filtersel) logger.debug('Starting filtering of simulations.') from htmd.config import _config from htmd.parallelprogress import ParallelExecutor, delayed aprun = ParallelExecutor(n_jobs=_config['ncpus']) filtsims = aprun(total=len(sims), description='Filtering trajectories')( delayed(_filtSim)(i, sims, outfolder, filtersel) for i in range(len(sims))) logger.debug('Finished filtering of simulations') return np.array(filtsims)
def parallelTest(self, simlen, ntraj, startFrames=None): if startFrames is None: startFrames = self._startingFrames(ntraj, startFrames, simlen) else: startFrames = self._convertRelFrames(startFrames) from joblib import delayed from htmd.parallelprogress import ParallelExecutor aprun = ParallelExecutor(n_jobs=-2) ret = aprun(total=len(startFrames))(delayed(_pickFromMicro)( relFrame, simlen, np.cumsum(self.fulldata.trajLengths), self.fulldata.trajectories, self.cluster2micro, self.micro2cluster, self.micronum, self.P, self.stconcat, ) for relFrame in startFrames) self.reference += ret return ret
def cluster(smallmol_list, method, distThresholds=0.2, returnDetails=True, removeHs=True): """ Rreturn the SmallMol objects grouped in the cluster. It can also return the details of the clusters computed. Parameters ---------- smallmol_list: list The list of htmd.smallmol.smallmol.SmallMol objects method: str The cluster methods. Can be ['maccs', 'pathFingerprints', 'atomsFingerprints', 'torsionsFingerprints', 'circularFingerprints', 'shape', 'mcs'] distThresholds: float The disance cutoff for the clusters Default: 0.2 returnDetails: bool If True, the cluster details are also returned Default: True removeHs: bool If True, the hydrogens are not considered Default: True Returns ------- clusters: list List of lists, That contains the SmallMol objects grouped based on the cluster belongings details: list A list with all the cluster details """ from sklearn.cluster import DBSCAN import sys this_module = sys.modules[__name__] _methods = [ 'maccs', 'pathFingerprints', 'atomsFingerprints', 'torsionsFingerprints', 'circularFingerprints', 'shape', 'mcs' ] if method not in _methods: raise ValueError( 'The method provided {} does not exists. The ones available are the following: {}' .format(method, _methods)) smallmol_list = np.array([sm.copy() for sm in smallmol_list]) if removeHs: tmp_smallmol_list = [] for sm in smallmol_list: B = Builder(sm) B.removeHydrogens() sm = B.getSmallMol() tmp_smallmol_list.append(sm) #sm._removeAtoms(sm.get('element H', 'idx')) smallmol_list = np.array(tmp_smallmol_list) #rdkitMols_list = [sm.toRdkitMol(includeConformer=True) for sm in smallmol_list] rdkitMols_list = [] wrong = [] for n, sm in enumerate(smallmol_list): try: rdkitMols_list.append(sm.toRdkitMol(includeConformer=True)) except: wrong.append(n) print('{} problematic molecules. Indexes: {}'.format(len(wrong), wrong)) clustmethod = getattr(this_module, '_{}Clustering'.format(method)) if method not in ['shape', 'mcs']: matrix = clustmethod(rdkitMols_list) else: aprun = ParallelExecutor(n_jobs=-1) # _config['ncpus']) matrix = aprun(total=len(rdkitMols_list), desc='{} Distance'.format(method)) \ (delayed(clustmethod)(mol1, rdkitMols_list) for mol1 in rdkitMols_list) matrix = np.array(matrix) db = DBSCAN(eps=distThresholds, min_samples=0, metric='precomputed').fit(matrix) labels = db.labels_ populations = np.bincount(labels) n_clusters = np.max(labels) clusters_idx = np.empty((n_clusters, ), dtype=object) clusters_smallmols = np.empty((n_clusters, ), dtype=object) for n_cl in np.arange(n_clusters): idxs = np.where(labels == n_cl)[0] clusters_idx[n_cl] = idxs clusters_smallmols[n_cl] = smallmol_list[idxs] if returnDetails: details = { 'numClusters': n_clusters, 'populations': populations, 'clusters': clusters_idx } return clusters_smallmols, details return clusters_smallmols
def getStates(self, states=None, statetype='macro', wrapsel='protein', alignsel='name CA', alignmol=None, samplemode='weighted', numsamples=50, simlist=None): """ Get samples of MSM states in Molecule classes Parameters ---------- states : ndarray, optional A list of states to visualize statetype : ['macro','micro','cluster'], optional The type of state to visualize wrapsel : str, optional, default='protein' A selection to use for wrapping alignsel : str, optional, default='name CA' A selection used for aligning all frames. Set to None to disable aligning alignmol : :class:`Molecule <htmd.molecule.molecule.Molecule>` object A reference molecule onto which to align all others samplemode : ['weighted','random'], optional, default='weighted' How to obtain the samples from the states numsamples : int Number of samples (conformations) for each state. simlist : numpy.ndarray of :class:`Sim <htmd.simlist.Sim>` objects Optionally pass a different (but matching, i.e. filtered) simlist for creating the Molecules. Returns ------- mols : ndarray of :class:`Molecule <htmd.molecule.molecule.Molecule>` objects A list of :class:`Molecule <htmd.molecule.molecule.Molecule>` objects containing the samples of each state Examples -------- >>> model = Model(data) >>> model.markovModel(100, 5) >>> mols = model.getStates() >>> for m in mols: >>> m.view() """ self._integrityCheck(postmsm=(statetype != 'cluster')) if simlist is None: simlist = self.data.simlist else: if len(simlist) != len(self.data.simlist): raise AttributeError( 'Provided simlist has different number of trajectories than the one used by the model.' ) (single, molfile) = _singleMolfile(simlist) if not single: raise NameError( 'Visualizer does not support yet visualization of systems with different structure files. ' 'The simlist should be created with a single molfile (for example a filtered one)' ) if alignmol is None: alignmol = Molecule(molfile) if statetype != 'macro' and statetype != 'micro' and statetype != 'cluster': raise NameError( "'statetype' must be either 'macro', 'micro' or ''cluster'") if states is None: if statetype == 'macro': states = range(self.macronum) elif statetype == 'micro': states = range(self.micronum) elif statetype == 'cluster': states = range(self.data.K) if len(states) == 0: raise NameError('No ' + statetype + ' states exist in the model') (tmp, relframes) = self.sampleStates(states, numsamples, statetype=statetype, samplemode=samplemode) from htmd.config import _config from htmd.parallelprogress import ParallelExecutor, delayed # This loop really iterates over states. sampleStates returns an array of arrays # Removed ncpus because it was giving errors on some systems. aprun = ParallelExecutor(n_jobs=1) # _config['ncpus']) mols = aprun(total=len(relframes), description='Getting state Molecules')\ (delayed(_loadMols)(self, rel, molfile, wrapsel, alignsel, alignmol, simlist) for rel in relframes) return np.array(mols, dtype=object)
def project(self): """ Applies all projections stored in Metric on all simulations. Returns ------- data : MetricData object Returns a MetricData object containing the projected data. """ if len(self.projectionlist) == 0: raise RuntimeError( 'You need to provide projections using the Metric.set method.') # Projecting single Molecules if isinstance(self.simulations, Molecule): data = [] mol = self.simulations for proj in self.projectionlist: data.append(_project(proj, mol)) return data numSim = len(self.simulations) # Find out if there is a unique molfile. If there is, initialize a single Molecule to speed up calculations uqMol = None (single, molfile) = _singleMolfile(self.simulations) if single: uqMol = Molecule(molfile) for proj in self.projectionlist: if isinstance(proj, Projection): proj._precalculate(uqMol) else: logger.warning( 'Cannot calculate description of dimensions due to different topology files for each trajectory.' ) mapping = self.getMapping(uqMol) logger.debug('Metric: Starting projection of trajectories.') from htmd.config import _config aprun = ParallelExecutor(n_jobs=_config['ncpus']) results = aprun(total=numSim, description='Projecting trajectories')( delayed(_processSim)(self.simulations[i], self.projectionlist, uqMol, self.skip) for i in range(numSim)) metrics = np.empty(numSim, dtype=object) ref = np.empty(numSim, dtype=object) deletesims = np.zeros(numSim, dtype=bool) fstep = np.zeros(numSim) for i in range(len(results)): metrics[i] = results[i][0] ref[i] = results[i][1] fstep[i] = results[i][2] deletesims[i] = results[i][3] logger.debug('Finished projecting the trajectories.') # Removing empty trajectories metrics, ref, updlist, fstep = self._removeEmpty( metrics, ref, deletesims, fstep) # Constructing a MetricData object data = MetricData(dat=metrics, ref=ref, description=mapping, simlist=updlist) uqfsteps = np.unique(fstep) data.fstep = float(stats.mode(fstep).mode) if len(uqfsteps) != 1: logger.warning( 'Multiple framesteps [{}] ns were read from the simulations. ' 'Taking the statistical mode: {}ns. ' 'If it looks wrong, you can modify it by manually ' 'setting the MetricData.fstep property.'.format( ', '.join(map(str, uqfsteps)), data.fstep)) else: logger.info( 'Frame step {}ns was read from the trajectories. If it looks wrong, redefine it by manually ' 'setting the MetricData.fstep property.'.format(data.fstep)) return data
def ffevaluate(mol, prm, betweensets=None, cutoff=0, rfa=False, solventDielectric=78.5, threads=1, fromstruct=False): """ Evaluates energies and forces of the forcefield for a given Molecule Parameters ---------- mol : :class:`Molecule <htmd.molecule.molecule.Molecule>` object A Molecule object. Can contain multiple frames. prm : :class:`ParameterSet <parmed.ParameterSet>` object Forcefield parameters. betweensets : tuple of strings Only calculate energies between two sets of atoms given as atomselect strings. Only computes LJ and electrostatics. cutoff : float If set to a value != 0 it will only calculate LJ, electrostatics and bond energies for atoms which are closer than the threshold rfa : bool Use with `cutoff` to enable the reaction field approximation for scaling of the electrostatics up to the cutoff. Uses the value of `solventDielectric` to model everything beyond the cutoff distance as solvent with uniform dielectric. solventDielectric : float Used together with `cutoff` and `rfa` Returns ------- energies : np.ndarray A (6, nframes) shaped matrix containing the individual energy components of each simulation frame. Rows correspond to the following energies 0: bond 1: LJ 2: Electrostatic 3: angle 4: dihedral 5: improper forces : np.ndarray A (natoms, 3, nframes) shaped matrix containing the total force on each atom for each simulation frame. atmnrg : np.ndarray A (natoms, 6, nframes) shaped matrix containing the approximate potential energy components of each atom at each simulation frame. The 6 indexes are the same as in the `energies` return argument. Examples -------- >>> from htmd.ffevaluation.ffevaluate import * >>> from htmd.ffevaluation.test_ffevaluate import fixParameters, drawForce >>> from htmd.ui import * >>> import parmed >>> mol = Molecule('./htmd/data/test-ffevaluate/waterbox/structure.psf') >>> mol.read('./htmd/data/test-ffevaluate/waterbox/output.xtc') >>> prm = parmed.charmm.CharmmParameterSet(fixParameters('./htmd/data/test-ffevaluate/waterbox/parameters.prm')) >>> energies, forces, atmnrg = ffevaluate(mol, prm, betweensets=('resname SOD', 'water')) >>> mol.view() >>> for cc, ff in zip(mol.coords[:, :, 0], forces[:, :, 0]): >>> drawForce(cc, ff) Amber style >>> prmtop = parmed.amber.AmberParm('structure.prmtop') >>> prm = parmed.amber.AmberParameterSet.from_structure(prmtop) >>> energies, forces, atmnrg = ffevaluate(mol, prm, betweensets=('resname SOD', 'water')) """ if mol.box.shape[0] != 3 or mol.box.shape[1] != mol.coords.shape[2]: raise ValueError( 'Box dimensions have to be (3, numFrames), your Molecule has box of shape {}' .format(mol.box.shape)) mol = mol.copy() coords = mol.coords.astype(np.float32) box = mol.box.astype(np.float32) setA, setB = calculateSets(mol, betweensets) args = list(init(mol, prm, fromstruct)) args.append(setA) args.append(setB) args.append(cutoff) args.append(rfa) args.append(solventDielectric) if threads == 1: energies, forces, atmnrg = _ffevaluate(coords, box, *args) else: from htmd.parallelprogress import ParallelExecutor, delayed aprun = ParallelExecutor(n_jobs=threads) res = aprun(total=mol.numFrames, desc='Evaluating energies')(delayed(_ffevaluate)( np.atleast_3d(coords[:, :, f]), box[:, f].reshape(3, 1), *args) for f in range(mol.numFrames)) energies = np.hstack([r[0] for r in res]) forces = np.concatenate([r[1] for r in res], axis=2) atmnrg = np.concatenate([r[2] for r in res], axis=2) return energies, forces, atmnrg
def project(self, njobs=None): """ Applies all projections stored in Metric on all simulations. Parameters ---------- njobs : int Number of parallel jobs to spawn for projection of trajectories. Take care that this can use large amounts of memory as multiple trajectories are loaded at once. If None it will use the default from htmd.config. Returns ------- data : MetricData object Returns a MetricData object containing the projected data. """ if len(self.projectionlist) == 0: raise RuntimeError( "You need to provide projections using the Metric.set method.") # Projecting single Molecules if isinstance(self.simulations, Molecule): data = [] mol = self.simulations for proj in self.projectionlist: data.append(_project(proj, mol)) return data numSim = len(self.simulations) # Find out if there is a unique molfile. If there is, initialize a single Molecule to speed up calculations uqMol = None (single, molfile) = _singleMolfile(self.simulations) if single: uqMol = Molecule(molfile) for proj in self.projectionlist: if isinstance(proj, Projection): proj._setCache(uqMol) else: logger.warning( "Cannot calculate description of dimensions due to different topology files for each trajectory." ) mapping = self.getMapping(uqMol) logger.debug("Metric: Starting projection of trajectories.") from htmd.config import _config aprun = ParallelExecutor( n_jobs=njobs if njobs is not None else _config["njobs"]) results = aprun(total=numSim, desc="Projecting trajectories")( delayed(_processSim)(self.simulations[i], self.projectionlist, uqMol, self.skip) for i in range(numSim)) metrics = np.empty(numSim, dtype=object) ref = np.empty(numSim, dtype=object) deletesims = np.zeros(numSim, dtype=bool) fstep = np.zeros(numSim) for i in range(len(results)): metrics[i] = results[i][0] ref[i] = results[i][1] fstep[i] = results[i][2] deletesims[i] = results[i][3] logger.debug("Finished projecting the trajectories.") # Removing empty trajectories metrics, ref, updlist, fstep = self._removeEmpty( metrics, ref, deletesims, fstep) # Constructing a MetricData object data = MetricData(dat=metrics, ref=ref, description=mapping, simlist=updlist) uqfsteps = np.unique(fstep) if np.all(np.isnan(uqfsteps)): logger.warning( "No framestep could be read from the trajectories. Please manually set the MetricData.fstep" " property, otherwise calculations in Model and Kinetics classes can fail." ) else: data.fstep = float(stats.mode(fstep).mode) if len(uqfsteps) != 1: logger.warning( "Multiple framesteps [{}] ns were read from the simulations. " "Taking the statistical mode: {}ns. " "If it looks wrong, you can modify it by manually " "setting the MetricData.fstep property.".format( ", ".join(map(str, uqfsteps)), data.fstep)) else: if data.fstep == 0: logger.warning( "A framestep of 0 was read from the trajectories. Please manually set the MetricData.fstep" " property, otherwise calculations in Model and Kinetics classes can fail." ) else: logger.info( "Frame step {}ns was read from the trajectories. If it looks wrong, redefine it by manually " "setting the MetricData.fstep property.".format( data.fstep)) return data