def _calculateMolProp(self, mol, props="all"): from moleculekit.util import sequenceID # Calculate all properties at once since it would be too slow otherwise to redo calculations res = {} mol = mol.copy() mol.filter(self.sel, _logger=False) residues = sequenceID((mol.resid, mol.chain, mol.insertion)) backbone = mol.atomselect("backbone") ca_indices = np.where(mol.name == "CA")[0].astype(np.int32) chainids = mol.chain[ca_indices] resnames = mol.resname[ca_indices] proline_indices = np.array(resnames == "PRO", dtype=np.int32) _, chain_ids = np.unique(chainids, return_inverse=True) chain_ids = chain_ids.astype(np.int32) nco_indices = np.ones((residues.max() + 1, 3), dtype=np.int32) * -1 natriums = np.where((mol.name == "N") & backbone)[0] carbons = np.where((mol.name == "C") & backbone)[0] oxygens = np.where((mol.name == "O") & backbone)[0] nco_indices[residues[natriums], 0] = natriums nco_indices[residues[carbons], 1] = carbons nco_indices[residues[oxygens], 2] = oxygens res["ca_indices"] = ca_indices res["nco_indices"] = nco_indices res["proline_indices"] = proline_indices res["chain_ids"] = chain_ids return res
def _checkChainAndSegid(mol, _loggerLevel): from moleculekit.util import sequenceID emptychains = mol.chain == '' emptysegids = mol.segid == '' if np.all(emptychains) and np.all(emptysegids): raise RuntimeError( 'No chains or segments defined in Molecule.chain / Molecule.segid. Please assign either to continue with preparation.' ) if np.all(emptychains) and np.any(~emptysegids): logger.info( 'No chains defined in Molecule. Using segment IDs as chains for protein preparation.' ) mol = mol.copy() mol.chain = sequenceID(mol.segid) if np.any(~emptysegids) and np.any(~emptychains): chainseq = sequenceID(mol.chain) segidseq = sequenceID(mol.segid) if not np.array_equal(chainseq, segidseq): logger.warning('Both chains and segments are defined in Molecule.chain / Molecule.segid, however they are inconsistent. ' \ 'Protein preparation will use the chain information.') if _loggerLevel is None or _loggerLevel == 'INFO': chainids = np.unique(mol.chain) if np.any([len(cc) > 1 for cc in chainids]): raise RuntimeError( 'The chain field should only contain a single character.') print('\n---- Molecule chain report ----') for c in chainids: chainatoms = np.where(mol.chain == c)[0] firstatom = chainatoms[0] lastatom = chainatoms[-1] print(f'Chain {c}:') print( f' First residue: {mol.resname[firstatom]}:{mol.resid[firstatom]}:{mol.insertion[firstatom]}' ) print( f' Final residue: {mol.resname[lastatom]}:{mol.resid[lastatom]}:{mol.insertion[lastatom]}' ) print('---- End of chain report ----\n') return mol
def project(self, mol): """ Project molecule. Parameters ---------- mol : :class:`Molecule <moleculekit.molecule.Molecule>` A :class:`Molecule <moleculekit.molecule.Molecule>` object to project. Returns ------- data : np.ndarray An array containing the projected data. """ coords = super().project(mol) if self._refmol is None: refcoords = np.mean(coords, axis=0) else: _wrapref = True if self._pbc and (self._refmol.box is None or len(self._refmol.box) == 0 or np.all(self._refmol.box == 0)): logger.warning( "refmol doesn't contain periodic box information and will not be wrapped." ) _wrapref = False refcoords = _MetricCoordinate(atomsel=self._atomsel, refmol=self._refmol, pbc=_wrapref).project(self._refmol) mapping = super().getMapping(mol) xyzgroups = mapping.groupby('atomIndexes').groups numatoms = len(xyzgroups) resids = sequenceID(mol.resid) atomfluct = np.zeros((coords.shape[0], numatoms)) squarediff = (coords - refcoords)**2 atomresids = np.zeros(numatoms, dtype=int) for i, atom in enumerate(sorted(xyzgroups.values(), key=lambda x: x[0])): assert len(np.unique(mapping.atomIndexes[atom])) == 1 atomfluct[:, i] = squarediff[:, atom].sum(axis=1) atomresids[i] = resids[int(mapping.atomIndexes[atom[0]])] if self._mode == 'atom': return atomfluct elif self._mode == 'residue': numres = len(np.unique(atomresids)) meanresfluct = np.zeros((coords.shape[0], numres)) for i, r in enumerate(np.unique(atomresids)): meanresfluct[:, i] = atomfluct[:, atomresids == r].mean(axis=1) return meanresfluct else: raise RuntimeError( 'Invalid mode {} given. Choose between `atom` and `residue`'. format(self._mode))
def _calculateMolProp(self, mol, props='all'): props = ('radii', 'atom_mapping', 'sel', 'filtersel', 'tokeep') if props == 'all' else props res = {} sel = mol.atomselect(self._sel) selidx = np.where(sel)[0] if 'sel' in props: res['sel'] = sel filtersel = mol.atomselect(self._filtersel) filterselidx = np.where(filtersel)[0] if 'filtersel' in props: res['filtersel'] = filtersel if len(np.setdiff1d(selidx, filterselidx)) != 0: raise RuntimeError( 'Some atoms selected by `sel` are not selected by `filtersel` and thus would not be calculated. Make sure `sel` is a subset of `filtersel`.' ) if 'tokeep' in props: filterselmod = filtersel.copy().astype(int) filterselmod[filterselmod == 0] = -1 filterselmod[filtersel] = np.arange(np.count_nonzero(filtersel)) res['tokeep'] = filterselmod[sel] if 'radii' in props: _ATOMIC_RADII = { 'C': 1.5, 'F': 1.2, 'H': 0.4, 'N': 1.10, 'O': 1.05, 'S': 1.6, 'P': 1.6 } elements = [n[0] for n in mol.name[filtersel]] atom_radii = np.vectorize(_ATOMIC_RADII.__getitem__)(elements) res['radii'] = np.array(atom_radii, np.float32) + self._probeRadius if 'atom_mapping' in props: if self._mode == 'atom': res['atom_mapping'] = np.arange(np.sum(filtersel), dtype=np.int32) elif self._mode == 'residue': from moleculekit.util import sequenceID res['atom_mapping'] = sequenceID( (mol.resid[filtersel], mol.chain[filtersel], mol.segid[filtersel])).astype(np.int32) else: raise ValueError( 'mode must be one of "residue", "atom". "{}" supplied'. format(self._mode)) return res
def calculateVariables(currmol): res = sequenceID((currmol.resid, currmol.insertion, currmol.segid, currmol.chain)) caidx = currmol.name == 'CA' res = np.unique(res) reslen = len(res) # Calculate the protein sequence seq = ''.join([_residueNameTable[x] for x in currmol.resname[caidx]]) seq = ct.c_char_p(seq.encode('utf-8')) # Keep only CA coordinates coords = currmol.coords[caidx, :, :].copy() return reslen, res.astype(np.int32), seq, coords
def removeAtomsInHull(mol1, mol2, hullsel, removesel): """Calculates the convex hull of an atom selection in mol1 and removes atoms within that hull in mol2. Parameters ---------- mol1 : :class:`Molecule <moleculekit.molecule.Molecule>` object Molecule for which to calculate the convex hull mol2 : :class:`Molecule <moleculekit.molecule.Molecule>` object Molecule which contains the atoms which we check if they are within the hull hullsel : str Atom selection string for atoms in mol1 from which to calculate the convex hull. See more `here <http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html>`__ removesel : str Atom selection string for atoms in mol2 from which to remove the ones which are within the hull. See more `here <http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html>`__ Returns ------- newmol2 : Molecule mol2 but without any atoms located within the convex hull numrem : int Number of fragments removed """ # TODO: Look into Morphological Snakes from scipy.spatial import ConvexHull mol2 = mol2.copy() # Convex hull of the protein hullcoords = mol1.get("coords", hullsel) hull = ConvexHull(hullcoords) sequence = sequenceID((mol2.resid, mol2.segid)) uqres = np.unique(sequence) toremove = np.zeros(len(sequence), dtype=bool) numlipsrem = 0 for ( res ) in uqres: # For each fragment check if it's atoms lie within the convex hull atoms = np.where(sequence == res)[0] newhull = ConvexHull(np.vstack((hullcoords, mol2.get("coords", sel=atoms)))) # If the hull didn't change by adding the fragment, it lies within convex hull. Remove it. if list(hull.vertices) == list(newhull.vertices): toremove[atoms] = True numlipsrem += 1 rematoms = mol2.atomselect(removesel) mol2.remove(toremove & rematoms) return mol2, numlipsrem
def embed(mol1, mol2, gap=1.3): """Embeds one molecule into another removing overlaps. Will remove residues of mol2 which have collisions with atoms of mol1. Parameters ---------- mol1 : :class:`Molecule <moleculekit.molecule.Molecule>` object The first Molecule object mol2 : :class:`Molecule <moleculekit.molecule.Molecule>` object The second Molecule object gap : float Minimum space in A between atoms of the two molecules Return ------ newmol : :class:`Molecule <moleculekit.molecule.Molecule>` object The resulting Molecule object Example ------- >>> all = embed(memb, prot) """ mol1 = mol1.copy() mol2 = mol2.copy() # Set different occupancy to separate atoms of mol1 and mol2 occ1 = mol1.get("occupancy") occ2 = mol2.get("occupancy") mol1.set("occupancy", 1) mol2.set("occupancy", 2) mol2.append(mol1) s1 = mol2.atomselect("occupancy 1") s2 = mol2.atomselect("occupancy 2") # Give unique "residue" beta number to all resids beta = mol2.get("beta") mol2.set("beta", sequenceID(mol2.resid)) # Calculate overlapping atoms overlaps = mol2.atomselect( "(occupancy 2) and same beta as exwithin " + str(gap) + " of (occupancy 1)" ) # Restore original beta and occupancy mol2.set("beta", beta) mol2.set("occupancy", occ1, s1) mol2.set("occupancy", occ2, s2) # Remove the overlaps mol2.remove(overlaps, _logger=False) return mol2
def _calculateMolProp(self, mol, props='all'): props = ('radii', 'atom_mapping', 'sel') if props == 'all' else props res = {} sel = mol.atomselect(self._sel) if 'sel' in props: res['sel'] = sel if 'radii' in props: _ATOMIC_RADII = { 'C': 1.5, 'F': 1.2, 'H': 0.4, 'N': 1.10, 'O': 1.05, 'S': 1.6, 'P': 1.6 } elements = [n[0] for n in mol.name[sel]] atom_radii = np.vectorize(_ATOMIC_RADII.__getitem__)(elements) res['radii'] = np.array(atom_radii, np.float32) + self._probeRadius if 'atom_mapping' in props: if self._mode == 'atom': res['atom_mapping'] = np.arange(np.sum(sel), dtype=np.int32) elif self._mode == 'residue': from moleculekit.util import sequenceID res['atom_mapping'] = sequenceID( (mol.resid[sel], mol.chain[sel], mol.segid[sel])).astype(np.int32) else: raise ValueError( 'mode must be one of "residue", "atom". "{}" supplied'. format(self._mode)) return res
def tileMembrane(memb, xmin, ymin, xmax, ymax, buffer=1.5): """ Tile a membrane in the X and Y dimensions to reach a specific size. Parameters ---------- memb : :class:`Molecule <moleculekit.molecule.Molecule>` object The membrane to be tiled xmin : float Minimum x coordinate ymin : float Minimum y coordinate xmax : float Maximum x coordinate ymax : float Maximum y coordinate buffer : float Buffer distance between tiles Returns ------- megamemb : A big membrane Molecule """ from tqdm import tqdm memb = memb.copy() memb.resid = sequenceID( (memb.resid, memb.insertion, memb.chain, memb.segid)) minmemb = np.min(memb.get('coords', 'water'), axis=0).flatten() size = np.max(memb.get('coords', 'water'), axis=0) - np.min( memb.get('coords', 'water'), axis=0) size = size.flatten() xreps = int(np.ceil((xmax - xmin) / size[0])) yreps = int(np.ceil((ymax - ymin) / size[1])) logger.info('Replicating Membrane {}x{}'.format(xreps, yreps)) from moleculekit.molecule import Molecule megamemb = Molecule() bar = tqdm(total=xreps * yreps, desc='Replicating Membrane') k = 0 for x in range(xreps): for y in range(yreps): tmpmemb = memb.copy() xpos = xmin + x * (size[0] + buffer) ypos = ymin + y * (size[1] + buffer) tmpmemb.moveBy( [-float(minmemb[0]) + xpos, -float(minmemb[1]) + ypos, 0]) tmpmemb.remove('same resid as (x > {} or y > {})'.format( xmax, ymax), _logger=False) if tmpmemb.numAtoms == 0: continue tmpmemb.set('segid', 'M{}'.format(k), sel='not water') tmpmemb.set('segid', 'MW{}'.format(k), sel='water') megamemb.append(tmpmemb) k += 1 bar.update(1) bar.close() # Membranes don't tile perfectly. Need to remove waters that clash with lipids of other tiles # Some clashes will still occur between periodic images however megamemb.remove('same resid as water and within 1.5 of not water', _logger=False) return megamemb
def prepareProteinForAtomtyping(mol, guessBonds=True, protonate=True, pH=7.4, segment=True, verbose=True): """Prepares a Molecule object for atom typing. Parameters ---------- mol : Molecule object The protein to prepare guessBonds : bool Drops the bonds in the molecule and guesses them from scratch protonate : bool Protonates the protein for the given pH and optimizes hydrogen networks pH : float The pH for protonation segment : bool Automatically guesses the segments of a protein by using the guessed bonds verbose : bool Set to False to turn of the printing Returns ------- mol : Molecule object The prepared Molecule """ from moleculekit.tools.autosegment import autoSegment2 from moleculekit.util import sequenceID mol = mol.copy() if ( guessBonds ): # Need to guess bonds at the start for atom selection and for autoSegment mol.bondtype = np.array([], dtype=object) mol.bonds = mol._guessBonds() protsel = mol.atomselect("protein") metalsel = mol.atomselect(f"element {' '.join(metal_atypes)}") watersel = mol.atomselect("water") notallowed = ~(protsel | metalsel | watersel) if not np.any(protsel): raise RuntimeError("No protein atoms found in Molecule") if np.any(notallowed): resnames = np.unique(mol.resname[notallowed]) raise RuntimeError( "Found atoms with resnames {} in the Molecule which can cause issues with the voxelization. Please make sure to only pass protein atoms and metals." .format(resnames)) protmol = mol.copy() protmol.filter(protsel, _logger=False) metalmol = mol.copy() metalmol.filter(metalsel, _logger=False) watermol = mol.copy() watermol.filter(watersel, _logger=False) if protonate: from moleculekit.tools.preparation import systemPrepare if np.all(protmol.segid == "") and np.all(protmol.chain == ""): protmol = autoSegment2( protmol, fields=("segid", "chain"), basename="K", _logger=verbose) # We need segments to prepare the protein protmol = systemPrepare( protmol, pH=pH, verbose=verbose, _logger_level="INFO" if verbose else "ERROR", ) if guessBonds: protmol.bonds = protmol._guessBonds() # TODO: Should we remove bonds between metals and protein? if segment: protmol = autoSegment2( protmol, fields=("segid", "chain"), _logger=verbose) # Reassign segments after preparation # Assign separate segment to the metals just in case pybel takes that into account if np.any(protmol.chain == "Z") or np.any(protmol.segid == "ME"): raise AssertionError( "Report this issue on the moleculekit github issue tracker. Too many chains in the protein." ) metalmol.segid[:] = "ME" metalmol.chain[:] = "Z" metalmol.resid[:] = ( np.arange(0, 2 * metalmol.numAtoms, 2) + protmol.resid.max() + 1 ) # Just in case, let's put a residue gap between the metals so that they are considered separate chains no matter what happens if watermol.numAtoms != 0: if np.any(protmol.chain == "W") or np.any(protmol.segid == "WX"): raise AssertionError( "Report this issue on the moleculekit github issue tracker. Too many chains in the protein." ) watermol.resid[:] = sequenceID( (watermol.resid, watermol.segid, watermol.chain), step=2) watermol.segid[:] = "WX" watermol.chain[:] = "W" mol = protmol.copy() mol.append(metalmol) mol.append(watermol) return mol
def ionizePlace(mol, anion_resname, cation_resname, anion_name, cation_name, nanion, ncation, dfrom=5, dbetween=5, segname=None): """Place a given number of negative and positive ions in the solvent. Replaces water molecules al long as they respect the given distance criteria. Parameters ---------- mol : :class:`Molecule <moleculekit.molecule.Molecule>` object The Molecule object anion_resname : str Resname of the added anions cation_resname : str Resname of the added cations anion_name : str Name of the added anions cation_name : str Name of the added cations nanion : int Number of anions to add ncation : int Number of cations to add dfrom : float Min distance of ions from molecule dbetween : float Min distance between ions segname : str Segment name to add Returns ------- mol : :class:`Molecule <moleculekit.molecule.Molecule>` object The molecule with the ions added """ newmol = mol.copy() logger.debug('Min distance of ions from molecule: ' + str(dfrom) + 'A') logger.debug('Min distance between ions: ' + str(dbetween) + 'A') logger.debug('Placing {:d} anions and {:d} cations.'.format( nanion, ncation)) if (nanion + ncation) == 0: return newmol nions = nanion + ncation betabackup = newmol.beta.copy() newmol.set('beta', sequenceID((newmol.resid, newmol.insertion, newmol.segid))) # Find water oxygens to replace with ions ntries = 0 maxtries = 10 while True: ionlist = [] watindex = newmol.atomselect('noh and water and not (within ' + str(dfrom) + ' of not water)', indexes=True) watsize = len(watindex) if watsize == 0: raise NameError( 'No waters could be found further than ' + str(dfrom) + ' from other molecules to be replaced by ions. You might need to solvate with a bigger box or disable the ionize property when building.' ) while len(ionlist) < nions: if len(watindex) == 0: break randwat = np.random.randint(len(watindex)) thision = watindex[randwat] addit = True if len(ionlist) != 0: # Check for distance from precious ions ionspos = newmol.get('coords', sel=ionlist) thispos = newmol.get('coords', sel=thision) dists = distance.cdist(np.atleast_2d(ionspos), np.atleast_2d(thispos), metric='euclidean') if np.any(dists < dbetween): addit = False if addit: ionlist.append(thision) watindex = np.delete(watindex, randwat) if len(ionlist) == nions: break ntries += 1 if ntries == maxtries: raise NameError( 'Failed to add ions after ' + str(maxtries) + ' attempts. Try decreasing the ' 'from' ' and ' 'between' ' parameters, decreasing ion concentration or making a larger water box.' ) # Delete waters but keep their coordinates waterpos = np.atleast_2d(newmol.get('coords', ionlist)) betasel = np.zeros(newmol.numAtoms, dtype=bool) for b in newmol.beta[ionlist]: betasel |= newmol.beta == b atmrem = np.sum(betasel) atmput = 3 * len(ionlist) # assert atmrem == atmput, 'Removing {} atoms instead of {}. Report this bug.'.format(atmrem, atmput) sel = np.where(betasel)[0] newmol.remove(sel, _logger=False) # assert np.size(sel) == atmput, 'Removed {} atoms instead of {}. Report this bug.'.format(np.size(sel), atmput) betabackup = np.delete(betabackup, sel) # Add the ions randidx = np.random.permutation(np.size(waterpos, 0)) atom = Molecule() atom.empty(1) atom.set('chain', 'I') atom.set('segid', 'I') for i in range(nanion): atom.set('name', anion_name) atom.set('resname', anion_resname) atom.set('resid', newmol.resid[-1] + 1) atom.coords = waterpos[randidx[i], :] newmol.insert(atom, len(newmol.name)) for i in range(ncation): atom.set('name', cation_name) atom.set('resname', cation_resname) atom.set('resid', newmol.resid[-1] + 1) atom.coords = waterpos[randidx[i + nanion], :] newmol.insert(atom, len(newmol.name)) # Restoring the original betas newmol.beta[:len(betabackup)] = betabackup return newmol
def _charmmLipid2Amber(mol): """ Convert a CHARMM lipid membrane to AMBER format Parameters ---------- mol : :class:`Molecule <moleculekit.molecule.Molecule>` object The Molecule object containing the membrane Returns ------- newmol : :class:`Molecule <moleculekit.molecule.Molecule>` object A new Molecule object with the membrane converted to AMBER """ resdict = _readcsvdict( os.path.join(home(shareDir=True), 'builder', 'charmmlipid2amber.csv')) natoms = mol.numAtoms neworder = np.array( list(range(natoms) )) # After renaming the atoms and residues I have to reorder them begs = np.zeros(natoms, dtype=bool) fins = np.zeros(natoms, dtype=bool) begters = np.zeros(natoms, dtype=bool) finters = np.zeros(natoms, dtype=bool) # Iterate over the translation dictionary mol = mol.copy() incrresids = sequenceID((mol.resid, mol.insertion, mol.segid)) for res in resdict.keys(): molresidx = mol.resname == res if not np.any(molresidx): continue names = mol.name.copy( ) # Need to make a copy or I accidentally double-modify atoms atommap = resdict[res] for atom in atommap.keys(): rule = atommap[atom] molatomidx = np.zeros(len(names), dtype=bool) molatomidx[molresidx] = names[molresidx] == atom mol.set('resname', rule.replaceresname, sel=molatomidx) mol.set('name', rule.replaceatom, sel=molatomidx) neworder[molatomidx] = rule.order if rule.order == 0: # First atom (with or without ters) begs[molatomidx] = True if rule.order == rule.natoms - 1: # Last atom (with or without ters) fins[molatomidx] = True if rule.order == 0 and rule.ter: # First atom with ter begters[molatomidx] = True if rule.order == rule.natoms - 1 and rule.ter: # Last atom with ter finters[molatomidx] = True uqresids = np.unique(incrresids[begs]) residuebegs = np.ones(len(uqresids), dtype=int) * -1 residuefins = np.ones(len(uqresids), dtype=int) * -1 for i in range(len(uqresids)): residuebegs[i] = np.where(incrresids == uqresids[i])[0][0] residuefins[i] = np.where(incrresids == uqresids[i])[0][-1] for i in range(len(residuebegs)): beg = residuebegs[i] fin = residuefins[i] + 1 neworder[beg:fin] = neworder[beg:fin] + beg idx = np.argsort(neworder) _reorderMol(mol, idx) begters = np.where(begters[idx])[0] # Sort the begs and ters finters = np.where(finters[idx])[0] #if len(begters) > 999: # raise NameError('More than 999 lipids. Cannot define separate segments for all of them.') for i in range(len(begters)): map = np.zeros(len(mol.resid), dtype=bool) map[begters[i]:finters[i] + 1] = True mol.set('resid', sequenceID(mol.get('resname', sel=map)), sel=map) mol.set('segid', 'L{}'.format(i % 2), sel=map) return mol
def build(mol, ff=None, topo=None, param=None, prefix='structure', outdir='./build', caps=None, ionize=True, saltconc=0, saltanion=None, saltcation=None, disulfide=None, teleap=None, teleapimports=None, execute=True, atomtypes=None, offlibraries=None, gbsa=False, igb=2): """ Builds a system for AMBER Uses tleap to build a system for AMBER. Additionally it allows the user to ionize and add disulfide bridges. Parameters ---------- mol : :class:`Molecule <moleculekit.molecule.Molecule>` object The Molecule object containing the system ff : list of str A list of leaprc forcefield files. Use :func:`amber.listFiles <htmd.builder.amber.listFiles>` to get a list of available forcefield files. Default: :func:`amber.defaultFf <htmd.builder.amber.defaultFf>` topo : list of str A list of topology `prepi/prep/in` files. Use :func:`amber.listFiles <htmd.builder.amber.listFiles>` to get a list of available topology files. Default: :func:`amber.defaultTopo <htmd.builder.amber.defaultTopo>` param : list of str A list of parameter `frcmod` files. Use :func:`amber.listFiles <htmd.builder.amber.listFiles>` to get a list of available parameter files. Default: :func:`amber.defaultParam <htmd.builder.amber.defaultParam>` prefix : str The prefix for the generated pdb and psf files outdir : str The path to the output directory Default: './build' caps : dict A dictionary with keys segids and values lists of strings describing the caps for a particular protein segment. e.g. caps['P'] = ['ACE', 'NME'] or caps['P'] = ['none', 'none']. Default: will apply ACE and NME caps to every protein segment. ionize : bool Enable or disable ionization saltconc : float Salt concentration to add to the system after neutralization. saltanion : {'Cl-'} The anion type. Please use only AMBER ion atom names. saltcation : {'Na+', 'K+', 'Cs+'} The cation type. Please use only AMBER ion atom names. disulfide : list of pairs of atomselection strings If None it will guess disulfide bonds. Otherwise provide a list pairs of atomselection strings for each pair of residues forming the disulfide bridge. teleap : str Path to teLeap executable used to build the system for AMBER teleapimports : list A list of paths to pass to teLeap '-I' flag, i.e. directories to be searched Default: determined from :func:`amber.defaultAmberHome <htmd.builder.amber.defaultAmberHome>` and :func:`amber.htmdAmberHome <htmd.builder.amber.htmdAmberHome>` execute : bool Disable building. Will only write out the input script needed by tleap. Does not include ionization. atomtypes : list of triplets Custom atom types defined by the user as ('type', 'element', 'hybrid') triplets e.g. (('C1', 'C', 'sp2'), ('CI', 'C', 'sp3')). Check `addAtomTypes` in AmberTools docs. offlibraries : str or list A path or a list of paths to OFF library files. Check `loadOFF` in AmberTools docs. gbsa : bool Modify radii for GBSA implicit water model igb : int GB model. Select: 1 for mbondi, 2 and 5 for mbondi2, 7 for bondi and 8 for mbondi3. Check section 4. The Generalized Born/Surface Area Model of the AMBER manual. Returns ------- molbuilt : :class:`Molecule <moleculekit.molecule.Molecule>` object The built system in a Molecule object Example ------- >>> from htmd.ui import * # doctest: +SKIP >>> mol = Molecule("3PTB") >>> molbuilt = amber.build(mol, outdir='/tmp/build') # doctest: +SKIP >>> # More complex example >>> disu = [['segid P and resid 157', 'segid P and resid 13'], ['segid K and resid 1', 'segid K and resid 25']] >>> molbuilt = amber.build(mol, outdir='/tmp/build', saltconc=0.15, disulfide=disu) # doctest: +SKIP """ # Remove pdb protein bonds as they can be regenerated by tleap. Keep non-protein bonds i.e. for ligands mol = mol.copy() _removeProteinBonds(mol) if teleap is None: teleap = _findTeLeap() else: if shutil.which(teleap) is None: raise NameError( 'Could not find executable: `{}` in the PATH. Cannot build for AMBER.' .format(teleap)) if not os.path.isdir(outdir): os.makedirs(outdir) _cleanOutDir(outdir) if ff is None: ff = defaultFf() if topo is None: topo = defaultTopo() if param is None: param = defaultParam() if caps is None: caps = _defaultProteinCaps(mol) _missingSegID(mol) _checkMixedSegment(mol) mol = _charmmLipid2Amber(mol) _applyProteinCaps(mol, caps) f = open(os.path.join(outdir, 'tleap.in'), 'w') f.write('# tleap file generated by amber.build\n') # Printing out the forcefields for i, force in enumerate(ensurelist(ff)): if not os.path.isfile(force): force = _locateFile(force, 'ff', teleap) if force is None: continue newname = 'ff{}_{}'.format(i, os.path.basename(force)) shutil.copy(force, os.path.join(outdir, newname)) f.write('source {}\n'.format(newname)) f.write('\n') if gbsa: gbmodels = { 1: 'mbondi', 2: 'mbondi2', 5: 'mbondi2', 7: 'bondi', 8: 'mbondi3' } f.write('set default PBradii {}\n\n'.format(gbmodels[igb])) # Adding custom atom types if atomtypes is not None: atomtypes = ensurelist(tocheck=atomtypes[0], tomod=atomtypes) f.write('addAtomTypes {\n') for at in atomtypes: if len(at) != 3: raise RuntimeError( 'Atom type definitions have to be triplets. Check the AMBER documentation.' ) f.write(' {{ "{}" "{}" "{}" }}\n'.format(at[0], at[1], at[2])) f.write('}\n\n') # Loading OFF libraries if offlibraries is not None: offlibraries = ensurelist(offlibraries) for i, off in enumerate(offlibraries): if not os.path.isfile(off): raise RuntimeError( 'Could not find off-library in location {}'.format(off)) newname = 'offlib{}_{}'.format(i, os.path.basename(off)) shutil.copy(off, os.path.join(outdir, newname)) f.write('loadoff {}\n'.format(newname)) # Loading frcmod parameters f.write('# Loading parameter files\n') for i, p in enumerate(param): if not os.path.isfile(p): p = _locateFile(p, 'param', teleap) if p is None: continue newname = 'param{}_{}'.format(i, os.path.basename(p)) shutil.copy(p, os.path.join(outdir, newname)) f.write('loadamberparams {}\n'.format(newname)) f.write('\n') # Loading prepi topologies f.write('# Loading prepi topologies\n') for i, t in enumerate(topo): if not os.path.isfile(t): t = _locateFile(t, 'topo', teleap) if t is None: continue newname = 'topo{}_{}'.format(i, os.path.basename(t)) shutil.copy(t, os.path.join(outdir, newname)) f.write('loadamberprep {}\n'.format(newname)) f.write('\n') f.write('# Loading the system\n') f.write('mol = loadpdb input.pdb\n\n') if np.sum(mol.atomtype != '') != 0: logger.debug('Writing mol2 files for input to tleap.') segs = np.unique(mol.segid[mol.atomtype != '']) combstr = 'mol = combine {mol' for s in segs: name = 'segment{}'.format(s) mol2name = os.path.join(outdir, '{}.mol2'.format(name)) mol.write(mol2name, (mol.atomtype != '') & (mol.segid == s)) if not os.path.isfile(mol2name): raise NameError( 'Could not write a mol2 file out of the given Molecule.') f.write('# Loading the rest of the system\n') f.write('{} = loadmol2 {}.mol2\n\n'.format(name, name)) combstr += ' {}'.format(name) combstr += '}\n\n' f.write(combstr) # Write patches for disulfide bonds (only after ionizing) if not ionize: # TODO: Remove this once we deprecate the class from htmd.builder.builder import DisulfideBridge from moleculekit.molecule import UniqueResidueID if disulfide is not None and len(disulfide) != 0 and isinstance( disulfide[0], DisulfideBridge): newdisu = [] for d in disulfide: r1 = UniqueResidueID.fromMolecule( mol, 'resid {} and segname {}'.format(d.resid1, d.segid1)) r2 = UniqueResidueID.fromMolecule( mol, 'resid {} and segname {}'.format(d.resid2, d.segid2)) newdisu.append([r1, r2]) disulfide = newdisu # TODO: Remove up to here ---------------------- if disulfide is not None and len(disulfide) != 0 and isinstance( disulfide[0][0], str): disulfide = convertDisulfide(mol, disulfide) if disulfide is None: logger.info('Detecting disulfide bonds.') disulfide = detectDisulfideBonds(mol) # Fix structure to match the disulfide patching if len(disulfide) != 0: torem = np.zeros(mol.numAtoms, dtype=bool) f.write('# Adding disulfide bonds\n') for d in disulfide: # Rename the residues to CYX if there is a disulfide bond atoms1 = d[0].selectAtoms(mol, indexes=False) atoms2 = d[1].selectAtoms(mol, indexes=False) mol.resname[atoms1] = 'CYX' mol.resname[atoms2] = 'CYX' # Remove (eventual) HG hydrogens on these CYS (from proteinPrepare) torem |= (atoms1 & (mol.name == 'HG')) | (atoms2 & (mol.name == 'HG')) # Convert to stupid amber residue numbering uqseqid = sequenceID( (mol.resid, mol.insertion, mol.segid)) + mol.resid[0] uqres1 = int(np.unique(uqseqid[atoms1])) uqres2 = int(np.unique(uqseqid[atoms2])) f.write('bond mol.{}.SG mol.{}.SG\n'.format(uqres1, uqres2)) f.write('\n') mol.remove(torem, _logger=False) # Calculate the bounding box and store it in the CRD file f.write('setBox mol "vdw"\n\n') f.write('# Writing out the results\n') f.write('saveamberparm mol ' + prefix + '.prmtop ' + prefix + '.crd\n') f.write('quit') f.close() # Printing and loading the PDB file. AMBER can work with a single PDB file if the segments are separate by TER logger.debug('Writing PDB file for input to tleap.') pdbname = os.path.join(outdir, 'input.pdb') # mol2 files have atomtype, here we only write parts not coming from mol2 # We need to write the input.pdb at the end since we modify the resname for disulfide bridges in mol mol.write(pdbname, mol.atomtype == '') if not os.path.isfile(pdbname): raise NameError( 'Could not write a PDB file out of the given Molecule.') molbuilt = None if execute: if not teleapimports: teleapimports = [] # Source default Amber (i.e. the same paths tleap imports) amberhome = defaultAmberHome(teleap=teleap) teleapimports += [ os.path.join(amberhome, s) for s in _defaultAmberSearchPaths.values() ] if len(teleapimports) == 0: raise RuntimeWarning( 'No default Amber force-field found. Check teLeap location: {}' .format(teleap)) # Source HTMD Amber paths that contain ffs htmdamberdir = htmdAmberHome() teleapimports += [ os.path.join(htmdamberdir, os.path.dirname(f)) for f in ff if os.path.isfile(os.path.join(htmdamberdir, f)) ] if len(teleapimports) == 0: raise RuntimeError( 'No default Amber force-field imports found. Check ' '`htmd.builder.amber.defaultAmberHome()` and `htmd.builder.amber.htmdAmberHome()`' ) # Set import flags for teLeap teleapimportflags = [] for p in teleapimports: teleapimportflags.append('-I') teleapimportflags.append(str(p)) logpath = os.path.abspath(os.path.join(outdir, 'log.txt')) logger.info('Starting the build.') currdir = os.getcwd() os.chdir(outdir) f = open(logpath, 'w') try: cmd = [teleap, '-f', './tleap.in'] cmd[1:1] = teleapimportflags logger.debug(cmd) call(cmd, stdout=f) except: raise NameError('teLeap failed at execution') f.close() errors = _logParser(logpath) os.chdir(currdir) if errors: raise BuildError(errors + [ 'Check {} for further information on errors in building.'. format(logpath) ]) logger.info('Finished building.') if os.path.exists(os.path.join(outdir, 'structure.crd')) and \ os.path.getsize(os.path.join(outdir, 'structure.crd')) != 0 and \ os.path.getsize(os.path.join(outdir, 'structure.prmtop')) != 0: try: molbuilt = Molecule(os.path.join(outdir, 'structure.prmtop')) molbuilt.read(os.path.join(outdir, 'structure.crd')) except Exception as e: raise RuntimeError( 'Failed at reading structure.prmtop/structure.crd due to error: {}' .format(e)) else: raise BuildError( 'No structure pdb/prmtop file was generated. Check {} for errors in building.' .format(logpath)) if ionize: shutil.move(os.path.join(outdir, 'structure.crd'), os.path.join(outdir, 'structure.noions.crd')) shutil.move(os.path.join(outdir, 'structure.prmtop'), os.path.join(outdir, 'structure.noions.prmtop')) totalcharge = np.sum(molbuilt.charge) nwater = np.sum(molbuilt.atomselect('water and noh')) anion, cation, anionatom, cationatom, nanion, ncation = ionizef( totalcharge, nwater, saltconc=saltconc, anion=saltanion, cation=saltcation) newmol = ionizePlace(mol, anion, cation, anionatom, cationatom, nanion, ncation) # Redo the whole build but now with ions included return build(newmol, ff=ff, topo=topo, param=param, prefix=prefix, outdir=outdir, caps={}, ionize=False, execute=execute, saltconc=saltconc, disulfide=disulfide, teleap=teleap, atomtypes=atomtypes, offlibraries=offlibraries) tmpbonds = molbuilt.bonds molbuilt.bonds = [] # Removing the bonds to speed up writing molbuilt.write(os.path.join(outdir, 'structure.pdb')) molbuilt.bonds = tmpbonds # Restoring the bonds detectCisPeptideBonds(molbuilt) # Warn in case of cis bonds return molbuilt
def autoSegment( mol, sel="all", basename="P", spatial=True, spatialgap=4.0, fields=("segid", ), field=None, _logger=True, ): """Detects resid gaps in a selection and assigns incrementing segid to each fragment !!!WARNING!!! If you want to use atom selections like 'protein' or 'fragment', use this function on a Molecule containing only protein atoms, otherwise the protein selection can fail. Parameters ---------- mol : :class:`Molecule <moleculekit.molecule.Molecule>` object The Molecule object sel : str Atom selection string on which to check for gaps. See more `here <http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html>`__ basename : str The basename for segment ids. For example if given 'P' it will name the segments 'P1', 'P2', ... spatial : bool Only considers a discontinuity in resid as a gap if the CA atoms have distance more than `spatialgap` Angstrom spatialgap : float The size of a spatial gap which validates a discontinuity (A) fields : list Fields in which to set the segments. Must be a combination of "chain", "segid" or only one of them. Returns ------- newmol : :class:`Molecule <moleculekit.molecule.Molecule>` object A new Molecule object with modified segids Example ------- >>> newmol = autoSegment(mol, "chain B", "P", fields=("chain", "segid")) """ from moleculekit.util import sequenceID if field is not None and isinstance(field, str): if field == "both": fields = ("chain", "segid") else: fields = (field, ) mol = mol.copy() idx = mol.atomselect(sel, indexes=True) rid = mol.resid[idx].copy() residiff = np.diff(rid) # Points to the index before the gap! gappos = np.where((residiff != 1) & (residiff != 0))[0] idxstartseg = [idx[0]] + idx[gappos + 1].tolist() idxendseg = idx[gappos].tolist() + [idx[-1]] # Letters to be used for chains, if free: 0123456789abcd...ABCD..., minus chain symbols already used sel_mask = mol.atomselect(sel) used_chains = set(mol.chain[~sel_mask]) available_chains = [x for x in chain_alphabet if x not in used_chains] used_segids = set([x[0] for x in mol.segid[~sel_mask] if x != ""]) available_segids = [ x for x in [basename] + segid_alphabet if x not in used_segids ] basename = available_segids[0] if len(gappos) == 0: if "chain" in fields: mol.set("chain", available_chains[0], sel) if "segid" in fields: mol.set("segid", basename + "0", sel) return mol if spatial: residbackup = mol.resid.copy() # Assigning unique resids to be able to do the distance selection mol.set("resid", sequenceID(mol.resid)) todelete = [] i = 0 for s, e in zip(idxstartseg[1:], idxendseg[:-1]): # Get the carbon alphas of both residues ('coords', sel='resid "{}" "{}" and name CA'.format(mol.resid[e], mol.resid[s])) ca1coor = mol.coords[(mol.resid == mol.resid[e]) & (mol.name == "CA")] ca2coor = mol.coords[(mol.resid == mol.resid[s]) & (mol.name == "CA")] if len(ca1coor) and len(ca2coor): dist = np.sqrt( np.sum((ca1coor.squeeze() - ca2coor.squeeze())**2)) if dist < spatialgap: todelete.append(i) i += 1 todelete = np.array(todelete, dtype=int) # Join the non-real gaps into segments idxstartseg = np.delete(idxstartseg, todelete + 1) idxendseg = np.delete(idxendseg, todelete) mol.set("resid", residbackup) # Restoring the original resids for i, (s, e) in enumerate(zip(idxstartseg, idxendseg)): if "chain" in fields: newchainid = available_chains[i % len(available_chains)] if _logger: logger.info( f"Set chain {newchainid} between resid {mol.resid[s]} and {mol.resid[e]}." ) mol.chain[s:e + 1] = newchainid if "segid" in fields: newsegid = basename + str(i) if _logger: logger.info( f"Created segment {newsegid} between resid {mol.resid[s]} and {mol.resid[e]}." ) mol.segid[s:e + 1] = newsegid return mol
def autoSegment( mol, sel="all", basename="P", spatial=True, spatialgap=4.0, field="segid", mode="alphanumeric", _logger=True, ): """Detects resid gaps in a selection and assigns incrementing segid to each fragment !!!WARNING!!! If you want to use atom selections like 'protein' or 'fragment', use this function on a Molecule containing only protein atoms, otherwise the protein selection can fail. Parameters ---------- mol : :class:`Molecule <moleculekit.molecule.Molecule>` object The Molecule object sel : str Atom selection string on which to check for gaps. See more `here <http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html>`__ basename : str The basename for segment ids. For example if given 'P' it will name the segments 'P1', 'P2', ... spatial : bool Only considers a discontinuity in resid as a gap of the CA atoms have distance more than `spatialgap` Angstrom spatialgap : float The size of a spatial gap which validates a discontinuity (A) field : str Field to fix. Can be "segid" (default), "chain", or "both" mode : str If set to 'numeric' it will use numbers for segment IDs. If set to 'alphabetic' it will use letters for segment IDs. If set to 'alphanumeric' it will use both numbers and letters for segment IDs. Returns ------- newmol : :class:`Molecule <moleculekit.molecule.Molecule>` object A new Molecule object with modified segids Example ------- >>> newmol = autoSegment(mol,'chain B','P') """ from moleculekit.util import sequenceID mol = mol.copy() idx = mol.atomselect(sel, indexes=True) rid = mol.resid[idx].copy() residiff = np.diff(rid) # Points to the index before the gap! gappos = np.where((residiff != 1) & (residiff != 0))[0] # Letters to be used for chains, if free: 0123456789abcd...ABCD..., minus chain symbols already used used_chains = set(mol.chain) chain_alphabet = _getChainAlphabet(mode) available_chains = [x for x in chain_alphabet if x not in used_chains] idxstartseg = [idx[0]] + idx[gappos + 1].tolist() idxendseg = idx[gappos].tolist() + [idx[-1]] mol.set("segid", basename, sel) if len(gappos) == 0: mol.set("segid", basename + chain_alphabet[0], sel) return mol if spatial: residbackup = mol.resid.copy() # Assigning unique resids to be able to do the distance selection mol.set("resid", sequenceID(mol.resid)) todelete = [] i = 0 for s, e in zip(idxstartseg[1:], idxendseg[:-1]): # Get the carbon alphas of both residues ('coords', sel='resid "{}" "{}" and name CA'.format(mol.resid[e], mol.resid[s])) ca1coor = mol.coords[(mol.resid == mol.resid[e]) & (mol.name == "CA")] ca2coor = mol.coords[(mol.resid == mol.resid[s]) & (mol.name == "CA")] if len(ca1coor) and len(ca2coor): dist = np.sqrt(np.sum((ca1coor.squeeze() - ca2coor.squeeze()) ** 2)) if dist < spatialgap: todelete.append(i) i += 1 todelete = np.array(todelete, dtype=int) # Join the non-real gaps into segments idxstartseg = np.delete(idxstartseg, todelete + 1) idxendseg = np.delete(idxendseg, todelete) mol.set("resid", residbackup) # Restoring the original resids i = 0 for s, e in zip(idxstartseg, idxendseg): # Fixup segid if field in ["segid", "both"]: newsegid = basename + str(i) if np.any(mol.segid == newsegid): raise RuntimeError( f"Segid {newsegid} already exists in the molecule. Please choose different prefix." ) if _logger: logger.info( f"Created segment {newsegid} between resid {mol.resid[s]} and {mol.resid[e]}." ) mol.segid[s : e + 1] = newsegid # Fixup chain if field in ["chain", "both"]: newchainid = available_chains[i] if _logger: logger.info( f"Set chain {newchainid} between resid {mol.resid[s]} and {mol.resid[e]}." ) mol.chain[s : e + 1] = newchainid i += 1 return mol
def sequenceStructureAlignment(mol, ref, molseg=None, refseg=None, maxalignments=10, nalignfragment=1): """ Aligns two structures by their longests sequences alignment Parameters ---------- mol : :class:`Molecule <moleculekit.molecule.Molecule>` object The Molecule we want to align ref : :class:`Molecule <moleculekit.molecule.Molecule>` object The reference Molecule to which we want to align molseg : str The segment of `mol` we want to align refseg : str The segment of `ref` we want to align to maxalignments : int The maximum number of alignments we want to produce nalignfragment : int The number of fragments used for the alignment. Returns ------- mols : list A list of Molecules each containing a different alignment. """ from moleculekit.util import ensurelist try: from Bio import pairwise2 except ImportError as e: raise ImportError( 'You need to install the biopython package to use this function. Try using `conda install biopython`.' ) from Bio.SubsMat import MatrixInfo as matlist if len([x for x in np.unique(mol.altloc) if len(x)]) > 1: raise RuntimeError( 'Alternative atom locations detected in `mol`. Please remove these before calling this function.' ) if len([x for x in np.unique(ref.altloc) if len(x)]) > 1: raise RuntimeError( 'Alternative atom locations detected in `ref`. Please remove these before calling this function.' ) seqmol = mol.sequence() seqref = ref.sequence() if molseg is None and len(seqmol) > 1: logger.info( 'Multiple segments ({}) detected in `mol`. Alignment will be done on all. Otherwise please specify which segment to align.' .format(list(seqmol.keys()))) seqmol = mol.sequence(noseg=True) molseg = list(seqmol.keys())[0] if refseg is None and len(seqref) > 1: logger.info( 'Multiple segments ({}) detected in `ref`. Alignment will be done on all. Otherwise please specify which segment to align.' .format(list(seqref.keys()))) seqref = ref.sequence(noseg=True) refseg = list(seqref.keys())[0] def getSegIdx(m, mseg): # Calculate the atoms which belong to the selected segments if isinstance(mseg, str) and mseg == 'protein': msegidx = m.atomselect('protein and name CA') else: msegidx = np.zeros(m.numAtoms, dtype=bool) for seg in ensurelist(mseg): msegidx |= (m.segid == seg) & (m.name == 'CA') return np.where(msegidx)[0] molsegidx = getSegIdx(mol, molseg) refsegidx = getSegIdx(ref, refseg) # Create fake residue numbers for the selected segment from moleculekit.util import sequenceID molfakeresid = sequenceID( (mol.resid[molsegidx], mol.insertion[molsegidx], mol.chain[molsegidx])) reffakeresid = sequenceID( (ref.resid[refsegidx], ref.insertion[refsegidx], ref.chain[refsegidx])) # TODO: Use BLOSUM62? alignments = pairwise2.align.globaldx(seqref[refseg], seqmol[molseg], matlist.blosum62) numaln = len(alignments) if numaln > maxalignments: logger.warning( '{} alignments found. Limiting to {} as specified in the `maxalignments` argument.' .format(numaln, maxalignments)) alignedstructs = [] for i in range(min(maxalignments, numaln)): refaln = np.array(list(alignments[i][0])) molaln = np.array(list(alignments[i][1])) # By doing cumsum we calculate how many letters were before the current letter (i.e. residues before current) residref = np.cumsum(refaln != '-') - 1 # Start them from 0 residmol = np.cumsum(molaln != '-') - 1 # Start them from 0 # Find the region of maximum alignment between the molecules dsig = np.hstack( ([False], (refaln != '-') & (molaln != '-'), [False])).astype(int) dsigdiff = np.diff(dsig) startIndex = np.where(dsigdiff > 0)[0] endIndex = np.where(dsigdiff < 0)[0] duration = endIndex - startIndex duration_sorted = np.sort(duration)[::-1] _list_starts = [] _list_finish = [] for n in range(nalignfragment): if n == len(duration): break idx = np.where(duration == duration_sorted[n])[0] start = startIndex[idx][0] finish = endIndex[idx][0] _list_starts.append(start) _list_finish.append(finish) # Get the "resids" of the aligned residues only refalnresid = np.concatenate([ residref[start:finish] for start, finish in zip(_list_starts, _list_finish) ]) molalnresid = np.concatenate([ residmol[start:finish] for start, finish in zip(_list_starts, _list_finish) ]) refidx = [] for r in refalnresid: refidx += list(refsegidx[reffakeresid == r]) molidx = [] for r in molalnresid: molidx += list(molsegidx[molfakeresid == r]) molboolidx = np.zeros(mol.numAtoms, dtype=bool) molboolidx[molidx] = True refboolidx = np.zeros(ref.numAtoms, dtype=bool) refboolidx[refidx] = True start_residues = np.concatenate([ mol.resid[molsegidx[molfakeresid == residmol[r]]] for r in _list_starts ]) finish_residues = np.concatenate([ mol.resid[molsegidx[molfakeresid == residmol[r - 1]]] for r in _list_finish ]) logger.info( 'Alignment #{} was done on {} residues: mol segid {} resid {}'. format( i, len(refalnresid), np.unique(mol.segid[molidx])[0], ', '.join([ '{}-{}'.format(s, f) for s, f in zip(start_residues, finish_residues) ]))) alignedmol = mol.copy() alignedmol.align(molboolidx, ref, refboolidx) alignedstructs.append(alignedmol) return alignedstructs