def get_topologies(cls, forcefield, water_model): if forcefield == "amber": if not os.environ.get("AMBERHOME"): raise DabbleError("AMBERHOME must be set to use AMBER " "forcefield!") # Check amber version and set topologies accordingly ambpath = os.path.join(os.environ["AMBERHOME"], "dat", "leap", "cmd") topologies = [ "leaprc.protein.ff14SB", "leaprc.lipid14", "leaprc.gaff2", ] if water_model == "tip3": topologies.append("leaprc.water.tip3p") elif water_model == "tip4e": topologies.append("leaprc.water.tip4pew") elif water_model == "spce": topologies.append("leaprc.water.spce") else: raise DabbleError("Water model '%s' not supported with AMBER" % water_model) for i, top in enumerate(topologies): topologies[i] = os.path.abspath(os.path.join(ambpath, top)) if not os.path.isfile(topologies[i]): raise DabbleError("AMBER forcefield files '%s' not found\n" "Dabble requires >= AmberTools16" % top) return topologies
def _check_psf_output(self): """ Scans the output psf from psfgen for atoms where the coordinate could not be set, indicating an unmatched atom. This check is necessary because sometimes psfgen will run with no errors or warnings but will have unmatched atoms that are all at (0,0,0). """ # Check file was written at all if not os.path.isfile('%s.pdb' % self.outprefix): raise DabbleError("\nERROR: psf file failed to write.\n" " Please see log above.\n") # Open the pdb file in VMD and check for atoms with no occupancy fileh = molecule.load('pdb', '%s.pdb' % self.outprefix) errors = atomsel("occupancy=-1", molid=fileh) # Print out error messages if errors: errstr = "\nERROR: Couldn't find the following atoms.\n" for i in range(len(errors)): errstr += "\t%s%s:%s\n" % (errors.resname[i], errors.resid[i], errors.name[i]) errstr += "Check if they are present in the original structure.\n" raise DabbleError(errstr) print("\nChecked output pdb/psf has all atoms present " "and correct.\n")
def _set_water_names(self): """ Sets the names of water residues and atoms according to the given water model. We do it this way instead of with the GraphMatcher because waters can have a fake bond """ # Sanity check if self.water_model not in self.WATER_NAMES: raise DabbleError("Unsupported water model '%s' with forcefield " "'%s'" % (self.water_model, self.forcefield)) watres = self.WATER_NAMES[self.water_model] if watres not in self.matcher.known_res: raise DabbleError("Water resname '%s' for model '%s' not defined " "in topology files" % (watres, self.water_model)) # Set consistent residue and atom names, crystal waters # can be named HOH, etc residues = set(atomsel("water").residue) # If no water, nothing to do if not residues: return watsel = "residue %s" % ' '.join(str(_) for _ in residues) atomsel(watsel).resname = self.WATER_NAMES[self.water_model] atomsel("%s and noh" % watsel).name = self.WATER_O_NAME atomsel("%s and not noh" % watsel).name = self.WATER_H_NAMES * len(residues)
def get_disulfide(self, selection, molid): """ Checks if the selection corresponds to a cysteine in a disulfide bond. Sets the patch line appropriately and matches atom names using a subgraph match to the normal cysteine residue Args: selection (VMD atomsel): Selection to check molid (int): VMD molecule ID to look for other CYS in Returns: resnames (dict int -> str) Residue name translation dictionary atomnames (dict int -> str) Atom name translation dictionary conect (int) Residue this one is connected to """ rgraph, _ = self.parse_vmd_graph(selection) # Sanity check if not self.known_res.get("CYX"): raise DabbleError("CYX undefined. Check forcefields!") # Check for the 3 join atoms corresponding to the disulfide bonds externs = self.get_extraresidue_atoms(selection) if len(externs) != 3: return (None, None, None) # With the AMBER format, the CYX residue should be a subgraph of this # residue as the only difference is the _join bond graph = self.known_res.get("CYX") matcher = isomorphism.GraphMatcher(rgraph, graph, \ node_match=self._check_atom_match) if matcher.subgraph_is_isomorphic(): # TODO: Check there's only one match match = next(matcher.match()) else: return (None, None, None) # Get naming dictionaries to return resmatch, nammatch = self._get_names_from_match(graph, match) # Now we know it's a cysteine in a disulfide bond # Identify which resid and fragment corresponds to the other cysteine partners = [n for n in externs if \ atomsel("index %d" % n, molid=molid).element[0] == "S"] if not partners: raise DabbleError("3 bonded Cys %d isn't a valid disulfide!" % selection.resid[0]) osel = atomsel("index %d" % partners[0], molid=molid) conect = osel.residue[0] return (resmatch, nammatch, conect)
def get_lipid_tails(self, selection, head): """ Obtains a name mapping for both ligand tails in a system given a selection describing the lipid and the indices of the head group atoms. Args: selection (VMD atomsel): Selection to pull tails from head (list of int): Atom indices in the head group of this lipid. Obtain with get_lipid_head function. Returns: (array of tuples that are dict int->str): Atom index to resname matched, atom index to atom name translation dictionaries for both tails Raises: ValueError: If a tail could not be matched or if there is an incorrect number of tails somehow attached. """ resname = selection.resname[0] rgraph = self.parse_vmd_graph(selection)[0] rgraph.remove_nodes_from(head) if nx.number_connected_components(rgraph) != 2: raise DabbleError("Incorrect number of tails attached to %s:%s!" % (resname, selection.resid[0])) taildicts = [] for t in nx.connected_components(rgraph): tgraph = rgraph.subgraph(t) matched = False for matchname in (_ for _ in self.LIPID_TAILS if \ self.known_res.get(_)): graph = self.known_res.get(matchname) truncated = nx.Graph(graph) truncated.remove_nodes_from([n for n in graph.nodes() if \ graph.node[n]["residue"] != "self"]) matcher = isomorphism.GraphMatcher( tgraph, truncated, node_match=self._check_atom_match) if matcher.is_isomorphic(): matched = True match = next(matcher.match()) resmatch, nammatch = self._get_names_from_match( graph, match) taildicts.append((resmatch, nammatch)) break if not matched: raise DabbleError("Couldn't find a match for tail %s:%s" % (resname, selection.resid[0])) return taildicts
def check_out_type(value, outformat, forcefield, hmr=False): """ Checks the file format of the requiested output is supported, and sets internal variables as necessary. Args: value (str): Filename requested outformat (str): Format requested, or None to infer from filename forcefield (str): Force field requested hmr (bool): If hydrogen mass repartitioning is requested Returns: The requested output format Raises: ValueError: if the output format requested is currently unsupported NotImplementedError: if hydrogen mass repartitioning is requested for amber files """ if outformat is not None: print("Will output files in %s format" % outformat) return outformat print("Inferring output format from file extension") ext = value.rsplit('.')[-1] if ext == 'mae': out_fmt = 'mae' elif ext == 'pdb': out_fmt = 'pdb' elif ext == 'dms': out_fmt = 'dms' elif ext == 'dat': out_fmt = 'lammps' elif ext == 'psf' and forcefield in ["amber", "charmm", "opls"]: out_fmt = 'charmm' elif ext == 'prmtop' and forcefield in ["amber", "charmm", "opls"]: out_fmt = 'amber' else: raise DabbleError("%s is an unsupported format with %s forcefield" % (value, forcefield)) if hmr and (out_fmt != 'amber'): raise DabbleError("HMR only supported with AMBER outputs!") # Check if amber forcefield can be used if forcefield == "amber" and not os.environ.get("AMBERHOME"): raise DabbleError("AMBERHOME must be set to use AMBER forcefields!") return out_fmt
def get_parameters(cls, forcefield, water_model): if forcefield == "charmm": prms = [ "par_all36m_prot.prm", "par_all36_cgenff.prm", "par_all36_lipid.prm", "par_all36_carb.prm", "par_all36_na.prm", "toppar_all36_prot_na_combined.str" ] if water_model == "tip3": prms.append("toppar_water_ions.str") elif water_model == "tip4e": prms.append("toppar_water_ions_tip4p_ew.str") elif water_model == "spce": prms.append("toppar_water_ions_spc_e.str") elif forcefield == "amber": from dabble.param import AmberWriter # avoid circular dependency return AmberWriter.get_parameters(forcefield, water_model) elif forcefield == "opls": prms = ["opls_aam.prm"] if water_model != "tip3": raise DabbleError("Only TIP3 water model supported for OPLS") else: raise ValueError("Invalid forcefield: '%s'" % forcefield) return [cls._get_forcefield_path(par) for par in prms]
def get_topologies(cls, forcefield, water_model): if forcefield == "charmm": topos = [ "top_all36_caps.rtf", "top_all36_cgenff.rtf", "top_all36_prot.rtf", "top_all36_lipid.rtf", "top_all36_carb.rtf", "top_all36_na.rtf", "toppar_all36_prot_na_combined.str", "toppar_all36_prot_fluoro_alkanes.str" ] if water_model == "tip3": topos.append("toppar_water_ions.str") elif water_model == "tip4e": topos.append("toppar_water_ions_tip4p_ew.str") elif water_model == "spce": topos.append("toppar_water_ions_spc_e.str") elif forcefield == "opls": topos = ["opls_aam.rtf", "opls_aam_caps.rtf"] if water_model != "tip3": raise DabbleError("Only TIP3 water model supported for OPLS") elif forcefield == "amber": from dabble.param import AmberWriter # avoid circular dependency return AmberWriter.get_topologies(forcefield, water_model) else: raise ValueError("Invalid forcefield: '%s'" % forcefield) return [cls._get_forcefield_path(top) for top in topos]
def get_topologies(cls, forcefield, water_model): """ Gets the path to GROMACS-format topologies for a given force field """ # Amber, Charmm, and OPLS handled by conversion if forcefield == "charmm": return CharmmWriter.get_topologies(forcefield, water_model) if forcefield == "amber": return AmberWriter.get_topologies(forcefield, water_model) if forcefield == "opls": return CharmmWriter.get_topologies(forcefield, water_model) # No forcefields really ship with gromacs right now because # I found an error in the OPLS AA/M gromacs implementation and # they won't respond to my emails # Use GROMACS forcefield for the remaining ones #if forcefield == "opls": # ffdir = "oplsaam.ff" #elif forcefield == "gromos": # ffdir = "gromos54a7.ff" raise DabbleError("Unsupported forcefield %s" % forcefield)
def get_names(self, selection, print_warning=False): """ Returns at atom name matching up dictionary. Does the generic moleculematcher algorithm then checks that only one resname matched since for CHARMM there is no concept of a unit and only one named residue is defined per topology. Args: selection (VMD atomsel): Selection to rename print_warning (bool): Debug output Returns: (str) resname matched (dict int->str) translation dictionary from index to atom name Raises: ValueError if more than one residue name is matched """ (resnames, atomnames) = super(CharmmMatcher, self).get_names(selection, print_warning) if not resnames: return (None, None) # Set the resname correctly after checking only one resname # matched since this is charmm resname = set(resnames.values()) if len(resname) > 1: raise DabbleError("More than one residue name was returned as " "belonging to a single residue in CHARMM matching." " Not sure how this happened; something is really " "really wrong. Residue was: %s:%d" % (selection.resname[0], selection.resid[0])) return (resname.pop(), atomnames)
def _parse_atp(self, filename): """ Parses an atom types definition file, populating the elements table. Args: filename (str): .atp file to parse Returns: True on success """ with open(filename, 'r') as fileh: lines = fileh.readlines() for line in lines: line = line.strip() tokens = [i.strip(" \t\n") for i in line.split()] if not tokens or not tokens[0]: continue # Comment lines start with ';' if tokens[0][0] == ";": continue try: element = self.get_element(float(tokens[1])) except: raise DabbleError("Problem parsing line:\n%s" % line) if self.nodenames.get(tokens[0]): logging.info("Already have element %s defined", element) else: self.nodenames[tokens[0]] = element return True
def _assign_elements(self, graph): """ Assigns elements to parsed in residues. Called after all topology files are read in. Element "Any" is assigned to atoms from other residues (+- atoms), since these are only defined by name. Args: graph (networkx graph): The graph to assign elements to Raises: ValueError if an atom type can't be assigned an element """ # Now that all atom and mass lines are read, get the element for each atom for node, data in graph.nodes(data=True): if data.get('residue') != "self": element = "Any" else: element = self.nodenames.get(data.get('type')) if not element: self.write_dot(graph, "invalid_type.dot") raise DabbleError("Unknown atom type %s, name '%s'.\nDumping " "graph as invalid_type.dot" % (data.get("type"), node)) data['element'] = element
def _rename_atoms_amber(self): """ Matches up atom names with those in the provided topologies and sets the atom and residue names correctly in the built molecule. Handles all non-lipid atoms. Sets the user field of all atoms to 1.0 to track which things have been written. Returns: (set of tuples (int,int)): Residue #s of disulfide or otherwise noncanonically linked residues Raises: ValueError if a residue definition could not be found """ self._set_water_names() nonlips = set( atomsel("not (water or %s)" % self.lipid_sel, molid=self.molid).residue) n_res = len(nonlips) conect = set() # Atom indices bound to noncanonical residues while nonlips: if len(nonlips) % 500 == 0: print("Renaming residues.... %.0f%% \r" % (100. - 100 * len(nonlips) / float(n_res)), flush=True) residue = nonlips.pop() sel = atomsel("residue %s" % residue) resnames, atomnames = self.matcher.get_names(sel, print_warning=False) # Check if it's a linkage to another amino acid if not resnames: resnames, atomnames, other = self.matcher.get_linkage( sel, self.molid) if not resnames: rgraph = self.matcher.parse_vmd_graph(sel)[0] self.matcher.write_dot(rgraph, "rgraph.dot") raise DabbleError( "ERROR: Could not find a residue definition " "for %s:%s" % (sel.resname[0], sel.resid[0])) print( "\tBonded residue: %s:%d -> %s" % (sel.resname[0], sel.resid[0], list(resnames.values())[0])) conect.add(other) # Do the renaming self._apply_naming_dictionary(resnames=resnames, atomnames=atomnames) atomsel('all').user = 1.0 print("\n", flush=True) return conect
def __init__(self, molid, **kwargs): """ Creates a CHARMM writer Args: molid (int): VMD molecule ID of system to write tmp_dir (str): Directory for temporary files. Defaults to "." lipid_sel (str): Lipid selection string. Defaults to "lipid" hmr (bool): If hydrogen masses should be repartitioned. Defaults to False forcefield (str): Forcefield to use, either "charmm" or "amber" water_model (str): Water model to use extra_topos (list of str): Additional topology (.str, .off, .lib) to include. extra_params (list of str): Additional parameter sets (.str, .frcmod) override_defaults (bool): If set, omits default forcefield parameters. debug_verbose (bool): Prints additional output, like from psfgen. """ # Initialize default options super(CharmmWriter, self).__init__(molid, **kwargs) # Create a psf generator object self.psfgen = PsfGen() # Set forcefield default topologies and parameters self.forcefield = kwargs.get("forcefield", "charmm") self.water_model = kwargs.get("water_model", "tip3") self.topologies = self.get_topologies(self.forcefield, self.water_model) self.parameters = self.get_parameters(self.forcefield, self.water_model) if "charmm" in self.forcefield: if self.hmr: raise DabbleError("HMR not supported with CHARMM ff yet") # Handle override and extra topologies if self.override: self.topologies = [] self.parameters = [] # Now extra topologies (put in self by super __init__) self.topologies.extend(self.extra_topos) self.parameters.extend(self.extra_params) # Once all topologies defined, initialize matcher only if # using CHARMM topologies (not if we're doing a conversion) if "charmm" in self.forcefield or "opls" in self.forcefield: self.matcher = CharmmMatcher(self.topologies) # Keep track of segment numbers for protein and other self.segint = 0
def _parse_topology(self, filename): """ Parses a gromacs forcefield directory. Reads all atom types from the atomtypes.atp file, parses all .itp topology files, and reads specbonds.dat for special bonds Args: filename (str): The folder to parse (should end in .ff) Returns: True if successful Raises: DabbleError if topology file is malformed in various ways DabbleError if gromacs installation cannot be found """ # If .itp file only, just parse it. Otherwise, expect a directory if not os.path.isdir(filename): if os.path.splitext(filename)[1] == ".itp": return self._parse_itp(filename) raise DabbleError("GROMACS forcefields are specified by a " "directory, got '%s'" % filename) # Ensure atomtypes.atp is present if not os.path.isfile(os.path.join(filename, "atomtypes.atp")): raise DabbleError("atomtypes.atp not present in GROMACS " "forcefield directory '%s'" % filename) # Parse atom types first self._parse_atp(os.path.join(filename, "atomtypes.atp")) for file in os.listdir(filename): ext = os.path.splitext(file)[1] if ext == ".itp": self._parse_itp(os.path.join(filename, file)) elif ext == ".rtp": self._parse_rtp(os.path.join(filename, file)) return True
def write(self, filename): """ Writes the parameter and topology files. Args: filename (str): File name to write. Gromacs suffix will be added. """ self.outprefix = filename # Charmm forcefield if "charmm" in self.forcefield or "opls" in self.forcefield: psfgen = CharmmWriter(molid=self.molid, tmp_dir=self.tmp_dir, lipid_sel=self.lipid_sel, forcefield=self.forcefield, water_model=self.water_model, hmr=self.hmr, extra_topos=self.extra_topos, extra_params=self.extra_params, override_defaults=self.override) print("Writing intermediate psf") psfgen.write(self.outprefix) self._psf_to_gromacs() elif "amber" in self.forcefield: prmgen = AmberWriter(molid=self.molid, tmp_dir=self.tmp_dir, forcefield=self.forcefield, water_model=self.water_model, hmr=self.hmr, lipid_sel=self.lipid_sel, extra_topos=self.extra_topos, extra_params=self.extra_params, override_defaults=self.override) print("Writing intermediate prmtop") prmgen.write(self.outprefix) self._amber_to_gromacs() # Now native GROMACS style for gromos or opls else: # Currently unsupported raise DabbleError("Forcefield '%s' not supported for gromacs" % self.forcefield) print("Using the following topology files and/or directories:") for top in self.topologies: print(" - %s" % os.path.split(top)[1]) self._set_atom_names() self._run_pdb2gmx()
def __init__(self, topologies): """ Initializes a graph parser with the given topology files as known molecules """ # Require AMBERHOME to be set if not os.environ.get("AMBERHOME"): raise DabbleError("AMBERHOME must be set to use AmberMatcher") # Parent calls parse topologies super(AmberMatcher, self).__init__(topologies=topologies) # Add the water without TIP3 bond self._load_off(resource_filename(__name__, "parameters/hoh.lib"))
def load_solute(filename, tmp_dir): """ Loads a molecule input file, guessing the format from the extension. Args: filename (str): Filename to load tmp_dir (str): Directory to put temporary files in Returns: (int) VMD molecule ID that was loaded Raises: ValueError if filetype is currently unsupported """ if len(filename) < 3: raise DabbleError("Cannot determine filetype of input file '%s'" % filename) ext = filename.split(".")[-1] if ext == 'mae': molid = molecule.load('mae', filename) elif ext == 'dms': molid = molecule.load('dms', filename) elif ext == 'mol2': molid = molecule.load('mol2', filename) elif ext == 'pdb': # Need to convert to MAE so concatenation will work later temp_mae = tempfile.mkstemp(suffix='.mae', prefix='dabble_input', dir=tmp_dir)[1] molid = molecule.load('pdb', filename) atomsel('all').write('mae', temp_mae) molecule.delete(molid) molid = molecule.load('mae', temp_mae) else: raise DabbleError("Filetype '%s' currently unsupported " "for input protein" % ext) return molid
def write(self, filename): """ Writes the parameter and topology files Args: filename (str): File name to write. File type suffix will be added. """ self.outprefix = filename # Put our molecule on top old_top = molecule.get_top() molecule.set_top(self.molid) # Amber forcefield done with AmberWriter then conversion if "amber" in self.forcefield: # Avoid circular import by doing it here from dabble.param import AmberWriter prmtopgen = AmberWriter(molid=self.molid, tmp_dir=self.tmp_dir, forcefield=self.forcefield, water_model=self.water_model, hmr=self.hmr, lipid_sel=self.lipid_sel, extra_topos=self.extra_topos, extra_params=self.extra_params, override_defaults=self.override, debug_verbose=self.debug) prmtopgen.write(self.outprefix) self._prmtop_to_charmm() # Charmm forcefield elif "charmm" in self.forcefield: self._run_psfgen() # OPLS forcefield. Same as charmm but list separately for readability elif "opls" in self.forcefield: self._run_psfgen() else: raise DabbleError("Unsupported forcefield '%s' for CharmmWriter" % self.forcefield) # Check output and finish up self._check_psf_output() # Reset top molecule molecule.set_top(old_top)
def check_write_ok(filename, out_fmt, overwrite=False): """ Checks if the output files for the requested format exists, and prints out an error message if the current options don't allow overwriting them. Args: filename (str): Output filename requested out_fmt (str): Output format requested. All intermediate files involved in writing to this format will be checked for existence. overwrite (bool): True if overwriting is allowed Returns: True if it okay to overwrite, False otherwise """ if overwrite is True: return True # Generate file suffixes to search for prefix = '.'.join(filename.split('.')[:-1]) suffixes = ['mae'] if out_fmt == 'desmond': suffixes.append('dms') elif out_fmt == 'pdb': suffixes.append('pdb') elif out_fmt == 'charmm': suffixes.extend(['psf', 'pdb']) elif out_fmt == 'amber': suffixes.extend(['prmtop', 'inpcrd']) elif out_fmt == 'gromacs': suffixes.extend(['.gro', '.top']) elif out_fmt == 'lammps': suffixes.extend(['.dat']) exists = [] for sfx in suffixes: if os.path.isfile('%s.%s' % (prefix, sfx)): exists.append('%s.%s' % (prefix, sfx)) if exists: raise DabbleError("\nERROR: The following files exist and would be " "overwritten:\n%s\n\tWon't overwrite unless -O " "specified" % ' '.join(exists)) return False
def set_cations(molid, element, filter_sel='none'): """ Sets all of the specified atoms to a cation Args: molid (int): VMD molecule ID to consider element (str in Na, K): Cation to convert filter_sel (str): VMD atom selection string for atoms to convert Raises: ValueError if invalid cation specified """ if element not in ['Na', 'K']: raise DabbleError("Invalid cation '%s'. " "Supported cations are Na, K" % element) for gid in tuple(atomsel('element K Na and not (%s)' % filter_sel)): set_ion(molid, gid, element)
def get_net_charge(sel, molid): """ Gets the net charge of an atom selection, using the charge field of the data. Args: sel (str): VMD atom selection to compute the charge of molid (int): VMD molecule id to select within Returns: (int): The rounded net charge of the selection Throws: ValueError: If charge does not round to an integer value """ charge = np.array(atomsel(sel, molid=molid).charge) if charge.size == 0: return 0 print("Calculating charge on %d atoms" % charge.size) # Check the system has charges defined if all(charge == 0): print("\nWARNING: All charges in selection are zero. " "Check the input file has formal charges defined!\n" "Selection was:\n%s\n"%sel) print(set(charge)) # Round to nearest integer nd check this is okay net_charge = sum(charge) rslt = round(net_charge) if abs(rslt - net_charge) > 0.05: raise DabbleError("Total charge of %f is not integral within a " "tolerance of 0.05. Check your input file." % net_charge) return int(rslt)
def _parse_topology(self, filename): #pylint: disable=too-many-branches """ Parses a topology file and pulls out the defined residues into graph representation. First pulls out atom types that are defined and updates nodenames, then pulls out defined residues and updates known_res. Also pulls out known patches as it goes Args: filename (str): The file to parse Returns: True if successful Raises: ValueError if topology file is malformed in various ways """ resname = "" data = "" patch = False with open(filename, 'r') as fileh: for line in fileh: # Remove comments except "special" graphmatcher directives # This directive is only really used to parse the bond on NMA # that attaches to the previous residue, in order for its extra # connection to be properly registered since chamber fails # if a connection is listed twice if "!GraphMatcher:" in line: line = line.replace("!GraphMatcher:", "") if "!" in line: line = line[:line.index("!")] if not line: continue tokens = [i.strip() for i in line.split()] if not tokens: continue # Handle previous data if data and (tokens[0] == "RESI" or tokens[0] == "PRES"): if patch: self.patches[resname] = data else: self.known_res[resname] = self._rtf_to_graph(data, resname) data = "" # Handle new residue definition if tokens[0] == "RESI": resname = tokens[1] # Only warn for too long str files if len(resname) > 4 and filename.split('.')[-1] == "str": raise DabbleError("Residue name '%s' too long for psfgen" " to parse. Max is 4 characters!" % resname) patch = False if self.known_res.get(resname): logging.info("Skipping duplicate residue %s", resname) # TODO define as a different residue name??? # Currently reads in first file's definition, ignores others resname = "_skip" # PRES is a patch elif tokens[0] == "PRES": resname = tokens[1] # prefix with _ so we can tell it's a patch if len(resname) > 10: raise DabbleError("Patch name '%s' too long for psfgen" " to parse. Max is 10 characters." % resname) patch = True if self.patches.get(resname): logging.warning("Skipping duplicate patch %s", resname[1:]) # Check for atom definitions elif tokens[0] == "MASS": if self.nodenames.get(tokens[2]): logger.info("Skipping duplicate type %s", tokens[2]) else: self.nodenames[tokens[2]] = \ MoleculeMatcher.get_element(float(tokens[3])) elif resname and resname != "_skip": data += ' '.join(tokens) + '\n' # Write out final residue if data: if patch: self.patches[resname] = data else: self.known_res[resname] = self._rtf_to_graph(data, resname) return True
def get_disulfide(self, selstring, molid): #pylint: disable=too-many-locals """ Checks if the selection corresponds to a cysteine in a disulfide bond. Sets the patch line appropriately and matches atom names using a subgraph match to the normal cysteine residue Args: selstring (str): Selection to check molid (int): VMD molecule of entire system (needed for disu partner) Returns: (str, Patch, dict) resname matched, patch object for psfgen, name translation dictionary """ selection = atomsel(selstring, molid=molid) # Check for the 3 join atoms corresponding to the disulfide bonds rgraph, _ = self.parse_vmd_graph(selection) externs = self.get_extraresidue_atoms(selection) if len(externs) != 3: return (None, None, None) # Check that it is a cysteine in some way shape or form # ie that it this residue is a subgraph of a cysteine truncated = nx.Graph(rgraph) truncated.remove_nodes_from([n for n in rgraph.nodes() if \ rgraph.node[n]["residue"] != "self"]) matches = {} for matchname in self.AMINO_ACIDS: graph = self.known_res.get(matchname) if not graph: continue matcher = isomorphism.GraphMatcher(graph, truncated, \ node_match=super(CharmmMatcher, self)._check_atom_match) if matcher.subgraph_is_isomorphic(): matches[matchname] = matcher.match() if not matches: return (None, None, None) matchname = max(matches.keys(), key=(lambda x: len(self.known_res[x]))) if matchname != "CYS": return (None, None, None) # Invert mapping so it's idx->name. It's currently backwards # because of the need to find a subgraph. atomnames = dict((v, k) for (k, v) in next(matches[matchname]).items()) # Now we know it's a cysteine in a disulfide bond # Identify which resid and fragment corresponds to the other cysteine partners = [n for n in externs if \ atomsel("index %d" % n, molid=molid).element[0] == "S"] if not partners: raise DabbleError("3 bonded Cys %d isn't a valid disulfide!" % selection.resid[0]) osel = atomsel("index %d" % partners[0], molid=molid) # Order so same DISU isn't listed twice fr1 = osel.fragment[0] fr2 = selection.fragment[0] if fr1 < fr2: first = osel second = selection elif fr1 > fr2: first = selection second = osel else: if osel.resid[0] < selection.resid[0]: first = osel second = selection else: first = selection second = osel patch = Patch(name="DISU", segids=[ self.get_protein_segname(molid, first.fragment[0]), self.get_protein_segname(molid, second.fragment[0]) ], resids=[first.resid[0], second.resid[0]]) return (matchname, patch, atomnames)
def _parse_topology(self, filename): """ Parses an amber topology file. More specifically, parses a leaprc file. The atom type definitions are in there as "addAtomTypes" command, and the topologies in the files specified with "loadOff" command. Args: filename (str): The file to parse Returns: True if successful Raises: DabbleError if topology file is malformed in various ways """ if ".off" in filename or ".lib" in filename: self._load_off(filename) elif "frcmod" in filename: return self._load_params(filename) elif "leaprc" not in filename: raise DabbleError("AmberMatcher only parses .leaprc, .off, or " ".frcmod topologies! Can't read topology '%s'" % filename) # Set AMBER search path for lib files leapdir = os.path.join(os.environ["AMBERHOME"], "dat", "leap") incmd = "" with open(filename, 'r') as fileh: for line in fileh: if "#" in line: line = line[:line.index("#")] if not line: continue tokens = [i.strip(" \t'\"\n") for i in line.split()] if not tokens: continue # addAtomTypes adds more atoms if not incmd and tokens[0].lower() == "addatomtypes": incmd = "addatomtypes" elif incmd == "addatomtypes": # Line should look like: { "OG" "O" "sp3" } # we need the first 2 things for atom name and element if tokens[0] == "}": # done with atom type definition incmd = "" continue if tokens[0] != "{" or tokens[-1] != "}": raise DabbleError("Malformed line in %s: %s" % (filename, line)) if not tokens[2]: self.pseudoatoms.append(tokens[1]) continue if tokens[2] not in self.MASS_LOOKUP.values() and \ tokens[2] not in self.LEAP_ELEMENTS.values(): raise DabbleError("Unknown element in %s\n: %s" % (filename, tokens[2])) self.nodenames[tokens[1]] = tokens[2] # loadOff loads a topology library # search in current directory first, then libdir elif not incmd and tokens[0].lower() == "loadoff": if len(tokens) < 2: raise DabbleError("Malformed line in %s: %s" % (filename, line)) if os.path.isfile(tokens[1]): self._load_off(tokens[1]) else: self._load_off(os.path.join(leapdir, "lib", tokens[1])) # loadamberparamsloads a frcmod file, which # may define ions elif not incmd and tokens[0].lower() == "loadamberparams": if len(tokens) < 2: raise DabbleError("Malformed line in %s: %s" % (filename, line)) if os.path.isfile(tokens[1]): self._load_params(tokens[1]) else: self._load_params( os.path.join(leapdir, "parm", tokens[1])) # can source other leaprc files within this one # search current directory first, then amber one elif not incmd and tokens[0].lower() == "source": if os.path.isfile(tokens[1]): self._parse_topology(tokens[1]) else: self._parse_topology( os.path.join(leapdir, "cmd", tokens[1])) elif incmd: raise DabbleError("Unclosed command in %s" % filename) return True
def get_lipid_head(self, selection): """ Obtains a name mapping for a lipid head group given a selection describing a possible lipid. Args: selection (VMD atomsel): Selection to set names for Returns: (dict int->str) Atom index to resname matched (dict int->str) Atom index to atom name matched up (int) Atom index corresponding to - direction tail Raises: KeyError: if no matching possible """ resname = selection.resname[0] rgraph = self.parse_vmd_graph(selection)[0] # Check if a lipid head group is part of this selection. # Remove _join residues from the head so that subgraph match can # be successfully completed matches = {} for matchname in (_ for _ in self.LIPID_HEADS if self.known_res.get(_)): graph = self.known_res.get(matchname) truncated = nx.Graph(graph) truncated.remove_nodes_from([n for n in graph.nodes() if \ graph.node[n]["residue"] != "self"]) matcher = isomorphism.GraphMatcher( rgraph, truncated, node_match=self._check_atom_match) if matcher.subgraph_is_isomorphic(): matches[matchname] = next(matcher.match()) if not matches: return (None, None, None) matchname = max(matches.keys(), key=(lambda x: len(self.known_res[x]))) match = matches[matchname] graph = self.known_res.get(matchname) # Get naming dictionaries to return resmatch, nammatch = self._get_names_from_match(graph, match) # Find atom index on non-truncated graph that corresponds to the # - direction join atom. Necessary to figure out the order in which # to list the tails. minusbnded = [_ for _ in match.keys() if match[_] in \ [e[1] for e in graph.edges(nbunch=["-"])]] if len(minusbnded) != 1: raise DabbleError( "Could not identify tail attached to lipid %s:%s!" % (resname, selection.resid[0])) minusidx = [_ for _ in atomsel("index %s" % minusbnded[0]).bonds[0] \ if _ not in match.keys()] if len(minusidx) != 1: raise DabbleError( "Could not identify tail attached to lipid %s:%s!" % (resname, selection.resid[0])) return (resmatch, nammatch, minusidx[0])
def _rtf_to_graph(self, data, resname, patch=None): #pylint: disable=too-many-branches """ Parses rtf text to a graph representation. If a graph to patch is provided, then patches that graph with this rtf data Args: data (str): The rtf data for this residue or patch resname (str): Residue name, from earlier parsing patch (networkx graph): The graph to apply patches to, or None if just parsing a residue. Will not be modified. Returns: (networkx graph): Graph representation of molecule, or None if it could not be converted (invalid patch) Raises: ValueError if rtf file is malformed in various ways """ # They changed the copy keyword after version 2.1 so that # graph attributes can have more names if nx.__version__ >= "2.1": graph = nx.Graph(incoming_graph_data=patch) else: graph = nx.Graph(data=patch) for line in data.splitlines(): tokens = [i.strip().upper() for i in line.split()] # Atoms mean add node to current residue if tokens[0] == "ATOM": # Patches can change atom type # Technically re-adding the node will just change the type and # not add a duplicate, but this is more correct and clear. if tokens[1] in graph.nodes(): graph.node[tokens[1]]["type"] = tokens[2] else: graph.add_node(tokens[1], type=tokens[2], atomname=tokens[1], residue="self", patched=bool(patch)) # Bond or double means add edge to residue graph elif tokens[0] == "BOND" or tokens[0] == "DOUBLE": if len(tokens) % 2 == 0: raise DabbleError("Unequal number of atoms in bond terms\n" "Line was:\n%s" % line) for txn in range(1, len(tokens), 2): node1 = tokens[txn] node2 = tokens[txn+1] if not _define_bond(graph, node1, node2, bool(patch)): if patch: return None raise DabbleError("Could not bond atoms '%s' - '%s' " "when parsing rtf file.\n" "Line was:\n%s" % (node1, node2, line)) # Check for atom definitions elif tokens[0] == "MASS": if self.nodenames.get(tokens[2]): logger.info("Skipping duplicate type %s", tokens[2]) else: self.nodenames[tokens[2]] = \ MoleculeMatcher.get_element(float(tokens[3])) # Patches can delete atoms elif tokens[0] == "DELETE" or tokens[0] == "DELE": if not patch: raise ValueError("DELETE only supported in patches!\n" "Line was:\n%s" % line) # Sometimes delete has a number in front of the atom name try: if tokens[1] == "ATOM": if tokens[2][0].isdigit(): tokens[2] = tokens[2][1:] graph.remove_node(tokens[2]) elif tokens[1] == "BOND": if tokens[2][0].isdigit(): tokens[2] = tokens[2][1:] if tokens[3][0].isdigit(): tokens[3] = tokens[3][1:] graph.remove_edge(tokens[2], tokens[3]) # Atom or bond did not exist, ie this patch is invalid except nx.NetworkXError: return None # Assign resname to all atoms nx.set_node_attributes(graph, name="resname", values=resname) # If we didn't patch, set the whole residue to unpatched atom attribute # If we are patching, new atoms will have that attribute set when # they are added. if not patch: nx.set_node_attributes(graph, name="patched", values=False) return graph
def _load_off(self, filename): """ Parses an off format amber library file. Puts the resulting residue definitions into the known_res dictionary. Args: filename (str): The file to parse Returns: True if successful Raises: ValueError if off file is malformed in various ways """ unit = "" incmd = "" cmdidx = 1 with open(filename, 'r') as fileh: for line in fileh: if not line: continue tokens = [i.strip(" \t\"\n") for i in line.split()] if not tokens or not tokens[0]: continue # If we find a command, pull out the unit name then figure # out what section is being defined if tokens[0][0] == "!" and tokens[0][1] != "!": unit = tokens[0].split('.')[1] if tokens[0] == "!entry.%s.unit.atoms" % unit: incmd = "addatoms" elif tokens[0] == "!entry.%s.unit.connectivity" % unit: incmd = "addbonds" elif tokens[0] == "!entry.%s.unit.connect" % unit: incmd = "addextrabonds" elif tokens[0] == "!entry.%s.unit.residues" % unit: incmd = "name" else: incmd = "skip" if not self.known_res.get(unit): self.known_res[unit] = nx.Graph() graph = self.known_res[unit] cmdidx = 1 continue # Add atoms command if incmd == "addatoms": # Define atom types if not present using element index element = self.nodenames.get(tokens[1]) if not element: element = self.LEAP_ELEMENTS.get( int(tokens[6]), "Other") self.nodenames[tokens[0]] = element graph.add_node( str(cmdidx), type=tokens[1], element=element, resname=tokens[3], residue=tokens[3], # residue index, will be replaced atomname=tokens[0]) # Add bonds command elif incmd == "addbonds": node1 = graph.node.get(tokens[0]) node2 = graph.node.get(tokens[1]) if not node1 or not node2: print(node1, node2) print(graph.node.keys()) raise DabbleError( "Can't parse bond for unit %s, file %s\n" "Line was: %s" % (unit, filename, line)) graph.add_edge(tokens[0], tokens[1]) # Add externally bonded atoms command if there are actually # atoms, a 0 value here indicates no value. The - is listed before # the + so cmdidx is used to keep track of which one we're on elif incmd == "addextrabonds" and tokens[0] != "0": if cmdidx == 1: node1 = "-" else: node1 = "+" graph.add_node(node1, atomname=node1, type="", residue=node1, element="_join") if not graph.node.get(tokens[0]): raise DabbleError("Can't parse extra residue bond for " "unit %s, file %s\nLine was: %s" % (unit, filename, line)) graph.add_edge(node1, tokens[0]) elif incmd == "name": for nod in (n for n in graph.nodes() if \ graph.node[n].get("residue") == tokens[1]): # Sanity check residue name here if "*" in tokens[0]: raise DabbleError( "You have a common error in your " ".off file '%s'.\n The residue name " "is invalid. Please check the first " "field in the unit.residue section." % filename) graph.node[nod]["resname"] = tokens[0] graph.node[nod]["residue"] = "self" cmdidx += 1 return True
def get_linkage(self, selection, molid): """ Checks if the selection corresponds to a residue that is covalently bonded to some other residue other than the normal + or - peptide bonds. Sets the patch line (bond line for leap) appropriately and matches atom names using a maximal subgraph isomorphism to the normal residue. Args: selection (VMD atomsel): Selection to check molid (int): VMD molecule ID to look for other bonded residue in Returns: resnames (dict int -> str) Residue name translation dictionary atomnames (dict int -> str) Atom name translation dictionary conect (str) Leap patch line to apply for this linkage """ # Sanity check selection corresponds to one resid resids = set(selection.resid) if len(resids) > 1: raise ValueError("Multiple resids in selection: %s" % resids) # Get externally bonded atoms externs = self.get_extraresidue_atoms(selection) # Create a subgraph with no externally bonded atoms for matching # Otherwise, extra bonded atom will prevent matches from happening noext, _ = self.parse_vmd_graph(selection) noext.remove_nodes_from([ i for i in noext.nodes() if noext.node[i].get("residue") != "self" ]) # Find all possible subgraph matches, only amino acids for now, otherwise # weird terminal versions like NLYS instead of LYS could be chosen matches = {} for names in self.known_res: graph = self.known_res.get(names).copy() graph.remove_nodes_from([ i for i in graph.nodes() if graph.node[i].get("residue") != "self" ]) matcher = isomorphism.GraphMatcher(noext, graph, \ node_match=super(AmberMatcher, self)._check_atom_match) if matcher.is_isomorphic(): matches[names] = matcher.match() if not matches: self.write_dot(noext, "noext.dot") return (None, None, None) # Want minimally different thing, ie fewest _join atoms different def difference(res): return len(self.known_res[res]) - len(noext) minscore = min(difference(_) for _ in matches) possible_matches = [_ for _ in matches if difference(_) == minscore] # Prefer canonical amino acids here over weird other types if len(possible_matches) > 1: canonicals = [_ for _ in possible_matches if _ in self.AMINO_ACIDS] if len(canonicals) == 1: print("\tPreferring canonical acid %s" % canonicals[0]) matchname = canonicals.pop() else: raise DabbleError("Ambiguous bonded residue %s" % selection.resname[0]) else: matchname = possible_matches.pop() # Invert mapping so it's idx-> name. It's backwards b/c of subgraph mapping = next(matches[matchname]) graph = self.known_res.get(matchname) # Generate naming dictionaries to return nammatch = { i: graph.node[mapping[i]].get("atomname") for i in mapping.keys() if graph.node[mapping[i]].get("residue") == "self" } resmatch = { i: graph.node[mapping[i]].get("resname") for i in mapping.keys() if graph.node[mapping[i]].get("residue") == "self" } # Find resid and fragment for other molecule partners = [] residue = selection.residue[0] chain = selection.chain[0] for num in externs: rid = atomsel("index %d" % num, molid=molid).residue[0] ch = atomsel("index %d" % num, molid=molid).chain[0] if ch != chain: partners.append(num) elif rid not in (residue + 1, residue - 1): partners.append(num) if len(partners) != 1: return (None, None, None) return (resmatch, nammatch, partners[0])
def _find_residue_in_rtf(self, resname, molid): """ Scans the input topology files to find a name match for the given residue name, then pulls out the atoms involved and checks that they are all present in the input coordinates, prompting the user to correct the names of atoms that could not be matched. Residue ID is used because there can be multiple copies of a residue with the same name, but only one has missing or extra atoms. Args: resname (str): Residue name to check molid (int): VMD molecule ID Returns: True if all matching was successful False if the residue name cannot be found """ print("Finding residue name '%s'" % resname) for top in self.topologies: topfile = open(top, 'r') topo_atoms = _get_atoms_from_rtf(text=topfile.readlines(), resname=resname) # Use first definition found of this residue if topo_atoms: break topfile.close() if not topo_atoms: return False print("Successfully found residue %s in input topologies" % resname) # Match up atoms with python sets pdb_atoms = set( atomsel("resname '%s' and user 1.0" % resname, molid=molid).name) pdb_only = pdb_atoms - topo_atoms topo_only = topo_atoms - pdb_atoms # If uneven number of atoms, there are missing or additional atoms if len(pdb_atoms) > len(topo_atoms): raise DabbleError( "\nERROR: Cannot process modified residue %s.\n" "There are %d extra atoms in the input structure " "that are undefined in the topology file. The " "following atoms could not be matched and may " "either be misnamed, or additional atoms:\n" "[ %s ]\n" % (resname, len(pdb_atoms) - len(topo_atoms), " ".join(pdb_only))) if len(topo_atoms) > len(pdb_atoms): raise DabbleError( "\nERROR: Cannot process modified residue %s.\n" "There are %d missing atoms in the input structure " "that are defined in the topology file. The " "following atoms could not be matched and may " "either be misnamed or deleted atoms:\n" "[ %s ]\n" % (resname, len(topo_atoms) - len(pdb_atoms), " ".join(topo_only))) # Offer to rename atoms that couldn't be matched to the topology if pdb_only: print("\nWARNING: Having some trouble with modified residue %s.\n" " The following atom names cannot be matched up " " to the input topologies. They are probably " " misnamed.\n" % resname) print(" To help you, here are the atom names that " " should be present according to the topology " " but were not found:\n") print(" [ %s ]\n" % ' '.join([str(t) for t in topo_only])) print(" Please enter a valid name for each atom as " "it appears or CTRL+D to quit..\n") for unmatched in pdb_only: print("Unmatched topology names: [ %s ]" % ' '.join(topo_only)) newname = input(" %s -> " % unmatched) while newname not in topo_only: print("'%s' is not an available name in the topology." "Please try again.\n" % newname) newname = input(" %s -> " % unmatched) atomsel("resname '%s' and user 1.0 and name '%s'" % (resname, unmatched)).name = newname pdb_atoms = set( atomsel("resname '%s' and user 1.0" % resname).name) topo_only = topo_atoms - pdb_atoms resname = newname # Recurse to check that everything is assigned correctly self._find_residue_in_rtf(resname, molid) print("Matched up all atom names for resname '%s'\n" % resname) return True