def test_read_atom(self): #See if we can read an atom line line = "ATOM 41 NH1AARG A -3 12.218 84.840 88.007 0.50 40.76 N " a = pdb_model.PdbAtom(line) self.assertEqual(a.serial, 41) self.assertEqual(a.name, ' NH1') self.assertEqual(a.altLoc, 'A') self.assertEqual(a.resName, 'ARG') self.assertEqual(a.chainID, 'A') self.assertEqual(a.resSeq, -3) self.assertEqual(a.iCode, None) self.assertEqual(a.x, 12.218) self.assertEqual(a.y, 84.840) self.assertEqual(a.z, 88.007) self.assertEqual(a.occupancy, 0.5) self.assertEqual(a.tempFactor, 40.76) self.assertEqual(a.element, 'N')
def test_read_atom4(self): #Round-trip an atom line line = "ATOM 183 OD2 ASP A 24 70.534 30.495 41.026 1.00 35.00 O1-" a = pdb_model.PdbAtom(line) self.assertEqual(a.serial, 183) self.assertEqual(a.name, ' OD2') self.assertEqual(a.altLoc, None) self.assertEqual(a.resName, 'ASP') self.assertEqual(a.chainID, 'A') self.assertEqual(a.resSeq, 24) self.assertEqual(a.iCode, None) self.assertEqual(a.x, 70.534) self.assertEqual(a.y, 30.495) self.assertEqual(a.z, 41.026) self.assertEqual(a.occupancy, 1.00) self.assertEqual(a.tempFactor, 35.00) self.assertEqual(a.element, 'O') self.assertEqual(a.charge, -1)
def test_read_atom3(self): #Round-trip an atom line line = "ATOM 160 NH1 ARG A 21 57.124 31.377 40.357 1.00 35.50 N1+" a = pdb_model.PdbAtom(line) self.assertEqual(a.serial, 160) self.assertEqual(a.name, ' NH1') self.assertEqual(a.altLoc, None) self.assertEqual(a.resName, 'ARG') self.assertEqual(a.chainID, 'A') self.assertEqual(a.resSeq, 21) self.assertEqual(a.iCode, None) self.assertEqual(a.x, 57.124) self.assertEqual(a.y, 31.377) self.assertEqual(a.z, 40.357) self.assertEqual(a.occupancy, 1.00) self.assertEqual(a.tempFactor, 35.50) self.assertEqual(a.element, 'N') self.assertEqual(a.charge, 1)
def test_read_atom2(self): #Round-trip an atom line line = "ATOM 28 C ALA A 12 -27.804 -2.987 10.849 1.00 11.75 AA-- C " a = pdb_model.PdbAtom(line) self.assertEqual(a.serial, 28) self.assertEqual(a.name, ' C ') self.assertEqual(a.altLoc, None) self.assertEqual(a.resName, 'ALA') self.assertEqual(a.chainID, 'A') self.assertEqual(a.resSeq, 12) self.assertEqual(a.iCode, None) self.assertEqual(a.x, -27.804) self.assertEqual(a.y, -2.987) self.assertEqual(a.z, 10.849) self.assertEqual(a.occupancy, 1.00) self.assertEqual(a.tempFactor, 11.75) self.assertEqual(a.segID, 'AA--') self.assertEqual(a.element, 'C')
def match_resseq(targetPdb=None, outPdb=None, resMap=None, sourcePdb=None): assert sourcePdb or resMap assert not (sourcePdb and resMap) if not resMap: resMap = residue_map.residueSequenceMap(targetPdb, sourcePdb) chain = None # The chain we're reading with open(targetPdb, 'r') as target, open(outPdb, 'w') as out: for line in target: if line.startswith("MODEL"): raise RuntimeError("Multi-model file!") if line.startswith("ANISOU"): raise RuntimeError( "I cannot cope with ANISOU! {0}".format(line)) # Stop at TER if line.startswith("TER"): pass if line.startswith("ATOM"): atom = pdb_model.PdbAtom(line) # First atom/chain if chain == None: chain = atom.chainID if atom.chainID != chain: pass # Get the matching resSeq for the model modelResSeq = resMap.ref2target(atom.resSeq) if modelResSeq == atom.resSeq: out.write(line) else: atom.resSeq = modelResSeq out.write(atom.toLine() + "\n") continue out.write(line)
def test_write_atom1(self): #Round-trip an atom line line = "ATOM 41 NH1AARG A -3 12.218 84.840 88.007 0.50 40.76 N " a = pdb_model.PdbAtom(line) self.assertEqual(a.toLine(), line)
def get_info(inpath): """Read a PDB and extract as much information as possible into a PdbInfo object """ info = pdb_model.PdbInfo() info.pdb = inpath currentModel = None currentChain = -1 modelAtoms = [ ] # list of models, each of which is a list of chains with the list of atoms # Go through refpdb and find which ref_residues are present f = open(inpath, 'r') line = f.readline() while line: # First line of title if line.startswith('HEADER'): info.pdbCode = line[62:66].strip() # First line of title if line.startswith('TITLE') and not info.title: info.title = line[10:-1].strip() if line.startswith("REMARK"): try: numRemark = int(line[7:10]) except ValueError: line = f.readline() continue # Resolution if numRemark == 2: line = f.readline() if line.find("RESOLUTION") != -1: try: info.resolution = float(line[25:30]) except ValueError: # RESOLUTION. NOT APPLICABLE. info.resolution = -1 # Get solvent content if numRemark == 280: maxread = 5 # Clunky - read up to maxread lines to see if we can get the information we're after # We assume the floats are at the end of the lines for _ in range(maxread): line = f.readline() if line.find("SOLVENT CONTENT") != -1: try: info.solventContent = float(line.split()[-1]) except ValueError: # Leave as None pass if line.find("MATTHEWS COEFFICIENT") != -1: try: info.matthewsCoefficient = float(line.split()[-1]) except ValueError: # Leave as None pass # End REMARK if line.startswith("CRYST1"): try: info.crystalInfo = pdb_model.CrystalInfo(line) except ValueError as e: logger.critical( "ERROR READING CRYST1 LINE in file %s\":%s\"\n%s", inpath, line.rstrip(), e) info.crystalInfo = None if line.startswith("MODEL"): if currentModel: # Need to make sure that we have an id if only 1 chain and none given if len(currentModel.chains) <= 1: if currentModel.chains[0] == None: currentModel.chains[0] = 'A' info.models.append(currentModel) # New/first model currentModel = pdb_model.PdbModel() # Get serial currentModel.serial = int(line.split()[1]) currentChain = None modelAtoms.append([]) # Count chains (could also check against the COMPND line if present?) if line.startswith('ATOM'): # Create atom object atom = pdb_model.PdbAtom(line) # Check for the first model if not currentModel: # This must be the first model and there should only be one currentModel = pdb_model.PdbModel() modelAtoms.append([]) if atom.chainID != currentChain: currentChain = atom.chainID currentModel.chains.append(currentChain) modelAtoms[-1].append([]) modelAtoms[-1][-1].append(atom) # Can ignore TER and ENDMDL for time being as we'll pick up changing chains anyway, # and new models get picked up by the models line line = f.readline() # End while loop # End of reading loop so add the last model to the list info.models.append(currentModel) f.close() bbatoms = ['N', 'CA', 'C', 'O', 'CB'] # Now process the atoms for modelIdx, model in enumerate(info.models): chainList = modelAtoms[modelIdx] for chainIdx, atomList in enumerate(chainList): # Paranoid check assert model.chains[chainIdx] == atomList[0].chainID # Add list of atoms to model model.atoms.append(atomList) # Initialise new chain currentResSeq = atomList[0].resSeq currentResName = atomList[0].resName model.resSeqs.append([]) model.sequences.append("") model.caMask.append([]) model.bbMask.append([]) atomTypes = [] for i, atom in enumerate(atomList): aname = atom.name.strip() if atom.resSeq != currentResSeq and i == len(atomList) - 1: # Edge case - last residue containing one atom atomTypes = [aname] else: if aname not in atomTypes: atomTypes.append(aname) if atom.resSeq != currentResSeq or i == len(atomList) - 1: # End of reading the atoms for a residue model.resSeqs[chainIdx].append(currentResSeq) model.sequences[chainIdx] += ample_util.three2one[ currentResName] if 'CA' not in atomTypes: model.caMask[chainIdx].append(True) else: model.caMask[chainIdx].append(False) missing = False for bb in bbatoms: if bb not in atomTypes: missing = True break if missing: model.bbMask[chainIdx].append(True) else: model.bbMask[chainIdx].append(False) currentResSeq = atom.resSeq currentResName = atom.resName atomTypes = [] return info
def _keep_matching(refpdb=None, targetpdb=None, outpdb=None, resSeqMap=None): """Create a new pdb file that only contains that atoms in targetpdb that are also in refpdb. It only considers ATOM lines and discards HETATM lines in the target. Args: refpdb: path to pdb that contains the minimal set of atoms we want to keep targetpdb: path to the pdb that will be stripped of non-matching atoms outpdb: output path for the stripped pdb """ assert refpdb and targetpdb and outpdb and resSeqMap def _output_residue(refResidues, targetAtomList, resSeqMap, outfh): """Output a single residue only outputting matching atoms, shuffling the atom order and changing the resSeq num""" # Get the matching list of atoms targetResSeq = targetAtomList[0].resSeq refResSeq = resSeqMap.ref2target(targetResSeq) # Get the atomlist for the reference for (rid, alist) in refResidues: if rid == refResSeq: refAtomList = alist break # Get ordered list of the ref atom names for this residue rnames = [x.name for x in refAtomList] if len(refAtomList) > len(targetAtomList): raise RuntimeError( "Cannot keep matching as refAtomList is > targetAtomList for residue {}\nRef: {}\nTrg: {}" .format(targetResSeq, rnames, [x.name for x in targetAtomList])) # Remove any not matching in the target alist = [] for atom in targetAtomList: if atom.name in rnames: alist.append(atom) # List now only contains matching atoms targetAtomList = alist # Now just have matching so output in the correct order for refname in rnames: for i, atom in enumerate(targetAtomList): if atom.name == refname: # Found the matching atom # Change resSeq and write out atom.resSeq = refResSeq outfh.write(atom.toLine() + "\n") # now delete both this atom and the line targetAtomList.pop(i) # jump out of inner loop break return # Go through refpdb and find which refResidues are present refResidues = [] targetResSeq = [ ] # ordered list of tuples - ( resSeq, [ list_of_atoms_for_that_residue ] ) last = None chain = -1 for line in open(refpdb, 'r'): if line.startswith("MODEL"): raise RuntimeError("Multi-model file!") if line.startswith("TER"): break if line.startswith("ATOM"): a = pdb_model.PdbAtom(line) # First atom/chain if chain == -1: chain = a.chainID if a.chainID != chain: raise RuntimeError( "ENCOUNTERED ANOTHER CHAIN! {0}".format(line)) if a.resSeq != last: last = a.resSeq # Add the corresponding resSeq in the target targetResSeq.append(resSeqMap.target2ref(a.resSeq)) refResidues.append((a.resSeq, [a])) else: refResidues[-1][1].append(a) # Now read in target pdb and output everything bar the atoms in this file that # don't match those in the refpdb t = open(targetpdb, 'r') out = open(outpdb, 'w') chain = None # The chain we're reading residue = None # the residue we're reading targetAtomList = [] for line in t: if line.startswith("MODEL"): raise RuntimeError("Multi-model file!") if line.startswith("ANISOU"): raise RuntimeError("I cannot cope with ANISOU! {0}".format(line)) # Stop at TER if line.startswith("TER"): _output_residue(refResidues, targetAtomList, resSeqMap, out) # we write out our own TER out.write("TER\n") continue if line.startswith("ATOM"): atom = pdb_model.PdbAtom(line) # First atom/chain if chain == None: chain = atom.chainID if atom.chainID != chain: raise RuntimeError( "ENCOUNTERED ANOTHER CHAIN! {0}".format(line)) if atom.resSeq in targetResSeq: # If this is the first one add the empty tuple and reset residue if atom.resSeq != residue: if residue != None: # Dont' write out owt for first atom _output_residue(refResidues, targetAtomList, resSeqMap, out) targetAtomList = [] residue = atom.resSeq # If not first keep adding targetAtomList.append(atom) # We don't write these out as we write them with _output_residue continue else: # discard this line as not a matching atom continue # For time being exclude all HETATM lines elif line.startswith("HETATM"): continue # Endif line.startswith("ATOM") # Output everything else out.write(line) # End reading loop t.close() out.close() return
def read_pdb(self, pdb): """Get sequence as string of 1AA get list of matching resSeq """ atomTypes = [] resSeq = [] resName = [] _atomTypes = [] atomTypesList = [] chain = None readingResSeq = None readingResName = None for line in open(pdb): if line.startswith("MODEL"): raise RuntimeError("FOUND MULTI_MODEL FILE!") if line.startswith("TER"): break if line.startswith("ATOM"): atom = pdb_model.PdbAtom(line) if not chain: chain = atom.chainID if atom.chainID != chain: raise RuntimeError("FOUND ADDITIONAL CHAIN") # First atom in first residue if readingResSeq == None: readingResSeq = atom.resSeq readingResName = atom.resName _atomTypes.append(atom.name.strip()) continue if readingResSeq != atom.resSeq: resName.append(readingResName) resSeq.append(readingResSeq) atomTypesList.append(_atomTypes) readingResSeq = atom.resSeq readingResName = atom.resName _atomTypes = [atom.name.strip()] else: if atom.name not in _atomTypes: _atomTypes.append(atom.name.strip()) resName.append(readingResName) resSeq.append(readingResSeq) atomTypesList.append(_atomTypes) sequence = "" for n in resName: sequence += ample_util.three2one[n] cAlphaMask = ['CA' not in atomTypes for atomTypes in atomTypesList] return sequence, resSeq, cAlphaMask