def load_PDB_to_system(self, filename = None): parser = PDBParser(QUIET=True) structure = parser.get_structure('X', filename) self.residues = [] for model in structure: c = 1 for chain in model: self.id = 1 #self.name = "protein" n = 1 r = 1 for pdb_residue in chain: residue = Residue(id=r, name=pdb_residue.resname) for pdb_atom in pdb_residue: atom = Atom(id=n, name=pdb_atom.name, pos=pdb_atom.coord) n += 1 residue.atoms.append(atom) self.residues.append(residue) r += 1
def test_to_string(self): """Write structure as string""" stream = StringIO() stream.write(dummy_1) stream.seek(0) mol = MolProcesser(stream) n_models = sum(1 for _ in mol.structure.get_models()) #1 n_chains = sum(1 for _ in mol.structure.get_chains()) #2 n_resids = sum(1 for _ in mol.structure.get_residues()) #2 n_atoms = sum(1 for _ in mol.structure.get_atoms()) #15 has_docc = sum(1 for a in mol.structure.get_atoms() if a.is_disordered()) has_hatm = sum(1 for r in mol.structure.get_residues() if r.id[0] != ' ') stream_2 = StringIO() stream_2.write(mol.tostring) stream_2.seek(0) p = PDBParser(QUIET=1) mol_2 = p.get_structure('xyz', stream_2) n_models_2 = sum(1 for _ in mol_2.get_models()) #1 n_resids_2 = sum(1 for _ in mol_2.get_residues()) #2 n_atoms_2 = sum(1 for _ in mol_2.get_atoms()) #15 has_docc_2 = sum(1 for a in mol_2.get_atoms() if a.is_disordered()) has_hatm_2 = sum(1 for r in mol_2.get_residues() if r.id[0] != ' ') self.assertEqual(n_models, n_models_2) self.assertEqual(n_resids, n_resids_2) self.assertEqual(n_atoms, n_atoms_2) self.assertEqual(has_docc, has_docc_2) self.assertEqual(has_hatm, has_hatm_2)
def test_get_sequence_from_pdb_structure(self): pdb_file = "./test.pdb" p = PDBParser() structure = p.get_structure('test', pdb_file) structure_of_chain = structure[0]['A'] sequence = construct_protein_graph.get_sequence_from_pdb_structure(structure_of_chain) self.assertEqual("VNIKTNPFK", sequence)
def selectChain(ifn, ofn, chainID='A'): parser = PDBParser() structure = parser.get_structure('x', ifn) class ChainSelector(): def __init__(self, chainID=chainID): self.chainID = chainID def accept_chain(self, chain): if chain.get_id() == self.chainID: return 1 return 0 def accept_model(self, model): return 1 def accept_residue(self, residue): return 1 def accept_atom(self, atom): return 1 sel = ChainSelector(chainID) io = PDBIO() io.set_structure(structure) io.save(ofn, sel)
def RemoveLigandsOneBioUnit(biounit, ligandlist): # ligandlist is a residue list with residue chain id, name and residue number p = PDBParser(PERMISSIVE = 1) pdbname= biounit.split("/")[-1] try: models = p.get_structure(pdbname, biounit) except: return None #for model in models: # for chain in model: # for residue in chain: # print residue for rligand in ligandlist: for model in models: for chain in model: for residue in list(chain): if chain.id == rligand["ChainID"] and int(rligand["ResNum"]) == residue.id[1]: chain.detach_child(residue.id) elif residue.id[0] == "W": chain.detach_child(residue.id) elif len(rligand["LigName"].split()) > 1 and int(rligand["ResNum"]) <= residue.id[1]: LongLigand(chain, residue, rligand) io = PDBIO() io.set_structure(models) filepath = os.path.join(BIOSTRDIR, models.id) io.save(filepath)
def test_1_warnings(self): """Check warnings: Parse a flawed PDB file in permissive mode. NB: The try/finally block is adapted from the warnings.catch_warnings context manager in the Python 2.6 standard library. """ warnings.simplefilter('always', PDBConstructionWarning) try: # Equivalent to warnings.catch_warnings -- hackmagic orig_showwarning = warnings.showwarning all_warns = [] def showwarning(*args, **kwargs): all_warns.append(args[0]) warnings.showwarning = showwarning # Trigger warnings p = PDBParser(PERMISSIVE=True) p.get_structure("example", "PDB/a_structure.pdb") for wrn, msg in zip(all_warns, [ # Expected warning messages: 'Atom N defined twice in residue <Residue ARG het= resseq=2 icode= > at line 19.', 'disordered atom found with blank altloc before line 31.', "Residue (' ', 4, ' ') redefined at line 41.", "Blank altlocs in duplicate residue SER (' ', 4, ' ') at line 41.", "Residue (' ', 10, ' ') redefined at line 73.", "Residue (' ', 14, ' ') redefined at line 104.", "Residue (' ', 16, ' ') redefined at line 133.", "Residue (' ', 80, ' ') redefined at line 631.", "Residue (' ', 81, ' ') redefined at line 644.", 'Atom O defined twice in residue <Residue HOH het=W resseq=67 icode= > at line 820.' ]): self.assertTrue(msg in str(wrn)) finally: warnings.showwarning = orig_showwarning
def chain2pos_scan_str(chain, pdb, mutation_set='a'): """ Takes a chain ID and a model.PDBFile object, returns a string suitable as the PositionScan line for FoldX. """ parser = PDBParser(PERMISSIVE=1) pdbfn = pdb.fullpath() struct = parser.get_structure(pdb.uuid, pdbfn)[0] #chains = pdb_extract_chain_seqs(struct) chainlist = Selection.unfold_entities(struct, 'C') position_scan_str = '' for c in chainlist: if c.id == chain: for r in c: try: aa = three_to_one(r.get_resname()) resnum = r.id[1] position_scan_str += '%s%s%i%s,' % (aa, chain, resnum, mutation_set) except: # non-native amino acid or water pass position_scan_str = position_scan_str[:-1] return position_scan_str
def parse_structure(path): """ Parses a structure using Biopython's PDB/mmCIF Parser Verifies the integrity of the structure (gaps) and its suitability for the calculation (is it a complex?). """ # setup logging logger = logging.getLogger('Prodigy') logger.info('[+] Reading structure file: {0}'.format(path)) fname = os.path.basename(path) sname = '.'.join(fname.split('.')[:-1]) s_ext = fname.split('.')[-1] _ext = {'pdb', 'ent', 'cif'} if s_ext not in _ext: raise IOError('[!] Structure format \'{0}\' is not supported. Use \'.pdb\' or \'.cif\'.'.format(s_ext)) sparser = PDBParser(QUIET=1) if s_ext in {'pdb', 'ent'} else MMCIFParser() try: s = sparser.get_structure(sname, path) except Exception as exeption: logger.error('[!] Structure \'{0}\' could not be parsed'.format(sname), file=sys.stderr) raise Exception(exeption) return (validate_structure(s), len(set([c.id for c in s.get_chains()])), len(list(s.get_residues())))
def test_NACCESS(self): """Test NACCESS generation from PDB""" p = PDBParser() pdbfile = "PDB/1A8O.pdb" model = p.get_structure("1A8O", pdbfile)[0] naccess = NACCESS(model, pdbfile) self.assertEqual(len(naccess), 66)
def test_dssp(self): """Test DSSP generation from PDB.""" p = PDBParser() pdbfile = "PDB/2BEG.pdb" model = p.get_structure("2BEG", pdbfile)[0] dssp = DSSP(model, pdbfile) self.assertEqual(len(dssp), 130)
def run(self): mypath = self.getPath() lig_ifn = mypath.sdf prt_ifn = mypath.pdb lig_ext = os.path.basename(lig_ifn).split('.')[-1] lig = pybel.readfile(lig_ext, lig_ifn).next() lig.removeh() parser = PDBParser(QUIET=True) structure = parser.get_structure('prt', prt_ifn) typetable = OBTypeTable() typetable.SetFromType('INT') typetable.SetToType('SYB') dat = [] atom_types = [typetable.Translate(atom.type) for atom in lig.atoms] atom_types = shuffle(atom_types) for residue in structure.get_residues(): dists = residueDistances2LigandAtoms(residue, lig) dat.append({"dists": dists, "atom_types": atom_types, "residue": residue.get_resname()}) to_write = json.dumps(dat, indent=4, separators=(',', ':')) with self.output().open('w') as ofs: ofs.write(to_write)
def test_conversion(self): """Parse 1A8O.cif, write 1A8O.pdb, parse again and compare""" cif_parser = MMCIFParser(QUIET=1) cif_struct = cif_parser.get_structure("example", "PDB/1LCD.cif") pdb_writer = PDBIO() pdb_writer.set_structure(cif_struct) filenumber, filename = tempfile.mkstemp() pdb_writer.save(filename) pdb_parser = PDBParser(QUIET=1) pdb_struct = pdb_parser.get_structure('example_pdb', filename) # comparisons self.assertEqual(len(pdb_struct), len(cif_struct)) pdb_atom_names = [a.name for a in pdb_struct.get_atoms()] cif_atom_names = [a.name for a in cif_struct.get_atoms()] self.assertEqual(len(pdb_atom_names), len(cif_atom_names)) self.assertSequenceEqual(pdb_atom_names, cif_atom_names) pdb_atom_elems = [a.element for a in pdb_struct.get_atoms()] cif_atom_elems = [a.element for a in cif_struct.get_atoms()] self.assertSequenceEqual(pdb_atom_elems, cif_atom_elems)
def parse_freesasa_output(fpath): """ Returns per-residue relative accessibility of side-chain and main-chain atoms as calculated by freesasa. """ asa_data, rsa_data = {}, {} _rsa = rel_asa _bb = set(('CA', 'C', 'N', 'O')) P = PDBParser(QUIET=1) s = P.get_structure('bogus', fpath.name) for res in s.get_residues(): res_id = (res.parent.id, res.resname, res.id[1]) asa_mc, asa_sc, total_asa = 0, 0, 0 for atom in res: aname = atom.name at_id = (res.parent.id, res.resname, res.id[1], aname) asa = atom.bfactor # if atom.name in _bb: # asa_mc += asa # else: # asa_sc += asa total_asa += asa asa_data[at_id] = asa rsa_data[res_id] = total_asa / _rsa['total'][res.resname] return asa_data, rsa_data
def __init__(self, table, pdb): table = table.reset_index(drop=True) struct = PDBParser().get_structure(table['pdb_id'][0], pdb) table = table.fillna('') alpha_num = sum([1 for x in table['tcr_v_allele'].tolist() if x.find('TRA') != -1]) beta_num = table.shape[0] - alpha_num table.insert(table.columns.get_loc('tcr_chain'), 'tcr_chain_name', ['alpha'] * alpha_num + ['beta'] * beta_num) print table self.__table = table self.__name = str(struct.get_id()) self.__struct = struct self.__chains = [chain.get_id() for chain in struct[0]] self.__regions = table.groupby(['tcr_chain_name', 'tcr_region']) # Dictionary of regions residues; # looks like : { ('alpha', 'CDR1') : [residue list], # ('alpha', 'CDR2') : [residue list], ... } self.__regions_res = self.__regions.groups for key in self.__regions_res.keys(): self.__regions_res[key] = [] # Pepdide residue list self.__peptide = [] # Dictionaries with pairwise region matrices; # look like : { (('alpha', 'CDR1'), ('alpha', 'CDR2')) : dataframe, # (('alpha', 'CDR1'), ('alpha', 'CDR3')) : dataframe, ... } self.__d_matrices = {} self.__e_matrices = {} self.verbose = True if not self.getRegionsResidues(): print 'SOME REGION WAS NOT FOUND IN PDB' if not self.definePeptideChain(): print 'PEPTIDE WAS NOT FOUND IN PDB'
def test_3_bad_xyz(self): """Check error: Parse an entry with bad x,y,z value.""" data = "ATOM 9 N ASP A 152 21.554 34.953 27.691 1.00 19.26 N\n" parser = PDBParser(PERMISSIVE=False) s = parser.get_structure("example", StringIO(data)) data = "ATOM 9 N ASP A 152 21.ish 34.953 27.691 1.00 19.26 N\n" self.assertRaises(PDBConstructionException, parser.get_structure, "example", StringIO(data))
def main(): p = PDBParser() filename = "pdb10gs.ent" models = p.get_structure("10GS", filename) for model in models: print models[0] print model.get_full_id() TestDSSP(models[0], filename)
def main(): p = PDBParser() filename = "test/10gs.bio1" models = p.get_structure("10gs", filename) for model in models: print models[0] print model.get_full_id() TestNACCESS(models[0], filename)
def build_all_angles_model(pdb_filename): parser=PDBParser() structure=parser.get_structure('sample', \ path.join(PDBdir, pdb_filename)) model=structure[0] chain=model['A'] model_structure_geo=[] prev="0" N_prev="0" CA_prev="0" CO_prev="0" prev_res="" rad=180.0/math.pi for res in chain: if(res.get_resname() in resdict.keys()): geo=Geometry.geometry(resdict[res.get_resname()]) if(prev=="0"): N_prev=res['N'] CA_prev=res['CA'] C_prev=res['C'] prev="1" else: n1=N_prev.get_vector() ca1=CA_prev.get_vector() c1=C_prev.get_vector() C_curr=res['C'] N_curr=res['N'] CA_curr=res['CA'] c=C_curr.get_vector() n=N_curr.get_vector() ca=CA_curr.get_vector() geo.CA_C_N_angle=calc_angle(ca1, c1, n)*rad geo.C_N_CA_angle=calc_angle(c1, n, ca)*rad psi= calc_dihedral(n1, ca1, c1, n) ##goes to current res omega= calc_dihedral(ca1, c1, n, ca) ##goes to current res phi= calc_dihedral(c1, n, ca, c) ##goes to current res geo.psi_im1=psi*rad geo.omega=omega*rad geo.phi=phi*rad geo.N_CA_C_angle= calc_angle(n, ca, c)*rad ##geo.CA_C_O_angle= calc_angle(ca, c, o)*rad ##geo.N_CA_C_O= calc_dihedral(n, ca, c, o)*rad N_prev=res['N'] CA_prev=res['CA'] C_prev=res['C'] ##O_prev=res['O'] model_structure_geo.append(geo) return model_structure_geo
def pdb2dfromactivesite(pdb_fh,active_sites=[]): """ This calculates distances between each ligand atom or optionally provided amino acids (sources) and each residue in the protein. :param pdb_fh: path to .pdb file. :param active_sites: optional list of residue numbers as sources. :returns dfromligands: pandas table with distances from ligand """ junk_residues = ["HOH"," MG","CA"," NA","SO4","IOD","NA","CL","GOL","PO4"] pdb_parser=PDBParser() pdb_data=pdb_parser.get_structure("pdb_name",pdb_fh) model = pdb_data[0] chainA = model["A"] #only a chain residues = list(chainA.get_residues()) ligands_residue_objs=[] for residue in chainA: if not residue.get_resname() in junk_residues: if not residue.get_resname() in aas_21_3letter: #only aas ligands_residue_objs.append(residue) elif residue.id[1] in active_sites: ligands_residue_objs.append(residue) dfromligands=pd.DataFrame() for ligandi in range(len(ligands_residue_objs)): ligand_residue_obj=ligands_residue_objs[ligandi] for ligand_atom_obj in ligand_residue_obj: for residue in chainA: if residue.get_resname() in aas_21_3letter: #only aas dfromligands.loc[residue.id[1],"ref_pdb"]=residue.get_resname() if not ligand_residue_obj.get_resname() in aas_21_3letter: dfromligands.loc[residue.id[1],"Distance from Ligand: %s (ATOM: %s)" % \ (ligand_residue_obj.get_resname(),ligand_atom_obj.get_name())]\ =ligand_residue_obj[ligand_atom_obj.get_name()]-residue["CA"] else: dfromligands.loc[residue.id[1],"Distance from active site residue: %s %d (ATOM: %s)" % \ (ligand_residue_obj.get_resname(),ligand_residue_obj.get_id()[1],\ ligand_atom_obj.get_name())]\ =ligand_residue_obj[ligand_atom_obj.get_name()]-residue["CA"] dfromligands.index.name="aasi" if "ref_pdb" in dfromligands: del dfromligands["ref_pdb"] #average and minimum distances cols_all=dfromligands.columns.tolist() for moltype in ['Distance from Ligand:','Distance from active site residue:']: cols_moltype=[c for c in cols_all if moltype in c] if len(cols_all)>0: dfromligands.loc[:,'%s average' % moltype]=dfromligands.loc[:,cols_moltype].T.mean() dfromligands.loc[:,'%s minimum' % moltype]=dfromligands.loc[:,cols_moltype].T.min() mols=np.unique([c[c.find(moltype):c.find(' (ATOM')] for c in cols_moltype]) if len(mols)>1: for mol in mols: cols_mol=[c for c in cols_moltype if mol in c] dfromligands.loc[:,'%s: average' % mol]=dfromligands.loc[:,cols_mol].T.mean() dfromligands.loc[:,'%s: minimum' % mol]=dfromligands.loc[:,cols_mol].T.min() return dfromligands
def getPdbAtomsBySerialNum(pdb_fn, serial_nums): parser = PDBParser(QUIET=True) structure = parser.get_structure('x', pdb_fn) atoms = {atom.serial_number : atom for atom in structure.get_atoms()} re_ordered = [] for num in serial_nums: re_ordered.append(atoms[num]) return re_ordered
def test_fragment_mapper(self): """Self test for FragmentMapper module.""" p = PDBParser() pdb1 = "PDB/1A8O.pdb" s = p.get_structure("X", pdb1) m = s[0] fm = FragmentMapper(m, 10, 5, "PDB") for r in Selection.unfold_entities(m, "R"): if r in fm: self.assertTrue(str(fm[r]).startswith("<Fragment length=5 id="))
def _get_ligand_name(self): p = PDBParser(QUIET=True) ligand = p.get_structure('ligand', self.out_filename) chain = ligand[0]['A'] for residue in chain.get_residues(): if residue.resname in self.ignore: pass else: self.ligands.append(residue.resname) print "Ligands found: ", self.ligands
def _get_resmapping(self): res_mapping = [] filepath = self._get_filepath('', pdb_file=True) p = PDBParser(QUIET=True) structure = p.get_structure('protein', filepath) chain = structure[0]['A'] for residue in chain.get_residues(): if str(residue.id[1]) in self.resnums: res_mapping.append((self.codes[residue.resname], residue.id[1])) return res_mapping
def check_msms(self, prot_file, first_100_residues): p = PDBParser() with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) s = p.get_structure("X", prot_file) model = s[0] rd = ResidueDepth(model) res_chain = '' for item in rd.property_list[:100]: res_chain = res_chain + item[0].get_resname() self.assertEqual(res_chain, first_100_residues)
def test_empty(self): """Parse an empty file.""" parser = PDBParser() filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: struct = parser.get_structure('MT', filename) # Structure has no children (models) self.assertFalse(len(struct)) finally: os.remove(filename)
def experimental_method(pdb_path): """ Get String representation of Experimental method used file of interest. Use header for this information. :param pdb_path: Path to PDB file :return: """ parser = PDBParser(get_header=True) parser.get_structure('', pdb_path) return parser.get_header()['structure_method']
def read_pdb(pdbfile): ''' Read a PDB file as structure file with BIO.PDB :param pdbfile: path to pdb file :return: structure ''' parser = PDBParser() structure = parser.get_structure('pdb', pdbfile) return structure
def open_pdb(pdbfn): """Open pdb with Biopython. Args: pdbfn1 (str): a path to a pdb structure Returns: PDB Biopython object: with a pdb structure """ parser = PDBParser() return parser.get_structure('', pdbfn)
def test_c_n(self): """Extract polypeptides from 1A80.""" warnings.resetwarnings() parser = PDBParser(PERMISSIVE=False) structure = parser.get_structure("example", "PDB/1A8O.pdb") self.assertEqual(len(structure), 1) for ppbuild in [PPBuilder(), CaPPBuilder()]: #========================================================== #First try allowing non-standard amino acids, polypeptides = ppbuild.build_peptides(structure[0], False) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 151) self.assertEqual(pp[-1].get_id()[1], 220) # Check the sequence s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) #Here non-standard MSE are shown as M self.assertEqual("MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQ" "NANPDCKTILKALGPGATLEEMMTACQG", str(s)) #========================================================== #Now try strict version with only standard amino acids #Should ignore MSE 151 at start, and then break the chain #at MSE 185, and MSE 214,215 polypeptides = ppbuild.build_peptides(structure[0], True) self.assertEqual(len(polypeptides), 3) #First fragment pp = polypeptides[0] self.assertEqual(pp[0].get_id()[1], 152) self.assertEqual(pp[-1].get_id()[1], 184) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW", str(s)) #Second fragment pp = polypeptides[1] self.assertEqual(pp[0].get_id()[1], 186) self.assertEqual(pp[-1].get_id()[1], 213) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("TETLLVQNANPDCKTILKALGPGATLEE", str(s)) #Third fragment pp = polypeptides[2] self.assertEqual(pp[0].get_id()[1], 216) self.assertEqual(pp[-1].get_id()[1], 220) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("TACQG", str(s))
def _select_residues(self, name, chain_id, selected): pdb_id = name.split('_')[0] pdb_path = DecompressedPdb(pdb_id).output().path pdb_parser = PDBParser(QUIET=True) pdb_structure = pdb_parser.get_structure(name, pdb_path) chain = pdb_structure[0][chain_id] coords = [] names = [] for res_id in selected: for atom in chain[res_id].get_unpacked_list(): names.append(atom.get_name()) coords.append(atom.get_coord()) return coords, names
class WriteTest(unittest.TestCase): @classmethod def setUpClass(self): self.io = PDBIO() self.parser = PDBParser(PERMISSIVE=1) with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) self.structure = self.parser.get_structure("example", "PDB/1A8O.pdb") def test_pdbio_write_structure(self): """Write a full structure using PDBIO.""" struct1 = self.structure # Ensure that set_structure doesn't alter parent parent = struct1.parent # Write full model to temp file self.io.set_structure(struct1) self.assertIs(parent, struct1.parent) filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: self.io.save(filename) struct2 = self.parser.get_structure("1a8o", filename) nresidues = len(list(struct2.get_residues())) self.assertEqual(len(struct2), 1) self.assertEqual(nresidues, 158) finally: os.remove(filename) def test_pdbio_write_preserve_numbering(self): """Test writing PDB and preserve atom numbering.""" self.io.set_structure(self.structure) filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: self.io.save(filename) # default preserve_atom_numbering=False struct = self.parser.get_structure("1a8o", filename) serials = [a.serial_number for a in struct.get_atoms()] og_serials = list(range(1, len(serials) + 1)) self.assertEqual(og_serials, serials) finally: os.remove(filename) def test_pdbio_write_auto_numbering(self): """Test writing PDB and do not preserve atom numbering.""" self.io.set_structure(self.structure) filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: self.io.save(filename, preserve_atom_numbering=True) struct = self.parser.get_structure("1a8o", filename) serials = [a.serial_number for a in struct.get_atoms()] og_serials = [a.serial_number for a in self.structure.get_atoms()] self.assertEqual(og_serials, serials) finally: os.remove(filename) def test_pdbio_write_residue(self): """Write a single residue using PDBIO.""" struct1 = self.structure residue1 = list(struct1.get_residues())[0] # Ensure that set_structure doesn't alter parent parent = residue1.parent # Write full model to temp file self.io.set_structure(residue1) self.assertIs(parent, residue1.parent) filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: self.io.save(filename) struct2 = self.parser.get_structure("1a8o", filename) nresidues = len(list(struct2.get_residues())) self.assertEqual(nresidues, 1) finally: os.remove(filename) def test_pdbio_write_residue_w_chain(self): """Write a single residue (chain id == X) using PDBIO.""" struct1 = self.structure.copy() # make copy so we can change it residue1 = list(struct1.get_residues())[0] # Modify parent id parent = residue1.parent parent.id = "X" # Write full model to temp file self.io.set_structure(residue1) filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: self.io.save(filename) struct2 = self.parser.get_structure("1a8o", filename) nresidues = len(list(struct2.get_residues())) self.assertEqual(nresidues, 1) # Assert chain remained the same chain_id = [c.id for c in struct2.get_chains()][0] self.assertEqual(chain_id, "X") finally: os.remove(filename) def test_pdbio_write_residue_wout_chain(self): """Write a single orphan residue using PDBIO.""" struct1 = self.structure residue1 = list(struct1.get_residues())[0] residue1.parent = None # detach residue # Write full model to temp file self.io.set_structure(residue1) filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: self.io.save(filename) struct2 = self.parser.get_structure("1a8o", filename) nresidues = len(list(struct2.get_residues())) self.assertEqual(nresidues, 1) # Assert chain is default: "A" chain_id = [c.id for c in struct2.get_chains()][0] self.assertEqual(chain_id, "A") finally: os.remove(filename) def test_pdbio_write_custom_residue(self): """Write a chainless residue using PDBIO.""" res = Residue.Residue((" ", 1, " "), "DUM", "") atm = Atom.Atom("CA", [0.1, 0.1, 0.1], 1.0, 1.0, " ", "CA", 1, "C") res.add(atm) # Ensure that set_structure doesn't alter parent parent = res.parent # Write full model to temp file self.io.set_structure(res) self.assertIs(parent, res.parent) filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: self.io.save(filename) struct2 = self.parser.get_structure("res", filename) latoms = list(struct2.get_atoms()) self.assertEqual(len(latoms), 1) self.assertEqual(latoms[0].name, "CA") self.assertEqual(latoms[0].parent.resname, "DUM") self.assertEqual(latoms[0].parent.parent.id, "A") finally: os.remove(filename) def test_pdbio_select(self): """Write a selection of the structure using a Select subclass.""" # Selection class to filter all alpha carbons class CAonly(Select): """Accepts only CA residues.""" def accept_atom(self, atom): if atom.name == "CA" and atom.element == "C": return 1 struct1 = self.structure # Ensure that set_structure doesn't alter parent parent = struct1.parent # Write to temp file self.io.set_structure(struct1) self.assertIs(parent, struct1.parent) filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: self.io.save(filename, CAonly()) struct2 = self.parser.get_structure("1a8o", filename) nresidues = len(list(struct2.get_residues())) self.assertEqual(nresidues, 70) finally: os.remove(filename) def test_pdbio_missing_occupancy(self): """Write PDB file with missing occupancy.""" with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) structure = self.parser.get_structure("test", "PDB/occupancy.pdb") self.io.set_structure(structure) filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", BiopythonWarning) self.io.save(filename) self.assertEqual(len(w), 1, w) with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) struct2 = self.parser.get_structure("test", filename) atoms = struct2[0]["A"][(" ", 152, " ")] self.assertIsNone(atoms["N"].get_occupancy()) finally: os.remove(filename) def test_pdbio_write_truncated(self): """Test parsing of truncated lines.""" struct = self.structure # Write to temp file self.io.set_structure(struct) filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: self.io.save(filename) # Check if there are lines besides 'ATOM', 'TER' and 'END' with open(filename) as handle: record_set = {l[0:6] for l in handle} record_set -= { "ATOM ", "HETATM", "MODEL ", "ENDMDL", "TER\n", "TER ", "END\n", "END ", } self.assertEqual(record_set, set()) finally: os.remove(filename) def test_model_numbering(self): """Preserve model serial numbers during I/O.""" def confirm_numbering(struct): self.assertEqual(len(struct), 3) for idx, model in enumerate(struct): self.assertEqual(model.serial_num, idx + 1) self.assertEqual(model.serial_num, model.id + 1) def confirm_single_end(fname): """Ensure there is only one END statement in multi-model files.""" with open(fname) as handle: end_stment = [] for iline, line in enumerate(handle): if line.strip() == "END": end_stment.append((line, iline)) self.assertEqual(len(end_stment), 1) # Only one? self.assertEqual(end_stment[0][1], iline) # Last line of the file? with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) struct1 = self.parser.get_structure("1lcd", "PDB/1LCD.pdb") confirm_numbering(struct1) # Round trip: serialize and parse again self.io.set_structure(struct1) filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: self.io.save(filename) struct2 = self.parser.get_structure("1lcd", filename) confirm_numbering(struct2) confirm_single_end(filename) finally: os.remove(filename) def test_pdbio_write_x_element(self): """Write a structure with atomic element X with PDBIO.""" struct1 = self.structure # Change element of one atom atom = next(struct1.get_atoms()) atom.element = "X" # X is assigned in Atom.py as last resort self.io.set_structure(struct1) filenumber, filename = tempfile.mkstemp() os.close(filenumber) try: self.io.save(filename) finally: os.remove(filename) def test_pdbio_write_unk_element(self): """PDBIO raises ValueError when writing unrecognised atomic elements.""" struct1 = self.structure atom = next(struct1.get_atoms()) atom.element = "1" self.io.set_structure(struct1) filenumber, filename = tempfile.mkstemp() os.close(filenumber) with self.assertRaises(ValueError): self.io.save(filename) os.remove(filename)
def __init__(self, structId="subset"): self.structId = structId self.pdbParser = PDBParser(QUIET=True) self.structure = Structure(structId)
import json import re from abc import ABCMeta, abstractmethod from Bio.PDB import PDBParser, NeighborSearch, PPBuilder, Residue from enum import Enum from Constants import * LIGAND_STRUCT_ID = 'ligand' RECEPTOR_STRUCT_ID = 'receptor' N_PATCH_DOCK_SCORE_COMPONENTS = 4 pdb_parser = PDBParser(QUIET=True) class ComplexType(Enum): zdock_benchmark_bound = 1 zdock_benchmark_unbound = 2 patch_dock = 3 class Complex(object): __metaclass__ = ABCMeta def __init__(self, complex_id, reprocess=False): self._complex_id = complex_id self._neighbours = None if (not self._is_processed()) or reprocess: self._process_complex() data = self._load_processed_data()
def build_backbone_model(pdb_filename): parser = PDBParser() structure = parser.get_structure("sample", path.join(PDBdir, pdb_filename)) model = structure[0] chain = model["A"] model_structure_geo = [] prev = "0" N_prev = "0" CA_prev = "0" CO_prev = "0" ##O_prev="0" prev_res = "" rad = 180.0 / math.pi for res in chain: if res.get_resname() in resdict.keys(): geo = Geometry.geometry(resdict[res.get_resname()]) if prev == "0": N_prev = res["N"] CA_prev = res["CA"] C_prev = res["C"] ##O_prev=res['O'] prev = "1" else: n1 = N_prev.get_vector() ca1 = CA_prev.get_vector() c1 = C_prev.get_vector() ##o1=O_prev.get_vector() ##O_curr=res['O'] C_curr = res["C"] N_curr = res["N"] CA_curr = res["CA"] ##o=O_curr.get_vector() c = C_curr.get_vector() n = N_curr.get_vector() ca = CA_curr.get_vector() geo.CA_C_N_angle = calc_angle(ca1, c1, n) * rad geo.C_N_CA_angle = calc_angle(c1, n, ca) * rad geo.CA_N_length = CA_curr - N_curr geo.CA_C_length = CA_curr - C_curr geo.peptide_bond = N_curr - C_prev psi = calc_dihedral(n1, ca1, c1, n) ##goes to current res omega = calc_dihedral(ca1, c1, n, ca) ##goes to current res phi = calc_dihedral(c1, n, ca, c) ##goes to current res geo.psi_im1 = psi * rad geo.omega = omega * rad geo.phi = phi * rad geo.CA_N_length = CA_curr - N_curr geo.CA_C_length = CA_curr - C_curr ##geo.C_O_length= C_curr - O_curr geo.N_CA_C_angle = calc_angle(n, ca, c) * rad ##geo.CA_C_O_angle= calc_angle(ca, c, o)*rad ##geo.N_CA_C_O= calc_dihedral(n, ca, c, o)*rad N_prev = res["N"] CA_prev = res["CA"] C_prev = res["C"] ##O_prev=res['O'] model_structure_geo.append(geo) return model_structure_geo
class ResidueMutator(object): def __init__(self, tripeptides=None, components=None, standard_residues=None): """ The mutator object takes a non-standard residue or incomplete residue and modifies it """ try: from Bio.PDB import PDBParser from Bio.SVDSuperimposer import SVDSuperimposer except ModuleNotFoundError: raise ModuleNotFoundError( "BioPython is required for this functionality") # get defaults if not provided if standard_residues is None: standard_residues = data.standard_residues if tripeptides is None: tripeptides = data.tripeptides if components is None: components = data.chem_components self.components = components self.candidates = {} self.standard_residues = standard_residues self.imposer = SVDSuperimposer() self.parser = PDBParser(PERMISSIVE=1, QUIET=True) # build up candidate structures for fn in tripeptides: structure = self.parser.get_structure("", fn) resn = structure[0][" "][2].get_resname() self.candidates[resn] = [] for model in structure: self.candidates[resn].append(model[" "][2]) def mutate(self, residue, replace_backbone=True): resn = residue.get_resname() if self.standard(resn): # the residue is already a standard residue, here for repair parn = resn else: parn = self.components[resn]['_chem_comp.mon_nstd_parent_comp_id'] if not self.standard(parn): # the parent residue is a nonstandard residue, can't mutate return False if parn not in self.candidates: # parent not in candidate structures return False sc_fixed = set( self.components[resn] ['side_chain_atoms']) # side chain atoms of fixed residue sc_movin = set( self.components[parn] ['side_chain_atoms']) # side chain atoms of standard parent atom_names = sc_fixed.intersection(sc_movin) # get list of side chain atoms present in residue atom_list = [] for atom in atom_names: if atom in residue: atom_list.append(atom) if len(atom_list) == 0: return False # get side chain atom coordinates fixed_coord = np.zeros((len(atom_list), 3)) for i in range(len(atom_list)): fixed_coord[i] = residue[atom_list[i]].get_coord() # loop over candidates, finding best RMSD moved_coord = np.zeros((len(atom_list), 3)) min_rms = 99999 rotm = None tran = None min_candidate = None for candidate in self.candidates[parn]: for j in range(len(atom_list)): moved_coord[j] = candidate[atom_list[j]].get_coord() # perfom SVD fitting self.imposer.set(fixed_coord, moved_coord) self.imposer.run() if self.imposer.get_rms() < min_rms: min_rms = self.imposer.get_rms() rotm, tran = self.imposer.get_rotran() min_candidate = candidate # copy the candidate to a new object candidate = min_candidate.copy() candidate.transform(rotm, tran) stripHydrogens(candidate) if replace_backbone: # replace backbone atoms of candidate backbone_atoms = self.components[resn]['main_chain_atoms'] for atom in backbone_atoms: if atom not in residue: continue if atom not in candidate: candidate.add(residue[atom].copy()) candidate[atom].set_coord(residue[atom].get_coord()) return candidate def standard(self, resname): return resname in self.standard_residues def modified(self, resname): if resname in self.standard_residues: # it's standard, not modified return False if resname in self.components and '_chem_comp.mon_nstd_parent_comp_id' in self.components[ resname]: return ( (resname not in self.standard_residues) and (self.components[resname]['_chem_comp.mon_nstd_parent_comp_id'] in self.standard_residues)) else: # has no standard parent field - can't be modified return False
def PdbAtomIterator(source): """Return SeqRecord objects for each chain in a PDB file. Argument source is a file-like object or a path to a file. The sequences are derived from the 3D structure (ATOM records), not the SEQRES lines in the PDB file header. Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries are converted to "X" in the sequence. In addition to information from the PDB header (which is the same for all records), the following chain specific information is placed in the annotation: record.annotations["residues"] = List of residue ID strings record.annotations["chain"] = Chain ID (typically A, B ,...) record.annotations["model"] = Model ID (typically zero) Where amino acids are missing from the structure, as indicated by residue numbering, the sequence is filled in with 'X' characters to match the size of the missing region, and None is included as the corresponding entry in the list record.annotations["residues"]. This function uses the Bio.PDB module to do most of the hard work. The annotation information could be improved but this extra parsing should be done in parse_pdb_header, not this module. This gets called internally via Bio.SeqIO for the atom based interpretation of the PDB file format: >>> from Bio import SeqIO >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-atom"): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... Record id 1A8O:A, chain A Equivalently, >>> with open("PDB/1A8O.pdb") as handle: ... for record in PdbAtomIterator(handle): ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) ... Record id 1A8O:A, chain A """ # TODO - Add record.annotations to the doctest, esp the residues (not working?) # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO from Bio.PDB import PDBParser structure = PDBParser().get_structure(None, source) pdb_id = structure.header["idcode"] if not pdb_id: warnings.warn("'HEADER' line not found; can't determine PDB ID.", BiopythonParserWarning) pdb_id = "????" for record in AtomIterator(pdb_id, structure): # The PDB header was loaded as a dictionary, so let's reuse it all record.annotations.update(structure.header) # ENH - add letter annotations -- per-residue info, e.g. numbers yield record
self.shared_program.vert['radius'] = radius self.shared_program.frag['radius'] = radius self.shared_program.frag['color'] = color self._draw_mode = 'points' def _prepare_transforms(self, view): view.view_program.vert['transform'] = view.get_transform() from Bio.PDB import PDBParser, DSSP from molecular_data import crgbaDSSP, restype, colorrgba, vrad, resdict pdbdata = 'data/1yd9.pdb' parser = PDBParser(QUIET=True, PERMISSIVE=True) structure = parser.get_structure('model', pdbdata) def centroid(arr): length = arr.shape[0] sum_x = np.sum(arr[:, 0]) sum_y = np.sum(arr[:, 1]) sum_z = np.sum(arr[:, 2]) return sum_x / length, sum_y / length, sum_z / length atoms = [atom for atom in structure.get_atoms()] natoms = len(atoms) #atom coordinates coordinates = np.array([atom.coord for atom in atoms])
def pdb_neighbors(pdb_f, pdb_id): structure = PDBParser().get_structure(pdb_id, pdb_f) atom_list = Selection.unfold_entities(structure, 'A') ns = NeighborSearch(atom_list) center_res = [ res for res in structure.get_residues() if res.get_resname() in ['PTR', 'SEP', 'TPO'] ] neighbors = [] for res in center_res: if res.get_resname() == 'PTR': central_atoms = [ atom for atom in res.child_list if atom.get_name() in ['O1P', 'O2P', 'O3P', 'OH'] ] elif res.get_resname() == 'SEP': central_atoms = [ atom for atom in res.child_list if atom.get_name() in ['O1P', 'O2P', 'O3P', 'OG'] ] elif res.get_resname() == 'TPO': central_atoms = [ atom for atom in res.child_list if atom.get_name() in ['O1P', 'O2P', 'O3P', 'OG1'] ] atom_neighbors = [ ns.search(a.get_coord(), BOND_CUTOFF) for a in central_atoms ] atom_neighbors = [atom for atoms in atom_neighbors for atom in atoms] positive_atom_neighbors = [ ns.search(a.get_coord(), POSITIVE_BOND_CUTOFF) for a in central_atoms ] positive_atom_neighbors = [ atom for atoms in positive_atom_neighbors for atom in atoms ] positive_atom_neighbors = [ atom for atom in positive_atom_neighbors if atom.get_name() in ['NE2', 'ND1', 'NZ', 'NE', 'NH2', 'NH1'] ] atom_neighbors.extend(positive_atom_neighbors) atom_neighbors = list(set(atom_neighbors)) #filter self atom_neighbors = [ atom for atom in atom_neighbors if not atom.get_parent() == res ] # only consider those containing N or O atom_neighbors = [ atom for atom in atom_neighbors if 'N' in atom.get_name() or 'O' in atom.get_name() ] ## ignore water atom_neighbors = [ atom for atom in atom_neighbors if not atom.get_parent().get_resname() == 'HOH' ] # filter main_chain O, they are not donor atom_neighbors = [ atom for atom in atom_neighbors if not atom.get_name() == 'O' ] # filter O in N Q, they are not donor atom_neighbors = [ atom for atom in atom_neighbors if not (atom.get_name() == 'OD1' and atom.get_parent().get_resname() == 'ASN') ] atom_neighbors = [ atom for atom in atom_neighbors if not (atom.get_name() == 'OE1' and atom.get_parent().get_resname() == 'GLN') ] # filter O in D E, they are not donor # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'OD1' and atom.get_parent().get_resname() == 'ASP')] # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'OD2' and atom.get_parent().get_resname() == 'ASP')] # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'OE1' and atom.get_parent().get_resname() == 'GLU')] # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'OE2' and atom.get_parent().get_resname() == 'GLU')] # ignore residues on the same chain of res using main-chain atom # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'N' and atom.get_parent().get_parent() == res.get_parent())] ## filter non-standard residues STAND_RES = [ 'VAL', 'ILE', 'LEU', 'GLU', 'GLN', 'ASP', 'ASN', 'HIS', 'TRP', 'PHE', 'TYR', 'ARG', 'LYS', 'SER', 'THR', 'MET', 'ALA', 'GLY', 'PRO', 'CYS' ] for atom in atom_neighbors: if atom.get_parent().get_resname() not in STAND_RES: atom_neighbors = [] ## filter same chain # for atom in atom_neighbors: # if atom.get_parent().get_parent() == res.get_parent(): # atom_neighbors = [] ## filter entry containing main_chain O of residues on different chain of res # for atom in atom_neighbors: # if atom.get_name() == 'N': # atom_neighbors = [] atom_neighbors = list( set(Selection.unfold_entities(atom_neighbors, 'R'))) atom_neighbors = [r for r in atom_neighbors if r != res] if len(atom_neighbors) > 0: res = res.get_resname() + '_' + str( res.get_id()[1]) + '_' + res.get_parent().get_id() atom_neighbors = [ n.get_resname() + '_' + str(n.get_id()[1]) + '_' + n.get_parent().get_id() for n in atom_neighbors ] neighbors.append((pdb_id, res, atom_neighbors)) return neighbors
#!/usr/bin/env python # coding: utf-8 from Bio.PDB import Atom from math import sqrt from Bio.PDB import PDBParser import argparse import sys prot_id = "5AGY.pdb" prot_file = sys.argv[1] parser = PDBParser(PERMISSIVE = 1) structure = parser.get_structure(prot_id, prot_file) model = structure[0] if "-h" in sys.argv or "--help" in sys.argv: print("Ce programme identifie les interactions entre cycles aromatiques à partir d'un fichier Protein Data Bank (PDB). Les critères pris en compte proviennent du Protein Interaction Calculator que l'on peut retrouver en suivant le lien : http://pic.mbu.iisc.ernet.in/PIC_Criteria.pdf. Le parser de Biopython est strucutré de la manière suivante : Structure/model/chain/residu/atome.") print("Fonctions utilisées :") print('parser.get_structure --> ', 'Creation of a structure object from a PDB file') print('objet.get_name --> ', 'Renvoie le nom correspondant à l objet : Structure/model/chain/residu/atome') print('parser.get_structure --> ', 'Renvoie le numéro rattaché au résidue dans le fichier PDB') print('') residues = [] aroaro = ["PHE", "TRP", "TYR"] for chain in model: # protéine -> chaîne -> résidues impliqués dans interactions aromatiques / aromatiques
import numpy import argparse from Bio.PDB import PDBParser from Bio.PDB.PDBExceptions import PDBConstructionWarning import warnings warnings.filterwarnings("ignore", category=PDBConstructionWarning) arg_parser = argparse.ArgumentParser() arg_parser.add_argument("infile", help="Input file") args = arg_parser.parse_args() fname = args.infile pdb_parser = PDBParser() # Ignore PDB warnings, we are just interested in the size structure = pdb_parser.get_structure(0, fname) atoms = list(structure.get_atoms()) natoms = len(atoms) coords = numpy.zeros((natoms, 3)) for index, this_atom in enumerate(atoms): coords[index, :] = this_atom.get_vector().get_array() coords /= 10 # Convert from Ångström to nm. x_size = coords[:, 0].max() - coords[:, 0].min()
def pdb2dfromactivesite(pdb_fh, active_sites=[]): """ This calculates distances between each ligand atom or optionally provided amino acids (sources) and each residue in the protein. :param pdb_fh: path to .pdb file. :param active_sites: optional list of residue numbers as sources. :returns dfromligands: pandas table with distances from ligand """ junk_residues = [ "HOH", " MG", "CA", " NA", "SO4", "IOD", "NA", "CL", "GOL", "PO4" ] pdb_parser = PDBParser() pdb_data = pdb_parser.get_structure("pdb_name", pdb_fh) model = pdb_data[0] chainA = model["A"] #only a chain residues = list(chainA.get_residues()) ligands_residue_objs = [] for residue in chainA: if not residue.get_resname() in junk_residues: if not residue.get_resname() in aas_21_3letter: #only aas ligands_residue_objs.append(residue) elif residue.id[1] in active_sites: ligands_residue_objs.append(residue) dfromligands = pd.DataFrame() for ligandi in range(len(ligands_residue_objs)): ligand_residue_obj = ligands_residue_objs[ligandi] for ligand_atom_obj in ligand_residue_obj: for residue in chainA: if residue.get_resname() in aas_21_3letter: #only aas dfromligands.loc[residue.id[1], "ref_pdb"] = residue.get_resname() if not ligand_residue_obj.get_resname() in aas_21_3letter: dfromligands.loc[residue.id[1],"Distance from Ligand: %s (ATOM: %s)" % \ (ligand_residue_obj.get_resname(),ligand_atom_obj.get_name())]\ =ligand_residue_obj[ligand_atom_obj.get_name()]-residue["CA"] else: dfromligands.loc[residue.id[1],"Distance from active site residue: %s %d (ATOM: %s)" % \ (ligand_residue_obj.get_resname(),ligand_residue_obj.get_id()[1],\ ligand_atom_obj.get_name())]\ =ligand_residue_obj[ligand_atom_obj.get_name()]-residue["CA"] dfromligands.index.name = "aasi" if "ref_pdb" in dfromligands: del dfromligands["ref_pdb"] #average and minimum distances cols_all = dfromligands.columns.tolist() for moltype in [ 'Distance from Ligand:', 'Distance from active site residue:' ]: cols_moltype = [c for c in cols_all if moltype in c] if len(cols_all) > 0: dfromligands.loc[:, '%s average' % moltype] = dfromligands.loc[:, cols_moltype].T.mean( ) dfromligands.loc[:, '%s minimum' % moltype] = dfromligands.loc[:, cols_moltype].T.min() mols = np.unique( [c[c.find(moltype):c.find(' (ATOM')] for c in cols_moltype]) if len(mols) > 1: for mol in mols: cols_mol = [c for c in cols_moltype if mol in c] dfromligands.loc[:, '%s: average' % mol] = dfromligands.loc[:, cols_mol].T.mean( ) dfromligands.loc[:, '%s: minimum' % mol] = dfromligands.loc[:, cols_mol].T.min() return dfromligands
def PdbAtomIterator(handle): """Returns SeqRecord objects for each chain in a PDB file The sequences are derived from the 3D structure (ATOM records), not the SEQRES lines in the PDB file header. Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries are converted to "X" in the sequence. In addition to information from the PDB header (which is the same for all records), the following chain specific information is placed in the annotation: record.annotations["residues"] = List of residue ID strings record.annotations["chain"] = Chain ID (typically A, B ,...) record.annotations["model"] = Model ID (typically zero) Where amino acids are missing from the structure, as indicated by residue numbering, the sequence is filled in with 'X' characters to match the size of the missing region, and None is included as the corresponding entry in the list record.annotations["residues"]. This function uses the Bio.PDB module to do most of the hard work. The annotation information could be improved but this extra parsing should be done in parse_pdb_header, not this module. """ # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO from Bio.PDB import PDBParser from Bio.SCOP.three_to_one_dict import to_one_letter_code def restype(residue): """Return a residue's type as a one-letter code. Non-standard residues (e.g. CSD, ANP) are returned as 'X'. """ return to_one_letter_code.get(residue.resname, 'X') # Deduce the PDB ID from the PDB header # ENH: or filename? from Bio.File import UndoHandle undo_handle = UndoHandle(handle) firstline = undo_handle.peekline() if firstline.startswith("HEADER"): pdb_id = firstline[62:66] else: warnings.warn("First line is not a 'HEADER'; can't determine PDB ID") pdb_id = '????' struct = PDBParser().get_structure(pdb_id, undo_handle) model = struct[0] for chn_id, chain in sorted(model.child_dict.iteritems()): # HETATM mod. res. policy: remove mod if in sequence, else discard residues = [res for res in chain.get_unpacked_list() if res.get_resname().upper() in to_one_letter_code] if not residues: continue # Identify missing residues in the structure # (fill the sequence with 'X' residues in these regions) gaps = [] rnumbers = [r.id[1] for r in residues] for i, rnum in enumerate(rnumbers[:-1]): if rnumbers[i+1] != rnum + 1: # It's a gap! gaps.append((i+1, rnum, rnumbers[i+1])) if gaps: res_out = [] prev_idx = 0 for i, pregap, postgap in gaps: if postgap > pregap: gapsize = postgap - pregap - 1 res_out.extend(map(restype, residues[prev_idx:i])) prev_idx = i res_out.append('X'*gapsize) # Last segment res_out.extend(map(restype, residues[prev_idx:])) else: warnings.warn("Ignoring out-of-order residues after a gap", UserWarning) # Keep the normal part, drop the out-of-order segment # (presumably modified or hetatm residues, e.g. 3BEG) res_out.extend(map(restype, residues[prev_idx:i])) else: # No gaps res_out = map(restype, residues) record_id = "%s:%s" % (pdb_id, chn_id) # ENH - model number in SeqRecord id if multiple models? # id = "Chain%s" % str(chain.id) # if len(structure) > 1 : # id = ("Model%s|" % str(model.id)) + id record = SeqRecord(Seq(''.join(res_out), generic_protein), id=record_id, description=record_id, ) # The PDB header was loaded as a dictionary, so let's reuse it all record.annotations = struct.header.copy() # Plus some chain specifics: record.annotations["model"] = model.id record.annotations["chain"] = chain.id # Start & end record.annotations["start"] = int(rnumbers[0]) record.annotations["end"] = int(rnumbers[-1]) # ENH - add letter annotations -- per-residue info, e.g. numbers yield record
def main(): """The main routine for conkit-validate functionality""" parser = create_argument_parser() args = parser.parse_args() global logger logger = conkit.command_line.setup_logging(level="info") if os.path.isfile(args.output) and not args.overwrite: raise FileExistsError('The output file {} already exists!'.format(args.output)) if args.pdbformat != 'pdb': raise ValueError('Model file format can only be PDB') logger.info(os.linesep + "Working directory: %s", os.getcwd()) logger.info("Reading input sequence: %s", args.seqfile) sequence = conkit.io.read(args.seqfile, args.seqformat).top if len(sequence) < 5: raise ValueError('Cannot validate model with less than 5 residues') logger.info("Length of the sequence: %d", len(sequence)) logger.info("Reading input distance prediction: %s", args.distfile) prediction = conkit.io.read(args.distfile, args.distformat).top logger.info("Reading input PDB model: %s", args.pdbfile) model = conkit.io.read(args.pdbfile, args.pdbformat).top p = PDBParser() structure = p.get_structure('structure', args.pdbfile)[0] dssp = DSSP(structure, args.pdbfile, dssp=args.dssp, acc_array='Wilke') logger.info(os.linesep + "Validating model.") if len(sequence) > 500: logger.info("Input model has more than 500 residues, this might take a while...") figure = conkit.plot.ModelValidationFigure(model, prediction, sequence, dssp, map_align_exe=args.map_align_exe) figure.savefig(args.output, overwrite=args.overwrite) logger.info(os.linesep + "Validation plot written to %s", args.output) residue_info = figure.data.loc[:, ['RESNUM', 'SCORE', 'MISALIGNED']] table = PrettyTable() table.field_names = ["Residue", "Predicted score", "Suggested register"] _resnum_template = '{} ({})' _error_score_template = '*** {0:.2f} ***' _correct_score_template = ' {0:.2f} ' _register_template = '*** {} ({}) ***' _empty_register = ' ' for residue in residue_info.values: resnum, score, misalignment = residue current_residue = _resnum_template.format(sequence.seq[resnum - 1], resnum) score = _error_score_template.format(score) if score > 0.5 else _correct_score_template.format(score) if misalignment and resnum in figure.alignment.keys(): register = _register_template.format(sequence.seq[figure.alignment[resnum] - 1], figure.alignment[resnum]) else: register = _empty_register table.add_row([current_residue, score, register]) logger.info(os.linesep) logger.info(table)
def __init__(self): self.parser = PDBParser(QUIET=True, PERMISSIVE=True) self.df_structure = None self.pairwise_dist = None
# Benchmark the parsing of a PDB file given as an argument import sys import time from Bio.PDB import PDBParser pdb_filepath = sys.argv[1] parser = PDBParser() start = time.time() parser.get_structure("", pdb_filepath) elapsed = time.time() - start print elapsed
def parsePDBStructure( pdb_id ): parser = PDBParser() structure = parser.get_structure('test_rsa', pdb_id) return structure
search_dict = pypdb.Query(query) # create a dictionary containing search information found = search_dict.search(search_dict)[:500] # create a list of these PDBs by searching RCSB # create a list with the information and the metadata metadata = [] for proteins in found: # for items in # for the items in the list, metadata.append(pypdb.describe_pdb(proteins)) # append the dictionary # Save the metadata list as a CSV file dfm = pd.DataFrame(metadata) # convert to a Pandas DF dfm.to_csv('metadata_'+now+'.csv') # save as a CSV file # %% parser = PDBParser() # create a parser pdbs = list() pdbl = PDBList() # Download all PDB structures in the previous list if they aren't there for id in found: pdbl.retrieve_pdb_file(pdb_code=id, file_format='pdb', pdir=PDB_dl_dir) # Retrieve in PDB format, put in directory 'PDB' # Finished, print "Downloading ... finished!" print('\n#############~DOWNLOADING COMPLETE~#############\n') # %% # convert pdb*.ent to *.pdb for file in os.scandir(PDB_dl_dir): if (file.path.endswith(".ent") and file.is_file()): newfn = file.name.replace("pdb","").replace(".ent",".pdb") os.rename(file, PDB_dl_dir+"/"+newfn)
def test_Superimposer(self): """Test on module that superimpose two protein structures.""" pdb1 = "PDB/1A8O.pdb" p = PDBParser() s1 = p.get_structure("FIXED", pdb1) fixed = Selection.unfold_entities(s1, "A") s2 = p.get_structure("MOVING", pdb1) moving = Selection.unfold_entities(s2, "A") rot = numpy.identity(3).astype("f") tran = numpy.array((1.0, 2.0, 3.0), "f") for atom in moving: atom.transform(rot, tran) sup = Superimposer() sup.set_atoms(fixed, moving) self.assertTrue(numpy.allclose(sup.rotran[0], numpy.identity(3))) self.assertTrue( numpy.allclose(sup.rotran[1], numpy.array([-1.0, -2.0, -3.0]))) self.assertAlmostEqual(sup.rms, 0.0, places=3) # Turn black code style off # fmt: off atom_list = [ "N", "C", "C", "O", "C", "C", "SE", "C", "N", "C", "C", "O", "C", "C", "O", "O", "N", "C", "C", "O", "C", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "N", "C", "N", "N", "N", "C", "C", "O", "C", "C", "C", "O", "N", "N", "C", "C", "O", "N", "C", "C", "O", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "C", "N", "N", "C", "C", "O", "C", "C", "C", "O", "O", "N", "C", "C", "O", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "C", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "N", "C", "N", "N", "N", "C", "C", "O", "C", "C", "O", "O", "N", "C", "C", "O", "C", "C", "C", "C", "C", "C", "C", "O", "N", "C", "C", "O", "C", "C", "C", "N", "C", "C", "O", "C", "C", "O", "O", "N", "C", "C", "O", "C", "C", "C", "N", "C", "N", "N", "N", "C", "C", "O", "C", "C", "C", "C", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "C", "C", "C", "C", "O", "N", "C", "C", "O", "C", "C", "C", "C", "N", "N", "C", "C", "O", "C", "O", "C", "N", "C", "C", "O", "C", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "N", "C", "N", "N", "N", "C", "C", "O", "C", "N", "C", "C", "O", "C", "C", "C", "O", "O", "N", "C", "C", "O", "C", "C", "C", "O", "N", "N", "C", "C", "O", "C", "N", "C", "C", "O", "C", "O", "N", "C", "C", "O", "C", "C", "C", "O", "N", "N", "C", "C", "O", "C", "C", "C", "O", "O", "N", "C", "C", "O", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "C", "N", "N", "C", "C", "O", "C", "C", "O", "N", "N", "C", "C", "O", "C", "C", "C", "C", "N", "C", "C", "C", "C", "C", "N", "C", "C", "O", "C", "C", "SE", "C", "N", "C", "C", "O", "C", "O", "C", "N", "C", "C", "O", "C", "C", "C", "O", "O", "N", "C", "C", "O", "C", "O", "C", "N", "C", "C", "O", "C", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "O", "N", "N", "C", "C", "O", "C", "C", "O", "N", "N", "C", "C", "O", "C", "N", "C", "C", "O", "C", "C", "O", "N", "N", "C", "C", "O", "C", "C", "C", "N", "C", "C", "O", "C", "C", "O", "O", "N", "C", "C", "O", "C", "S", "N", "C", "C", "O", "C", "C", "C", "C", "N", "N", "C", "C", "O", "C", "O", "C", "N", "C", "C", "O", "C", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "C", "N", "N", "C", "C", "O", "C", "N", "C", "C", "O", "C", "C", "C", "C", "N", "C", "C", "O", "N", "C", "C", "O", "C", "C", "C", "N", "C", "C", "O", "N", "C", "C", "O", "C", "N", "C", "C", "O", "C", "O", "C", "N", "C", "C", "O", "C", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "O", "O", "N", "C", "C", "O", "C", "C", "C", "O", "O", "N", "C", "C", "O", "C", "C", "SE", "C", "N", "C", "C", "O", "C", "C", "SE", "C", "N", "C", "C", "O", "C", "O", "C", "N", "C", "C", "O", "C", "N", "C", "C", "O", "C", "S", "N", "C", "C", "O", "C", "C", "C", "O", "N", "N", "C", "C", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O" ] # Turn black code style on # fmt: on sup.apply(moving) atom_moved = [] for aa in moving: atom_moved.append(aa.element) self.assertEqual(atom_moved, atom_list)
return 0 def accept_residue(self, residue): hetatm_flag, resseq, icode = residue.get_id() # print(residue.get_id()) if hetatm_flag != " ": # skip HETATMS return 0 if icode != " ": warnings.warn("WARNING: Icode %s at position %s" % (icode, resseq), BiopythonWarning) return 1 for pdb_id_a in pdb_list: pdb_id = pdb_id_a[3:7] if len(pdb_id_a.split('_')) != 3: continue pdb_chain = pdb_id_a.split('_')[2] data_path = f'data/validation_pdb/pdb{pdb_id.lower()}.ent' if not os.path.exists(data_path): continue p = PDBParser() structure = p.get_structure('X', data_path) sel = ChainSelect(pdb_chain) io = PDBIO() io.set_structure(structure) io.save(f'data/validation_pdb/chain/{pdb_id}{pdb_chain}.pdb', sel)
def getStructure(name, filename): #faster in shell ? from Bio.PDB import PDBParser parser = PDBParser(PERMISSIVE=1) structure = parser.get_structure(name, filename) return structure
type = str, \ help = "Chain from which residues should be removed") argparser.add_argument("--start", \ dest = "start", \ type = int, \ help = "First residue to be removed from the chain") argparser.add_argument("--end", \ dest = "end", \ type = int, \ help = "Last residue to be removed from the chain") # Get the arguments args = argparser.parse_args() in_pdb_file = args.in_pdb_file out_pdb_file = args.out_pdb_file chain = args.chain start = args.start end = args.end # Create a PDB parser parser = PDBParser() # Parse the structure name = in_pdb_file.replace(".pdb", "") structure = parser.get_structure(name, in_pdb_file) # Save the processed structure w = PDBIO() w.set_structure(structure) w.save(out_pdb_file, \ NotInRangeResSelect(chain = chain, \ start = start, \ end = end))
aa = [ 'PRO', 'TYR', 'THR', 'VAL', 'PHE', 'ARG', 'GLY', 'CYS', 'ALA', 'LEU', 'MET', 'ASP', 'GLN', 'SER', 'TRP', 'LYS', 'GLU', 'ASN', 'ILE', 'HIS' ] from os import chdir b = '/Users/nicholassofroniew/Documents/DATA-proteins/' chdir(b) from glob import glob from os.path import exists files = glob(b + 'pdb/*/*.ent') p = PDBParser() for f in files: name = f[len(b):] print(name) if not exists(b + 'pdb-parsed' + name[6:-4] + '.csv') and not exists(b + 'pdb-rejected' + name[6:-4] + '.csv'): try: structure = p.get_structure('X', b + name) df = parse(structure) flag = check(df) if not flag: df = DataFrame([]) except: df = DataFrame([])
def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein: """Takes a PDB string and constructs a Protein object. WARNING: All non-standard residue types will be converted into UNK. All non-standard atoms will be ignored. Args: pdb_str: The contents of the pdb file chain_id: If None, then the pdb file must contain a single chain (which will be parsed). If chain_id is specified (e.g. A), then only that chain is parsed. Returns: A new `Protein` parsed from the pdb contents. """ pdb_fh = io.StringIO(pdb_str) parser = PDBParser(QUIET=True) structure = parser.get_structure('none', pdb_fh) models = list(structure.get_models()) if len(models) != 1: raise ValueError( f'Only single model PDBs are supported. Found {len(models)} models.' ) model = models[0] if chain_id is not None: chain = model[chain_id] chains = [chain] else: chains = list(model.get_chains()) # if len(chains) != 1: # raise ValueError( # 'Only single chain PDBs are supported when chain_id not specified. ' # f'Found {len(chains)} chains.') # else: # chain = chains[0] atom_positions = [] aatype = [] atom_mask = [] residue_index = [] b_factors = [] PARAM_CHAIN_BREAK = 100 residue_index_prev = 0 for k, chain in enumerate(chains): for res in chain: if res.id[2] != ' ': raise ValueError( f'PDB contains an insertion code at chain {chain.id} and residue ' f'index {res.id[1]}. These are not supported.') res_shortname = residue_constants.restype_3to1.get( res.resname, 'X') restype_idx = residue_constants.restype_order.get( res_shortname, residue_constants.restype_num) pos = np.zeros((residue_constants.atom_type_num, 3)) mask = np.zeros((residue_constants.atom_type_num, )) res_b_factors = np.zeros((residue_constants.atom_type_num, )) for atom in res: if atom.name not in residue_constants.atom_types: continue pos[residue_constants.atom_order[atom.name]] = atom.coord mask[residue_constants.atom_order[atom.name]] = 1. res_b_factors[residue_constants.atom_order[ atom.name]] = atom.bfactor if np.sum(mask) < 0.5: # If no known atom positions are reported for the residue then skip it. continue aatype.append(restype_idx) atom_positions.append(pos) atom_mask.append(mask) residue_index.append(res.id[1] + residue_index_prev + PARAM_CHAIN_BREAK * k) b_factors.append(res_b_factors) residue_index_prev = residue_index[-1] return Protein(atom_positions=np.array(atom_positions), atom_mask=np.array(atom_mask), aatype=np.array(aatype), residue_index=np.array(residue_index), b_factors=np.array(b_factors))
for i in range(0, L): residues[i].xtra["SS_PSEA"] = ss_seq[i] #os.system("rm "+fname) class PSEA: def __init__(self, model, filename): ss_seq = psea(filename) ss_seq = psea2HEC(ss_seq) annotate(model, ss_seq) self.ss_seq = ss_seq def get_seq(self): """ Return secondary structure string. """ return self.ss_seq if __name__ == "__main__": import sys from Bio.PDB import PDBParser # Parse PDB file p = PDBParser() s = p.get_structure('X', sys.argv[1]) # Annotate structure with PSEA sceondary structure info PSEA(s[0], sys.argv[1])
def parse_pdb_coordinates(pdb_path: str, start_position: int, end_position: int, position_correction: int, chain: str, sasa: bool = False) -> DataFrame: """ Parse coordinate of CA atoms. Will also return the bfactor and SASA using freesasa. If PDB is missing atoms, it can handle it. """ # Get structure from PDB structure = PDBParser().get_structure('pdb', pdb_path) coordinates = [] commands = [] bfactors = [] positions_worked = [] # positions present in pdb # Iterate over each CA atom and geet coordinates for i in np.arange(start_position + position_correction, end_position + position_correction): # first check if atom exists try: structure[0][chain][int(i)].has_id("CA") # Get atom from pdb and geet coordinates atom = list(structure[0][chain][int(i)]["CA"].get_vector()) + [i] coordinates.append(atom) # Get SASA command for each residue and bfactor residue = "s{}, chain {} and resi {}".format(str(i), chain, str(i)) commands.append(residue) bfactor = (structure[0][chain][int(i)]["CA"].get_bfactor()) bfactors.append(np.log10(bfactor)) positions_worked.append(i) except: print("residue {} not found".format(str(i))) coordinates.append([np.nan, np.nan, np.nan, i]) # Convert to df df_coordinates = DataFrame(columns=['x', 'y', 'z', 'Position'], data=coordinates) # Center data x, y, z = centroid(df_coordinates) df_coordinates['x_cent'] = (df_coordinates['x'] - x).abs()**2 df_coordinates['y_cent'] = (df_coordinates['y'] - y).abs()**2 df_coordinates['z_cent'] = (df_coordinates['z'] - z).abs()**2 df_coordinates['Distance'] = df_coordinates['x_cent'] + df_coordinates[ 'y_cent'] + df_coordinates['z_cent'] # Add sasa values if sasa: # Get structure for SASA structure_sasa = freesasa.Structure(pdb_path) result = freesasa.calc(structure_sasa) # Calculate sasa sasa_area = freesasa.selectArea(commands, structure_sasa, result) df_sasa: DataFrame = DataFrame(columns=['SASA'], data=sasa_area.values()) df_sasa['log B-factor'] = bfactors df_sasa['Position'] = positions_worked # Merge df_coordinates = df_coordinates.merge(df_sasa, how='outer', on='Position') return df_coordinates
""" if self.rotran is None: raise PDBException("No transformation has been calculated yet") rot, tran = self.rotran rot = rot.astype('f') tran = tran.astype('f') for atom in atom_list: atom.transform(rot, tran) if __name__ == "__main__": import sys from Bio.PDB import PDBParser, Selection p = PDBParser() s1 = p.get_structure("FIXED", sys.argv[1]) fixed = Selection.unfold_entities(s1, "A") s2 = p.get_structure("MOVING", sys.argv[1]) moving = Selection.unfold_entities(s2, "A") rot = numpy.identity(3).astype('f') tran = numpy.array((1.0, 2.0, 3.0), 'f') for atom in moving: atom.transform(rot, tran) sup = Superimposer() sup.set_atoms(fixed, moving)
class PandasMolStructure: def __init__(self): self.parser = PDBParser(QUIET=True, PERMISSIVE=True) self.df_structure = None self.pairwise_dist = None def get_pandas_structure(self, pdb_file: str = None, het_atom=False) -> pd.DataFrame: # TODO split get and init method """Constructs a pandas.DataFrame representation of PDB protein structure Args: pdb_file (str): Path to PDB file Returns: pd.DataFrame: DataFrame with following strudcture {"model":[],"chain":[],"residue":[],"atom":[],"x":[],"y":[],"z":[]} """ assert not (self.df_structure is None and pdb_file is None),\ "Data has not been initialized yet and no pdb file was provided" if self.df_structure is None: df_dict = { "model": [], "chain": [], "residue": [], "res_pos": [], "atom": [], "atom_pos": [], "is_hetatom": [], "x": [], "y": [], "z": [] } structure = self.parser.get_structure("protein_1", pdb_file) # TODO Option should be given to cohoose if we want to use all models from NMR pdb_samples model = next(structure.get_models()) # for model in structure.get_models(): for chain in model.get_chains(): for residue in chain.get_residues(): for atom in residue.get_atoms(): df_dict["model"].append(model.id) df_dict["chain"].append(chain.id) df_dict["residue"].append(residue.get_resname()) df_dict["res_pos"].append(residue.id[1]) df_dict["atom"].append(atom.get_name()) df_dict["atom_pos"].append(atom.serial_number) df_dict["is_hetatom"].append( not bool(re.search('het= ', residue.__repr__()))) cords = atom.get_coord() df_dict["x"].append(cords[0]) df_dict["y"].append(cords[1]) df_dict["z"].append(cords[2]) self.df_structure = pd.DataFrame(df_dict) if het_atom: return self.df_structure else: # het_atoms_to_ignore = ["HOH","NAG", "FUC", "MAN", "GAL", "SO4"] # FIXME Atoms to ignore should be based on HETATOM atoms_to_not_ignore = utils.get_AA_list( config.folder_structure_cfg.aminoacids_csv) return self.df_structure[self.df_structure["residue"].isin( atoms_to_not_ignore)] def get_atom_3Dcoord(self, pdb_file: str) -> np.array: """Returns numpy array of 3D atom positions Args: pdb_file (str): Path to pdb file Returns: np.array: Array of shape Nx3 where N is the number of atoms in pdb_file """ # TODO filter out heteroatoms (watter) structure = self.parser.get_structure("protein_1", pdb_file) atoms = [] for atom in structure.get_atoms(): cords = atom.get_coord() atoms.append(cords) return np.array(atoms) def get_protein_sequence(self): structure_df = self.get_pandas_structure() sequence = structure_df[structure_df['residue'].shift() != structure_df['residue']]\ .reset_index(drop=True)[["residue", "res_pos"]] return sequence @staticmethod def get_pairwise_euclidean_atom(sturcture_df: pd.DataFrame ): #pdb_file: str = None,het_atom=False): # TODO upgrade distances to energy calculations based on distance and charge # TODO find data for atom charges # TODO whta to do with missing hydrogen atoms? X-Ray doesent determine H positions # if self.pairwise_dist is None: cords = sturcture_df[["x", "y", "z"]] return euclidean_distances(cords, cords)
from Bio.PDB import DSSP, PDBParser import os dir = '*/casp11.domains/' list = os.listdir('*/casp11.domains') #print(list) q1 = open("/home/ystroot/Documents/seqcasp11.txt", "a") q2 = open("/home/ystroot/Documents/sscasp11.txt", "a") for i in list: print(i) l = dir + i p = PDBParser() structure = p.get_structure("Model", l) model = structure[0] dssp = DSSP(model, l) for row in dssp: #if row[0] < 1000: q1.write(str(row[1])) #with open("/home/ystroot/Documents/sscasp11.txt","a") as q2: q2.write(str(row[2])) q1.write('\n') q2.write('\n') q1.close() q2.close()
from Bio.PDB import PDBParser import numpy as np import matplotlib.pyplot as plt import seaborn as sns import sys import copy from numpy import random groupfilename = sys.argv[1] structurename = sys.argv[2] # create parser parser = PDBParser() # read structure from file structure = parser.get_structure('Closed', structurename) # store key locations in the DHFR structure model = structure[0] chain = model['A'] m20 = chain[20]['CA'] sheet = chain[112]['CA'] globular = chain[41]['CA'] ligand = model['X'] for residue in ligand: if "N" in residue.get_id()[0]: hydride = residue['H4'] adenosine = residue['C18'] hydride_distances = {} hydride_distance_list = [] adenosine_distances = {} adenosine_distance_list = []
def get_residue_depth(pdb_fh, msms_fh): """ Extracts Residue depth from PDB structure :param pdb_fh: path to PDB structure file :param msms_fh: path to MSMS libraries :returns data_depth: pandas table with residue depth per residue """ from Bio.PDB import Selection, PDBParser from Bio.PDB.Polypeptide import is_aa from Bio.PDB.ResidueDepth import get_surface, _read_vertex_array, residue_depth, ca_depth, min_dist surface_fh = "%s/%s.msms.vert" % (dirname(msms_fh), basename(pdb_fh)) if not exists(surface_fh): pdb_to_xyzr_fh = "%s/pdb_to_xyzr" % dirname(msms_fh) xyzr_fh = "%s/%s.xyzr" % (dirname(msms_fh), basename(pdb_fh)) pdb_to_xyzr_com = "%s %s > %s" % (pdb_to_xyzr_fh, pdb_fh, xyzr_fh) msms_com = "%s -probe_radius 1.5 -if %s -of %s > %s.log" % ( msms_fh, xyzr_fh, splitext(surface_fh)[0], splitext(surface_fh)[0]) log_fh = "%s.log" % msms_fh log_f = open(log_fh, 'a') log_f.write("%s;\n%s\n" % (pdb_to_xyzr_com, msms_com)) subprocess.call("%s;%s" % (pdb_to_xyzr_com, msms_com), shell=True, stdout=log_f, stderr=subprocess.STDOUT) log_f.close() surface = _read_vertex_array(surface_fh) pdb_parser = PDBParser() pdb_data = pdb_parser.get_structure("pdb_name", pdb_fh) model = pdb_data[0] residue_list = Selection.unfold_entities(model, 'R') depth_dict = {} depth_list = [] depth_keys = [] for residue in residue_list: if not is_aa(residue): continue rd = residue_depth(residue, surface) ca_rd = ca_depth(residue, surface) # Get the key res_id = residue.get_id() chain_id = residue.get_parent().get_id() if chain_id == "A": depth_dict[(chain_id, res_id)] = (rd, ca_rd) depth_list.append((residue, (rd, ca_rd))) depth_keys.append((chain_id, res_id)) # Update xtra information residue.xtra['EXP_RD'] = rd residue.xtra['EXP_RD_CA'] = ca_rd else: break depth_df = pd.DataFrame(depth_dict).T.reset_index() depth_df = depth_df.drop("level_0", axis=1) aasi_prev = 0 for i in range(len(depth_df)): if depth_df.loc[i, "level_1"][1] != aasi_prev: depth_df.loc[i, "aasi"] = depth_df.loc[i, "level_1"][1] aasi_prev = depth_df.loc[i, "level_1"][1] depth_df = depth_df.drop("level_1", axis=1) depth_df = depth_df.loc[~pd.isnull(depth_df.loc[:, "aasi"]), :] depth_df = depth_df.set_index("aasi", drop=True) depth_df.columns = ["Residue depth", "Residue (C-alpha) depth"] return depth_df