def pdb_contacts(pdb, chain, dist): i = 0 # Get chain code from 6th letter in pdb name pdb_chain = pdb_getchain(pdb, chain) ppb = CaPPBuilder() # Initialise building of a polypeptide and its sequence # If a mutated residue is present in a chain it is classed as a hetatm # However, not all hetatms in a chain are part of the sequence. The CaPPBuilder # makes sequences by requiring CA-CA distances to be <4.3A. Common hetatms are # identified such that an MSE hetatm will be replaced by an M in the sequence polypepTot = ppb.build_peptides(pdb_chain, aa_only=False)[0] sequen = polypepTot.get_sequence() # Add to the polypeptide for polypep_raw in ppb.build_peptides(pdb_chain, aa_only=False)[1:]: sequen += (polypep_raw.get_sequence()) polypepTot += polypep_raw i = 0 # Sometimes the terminal residue in a protein isn't fully resolved last_res = polypepTot[-1] if last_res.has_id("CA") or last_res.has_id("CB"): polypep = polypepTot # If resolved take whole AA file_seq.write(">sequence\n%s\n" % sequen) file_seq.write("%s" % sequen) else: polypep = polypepTot[:-1] # Otherwise take all but the last AA file_seq.write(">sequence\n%s\n" % sequen[:-1]) file_seq.write("%s" % sequen[:-1]) file_map.write(str(len(polypep)) + "\n") # sys.stderr.write(pdb+'\n') for residue1 in polypep: # Quite frequently residues do not have resolved CB, in which case use CA # If no CA exists, print ERROR. Grep the output if running unsupervised. try: if residue1.has_id("CB"): #get_resname() == "GLY": c_alpha = residue1["CB"] else: c_alpha = residue1["CA"] except: sys.stdout.write("ERROR") raise i += 1 j = 0 for residue2 in polypep: try: if residue2.has_id("CB"): #get_resname() == "GLY": c_alpha2 = residue2["CB"] else: c_alpha2 = residue2["CA"] except: file_map.write("ERROR") raise j += 1 if (norm(c_alpha.get_coord(), c_alpha2.get_coord()) < dist): # 3.5 ): file_map.write("%d %d\n" % (i - 1, j - 1))
def pdb_polypep(pdb, chain, trim): i = 0 # Get chain code from 6th letter in pdb name pdb_chain = pdb_getchain(pdb, chain) ppb = CaPPBuilder() # Initialise building of a polypeptide and its sequence # If a mutated residue is present in a chain it is classed as a hetatm # However, not all hetatms in a chain are part of the sequence. The CaPPBuilder # makes sequences by requiring CA-CA distances to be <4.3A. Common hetatms are # identified such that an MSE hetatm will be replaced by an M in the sequence polypepTot = ppb.build_peptides(pdb_chain, aa_only=False)[0] sequen = polypepTot.get_sequence() # Add to the polypeptide for polypep_raw in ppb.build_peptides(pdb_chain, aa_only=False)[1:]: sequen += (polypep_raw.get_sequence()) polypepTot += polypep_raw # Remove unstructured terminal ends if trim: polypepTot = pp_trim(polypepTot) # Sometimes the terminal residue in a protein isn't fully resolved last_res = polypepTot[-1] if last_res.has_id("CA") or last_res.has_id("CB"): polypep = polypepTot # If resolved take whole AA # file_seq.write(">sequence\n%s\n" %sequen) ## file_seq.write("%s" %sequen) else: polypep = polypepTot[:-1] # Otherwise take all but the last AA # file_seq.write(">sequence\n%s\n" %sequen[:-1]) ## file_seq.write("%s" %sequen[:-1]) # file_map.write( str(len(polypep)) +"\n" ) # sys.stderr.write(pdb+'\n') return polypep
def test_ca_ca(self): """Extract polypeptides using CA-CA.""" ppbuild = CaPPBuilder() polypeptides = ppbuild.build_peptides(self.structure[1]) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 2) self.assertEqual(pp[-1].get_id()[1], 86)
def test_cappbuilder_tau(self): """Test tau angles calculated with CaPPBuilder.""" ppb = CaPPBuilder() pp = ppb.build_peptides(self.structure) taus = pp[1].get_tau_list() self.assertAlmostEqual(taus[0], 0.3597907225123525, places=3) self.assertAlmostEqual(taus[1], 0.43239284636769254, places=3) self.assertAlmostEqual(taus[2], 0.99820157492712114, places=3) thetas = pp[2].get_theta_list() self.assertAlmostEqual(thetas[0], 1.6610069445335354, places=3) self.assertAlmostEqual(thetas[1], 1.7491703334817772, places=3) self.assertAlmostEqual(thetas[2], 2.0702447422720143, places=3)
def test_cappbuilder_real(self): """Test CaPPBuilder on real PDB file.""" ppb = CaPPBuilder() pp = ppb.build_peptides(self.structure) pp0_seq = pp[0].get_sequence() pp1_seq = pp[1].get_sequence() pp2_seq = pp[2].get_sequence() self.assertEqual(pp0_seq, "DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW") self.assertEqual(pp1_seq, "TETLLVQNANPDCKTILKALGPGATLEE") self.assertEqual(pp2_seq, "TACQG") self.assertEqual( [ca.serial_number for ca in pp[0].get_ca_list()], [ 10, 18, 26, 37, 46, 50, 57, 66, 75, 82, 93, 104, 112, 124, 131, 139, 150, 161, 173, 182, 189, 197, 208, 213, 222, 231, 236, 242, 251, 260, 267, 276, 284, ], )
def test_ca_ca(self): """Extract polypeptides using CA-CA.""" ppbuild = CaPPBuilder() polypeptides = ppbuild.build_peptides(self.structure[1]) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 2) self.assertEqual(pp[-1].get_id()[1], 86) # Check the sequence s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("RCGSQGGGSTCPGLRCCSIWGWCGDSEPYCGRTCENKCWSGER" "SDHRCGAAVGNPPCGQDRCCSVHGWCGGGNDYCSGGNCQYRC", str(s))
def test_cappbuilder_real_nonstd(self): """Test CaPPBuilder on real PDB file allowing non-standard amino acids.""" ppb = CaPPBuilder() pp = ppb.build_peptides(self.structure, False) self.assertEqual(len(pp), 1) # Check the start and end positions self.assertEqual(pp[0][0].get_id()[1], 151) self.assertEqual(pp[0][-1].get_id()[1], 220) # Check the sequence s = pp[0].get_sequence() self.assertIsInstance(s, Seq) # Here non-standard MSE are shown as M self.assertEqual( "MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQG", s)
def pdb_sequence(pdb_file, id=None, method="order"): from Bio.PDB import PDBParser, CaPPBuilder from Bio.PDB.Polypeptide import three_to_one if id is None: id = util.make_id_from_file_name(pdb_file) parser = PDBParser() structure = parser.get_structure(id, pdb_file) seq_chains = [] for chain in structure.get_chains(): id_chain = chain.get_id() if method == "distance": ppb = CaPPBuilder() seq = sum((pp.get_sequence() for pp in ppb.build_peptides(chain)), Seq("", IUPAC.protein)) seq_spec = None #TODO: implement elif method == "order": seq = [] seq_spec = [] for res in chain.get_residues(): seq.append(three_to_one(res.get_resname())) ## from Bio docs, res.get_full_id() returns: ("1abc", 0, "A", (" ", 10, "A")) fid = res.get_full_id() seq_spec.append( pdb_seq_spec(chain=fid[-2].strip(), resn=res.get_resname(), resi=fid[-1][-2], ins=fid[-1][-1].strip())) seq = Seq("".join(seq), IUPAC.protein) else: raise ValueError("Unknown method: {}".format(method)) seq_chains.append( dict(id_chain=id_chain, seq_rec=SeqRecord(seq, id="{}_{}".format(id, id_chain), description=""), seq_spec=seq_spec)) chains_map = dict(((x["id_chain"], x) for x in seq_chains)) return pdb_seqs(id=id, chains=seq_chains, chains_map=chains_map)
#from TCRmodeller_functions import * from subprocess import Popen, PIPE script, filename, tag, chainid = argv tmpdir = os.getcwd() hmmscan_program = '/TCRmodeller/programs/hmmer/hmmscan' profit_program = '/Users/ragul/profit/ProFitV3.1/src/profit' f2 = open('temp.fa','w+') pdbfile = parser.get_structure("PDB", filename) mychain = pdbfile[0][chainid] f2.write(">"+filename+"\n") for ppe in ppb.build_peptides(mychain): f2.write(str(ppe.get_sequence())+"\n") f2.close() def find_CDRs(tcr_seq, hmmscan_program, tmpdir, tag): CDR1_start_pos = 0 CDR1_end_pos = 0 CDR2_start_pos = 0 CDR2_end_pos = 0 CDR3_start_pos = 0 CDR3_end_pos = 0 HV4_start_pos = 0
class nonHetSelect(Select): def accept_residue(self,residue): if residue.id[0] == ' ': return 1 else: return 0 gzpdbfile_path = "/TCRmodeller/PDB_RELEASE/pdb_structures" + '/%s/pdb%s.ent.gz' %(pdbcode[1:3], pdbcode) gzpdbfile = gzip.open(gzpdbfile_path, 'rb') pdbfile = parser.get_structure("PDB", gzpdbfile) mychaina = pdbfile[0][chainida] io.set_structure(mychaina) io.save('tmpa.pdb', nonHetSelect()) faseqa = "" for ppe in ppb.build_peptides(mychaina): faseqa += str(ppe.get_sequence()) print "faseqa : ", faseqa mychainb = pdbfile[0][chainidb] io.set_structure(mychainb) io.save('tmpb.pdb', nonHetSelect()) faseqb = "" for ppe in ppb.build_peptides(mychainb): faseqb += str(ppe.get_sequence()) print "faseqb : ", faseqb regexa = "[A-Z]{0,23}C[A-Z]([A-Z]{8,12}W)[YF][A-Z]{13}([A-Z]{6,11})[A-Z]{15,30}[DL][A-Z]{2,3}Y[A-Z][CW][A-Z]([A-Z]{7,16}[FW])G[A-Z]G[A-Z]{0,7}[PA]*" regexb = "[A-Z]{0,23}C[A-Z]([A-Z]{8,12}W)[Y][A-Z]{13}([A-Z]{6,11})[A-Z]{15,40}[YLF][A-Z][CW][A-Z]([A-Z]{7,17}[F])G[A-Z]G[A-Z]{0,7}[E]*"