def getAncestry(self, parent, immediate=None): """Find out if a record is parent of another record using sequence information and get operations""" import sequence_alignment from Sequence import SequenceOperations as SQ full_record_sequence = self.aaseq full_parent_sequence = parent.aaseq if parent.has_key('Structure_alnseq_PDBfile') and parent.has_key( 'Structure_alnseq_EATrecord'): parent_aln_PDBfile = parent.Structure_alnseq_PDBfile + '*' parent_aln_record = parent.Structure_alnseq_EATrecord + '*' else: parent_aln_PDBfile = record_sequence, ignored_res = sequence_alignment.Protool2pir( full_record_sequence) parent_aln_record = parent_aln_PDBfile if full_record_sequence and full_parent_sequence: record_sequence, ignored_res = sequence_alignment.Protool2pir( full_record_sequence) parent_sequence, ignored_parent = sequence_alignment.Protool2pir( full_parent_sequence) # First try the simple option operations = [None] if len(record_sequence) == len(parent_sequence): operations = SQ.findSequenceDifferences( record_sequence, parent_sequence, full_parent_sequence, PDBaln=parent_aln_PDBfile, recordALN=parent_aln_record) if len(operations) > 10 or len(record_sequence) != len( parent_sequence): NW_align = sequence_alignment.NW(record_sequence, parent_sequence) al_seq, al_parent, map_seq, map_parent = NW_align.Align( verbose=True) operations = SQ.findSequenceDifferences( al_seq, al_parent, full_parent_sequence, PDBaln=parent_aln_PDBfile, recordALN=parent_aln_record) if operations == [None]: raise Exception() if len(operations) >= 0 and not immediate: return True, operations elif len(operations) > 0 and immediate: if len(operations) == 1: return False, operations else: return False, operations return False, False
def align_withseq(self, sequence): """Extract the sequence of the current molecule and align it with the sequence given. Return an array giving the relation between the sequence position and the PDB residue ID""" pdbpir = self.clean_seq(self.PirSeq(), remove_gaps=True) sequence = self.clean_seq(sequence, remove_gaps=True) import sequence_alignment NW = sequence_alignment.NW(pdbpir, sequence) al1, al2, map1, map2 = NW.Align(verbose=True) map = [] residues = self.residues.keys() residues.sort() count = 0 # for count in range(len(sequence)): pdbrespos = map2[count] if pdbrespos != '-': map.append(residues[map2[count]]) else: map.append(None) # # Check that we have 100% sequence identify for the aligned residues # print 'Aligned seqid', NW.aligned_seqid return map, NW.aligned_seqid
def main(): import os, sys, math sys.path.append('/home/people/tc/svn/Protool/') import geometry instance_geometry = geometry.geometry() ## ## lactoferrin ## pdb1 = '1lfg' ## pdb2 = '1lfh' ## domain_range = range(1,88+1)+range(253,333+1) ## chain = 'A' ## ## trp repressor ## pdb1 = '1wrp' ## pdb2 = '2oz9' ## 2wrp ## pdb2 = '1zt9' ## domain_range = range(1,999+1) ## chain1 = 'R' ## chain2 = 'R' ## chain2 = 'A' ## exclude_chain = '' ## ## luciferase ## pdb1 = '1ba3' ## pdb2 = '1lci' ## domain_range = range(1,999+1) ## chain1 = 'A' ## chain2 = 'A' ## exclude_chain = '' ## ## G3P DH ## pdb1 = '1gd1' ## pdb2 = '2gd1' ## domain_range = range(1,999+1) ## chain1 = 'O' ## chain2 = 'O' ## exclude_chain = '' ## ## hexokinase ## pdb1 = '1hkg' ## pdb2 = '2yhx' ## domain_range = range(1,999+1) ## chain1 = 'A' ## chain2 = 'A' ## exclude_chain = '' ## ## adk ## pdb1 = '1ake' ## pdb2 = '4ake' ## domain_range = range(1,999+1) ## chain1 = 'A' ## chain2 = 'A' ## exclude_chain = 'B' ## ## t4l ## pdb1 = '2lzm' ## pdb2 = '150l' #### domain_range = range(15,59+1) #### domain_range = range(60,80+1) ## domain_range = range(81,162+1) ## chain1 = 'A' ## chain2 = 'D' ## exclude_chain = 'B' l_input = [ ## ## ## {'pdb1':'1ipd','pdb2':'1osj','chain1':'A','chain2':'A','range':range(1,98+1)+range(253,345+1)}, ## {'pdb1':'1ipd','pdb2':'1osj','chain1':'A','chain2':'A','range':range(99,108+1)+range(109,252+1)}, ###### shears ## ## aspartate amino transferase ## {'pdb1':'9aat','pdb2':'1ama','chain1':'A','chain2':'A','range':range(15,36+1)+range(349,410+1)}, ## {'pdb1':'9aat','pdb2':'1ama','chain1':'A','chain2':'A','range':range(50,312+1)}, ## alcohol dehydrogenase { 'pdb1': '6adh', 'pdb2': '8adh1', 'chain1': 'A', 'chain2': 'A', 'range': range(1, 174 + 1) + range(322, 374 + 1) }, { 'pdb1': '6adh', 'pdb2': '8adh2', 'chain1': 'A', 'chain2': 'A', 'range': range(193, 317 + 1) }, ## ## citrate synthase ## {'pdb1':'1cts','pdb2':'4cts','chain1':'A','chain2':'A','range':range(1,276+1)+range(386,999+1)}, ###### hinges ## ## atpsulf ## {'pdb1':'1i2d','pdb2':'1m8p','chain1':'A','chain2':'A','range':range(1,389+1)}, ## ## dnak (different spacegroups) ## {'pdb1':'1dkx','pdb2':'1dky','chain1':'A','chain2':'A','range':range(389,509+1)}, ## ## dnak (different spacegroups) ## {'pdb1':'1ddt','pdb2':'1mdt','chain1':'A','chain2':'A','range':range(1,376+1)}, ## ## ecpdpbp ## {'pdb1':'1dpp','pdb2':'1dpe','chain1':'A','chain2':'A','range':range(1,260+1)+range(479,999+1)}, ## ## ef2 ## {'pdb1':'1n0v','pdb2':'1n0u','chain1':'C','chain2':'A','range':range(1,478+1)}, ## large ## {'pdb1':'1n0v','pdb2':'1n0u','chain1':'C','chain2':'A','range':range(479,560+1)}, ## independent ## {'pdb1':'1n0v','pdb2':'1n0u','chain1':'C','chain2':'A','range':range(561,9999+1)}, ## small ## ## febp ## {'pdb1':'1d9v','pdb2':'1mrp','chain1':'A','chain2':'A','range':range(109,227+1)+range(292,309+1)}, ## {'pdb1':'1d9v','pdb2':'1mrp','chain1':'A','chain2':'A','range':range(1,96+1)+range(228,262+1)}, ## ## folylpolyglutamate synthetase ## {'pdb1':'1jbv','pdb2':'1jbw','chain1':'A','chain2':'A','range':range(1,295+1)}, ## {'pdb1':'1jbv','pdb2':'1jbw','chain1':'A','chain2':'A','range':range(296,386+1)}, ## ## glucose ABC transporter ATPase subunit (different spacegroups) ## {'pdb1':'1oxs','pdb2':'1oxu','chain1':'C','chain2':'C','range':range(1,209+1)}, ## {'pdb1':'1oxs','pdb2':'1oxu','chain1':'C','chain2':'C','range':range(244,999+1)}, ## ## groel domain ## {'pdb1':'1aon','pdb2':'1oel','chain1':'A','chain2':'A','range':range(1,137+1)+range(410,999+1)}, ## {'pdb1':'1aon','pdb2':'1oel','chain1':'A','chain2':'A','range':range(192,374+1)}, ## {'pdb1':'1aon','pdb2':'1oel','chain1':'A','chain2':'A','range':range(138,190+1)+range(375,409+1)}, ## ## lao bp ## {'pdb1':'2lao','pdb2':'1laf','chain1':'A','chain2':'E','range':range(1,90+1)+range(192,238+1)}, ## {'pdb1':'2lao','pdb2':'1laf','chain1':'A','chain2':'E','range':range(91,191+1)}, ## ## t4l ## {'pdb1':'1l96','pdb2':'1l97','chain1':'A','chain2':'A','range':range(13,59+1)}, ## {'pdb1':'1l96','pdb2':'1l97','chain1':'A','chain2':'A','range':range(81,164+1)}, ## ## maltodextrin bp ## {'pdb1':'1omp','pdb2':'3mbp','chain1':'A','chain2':'A','range':range(1,104+1)+range(268,313+1)}, ## {'pdb1':'1omp','pdb2':'3mbp','chain1':'A','chain2':'A','range':range(113,258+1)+range(314,370+1)}, ## ## mRNA capping enzyme ## {'pdb1':'1ckm','pdb2':'1ckm','chain1':'A','chain2':'B','range':range(1,237+1)+range(319,327+1)}, ## {'pdb1':'1ckm','pdb2':'1ckm','chain1':'A','chain2':'B','range':range(241,303+1)}, ## ## mura ## {'pdb1':'1ejd','pdb2':'1a2n','chain1':'A','chain2':'A','range':range(1,20+1)+range(230,417+1)}, ## {'pdb1':'1ejd','pdb2':'1a2n','chain1':'A','chain2':'A','range':range(20,230+1)}, ## ## oligopeptide bp ## {'pdb1':'1rkm','pdb2':'2rkm','chain1':'A','chain2':'A','range':range(1,263+1)+range(491,517+1)}, ## {'pdb1':'1rkm','pdb2':'2rkm','chain1':'A','chain2':'A','range':range(277,477+1)}, ## ## protein kinase A ## {'pdb1':'1jlu','pdb2':'1cmk','chain1':'E','chain2':'E','range':range(1,33+1)+range(125,310+1), ## {'pdb1':'1jlu','pdb2':'1cmk','chain1':'E','chain2':'E','range':range(34,124+1)}, ## ## dna polymerase beta ## {'pdb1':'1bpd','pdb2':'2bpg','chain1':'A','chain2':'A','range':range(1,82+1)}, ## {'pdb1':'1bpd','pdb2':'2bpg','chain1':'A','chain2':'A','range':range(106,132+1)}, ## {'pdb1':'1bpd','pdb2':'2bpg','chain1':'A','chain2':'A','range':range(148,262+1)}, ## {'pdb1':'1bpd','pdb2':'2bpg','chain1':'A','chain2':'A','range':range(262,335+1)}, ## ## ribose bp ## {'pdb1':'1urp','pdb2':'2dri','chain1':'A','chain2':'A','range':range(1,98+1)+range(235,259+1)}, ## {'pdb1':'1urp','pdb2':'2dri','chain1':'A','chain2':'A','range':range(104,234+1)+range(265,271+1)}, ## ## thioredoxin reductase ## {'pdb1':'1tde','pdb2':'1f6m','chain1':'A','chain2':'E','range':range(1,112+1)+range(248,320+1)}, ## {'pdb1':'1tde','pdb2':'1f6m','chain1':'A','chain2':'E','range':range(118,242+1)}, ## ## dna bp ## {'pdb1':'1fgu','pdb2':'1jmc','chain1':'A','chain2':'A','range':range(183,283+1)}, ## ## transferrin ## {'pdb1':'1bp5','pdb2':'1a8e','chain1':'A','chain2':'A','range':range(1,75+1)+range(249,316+1)}, ## {'pdb1':'1bp5','pdb2':'1a8e','chain1':'A','chain2':'A','range':range(103,242+1)}, ## ## uracil dna glycosylase ## {'pdb1':'1ssp','pdb2':'1akz','chain1':'E','chain2':'A','range':range(82,144+1)+range(191,240+1)}, ## {'pdb1':'1ssp','pdb2':'1akz','chain1':'E','chain2':'A','range':range(166,182+1)+range(270,304+1)}, ] for i in range(len(l_input)): pdb1 = l_input[i]['pdb1'] pdb2 = l_input[i]['pdb2'] chain1 = l_input[i]['chain1'] chain2 = l_input[i]['chain2'] domain_range = l_input[i]['range'] os.system('cp /oxygenase_local/data/pdb/%s/pdb%s.ent %s.pdb' % ( pdb1[1:3], pdb1, pdb1, )) os.system('cp /oxygenase_local/data/pdb/%s/pdb%s.ent %s.pdb' % ( pdb2[1:3], pdb2[:4], pdb2[:4], )) ss_range1, l_missing1, seqres1, l_modres = parse_header( pdb1, chain1, ) ss_range2, l_missing2, seqres2, l_modres = parse_header( pdb2[:4], chain2, ) ss_range = list(set(ss_range1) & set(ss_range2)) l_missing = list(set(l_missing1) | set(l_missing2)) if len(seqres1) != len(seqres2): d_replace = { 'TPO': 'THR', 'PTR': 'TYR', ## 'SER':'CYS', ## 1tde v 1f6m } for i in range(len(seqres1)): if seqres1[i] in d_replace.keys(): seqres1[i] = d_replace[seqres1[i]] for i in range(len(seqres2)): if seqres2[i] in d_replace.keys(): seqres2[i] = d_replace[seqres2[i]] if not (''.join(seqres1) in ''.join(seqres2) or ''.join(seqres2) in ''.join(seqres1)): import sys sys.path.append('/home/people/tc/svn/EAT_DB/') import sequence_alignment d_res = { 'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y', } seq1 = '' for res in seqres1: seq1 += d_res[res] seq2 = '' for res in seqres2: seq2 += d_res[res] instance = sequence_alignment.NW(seq1, seq2) s1, s2 = instance.Align(verbose=False)[:2] l1 = len(s1) - len(s1.lstrip('-')) l2 = len(s2) - len(s2.lstrip('-')) r1 = len(s1) - len(s1.rstrip('-')) r2 = len(s2) - len(s2.rstrip('-')) print seqres1 print seqres2 print len(seqres1) print len(seqres2) print pdb1, pdb2 print l1, l2, r1, r2 print seqres2[l1:len(seqres2) - r1] print seqres1[l2:len(seqres1) - r2] l_coordinates1 = parse_coordinates( pdb1, chain1, domain_range, ss_range, l_missing, ) l_coordinates2 = parse_coordinates( pdb2[:4], chain2, domain_range, ss_range, l_missing, ) ## l_coordinates1 = l_coordinates1[l2:len(l_coordinates1)-r2] ## l_coordinates2 = l_coordinates2[l1:len(l_coordinates2)-r1] if len(l_coordinates1) != len(l_coordinates2): print len(l_coordinates1) print len(l_coordinates2) stop rmsd = instance_geometry.superpose(l_coordinates1, l_coordinates2) print pdb1, pdb2, round(rmsd, 1), len(l_coordinates1) / 3. tv1 = instance_geometry.fitcenter rm = instance_geometry.rotation tv2 = instance_geometry.refcenter apply_transformation_matrix( pdb1, chain1, l_modres, [0, 0, 0], [[1, 0, 0], [0, 1, 0], [0, 0, 1]], [0, 0, 0], ) apply_transformation_matrix( pdb2, chain2, l_modres, tv1, rm, tv2, ) l_coordinates1 = parse_coordinates( pdb1 + '_rotated', chain1, range(1, 9999), ss_range, l_missing, ) l_coordinates2 = parse_coordinates( pdb2 + '_rotated', chain2, range(1, 9999), ss_range, l_missing, ) SUM = 0. n = len(l_coordinates1) for i in range(n): SUM += sum((l_coordinates1[i] - l_coordinates2[i])**2) RMSD = math.sqrt(SUM / n) print RMSD return
def main( pdb1, pdb2, chains1_align, chains2_align, ): chains1_apply = chains1_align chains2_apply = chains2_align import os, sys, math sys.path.append('/home/people/tc/svn/Protool/') import geometry instance_geometry = geometry.geometry() domain_range = range(0, 9999) os.system('cp /data/pdb-v3.2/%s/pdb%s.ent %s.pdb' % ( pdb1[1:3], pdb1, pdb1, )) os.system('cp /data/pdb-v3.2/%s/pdb%s.ent %s.pdb' % ( pdb2[1:3], pdb2, pdb2, )) ss_range1, l_missing1, seqres1, l_modres = parse_header( pdb1, chains1_align, ) ss_range2, l_missing2, seqres2, l_modres = parse_header( pdb2, chains2_align, ) ss_range = list(set(ss_range1) & set(ss_range2)) l_missing = list(set(l_missing1) | set(l_missing2)) if len(seqres1) != len(seqres2): d_replace = { 'TPO': 'THR', 'PTR': 'TYR', ## 'SER':'CYS', ## 1tde v 1f6m } for i in range(len(seqres1)): if seqres1[i] in d_replace.keys(): seqres1[i] = d_replace[seqres1[i]] for i in range(len(seqres2)): if seqres2[i] in d_replace.keys(): seqres2[i] = d_replace[seqres2[i]] if not (''.join(seqres1) in ''.join(seqres2) or ''.join(seqres2) in ''.join(seqres1)): import sys sys.path.append('/home/people/tc/svn/EAT_DB/') import sequence_alignment d_res = { 'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y', } seq1 = '' for res in seqres1: seq1 += d_res[res] seq2 = '' for res in seqres2: seq2 += d_res[res] instance = sequence_alignment.NW(seq1, seq2) s1, s2 = instance.Align(verbose=False)[:2] l1 = len(s1) - len(s1.lstrip('-')) l2 = len(s2) - len(s2.lstrip('-')) r1 = len(s1) - len(s1.rstrip('-')) r2 = len(s2) - len(s2.rstrip('-')) print seqres1 print seqres2 print len(seqres1) print len(seqres2) print pdb1, pdb2 print l1, l2, r1, r2 print seqres2[l1:len(seqres2) - r1] print seqres1[l2:len(seqres1) - r2] else: s1 = ''.join(seqres1) s2 = ''.join(seqres2) if s1 in s2: seqres2 = seqres2[s2.index(s1) / 3:] else: seqres1 = seqres1[s1.index(s2) / 3:] if len(seqres1) != len(seqres2): print len(seqres1), len(seqres2) stop l_coordinates1 = parse_coordinates( pdb1, chains1_align, domain_range, ss_range, l_missing, ) l_coordinates2 = parse_coordinates( pdb2, chains2_align, domain_range, ss_range, l_missing, ) ## l_coordinates1 = l_coordinates1[l2:len(l_coordinates1)-r2] ## l_coordinates2 = l_coordinates2[l1:len(l_coordinates2)-r1] if len(l_coordinates1) == 0 or len(l_coordinates2) == 0: stop if len(l_coordinates1) != len(l_coordinates2): print len(l_coordinates1) print len(l_coordinates2) stop rmsd = instance_geometry.superpose(l_coordinates1, l_coordinates2) print pdb1, pdb2 print 'rmsd', round(rmsd, 1) print 'residues', len(seqres1), len(seqres2) print 'coordinates', len(l_coordinates1) tv1 = instance_geometry.fitcenter rm = instance_geometry.rotation tv2 = instance_geometry.refcenter lines1 = apply_transformation_matrix( pdb1, chains1_apply, l_modres, [0, 0, 0], [[1, 0, 0], [0, 1, 0], [0, 0, 1]], [0, 0, 0], ) lines2 = apply_transformation_matrix( pdb2, chains2_apply, l_modres, tv1, rm, tv2, ) fd = open('rotated_%s%s.pdb' % ( pdb1, pdb2, ), 'w') fd.writelines(lines1 + lines2) fd.close() l_coordinates1 = parse_coordinates( 'rotated_' + pdb1, chains1_apply, range(-9999, 9999), ss_range, l_missing, ) l_coordinates2 = parse_coordinates( 'rotated_' + pdb2, chains2_apply, range(-9999, 9999), ss_range, l_missing, ) SUM = 0. n = len(l_coordinates1) for i in range(n): SUM += sum((l_coordinates1[i] - l_coordinates2[i])**2) RMSD = math.sqrt(SUM / n) print 'RMSD all atoms', RMSD return RMSD, l_coordinates1, l_coordinates2
def checkPDBSequence(self, name): """Check the PDB sequence against a newly added structure, optional. Adds the amino acid seq of the PDB file, overwriting the old one""" # Extract the sequence import sequence_alignment pdb_1, ignored_res1 = sequence_alignment.Protool2pir(self.X.sequence) print 'IGNORED', ignored_res1 if ignored_res1 != {}: igroups = ignored_res1.keys() igroups.sort() import tkMessageBox tkMessageBox.showwarning( 'Unknown entities in PDB file', 'I ignored the following residue types/molecules in the PDB file:\n%s' % (str(igroups))) # Get the entry sequence accept_alignment_automatically = None record_AA = DB.get_AA_sequence(name) if record_AA: record_AA1, ignored_res = sequence_alignment.Protool2pir(record_AA) # If we do not have an amino acid sequence for the record, then # we simply use the one from the PDB file and accept the alignment # straight away accept_alignment_automatically = 1 import copy record_AA1 = copy.deepcopy(pdb_1) # Also deposit the amino acid sequence in the protein record DB.data[name]['aaseq'] = copy.deepcopy(self.X.sequence) # Align the two sequences NW_align = sequence_alignment.NW(pdb_1, record_AA1) al_pdb, al_record, map_pdb, map_record = NW_align.Align() self.al_pdb = al_pdb self.al_record = al_record # Find regions of overlap ids = 0 for count in range(len(al_pdb)): res_pdb = al_pdb[count] res_rec = al_record[count] if res_pdb == res_rec: ids = ids + 1 print 'Sequence identity %5.3f' % (100.0 * float(ids) / float(len(al_pdb))) AlignmentMap = {} AlignmentMap['OrigAa'] = al_record AlignmentMap['AlignedAa'] = al_pdb #Make alignment window AlignWindow = Toplevel() self.AlingWindow = AlignWindow AlignWindow.geometry('+100+200') AlignWindow.title('Please check alignment') AlignWindow.button = Button(AlignWindow, { "text": "Alignment OK", "fg": "black", "command": storePDB }) AlignWindow.button.grid(row=3, column=0) AlignWindow.button = Button( AlignWindow, { "text": "Alignment not OK", "fg": "black", "command": AlignWindow.destroy }) AlignWindow.button.grid(row=3, column=1) AlignWindow.Slider = Scrollbar(AlignWindow, orient=HORIZONTAL) AlignWindow.Slider.grid(row=1, column=0, sticky='news', columnspan=2) listbox = Listbox(AlignWindow, { "height": 2, "width": 80, "font": "courier 14" }) listbox.insert('end', "PEAT_DB record: " + al_record) listbox.insert('end', "PDB file : " + al_pdb) listbox.grid(row=0, column=0, columnspan=2) listbox.config(xscrollcommand=AlignWindow.Slider.set) AlignWindow.Slider.config(command=listbox.xview) return AlignmentMap