def main(self): '''Script for comparison of biological units from the PDB and from MSD-PISA''' fd = open('pdbbind_2004.txt', 'r') s = fd.read() fd.close() l_pdbs = s.split() d_pdbs = {} for pdb in l_pdbs: if not pdb[1:3] in d_pdbs.keys(): d_pdbs[pdb[1:3]] = [pdb] else: d_pdbs[pdb[1:3]] += [pdb] l_subs = d_pdbs.keys() l_subs.sort() l_pdbs = [] for sub in l_subs: for pdb in d_pdbs[sub]: l_pdbs += [pdb] for pdb in l_pdbs: if l_pdbs.index(pdb) + 1 < int(sys.argv[1]): continue ## if pdb != '1atl': ## continue print pdb, l_pdbs.index(pdb) + 1, len(l_pdbs) if pdb in [ '1fiv', '1hef', '1heg', '1e5a', ## ligand overlap upon transformation, two alternative ligand binding conformations?!' e.g. 1bm7!!! '1osv', '1oxn', '1oxq', '2pcp', '1u9l', '1igj', '1jq9', '1jq8', '1lpk', '1lpg', '1lpz', ## biounits should be identical, but different number of ligands in each PDB transformed biounit (water atoms often transformed incorrectly!!) '1c1r', ## grey region ## ## multiple biounits ## ## '1fpu', ## identical multimer in PISA and PDB '1a4k', '1a94', '1pkx', '1wc1', '1fj4', '1h1p', '1cgl', '1dqx', '1eix', '1fkn', '1fl3', '1h1s', '1hsh', '1i7z', '2cht', '1uw6', '1jqy', '1los', '1lrh', '1m4h', '1a08', '1b3l', '1is0', '1mh5', '1mjj', '1njj', '1p1q', '1q4k', '3tmk', '1umw', '1uv6', '1uz8', ## tommy error ## '2dqt', ## dimer in PISA and PDB ## ## other PISA multimers and/or interfaces are stable in solution ## '1qhc', '1fq5', '1it6', '1jn4', '2jxr', '1yei', '1tuf', '1e2k', '1e2n', '1oe7', '1e2l', '1nms', '1afk', '1fch', '1fkf', '1kc7', '6std', '7std', '1fzj', '1fzk', '1slg', '1sle', '1jyq', '1kyv', '1o9d', '1oe8', '1os0', '1p19', '1qca', '5std', '1tyr', '1ugp', '1v48', '1vfn', '1vpo', '1vwl', '1aqc', '1ghy', '2izl', ## ## same size multimers, but different interfaces ## '1loq', '1lyx', '1adl', '2ans', '1kll', '1trd', '1w72', '1b55', '1oko', '1lyb', '3pck', '3pcj', ## ## different multimers ## ## dimer in PISA, monomer in PDB '1oar', '1s39', '1udt', '1p6e', '1caq', '1rd4', '1p9p', '1q63', '1qi0', '1gpk', '1lf9', '1k1y', '1k4g', '1oim', '1b8y', '1gpn', '1ow4', '1h6h', '1lee', '1qy2', '1q4w', '1w3j', '1c5s', '1ciz', '1d7j', '1dy4', '1f3e', '1ghz', '1imx', '1j17', '1jt1', '1k4h', '1kpm', '1l2s', '1lnm', '1m13', '1m48', '1n2v', '1njs', '1nw7', '1nw5', '1oif', '1p28', '1q65', '1q66', '1q91', '1qft', '1qy1', '1r5y', '1s38', '1sqn', '1sw1', '1uho', '1uj6', '1uj5', '1uz1', '1wm1', '1xzx', '5yas', '1d7i', ## multimer in PISA, monomer in PDB '1b8o', '1b8n', '1j4r', '1g7v', '1lf2', '830c', '1jn2', '1fv0', '4tmk', '5tmp', '2usn', '1usn', ## correct multimer might be in "grey area" (deltaG_dissociation ~< 0) ## PQS interfaces might be different from PDB interaces '1b42', '1bky', '1bra', '3mag', '3mct', ## monomer in PISA, dimer in PDB '2csn', '1iup', '1gz9', '1igb', '1ii5', '1p1n', ## monomer in PISA, dimer in PDB, dimer in PQS '1kdk', '1lhw', ## monomer in PISA, dimer in PDB, hexamer in PQS '1f8d', ## monomer in PISA, tetramer in PDB, tetramer in PQS, tetramer in PISA upon removal of ligands '1bm7', '1n51', '2tmn', '4tmn', '5tmn', '1wht', ## dimer in PISA, tetramer in PDB, tetramer in PQS '1m5w', ## dimer in PISA, octamer in PDB, octamer in PQS '1ftm', ## trimer in PISA, monomer in PDB, dimer in PQS '1awi', ## monomer/dimer in PISA, trimer in PDB, trimer in PQS '1a99', ## tetramer in PISA, dimer in PDB ## ## ligand positions ## ## identical multimers in PISA and PDB, but different ligand/ion/sugar position(s) '2cgr', '3gss', '1elr', '1elb', '1hyo', '1gvx', '1gyx', '1jet', '2gss', '1gyy', '1gvu', '1qkb', '1ofz', '1b9j', '1jao', '1jeu', '1jev', '1af6', '10gs', '11gs', '1kui', '1kuk', '1kug', '1obx', '1ogx', '1px4', '1qka', '1ur9', '1e6s', '1e6q', '1e70', '1h22', '1h23', ## acetylcholine esterase ## acetylcholine esterase ## dimer ## 4 helix bundle interface ## 0.490nm between LYS530NZ and ASP365ODD2 ## 0.263nm between LYS530NZ and ASP369ODD2 ## hydrophobic core involving LEU366,LEU373,PHE527,LEU531 ## other AChE structures (1j06,1j07,1n5r,1n5m) have similar dimer interfaces with ligand in between interfaces ## PDB molecule of the month states it is a dimer, but no details about the dimer interface ]: continue ## ## ## d_transformations_PISA, d_chains_PISA = biounit.biounit( ).parse_pisa_multimers(pdb) ## ## ## d_transformations_REMARK350 = {} set_water = set() set_nonwater = set() fd = open( '/oxygenase_local/data/pdb/%s/pdb%s.ent' % (pdb[1:3], pdb), 'r') lines = fd.readlines() fd.close() for i in range(len(lines)): line = lines[i] record = line[:6].strip() if record == 'REMARK': remark = int(line[7:10]) if remark == 350: if line[11:23] == 'BIOMOLECULE:': d_transformations_REMARK350 = self.parse_REMARK350_biomolecules( d_transformations_REMARK350, lines, i) elif record in [ 'ATOM', 'HETATM', ]: res_name = line[17:20] chain = line[21] if chain == ' ' and record == 'HETATM': print pdb print line stop if res_name == 'HOH': set_water |= set([chain]) else: set_nonwater |= set([chain]) set_water -= set_nonwater ## remove water transformations if d_transformations_REMARK350 != {}: for chain in d_transformations_REMARK350[1]['chains'].keys(): if chain in set_water: del d_transformations_REMARK350[1]['chains'][chain] ## monomer in asu and biou if d_transformations_REMARK350 == {} and d_transformations_PISA == {}: continue ## asu == biou in PDB, biou == asu in PISA if d_transformations_REMARK350 == {}: for assembly in d_transformations_PISA.keys(): for chain in d_transformations_PISA[assembly][ 'chains'].keys(): for molecule in d_transformations_PISA[assembly][ 'chains'][chain].keys(): if d_transformations_PISA[assembly]['chains'][ chain][molecule]['r'] != Numeric.array([[ 1., 0., 0. ], [0., 1., 0.], [0., 0., 1.]]): stop1 if d_transformations_PISA[assembly]['chains'][ chain][molecule]['t'] != Numeric.array( [0., 0., 0.]): stop2 continue ## biou=asu in PISA, biou!=asu in PDB if d_transformations_PISA == {}: for biou in d_transformations_REMARK350.keys(): chains = d_transformations_REMARK350[biou]['chains'].keys() for chain in chains: matrixnos = d_transformations_REMARK350[biou][ 'chains'][chain] if len(matrixnos) != 1: stop2 matrix = d_transformations_REMARK350[biou]['matrices'][ list(matrixnos)[0]] if matrix != [[ '1.000000', '0.000000', '0.000000', '0.00000' ], [ '0.000000', '1.000000', '0.000000', '0.00000' ], ['0.000000', '0.000000', '1.000000', '0.00000']]: stop3 continue biounits = d_transformations_REMARK350.keys() ## multimer in PISA and PDB if len( biounits ) != 1 and pdb not in []: ## loop over biounits and replace [1] with [biounit] if this doesnt hold true!!! print d_transformations_PISA print d_transformations_REMARK350 print pdb, biounits stop_multimer_in_PISA_and_PDB ## print d_transformations_PISA ## print d_transformations_REMARK350 ## print pdb ## for assembly in d_transformations_PISA.keys(): ## size1 = 0 ## for chain_PISA in d_transformations_PISA[assembly]['chains'].keys(): ## if len(chain_PISA) == 1: ## chain = chain_PISA ## if chain == '-': ## continue ## else: ## chain = chain_PISA[chain_PISA.index(']')+1] ## if chain == '-': ## continue ## molecules = d_transformations_PISA[assembly]['chains'][chain_PISA].keys() ## size1 += len(molecules) ## print size1 ## break ## ## size2 = 0 ## for chain in d_transformations_REMARK350[1]['chains'].keys(): ## matrixnos = d_transformations_REMARK350[1]['chains'][chain] ## print chain, matrixnos ## size2 += len(matrixnos) ## print size2 ## ## if size2 > size1: ## print d_transformations_REMARK350 ## stop_maybe_water_has_chain_id d_transformations = {'chains': {}, 'matrices': {}} for assembly in d_transformations_PISA.keys(): matrices_identical = True chains_PISA = d_transformations_PISA[assembly]['chains'].keys() for chain_PISA in chains_PISA: ## parse PISA matrix molecules = d_transformations_PISA[assembly]['chains'][ chain_PISA].keys() for molecule in molecules: r = d_transformations_PISA[assembly]['chains'][ chain_PISA][molecule]['r'] t = d_transformations_PISA[assembly]['chains'][ chain_PISA][molecule]['t'] ## convert PISA chain ID to default chain ID if len(chain_PISA) == 1: chain = chain_PISA if chain == '-': ## temporary!!! continue else: chain = chain_PISA[chain_PISA.index(']') + 1] if chain == '-': ## temporary!!! continue ## compare PISA and REMARK350 matrices set_matrixnos = d_transformations_REMARK350[1][ 'chains'][chain] for matrixno in set_matrixnos: matrix_identical = True matrix_REMARK350 = d_transformations_REMARK350[1][ 'matrices'][matrixno] d_transformations['matrices'][ matrixno] = matrix_REMARK350 for i in range(3): if (round(float(matrix_REMARK350[i][0]), 5) == round(r[i][0], 5) and round( float(matrix_REMARK350[i][1]), 5) == round(r[i][1], 5) and round( float(matrix_REMARK350[i][2]), 5) == round(r[i][2], 5) and round( float(matrix_REMARK350[i][3]), 5) == round(t[i], 5)): continue else: matrix_identical = False break ## continue loop over REMARK350 matrices if matrix_identical == False: continue else: if chain not in d_transformations[ 'chains'].keys(): d_transformations['chains'][chain] = set( [matrixno]) else: d_transformations['chains'][chain] |= set( [matrixno]) if matrix_identical == False: stop_temporary matrix_identical = True break ## break loop over molecules if matrix_identical == False: for matrixno in set_matrixnos: print d_transformations_REMARK350[1][ 'matrices'][matrixno] print 'assembly', assembly print 'molecule', molecule print 'chain', chain_PISA print d_transformations_PISA[assembly]['chains'][ chain_PISA] print float(matrix_REMARK350[i][0]) == round( r[i][0], 6) print float(matrix_REMARK350[i][1]) == round( r[i][1], 6) print float(matrix_REMARK350[i][2]) == round( r[i][2], 6) print float(matrix_REMARK350[i][3]), round(t[i], 6) if len(chain_PISA) == 1: stop_multimer_difference else: stop_different_ligand_locations matrices_identical = False break ## break loop over PISA chains if matrices_identical == False: break if chain != '-': if matrix_identical == False: print assembly, molecule, chain_PISA print matrices_identical stop1 ## continue loop over assemblies if matrices_identical == False: continue if matrix_identical == False: stop2 ## if matrix_identical == False: ## stop3 if d_transformations_PISA != {}: if matrices_identical == False: print d_transformations_REMARK350[1]['matrices'] print d_transformations_PISA[assembly]['chains'][ chain_PISA] print assembly, molecule, chain_PISA print d_transformations_PISA.keys() stop4 if d_transformations_REMARK350[1] != d_transformations: ## if ( ## d_transformations_REMARK350[1]['matrices'].keys() != 1 and ## d_transformations_REMARK350[1]['matrices'][1] != [['1.000000', '0.000000', '0.000000', '0.00000'], ['0.000000', '1.000000', '0.000000', '0.00000'], ['0.000000', '0.000000', '1.000000', '0.00000']] ## ): ## print d_transformations_PISA print d_transformations_REMARK350[1] print d_transformations print pdb stop_PDB_larger_than_PISA return
def main(pdb): os.system('cp %s/%s/pdb%s.ent %s.pdb' %(path_pdb,pdb[1:3],pdb,pdb,)) ## ## Tommy crystal contacts ## ## create biounit biounit.biounit().main(pdb, '/data/remediated_pdb/', exclude_ligands = True) ## ## parse header (just use the asu instead!!!) ## fd = open('%s/%s/pdb%s.ent' %(path_pdb,pdb[1:3],pdb,),'r') lines = fd.readlines() fd.close() for i in range(len(lines)): line = lines[i] record = line[:6].strip() if record in ['MODEL','ATOM',]: break lines_header = lines[:i] ## ## parse coordinates ## fd = open('%s_1.pdb' %(pdb),'r') lines_biounit = fd.readlines() fd.close() fd = open('%s.pdb' %(pdb),'r') lines_asu = fd.readlines() fd.close() ## fd = open('C:\Users\Tommy Carstensen\pdb\%s.pdb' %(pdb),'r') ## lines = fd.readlines() ## fd.close() d_header = parse_pdb.parse_header(lines_header) d_coordinates_biounit, d_ATOMseq = parse_pdb.parse_coordinates( lines_biounit,d_header, parse_atom_seq = False, parse_ligands = False, ) d_coordinates_asu, d_ATOMseq = parse_pdb.parse_coordinates( lines_asu,d_header, parse_atom_seq = False, parse_ligands = False, ) ## set new chain IDs l_old_chains = d_coordinates_asu['chains'].keys() l_new_chains = list( set(s_alphabet)-set(l_old_chains) ) l_old_chains.sort() l_new_chains.sort() d_chains = {} for i in range(len(l_old_chains)): d_chains[l_old_chains[i]] = l_new_chains[i] ## a = d_header['CRYST1']['edges'][0] ## b = d_header['CRYST1']['edges'][1] ## c = d_header['CRYST1']['edges'][2] ## alpha = math.pi*d_header['CRYST1']['angles'][0]/180. ## beta = math.pi*d_header['CRYST1']['angles'][1]/180. ## gamma = math.pi*d_header['CRYST1']['angles'][2]/180. ## ## unit cell voumne ## volume = a*b*c*math.sqrt(1-math.cos(alpha)**2-math.cos(beta)**2-math.cos(gamma)**2+2*(math.cos(alpha)*math.cos(beta)*math.cos(gamma))) ## matrix_fractional2cartesian = numpy.array([ ## [a, b*math.cos(gamma), c*math.cos(beta),], ## [0, b*math.sin(gamma), c*(math.cos(alpha)-math.cos(beta)*math.cos(gamma))/math.sin(gamma),], ## [0,0,volume/(a*b*math.sin(gamma)),], ## ]) matrix_scale = d_header['SCALE'] matrix_scalei = numpy.linalg.inv(matrix_scale) ## lines = [] l_symop = d_header['REMARK290'].keys() l_symop.sort() ## l_symop = [1] for symop in l_symop: ## if symop != 1: ## continue matrix_symop = d_header['REMARK290'][symop]['4x4matrix'] for i in range(len(l_translations)): vector_translation = l_translations[i] ## if vector_translation != [-1,0,0]: ## continue vector_translation = numpy.array([vector_translation[0],vector_translation[1],vector_translation[2],0,]) ## if i != 0: ## continue ## if vector_translation[0] != 1: ## continue ## if vector_translation[1] != -1: ## continue ## if vector_translation[2] != -1: ## continue for chain2 in d_coordinates_asu['chains'].keys(): for res_no2 in d_coordinates_asu['chains'][chain2]['residues'].keys(): ## if res_no2 != 1: ## continue print '%4s %2i/%2i %2i/26 %1s %4i' %(pdb, symop, len(l_symop), i+1, chain2, res_no2) for iCode2 in d_coordinates_asu['chains'][chain2]['residues'][res_no2]['d_iCodes'].keys(): for chain1 in d_coordinates_biounit['chains'].keys(): for res_no1 in d_coordinates_biounit['chains'][chain1]['residues'].keys(): for iCode1 in d_coordinates_biounit['chains'][chain1]['residues'][res_no1]['d_iCodes'].keys(): for atom_name1 in d_coordinates_biounit['chains'][chain1]['residues'][res_no1]['d_iCodes'][iCode1]['atoms'].keys(): coordinate1 = d_coordinates_biounit['chains'][chain1]['residues'][res_no1]['d_iCodes'][iCode1]['atoms'][atom_name1]['coordinate'] for atom_name2 in d_coordinates_asu['chains'][chain2]['residues'][res_no2]['d_iCodes'][iCode2]['atoms'].keys(): coordinate2 = d_coordinates_asu['chains'][chain2]['residues'][res_no2]['d_iCodes'][iCode2]['atoms'][atom_name2]['coordinate'] ## coordinate2 = numpy.dot(matrix_symop,coordinate2)+vector_symop ## coordinate2 += numpy.dot(matrix_fractional2cartesian,vector_translation) ## coordinate2[0] = round(coordinate2[0],3) ## coordinate2[1] = round(coordinate2[1],3) ## coordinate2[2] = round(coordinate2[2],3) ## conversion from 3x1 vector to 4x1 vector coordinate2 = numpy.array([coordinate2[0],coordinate2[1],coordinate2[2],1.,]) ## conversion from cartesian to fractional coordinates coordinate2 = numpy.dot(matrix_scale,coordinate2) ## symmetry operator (before unit cell translation???) coordinate2 = numpy.dot(matrix_symop,coordinate2) ## unit cell translation coordinate2 += vector_translation ## conversion from fractional to cartesian coordinates coordinate2 = numpy.dot(matrix_scalei,coordinate2) ## conversion from 4x1 vector to 3x1 vector coordinate2 = numpy.array([coordinate2[0],coordinate2[1],coordinate2[2],]) vicinity = False distant = False dist = math.sqrt(sum((coordinate2-coordinate1)**2)) if dist < 5: print vector_translation print numpy.dot(matrix_fractional2cartesian,vector_translation) print matrix_fractional2cartesian print '%2i %2i %.2f %.2f %1s %4i %4s %s %1s %4i %4s %s %s' %( symop, i, round(dist,2), dist_treshold, chain2, res_no2, atom_name2, coordinate2, chain1, res_no1, atom_name1, coordinate1, d_coordinates_asu['chains'][chain2]['residues'][res_no2]['d_iCodes'][iCode2]['atoms'][atom_name2]['coordinate'], ) ## break atom_name2 loop if dist > 80.: distant = True break dist = 0 ## tmp!!! temp!!! get *all* translations dist_treshold = d_radii_vdw[atom_name1[0]]+d_radii_vdw[atom_name2[0]]+.25 ## break atom_name2 loop if dist < dist_treshold: vicinity = True break ## break atom_name1 loop (append line and check next iCode1) if vicinity == True: for atom_name3 in d_coordinates_asu['chains'][chain2]['residues'][res_no2]['d_iCodes'][iCode2]['atoms'].keys(): coordinate3 = d_coordinates_asu['chains'][chain2]['residues'][res_no2]['d_iCodes'][iCode2]['atoms'][atom_name3]['coordinate'] ## conversion from 3x1 vector to 4x1 vector coordinate3 = numpy.array([coordinate3[0],coordinate3[1],coordinate3[2],1.,]) ## conversion from cartesian to fractional coordinates coordinate3 = numpy.dot(matrix_scale,coordinate3) ## symmetry operator (before unit cell translation???) coordinate3 = numpy.dot(matrix_symop,coordinate3) ## unit cell translation coordinate3 += vector_translation ## conversion from fractional to cartesian coordinates coordinate3 = numpy.dot(matrix_scalei,coordinate3) ## conversion from 4x1 vector to 3x1 vector coordinate3 = numpy.array([coordinate3[0],coordinate3[1],coordinate3[2],]) line = build_line( atom_name3,d_coordinates_asu,coordinate3, chain2,res_no2,iCode2, d_chains, ) lines += [line] break ## break atom_name1 loop (check next iCode1) if distant == True: break ## break iCode1 loop (check next iCode2) if vicinity == True: break if distant == True: break ## break resno1 loop (check next iCode2) if vicinity == True: break if distant == True: break ## break chain1 loop (check next iCode2) if vicinity == True: break if distant == True: break fd = open('%s_biou_cc.pdb' %(pdb),'w') fd.writelines(lines) fd.close() return
def main(self): '''Script for comparison of biological units from the PDB and from MSD-PISA''' ## self.rsync() ## self.gunzip() fd = open('pdbbind_2007.txt', 'r') s = fd.read() fd.close() l_pdbs = s.split() d_pdbs = {} for pdb in l_pdbs: if not pdb[1:3] in d_pdbs.keys(): d_pdbs[pdb[1:3]] = [pdb] else: d_pdbs[pdb[1:3]] += [pdb] l_subs = d_pdbs.keys() l_subs.sort() l_pdbs = [] for sub in l_subs: for pdb in d_pdbs[sub]: l_pdbs += [pdb] for pdb in l_pdbs: if l_pdbs.index(pdb) + 1 < int(sys.argv[1]): continue print pdb, l_pdbs.index(pdb) + 1, len(l_pdbs) if pdb in [ ## '1fiv','1hef','1heg','1e5a', ## ligand overlap upon transformation, two alternative ligand binding conformations?!' e.g. 1bm7!!! ## biounits should be identical, but different number of ligands in each PDB transformed biounit (water atoms often transformed incorrectly!!) ## '1osv','1oxn','1oxq','2pcp','1u9l','1igj','1jq9','1jq8','1lpk','1lpg','1lpz','2a4m', ## '1c1r', ## grey region '2a4m', ## ligand not in PDB '2a5b', '2a5c', '2a8g', ## some of the ligands not transformed in PDB '1a69', ## altloc used for alternative temperature factors (but identical coordinates!) '1a8i', ## v2 atom names in biounit ## ## multiple biounits ## ## '1fpu', ## identical multimer in PISA and PDB ## '1a4k','1a94','1pkx','1wc1','1fj4','1h1p','1cgl','1dqx','1eix','1fkn','1fl3','1h1s','1hsh','1i7z','2cht','1uw6','1jqy','1los','1lrh','1m4h','1a08','1is0','1mh5','1mjj','1njj','1p1q','1q4k','3tmk','1umw','1uv6','1uz8', ## tommy error ## '2dqt', ## dimer in PISA and PDB ## ## other PISA multimers and/or interfaces are stable in solution ## ## '1qhc','1fq5','1it6','1jn4','2jxr','1yei','1tuf','1e2k','1e2n','1oe7','1e2l','1nms','1afk','1fch','1fkf','1kc7','6std','7std','1fzj','1fzk','1slg','1sle','1jyq','1kyv','1o9d','1oe8','1os0','1p19','1qca','5std','1tyr','1ugp','1v48','1vfn','1vpo','1vwl','1aqc','1ghy','2izl', ## ## same size multimers, but different interfaces ## '1adl', '1af2', '2ans', ##'1loq','1lyx','1kll','1trd','1w72','1b55','1oko','1lyb','3pck','3pcj', ## ## different multimers ## ## dimer in PISA, monomer in PDB '2ayr', '1b11', ##'1oar','1s39','1udt','1p6e','1caq','1rd4','1p9p','1q63','1qi0','1gpk','1lf9','1k1y','1k4g','1oim','1b8y','1gpn','1ow4','1h6h','1lee','1qy2','1q4w','1w3j','1c5s','1ciz','1d7j','1dy4','1f3e','1ghz','1imx','1j17','1jt1','1k4h','1kpm','1l2s','1lnm','1m13','1m48','1n2v','1njs','1nw7','1nw5','1oif','1p28','1q65','1q66','1q91','1qft','1qy1','1r5y','1s38','1sqn','1sw1','1uho','1uj6','1uj5','1uz1','1wm1','1xzx','5yas','1d7i', ## multimer in PISA, monomer in PDB ## '1b8o','1b8n','1j4r','1g7v','1lf2','830c','1jn2','1fv0','4tmk','5tmp','2usn', '1usn', ## correct multimer might be in "grey area" (deltaG_dissociation ~< 0) ## PQS interfaces might be different from PDB interaces '1atl', ##'1b42','1bky','1bra','3mag','3mct', ## monomer in PISA, dimer in PDB ## '2csn','1iup','1gz9','1igb','1ii5','1p1n', ## monomer in PISA, dimer in PDB, dimer in PQS ## '1kdk','1lhw', ## monomer in PISA, dimer in PDB, hexamer in PQS ## '1f8d', ## monomer in PISA, tetramer in PDB, tetramer in PQS, tetramer in PISA upon removal of ligands ## '1bm7','1n51','2tmn','4tmn','5tmn','1wht', ## dimer in PISA, tetramer in PDB, tetramer in PQS '1a1b', '1a1c', '1a1e', ## dimer in PISA, tetramer in PDB ## '1m5w', ## dimer in PISA, octamer in PDB, octamer in PQS ## '1ftm', ## trimer in PISA, monomer in PDB, dimer in PQS ## '1awi', ## monomer/dimer in PISA, trimer in PDB, trimer in PQS '1a99', ## tetramer in PISA, dimer in PDB ## ## ligand positions ## ## identical multimers in PISA and PDB, but different ligand/ion/sugar position(s) '2aj8', '2aoc', '2aod', '2aoe', '2aog', '1apv', '1apw', ##'2cgr','3gss','1elr','1elb','1hyo','1gvx','1gyx','1jet','2gss','1gyy','1gvu','1qkb','1ofz','1b9j','1jao','1jeu','1jev','10gs','11gs','1kui','1kuk','1kug','1obx','1ogx','1px4','1qka','1ur9','1e6s','1e6q','1e70', ## ACY not in PISA '2aac', '2avm', ## MG positions '1af6', ## SO4 positions '2avo', '2avs', ## GOL positions '2avq', ## CL positions '2avv', ## U1 positions '1b05', '1b0h', '1b2h', '1b4h', ## IUM positions '1b1h', '1b32', '1b3f', '1b3g', '1b3h', '1b3l', '1b40', '1b46', '1b4z', '1b51', '1b52', ## '1h22','1h23', ## acetylcholine esterase ## acetylcholine esterase ## dimer ## 4 helix bundle interface ## 0.490nm between LYS530NZ and ASP365ODD2 ## 0.263nm between LYS530NZ and ASP369ODD2 ## hydrophobic core involving LEU366,LEU373,PHE527,LEU531 ## other AChE structures (1j06,1j07,1n5r,1n5m) have similar dimer interfaces with ligand in between interfaces ## PDB molecule of the month states it is a dimer, but no details about the dimer interface ]: continue ## ## ## d_transformations_PISA = biounit.biounit().parse_pisa_multimers( pdb) if d_transformations_PISA == {}: continue ## temporary!!! d_coordinates_PDB = {} d_lines_PDB = {} for bm in range(1, 10000): if bm == 1 or os.path.isfile( '/oxygenase_local/data/biounit/%s/%s.pdb%i' % (pdb[1:3], pdb, bm)): print bm fd = open( '/oxygenase_local/data/biounit/%s/%s.pdb%i' % (pdb[1:3], pdb, bm), 'r') lines = fd.readlines() fd.close() d_coordinates_PDB[bm], d_lines_PDB[ bm] = self.parse_coordinates(lines) continue else: if bm > 3: print bm stop break l_biomolecules = d_coordinates_PDB.keys() l_assemblies = d_transformations_PISA.keys() for bm in l_biomolecules: biounits_identical = False chains_identical = False interfaces_identical = False l_chains_PDB = d_coordinates_PDB[bm]['ATOM'].keys() for assembly in l_assemblies: print bm, assembly l_chains_PISA = [] for chain in d_transformations_PISA[assembly][ 'chains'].keys(): if len(chain) == 1 and chain != '-': l_chains_PISA += [chain] ## print d_transformations_PISA[assembly]['chains'].keys() ## print d_chains_PDB['ATOM'][bm]+d_chains_PDB['HETATM'][bm] ## different chains (different IDs) if len(set(l_chains_PISA) ^ set(l_chains_PDB)) > 0: print 'different chains', l_chains_PDB, l_chains_PISA continue else: d_lines_PISA, d_coordinates_PISA = biounit.biounit( ).parse_pdb_coordinates(pdb, d_transformations_PISA, assembly) ## different chains (different number of transformations) if not (len(d_lines_PISA[assembly]['ATOM']) % len(d_lines_PDB[bm]['ATOM']) == 0 and len(d_lines_PDB[bm]['ATOM']) % len(d_lines_PISA[assembly]['ATOM']) == 0): if (len(d_lines_PISA[assembly]['ATOM']) % len(d_lines_PDB[bm]['ATOM']) != 0 and len(d_lines_PDB[bm]['ATOM']) % len(d_lines_PISA[assembly]['ATOM']) != 0): stop chains_identical = False print 'different chains', len( d_lines_PDB[bm]['ATOM']), len( d_lines_PISA[assembly]['ATOM']) continue ## identical chains (identical IDs, identical number of transformations) else: chains_identical = True ATOM_identical = self.identical_d_coordinates( 'ATOM', d_coordinates_PISA, d_coordinates_PDB, assembly, bm, ) HETATM_identical = self.identical_d_coordinates( 'HETATM', d_coordinates_PISA, d_coordinates_PDB, assembly, bm, ) print ATOM_identical, HETATM_identical if chains_identical == True and ATOM_identical == True and HETATM_identical == True: biounits_identical = True print bm, assembly, 'identical' elif chains_identical == True and ATOM_identical == False and HETATM_identical == True: print bm, assembly, 'different interfaces' continue elif chains_identical == True and ATOM_identical == True and HETATM_identical == False: interfaces_identical = True print bm, assembly, 'different ligands' ## if ( ## len(set(d_lines_PISA[assembly]['ATOM'])^set(d_lines_PDB[bm]['ATOM'])) == 0 and ## len(set(d_lines_PISA[assembly]['HETATM'])^set(d_lines_PDB['HETATM'][bm])) == 0 ## ): ## biounits_identical = True ## print assembly,bm, 'identical' ## break ## elif len(set(d_lines_PISA[assembly]['ATOM'])^set(d_lines_PDB[bm]['ATOM'])) != 0: ## set_PISA_ATOM = set(d_lines_PISA[assembly]['ATOM'])-set(d_lines_PDB[bm]['ATOM']) ## set_PDB_ATOM = set(d_lines_PDB[bm]['ATOM'])-set(d_lines_PISA[assembly]['ATOM']) ## print len(set_PISA_ATOM), len(set_PDB_ATOM) ## print bm, assembly, 'different interfaces' ## if assembly == 4: ## a = list(set_PISA_ATOM) ## b = list(set_PDB_ATOM) ## a.sort() ## b.sort() ## print a[:10] ## print b[:10] ## stop ## continue ## else: ## interfaces_identical = True ## set_PISA_ATOM = set(d_lines_PISA[assembly]['ATOM'])-set(d_lines_PDB[bm]['ATOM']) ## set_PDB_ATOM = set(d_lines_PDB[bm]['ATOM'])-set(d_lines_PISA[assembly]['ATOM']) ## set_PISA_HETATM = set(d_lines_PISA[assembly]['HETATM'])-set(d_lines_PDB['HETATM'][bm]) ## set_PDB_HETATM = set(d_lines_PDB['HETATM'][bm])-set(d_lines_PISA[assembly]['HETATM']) ## a = list(set_PISA_HETATM) ## a.sort() ## b = list(set_PDB_HETATM) ## b.sort() ## print a ## print b ## print len(set_PISA_ATOM), len(set_PDB_ATOM) ## print len(set_PISA_HETATM), len(set_PDB_HETATM) ## print assembly, bm, 'different ligands' ## continue if biounits_identical == False: print pdb, l_pdbs.index(pdb) + 1 if chains_identical == True and interfaces_identical == True: stop_ligand_differences elif chains_identical == True and interfaces_identical == False: stop_interfaces_different elif chains_identical == False and interfaces_identical == False: stop_different_multimers else: stop_not_expected elif biounits_identical == True: continue continue ## monomer in asu and biou if d_transformations_REMARK350 == {} and d_transformations_PISA == {}: continue ## asu == biou in PDB, biou == asu in PISA if d_transformations_REMARK350 == {}: for assembly in d_transformations_PISA.keys(): for chain in d_transformations_PISA[assembly][ 'chains'].keys(): for molecule in d_transformations_PISA[assembly][ 'chains'][chain].keys(): if d_transformations_PISA[assembly]['chains'][ chain][molecule]['r'] != Numeric.array([[ 1., 0., 0. ], [0., 1., 0.], [0., 0., 1.]]): stop1 if d_transformations_PISA[assembly]['chains'][ chain][molecule]['t'] != Numeric.array( [0., 0., 0.]): stop2 continue ## biou=asu in PISA, biou!=asu in PDB if d_transformations_PISA == {}: for biou in d_transformations_REMARK350.keys(): chains = d_transformations_REMARK350[biou]['chains'].keys() for chain in chains: matrixnos = d_transformations_REMARK350[biou][ 'chains'][chain] if len(matrixnos) != 1: stop2 matrix = d_transformations_REMARK350[biou]['matrices'][ list(matrixnos)[0]] if matrix != [[ '1.000000', '0.000000', '0.000000', '0.00000' ], [ '0.000000', '1.000000', '0.000000', '0.00000' ], ['0.000000', '0.000000', '1.000000', '0.00000']]: stop3 continue biounits = d_transformations_REMARK350.keys() ## multimer in PISA and PDB if len( biounits ) != 1 and pdb not in []: ## loop over biounits and replace [1] with [biounit] if this doesnt hold true!!! print d_transformations_PISA print d_transformations_REMARK350 print pdb, biounits stop_multimer_in_PISA_and_PDB ## print d_transformations_PISA ## print d_transformations_REMARK350 ## print pdb ## for assembly in d_transformations_PISA.keys(): ## size1 = 0 ## for chain_PISA in d_transformations_PISA[assembly]['chains'].keys(): ## if len(chain_PISA) == 1: ## chain = chain_PISA ## if chain == '-': ## continue ## else: ## chain = chain_PISA[chain_PISA.index(']')+1] ## if chain == '-': ## continue ## molecules = d_transformations_PISA[assembly]['chains'][chain_PISA].keys() ## size1 += len(molecules) ## print size1 ## break ## ## size2 = 0 ## for chain in d_transformations_REMARK350[1]['chains'].keys(): ## matrixnos = d_transformations_REMARK350[1]['chains'][chain] ## print chain, matrixnos ## size2 += len(matrixnos) ## print size2 ## ## if size2 > size1: ## print d_transformations_REMARK350 ## stop_maybe_water_has_chain_id d_transformations = {'chains': {}, 'matrices': {}} for assembly in d_transformations_PISA.keys(): matrices_identical = True chains_PISA = d_transformations_PISA[assembly]['chains'].keys() for chain_PISA in chains_PISA: ## parse PISA matrix molecules = d_transformations_PISA[assembly]['chains'][ chain_PISA].keys() for molecule in molecules: r = d_transformations_PISA[assembly]['chains'][ chain_PISA][molecule]['r'] t = d_transformations_PISA[assembly]['chains'][ chain_PISA][molecule]['t'] ## convert PISA chain ID to default chain ID if len(chain_PISA) == 1: chain = chain_PISA if chain == '-': ## temporary!!! continue else: chain = chain_PISA[chain_PISA.index(']') + 1] if chain == '-': ## temporary!!! continue ## compare PISA and REMARK350 matrices set_matrixnos = d_transformations_REMARK350[1][ 'chains'][chain] for matrixno in set_matrixnos: matrix_identical = True matrix_REMARK350 = d_transformations_REMARK350[1][ 'matrices'][matrixno] d_transformations['matrices'][ matrixno] = matrix_REMARK350 for i in range(3): if (round(float(matrix_REMARK350[i][0]), 5) == round(r[i][0], 5) and round( float(matrix_REMARK350[i][1]), 5) == round(r[i][1], 5) and round( float(matrix_REMARK350[i][2]), 5) == round(r[i][2], 5) and round( float(matrix_REMARK350[i][3]), 5) == round(t[i], 5)): continue else: matrix_identical = False break ## continue loop over REMARK350 matrices if matrix_identical == False: continue else: if chain not in d_transformations[ 'chains'].keys(): d_transformations['chains'][chain] = set( [matrixno]) else: d_transformations['chains'][chain] |= set( [matrixno]) if matrix_identical == False: stop_temporary matrix_identical = True break ## break loop over molecules if matrix_identical == False: for matrixno in set_matrixnos: print d_transformations_REMARK350[1][ 'matrices'][matrixno] print 'assembly', assembly print 'molecule', molecule print 'chain', chain_PISA print d_transformations_PISA[assembly]['chains'][ chain_PISA] print float(matrix_REMARK350[i][0]) == round( r[i][0], 6) print float(matrix_REMARK350[i][1]) == round( r[i][1], 6) print float(matrix_REMARK350[i][2]) == round( r[i][2], 6) print float(matrix_REMARK350[i][3]), round(t[i], 6) if len(chain_PISA) == 1: stop_multimer_difference else: stop_different_ligand_locations matrices_identical = False break ## break loop over PISA chains if matrices_identical == False: break if chain != '-': if matrix_identical == False: print assembly, molecule, chain_PISA print matrices_identical stop1 ## continue loop over assemblies if matrices_identical == False: continue if matrix_identical == False: stop2 ## if matrix_identical == False: ## stop3 if d_transformations_PISA != {}: if matrices_identical == False: print d_transformations_REMARK350[1]['matrices'] print d_transformations_PISA[assembly]['chains'][ chain_PISA] print assembly, molecule, chain_PISA print d_transformations_PISA.keys() stop4 if d_transformations_REMARK350[1] != d_transformations: ## if ( ## d_transformations_REMARK350[1]['matrices'].keys() != 1 and ## d_transformations_REMARK350[1]['matrices'][1] != [['1.000000', '0.000000', '0.000000', '0.00000'], ['0.000000', '1.000000', '0.000000', '0.00000'], ['0.000000', '0.000000', '1.000000', '0.00000']] ## ): ## print d_transformations_PISA print d_transformations_REMARK350[1] print d_transformations print pdb stop_PDB_larger_than_PISA return
def main(self): """Script for comparison of biological units from the PDB and from MSD-PISA""" fd = open("pdbbind_2004.txt", "r") s = fd.read() fd.close() l_pdbs = s.split() d_pdbs = {} for pdb in l_pdbs: if not pdb[1:3] in d_pdbs.keys(): d_pdbs[pdb[1:3]] = [pdb] else: d_pdbs[pdb[1:3]] += [pdb] l_subs = d_pdbs.keys() l_subs.sort() l_pdbs = [] for sub in l_subs: for pdb in d_pdbs[sub]: l_pdbs += [pdb] for pdb in l_pdbs: if l_pdbs.index(pdb) + 1 < int(sys.argv[1]): continue ## if pdb != '1atl': ## continue print pdb, l_pdbs.index(pdb) + 1, len(l_pdbs) if pdb in [ "1fiv", "1hef", "1heg", "1e5a", ## ligand overlap upon transformation, two alternative ligand binding conformations?!' e.g. 1bm7!!! "1osv", "1oxn", "1oxq", "2pcp", "1u9l", "1igj", "1jq9", "1jq8", "1lpk", "1lpg", "1lpz", ## biounits should be identical, but different number of ligands in each PDB transformed biounit (water atoms often transformed incorrectly!!) "1c1r", ## grey region ## ## multiple biounits ## ## '1fpu', ## identical multimer in PISA and PDB "1a4k", "1a94", "1pkx", "1wc1", "1fj4", "1h1p", "1cgl", "1dqx", "1eix", "1fkn", "1fl3", "1h1s", "1hsh", "1i7z", "2cht", "1uw6", "1jqy", "1los", "1lrh", "1m4h", "1a08", "1b3l", "1is0", "1mh5", "1mjj", "1njj", "1p1q", "1q4k", "3tmk", "1umw", "1uv6", "1uz8", ## tommy error ## '2dqt', ## dimer in PISA and PDB ## ## other PISA multimers and/or interfaces are stable in solution ## "1qhc", "1fq5", "1it6", "1jn4", "2jxr", "1yei", "1tuf", "1e2k", "1e2n", "1oe7", "1e2l", "1nms", "1afk", "1fch", "1fkf", "1kc7", "6std", "7std", "1fzj", "1fzk", "1slg", "1sle", "1jyq", "1kyv", "1o9d", "1oe8", "1os0", "1p19", "1qca", "5std", "1tyr", "1ugp", "1v48", "1vfn", "1vpo", "1vwl", "1aqc", "1ghy", "2izl", ## ## same size multimers, but different interfaces ## "1loq", "1lyx", "1adl", "2ans", "1kll", "1trd", "1w72", "1b55", "1oko", "1lyb", "3pck", "3pcj", ## ## different multimers ## ## dimer in PISA, monomer in PDB "1oar", "1s39", "1udt", "1p6e", "1caq", "1rd4", "1p9p", "1q63", "1qi0", "1gpk", "1lf9", "1k1y", "1k4g", "1oim", "1b8y", "1gpn", "1ow4", "1h6h", "1lee", "1qy2", "1q4w", "1w3j", "1c5s", "1ciz", "1d7j", "1dy4", "1f3e", "1ghz", "1imx", "1j17", "1jt1", "1k4h", "1kpm", "1l2s", "1lnm", "1m13", "1m48", "1n2v", "1njs", "1nw7", "1nw5", "1oif", "1p28", "1q65", "1q66", "1q91", "1qft", "1qy1", "1r5y", "1s38", "1sqn", "1sw1", "1uho", "1uj6", "1uj5", "1uz1", "1wm1", "1xzx", "5yas", "1d7i", ## multimer in PISA, monomer in PDB "1b8o", "1b8n", "1j4r", "1g7v", "1lf2", "830c", "1jn2", "1fv0", "4tmk", "5tmp", "2usn", "1usn", ## correct multimer might be in "grey area" (deltaG_dissociation ~< 0) ## PQS interfaces might be different from PDB interaces "1b42", "1bky", "1bra", "3mag", "3mct", ## monomer in PISA, dimer in PDB "2csn", "1iup", "1gz9", "1igb", "1ii5", "1p1n", ## monomer in PISA, dimer in PDB, dimer in PQS "1kdk", "1lhw", ## monomer in PISA, dimer in PDB, hexamer in PQS "1f8d", ## monomer in PISA, tetramer in PDB, tetramer in PQS, tetramer in PISA upon removal of ligands "1bm7", "1n51", "2tmn", "4tmn", "5tmn", "1wht", ## dimer in PISA, tetramer in PDB, tetramer in PQS "1m5w", ## dimer in PISA, octamer in PDB, octamer in PQS "1ftm", ## trimer in PISA, monomer in PDB, dimer in PQS "1awi", ## monomer/dimer in PISA, trimer in PDB, trimer in PQS "1a99", ## tetramer in PISA, dimer in PDB ## ## ligand positions ## ## identical multimers in PISA and PDB, but different ligand/ion/sugar position(s) "2cgr", "3gss", "1elr", "1elb", "1hyo", "1gvx", "1gyx", "1jet", "2gss", "1gyy", "1gvu", "1qkb", "1ofz", "1b9j", "1jao", "1jeu", "1jev", "1af6", "10gs", "11gs", "1kui", "1kuk", "1kug", "1obx", "1ogx", "1px4", "1qka", "1ur9", "1e6s", "1e6q", "1e70", "1h22", "1h23", ## acetylcholine esterase ## acetylcholine esterase ## dimer ## 4 helix bundle interface ## 0.490nm between LYS530NZ and ASP365ODD2 ## 0.263nm between LYS530NZ and ASP369ODD2 ## hydrophobic core involving LEU366,LEU373,PHE527,LEU531 ## other AChE structures (1j06,1j07,1n5r,1n5m) have similar dimer interfaces with ligand in between interfaces ## PDB molecule of the month states it is a dimer, but no details about the dimer interface ]: continue ## ## ## d_transformations_PISA, d_chains_PISA = biounit.biounit().parse_pisa_multimers(pdb) ## ## ## d_transformations_REMARK350 = {} set_water = set() set_nonwater = set() fd = open("/oxygenase_local/data/pdb/%s/pdb%s.ent" % (pdb[1:3], pdb), "r") lines = fd.readlines() fd.close() for i in range(len(lines)): line = lines[i] record = line[:6].strip() if record == "REMARK": remark = int(line[7:10]) if remark == 350: if line[11:23] == "BIOMOLECULE:": d_transformations_REMARK350 = self.parse_REMARK350_biomolecules( d_transformations_REMARK350, lines, i ) elif record in ["ATOM", "HETATM"]: res_name = line[17:20] chain = line[21] if chain == " " and record == "HETATM": print pdb print line stop if res_name == "HOH": set_water |= set([chain]) else: set_nonwater |= set([chain]) set_water -= set_nonwater ## remove water transformations if d_transformations_REMARK350 != {}: for chain in d_transformations_REMARK350[1]["chains"].keys(): if chain in set_water: del d_transformations_REMARK350[1]["chains"][chain] ## monomer in asu and biou if d_transformations_REMARK350 == {} and d_transformations_PISA == {}: continue ## asu == biou in PDB, biou == asu in PISA if d_transformations_REMARK350 == {}: for assembly in d_transformations_PISA.keys(): for chain in d_transformations_PISA[assembly]["chains"].keys(): for molecule in d_transformations_PISA[assembly]["chains"][chain].keys(): if d_transformations_PISA[assembly]["chains"][chain][molecule]["r"] != Numeric.array( [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]] ): stop1 if d_transformations_PISA[assembly]["chains"][chain][molecule]["t"] != Numeric.array( [0.0, 0.0, 0.0] ): stop2 continue ## biou=asu in PISA, biou!=asu in PDB if d_transformations_PISA == {}: for biou in d_transformations_REMARK350.keys(): chains = d_transformations_REMARK350[biou]["chains"].keys() for chain in chains: matrixnos = d_transformations_REMARK350[biou]["chains"][chain] if len(matrixnos) != 1: stop2 matrix = d_transformations_REMARK350[biou]["matrices"][list(matrixnos)[0]] if matrix != [ ["1.000000", "0.000000", "0.000000", "0.00000"], ["0.000000", "1.000000", "0.000000", "0.00000"], ["0.000000", "0.000000", "1.000000", "0.00000"], ]: stop3 continue biounits = d_transformations_REMARK350.keys() ## multimer in PISA and PDB if ( len(biounits) != 1 and pdb not in [] ): ## loop over biounits and replace [1] with [biounit] if this doesnt hold true!!! print d_transformations_PISA print d_transformations_REMARK350 print pdb, biounits stop_multimer_in_PISA_and_PDB ## print d_transformations_PISA ## print d_transformations_REMARK350 ## print pdb ## for assembly in d_transformations_PISA.keys(): ## size1 = 0 ## for chain_PISA in d_transformations_PISA[assembly]['chains'].keys(): ## if len(chain_PISA) == 1: ## chain = chain_PISA ## if chain == '-': ## continue ## else: ## chain = chain_PISA[chain_PISA.index(']')+1] ## if chain == '-': ## continue ## molecules = d_transformations_PISA[assembly]['chains'][chain_PISA].keys() ## size1 += len(molecules) ## print size1 ## break ## ## size2 = 0 ## for chain in d_transformations_REMARK350[1]['chains'].keys(): ## matrixnos = d_transformations_REMARK350[1]['chains'][chain] ## print chain, matrixnos ## size2 += len(matrixnos) ## print size2 ## ## if size2 > size1: ## print d_transformations_REMARK350 ## stop_maybe_water_has_chain_id d_transformations = {"chains": {}, "matrices": {}} for assembly in d_transformations_PISA.keys(): matrices_identical = True chains_PISA = d_transformations_PISA[assembly]["chains"].keys() for chain_PISA in chains_PISA: ## parse PISA matrix molecules = d_transformations_PISA[assembly]["chains"][chain_PISA].keys() for molecule in molecules: r = d_transformations_PISA[assembly]["chains"][chain_PISA][molecule]["r"] t = d_transformations_PISA[assembly]["chains"][chain_PISA][molecule]["t"] ## convert PISA chain ID to default chain ID if len(chain_PISA) == 1: chain = chain_PISA if chain == "-": ## temporary!!! continue else: chain = chain_PISA[chain_PISA.index("]") + 1] if chain == "-": ## temporary!!! continue ## compare PISA and REMARK350 matrices set_matrixnos = d_transformations_REMARK350[1]["chains"][chain] for matrixno in set_matrixnos: matrix_identical = True matrix_REMARK350 = d_transformations_REMARK350[1]["matrices"][matrixno] d_transformations["matrices"][matrixno] = matrix_REMARK350 for i in range(3): if ( round(float(matrix_REMARK350[i][0]), 5) == round(r[i][0], 5) and round(float(matrix_REMARK350[i][1]), 5) == round(r[i][1], 5) and round(float(matrix_REMARK350[i][2]), 5) == round(r[i][2], 5) and round(float(matrix_REMARK350[i][3]), 5) == round(t[i], 5) ): continue else: matrix_identical = False break ## continue loop over REMARK350 matrices if matrix_identical == False: continue else: if chain not in d_transformations["chains"].keys(): d_transformations["chains"][chain] = set([matrixno]) else: d_transformations["chains"][chain] |= set([matrixno]) if matrix_identical == False: stop_temporary matrix_identical = True break ## break loop over molecules if matrix_identical == False: for matrixno in set_matrixnos: print d_transformations_REMARK350[1]["matrices"][matrixno] print "assembly", assembly print "molecule", molecule print "chain", chain_PISA print d_transformations_PISA[assembly]["chains"][chain_PISA] print float(matrix_REMARK350[i][0]) == round(r[i][0], 6) print float(matrix_REMARK350[i][1]) == round(r[i][1], 6) print float(matrix_REMARK350[i][2]) == round(r[i][2], 6) print float(matrix_REMARK350[i][3]), round(t[i], 6) if len(chain_PISA) == 1: stop_multimer_difference else: stop_different_ligand_locations matrices_identical = False break ## break loop over PISA chains if matrices_identical == False: break if chain != "-": if matrix_identical == False: print assembly, molecule, chain_PISA print matrices_identical stop1 ## continue loop over assemblies if matrices_identical == False: continue if matrix_identical == False: stop2 ## if matrix_identical == False: ## stop3 if d_transformations_PISA != {}: if matrices_identical == False: print d_transformations_REMARK350[1]["matrices"] print d_transformations_PISA[assembly]["chains"][chain_PISA] print assembly, molecule, chain_PISA print d_transformations_PISA.keys() stop4 if d_transformations_REMARK350[1] != d_transformations: ## if ( ## d_transformations_REMARK350[1]['matrices'].keys() != 1 and ## d_transformations_REMARK350[1]['matrices'][1] != [['1.000000', '0.000000', '0.000000', '0.00000'], ['0.000000', '1.000000', '0.000000', '0.00000'], ['0.000000', '0.000000', '1.000000', '0.00000']] ## ): ## print d_transformations_PISA print d_transformations_REMARK350[1] print d_transformations print pdb stop_PDB_larger_than_PISA return
def main(self, pdb, path_pdb_bio, path_pdb_asu, verbose=True): '''Script for comparison of biological units from the PDB and from MSD-PISA''' ## ## ## d_transformations_PISA, status = biounit.biounit( ).parse_pisa_multimers(pdb, verbose=verbose) if status == 'Broken composition in PA Graph': s = 'PISA unable to determine quarternary structure' return s if status != 'Ok': s = status return status if d_transformations_PISA == {}: s = 'PISA unable to identify any stable quarternary structures' return s l_assemblies = d_transformations_PISA.keys() d_coordinates_PDB = {} d_lines_PDB = {} l_chains_ATOM = [] for bm in range(1, 99): if bm == 1 or os.path.isfile('%s%s/%s.pdb%i' % (path_pdb_bio, pdb[1:3], pdb, bm)): fd = open('%s%s/%s.pdb%i' % (path_pdb_bio, pdb[1:3], pdb, bm), 'r') lines = fd.readlines() fd.close() d_coordinates_PDB[bm], d_lines_PDB[ bm] = self.parse_coordinates(lines) l_chains_ATOM += d_coordinates_PDB[bm]['ATOM'].keys() continue else: break l_chains_ATOM = list(set(l_chains_ATOM)) l_biomolecules = d_coordinates_PDB.keys() for bm in l_biomolecules: biounits_identical = False chains_identical = False interfaces_identical = False l_chains_PDB = d_coordinates_PDB[bm]['ATOM'].keys() for assembly in l_assemblies: if verbose == True: print bm, assembly l_chains_PISA = [] for chain in d_transformations_PISA[assembly]['chains'].keys(): if len(chain) == 1 and chain != '-': l_chains_PISA += [chain] ## exclude non-ATOM chains from PISA assembly chains l_chains_PISA = list(set(l_chains_ATOM) & set(l_chains_PISA)) ## print d_transformations_PISA[assembly]['chains'].keys() ## print d_coordinates_PDB[bm]['ATOM'].keys() ## print d_coordinates_PDB[bm]['HETATM'].keys() ## different chains (different IDs) if len(set(l_chains_PISA) ^ set(l_chains_PDB)) > 0: if verbose == True: print 'different chains', l_chains_PDB, l_chains_PISA continue else: d_lines_PISA, d_coordinates_PISA = biounit.biounit( ).parse_pdb_coordinates(pdb, d_transformations_PISA, assembly, path_pdb_asu) len_PISA = float(len(d_lines_PISA[assembly]['ATOM'])) len_PDB = float(len(d_lines_PDB[bm]['ATOM'])) ## different chains (different number of transformations) if not (len_PISA % len_PDB == 0 and len_PDB % len_PISA == 0): ## len of coordinates not a multiplum of each other if (len_PISA % len_PDB != 0 and len_PDB % len_PISA != 0): ## print duplicate lines, which are most likely caused by transformation of coordinates with v2 atom names for line in d_lines_PDB[bm]['ATOM']: count = d_lines_PDB[bm]['ATOM'].count(line) if count > 1: if verbose == True: print line ## check that the number of coordinates are a multiplum of each other if (round( min(len_PDB, len_PISA) % math.modf(len_PISA / len_PDB)[0] * len_PDB, 8) != 0. or round( min(len_PDB, len_PISA) % math.modf(len_PDB / len_PISA)[0] * len_PISA, 8) != 0.): a = list( set(d_lines_PISA[assembly]['ATOM']) - set(d_lines_PDB[bm]['ATOM'])) b = list( set(d_lines_PDB[bm]['ATOM']) - set(d_lines_PISA[assembly]['ATOM'])) a.sort() b.sort() b.reverse() if verbose == True: print a print b print len_PDB print len_PISA print min(len_PDB, len_PISA) % math.modf( len_PISA / len_PDB)[0] * len_PDB print min(len_PDB, len_PISA) % math.modf( len_PDB / len_PISA)[0] * len_PISA stop_not_expected_or_v2_atom_names if verbose == True: print 'different chains', len( d_lines_PDB[bm]['ATOM']), len( d_lines_PISA[assembly]['ATOM']) continue ## identical chains (identical IDs, identical number of transformations) else: chains_identical = True ATOM_identical = self.identical_d_coordinates( 'ATOM', d_coordinates_PISA, d_coordinates_PDB, assembly, bm, verbose=verbose, ) if 'HETATM' in d_coordinates_PISA[assembly].keys(): HETATM_identical = self.identical_d_coordinates( 'HETATM', d_coordinates_PISA, d_coordinates_PDB, assembly, bm, ) else: HETATM_identical = True if verbose == True: print ATOM_identical, HETATM_identical ATOM_identical2 = self.identical_d_coordinates( 'ATOM', d_coordinates_PDB, d_coordinates_PISA, bm, assembly, ) if ATOM_identical != ATOM_identical2: if verbose == True: print ATOM_identical, ATOM_identical2 stop if chains_identical == True and ATOM_identical == True and HETATM_identical == True: biounits_identical = True if verbose == True: print bm, assembly, 'identical' break elif ATOM_identical == False: if verbose == True: print bm, assembly, 'different interfaces' continue elif ATOM_identical == True and HETATM_identical == False: interfaces_identical = True ## differences between sets of lines are not representative of differences if coordinates differ by less than 0.0001nm a = list( set(d_lines_PISA[assembly]['HETATM']) - set(d_lines_PDB[bm]['HETATM'])) b = list( set(d_lines_PDB[bm]['HETATM']) - set(d_lines_PISA[assembly]['HETATM'])) a.sort() b.sort() if verbose == True: print a print b print bm, assembly, 'different ligands' else: if verbose == True: print chains_identical, ATOM_identical, HETATM_identical stop_notexpected if biounits_identical == True: s = 'biounits identical' elif biounits_identical == False: if interfaces_identical == True: s = 'different ligand transformations' elif chains_identical == True and interfaces_identical == False: s = 'different peptide interfaces' elif chains_identical == False and interfaces_identical == False: s = 'different multimers' else: if verbose == True: print chains_identical, interfaces_identical, stop_not_expected return s
def main(self,pdb,path_pdb_bio,path_pdb_asu,verbose=True): '''Script for comparison of biological units from the PDB and from MSD-PISA''' ## ## ## d_transformations_PISA,status = biounit.biounit().parse_pisa_multimers(pdb,verbose=verbose) if status == 'Broken composition in PA Graph': s = 'PISA unable to determine quarternary structure' return s if status != 'Ok': s = status return status if d_transformations_PISA == {}: s = 'PISA unable to identify any stable quarternary structures' return s l_assemblies = d_transformations_PISA.keys() d_coordinates_PDB = {} d_lines_PDB = {} l_chains_ATOM = [] for bm in range(1,99): if bm == 1 or os.path.isfile('%s%s/%s.pdb%i' %(path_pdb_bio,pdb[1:3],pdb,bm)): fd = open('%s%s/%s.pdb%i' %(path_pdb_bio,pdb[1:3],pdb,bm),'r') lines = fd.readlines() fd.close() d_coordinates_PDB[bm], d_lines_PDB[bm] = self.parse_coordinates(lines) l_chains_ATOM += d_coordinates_PDB[bm]['ATOM'].keys() continue else: break l_chains_ATOM = list(set(l_chains_ATOM)) l_biomolecules = d_coordinates_PDB.keys() for bm in l_biomolecules: biounits_identical = False chains_identical = False interfaces_identical = False l_chains_PDB = d_coordinates_PDB[bm]['ATOM'].keys() for assembly in l_assemblies: if verbose == True: print bm, assembly l_chains_PISA = [] for chain in d_transformations_PISA[assembly]['chains'].keys(): if len(chain) == 1 and chain != '-': l_chains_PISA += [chain] ## exclude non-ATOM chains from PISA assembly chains l_chains_PISA = list(set(l_chains_ATOM)&set(l_chains_PISA)) ## print d_transformations_PISA[assembly]['chains'].keys() ## print d_coordinates_PDB[bm]['ATOM'].keys() ## print d_coordinates_PDB[bm]['HETATM'].keys() ## different chains (different IDs) if len(set(l_chains_PISA) ^ set(l_chains_PDB)) > 0: if verbose == True: print 'different chains', l_chains_PDB, l_chains_PISA continue else: d_lines_PISA, d_coordinates_PISA = biounit.biounit().parse_pdb_coordinates(pdb, d_transformations_PISA, assembly,path_pdb_asu) len_PISA = float(len(d_lines_PISA[assembly]['ATOM'])) len_PDB = float(len(d_lines_PDB[bm]['ATOM'])) ## different chains (different number of transformations) if not ( len_PISA % len_PDB == 0 and len_PDB % len_PISA == 0 ): ## len of coordinates not a multiplum of each other if ( len_PISA % len_PDB != 0 and len_PDB % len_PISA != 0 ): ## print duplicate lines, which are most likely caused by transformation of coordinates with v2 atom names for line in d_lines_PDB[bm]['ATOM']: count = d_lines_PDB[bm]['ATOM'].count(line) if count > 1: if verbose == True: print line ## check that the number of coordinates are a multiplum of each other if ( round(min(len_PDB,len_PISA) % math.modf(len_PISA/len_PDB)[0]*len_PDB,8) != 0. or round(min(len_PDB,len_PISA) % math.modf(len_PDB/len_PISA)[0]*len_PISA,8) != 0. ): a = list(set(d_lines_PISA[assembly]['ATOM'])-set(d_lines_PDB[bm]['ATOM'])) b = list(set(d_lines_PDB[bm]['ATOM'])-set(d_lines_PISA[assembly]['ATOM'])) a.sort() b.sort() b.reverse() if verbose == True: print a print b print len_PDB print len_PISA print min(len_PDB,len_PISA) % math.modf(len_PISA/len_PDB)[0]*len_PDB print min(len_PDB,len_PISA) % math.modf(len_PDB/len_PISA)[0]*len_PISA stop_not_expected_or_v2_atom_names if verbose == True: print 'different chains', len(d_lines_PDB[bm]['ATOM']), len(d_lines_PISA[assembly]['ATOM']) continue ## identical chains (identical IDs, identical number of transformations) else: chains_identical = True ATOM_identical = self.identical_d_coordinates('ATOM',d_coordinates_PISA,d_coordinates_PDB,assembly,bm,verbose=verbose,) if 'HETATM' in d_coordinates_PISA[assembly].keys(): HETATM_identical = self.identical_d_coordinates('HETATM',d_coordinates_PISA,d_coordinates_PDB,assembly,bm,) else: HETATM_identical = True if verbose == True: print ATOM_identical, HETATM_identical ATOM_identical2 = self.identical_d_coordinates('ATOM',d_coordinates_PDB,d_coordinates_PISA,bm,assembly,) if ATOM_identical != ATOM_identical2: if verbose == True: print ATOM_identical, ATOM_identical2 stop if chains_identical == True and ATOM_identical == True and HETATM_identical == True: biounits_identical = True if verbose == True: print bm,assembly,'identical' break elif ATOM_identical == False: if verbose == True: print bm,assembly,'different interfaces' continue elif ATOM_identical == True and HETATM_identical == False: interfaces_identical = True ## differences between sets of lines are not representative of differences if coordinates differ by less than 0.0001nm a = list(set(d_lines_PISA[assembly]['HETATM'])-set(d_lines_PDB[bm]['HETATM'])) b = list(set(d_lines_PDB[bm]['HETATM'])-set(d_lines_PISA[assembly]['HETATM'])) a.sort() b.sort() if verbose == True: print a print b print bm,assembly,'different ligands' else: if verbose == True: print chains_identical, ATOM_identical, HETATM_identical stop_notexpected if biounits_identical == True: s = 'biounits identical' elif biounits_identical == False: if interfaces_identical == True: s = 'different ligand transformations' elif chains_identical == True and interfaces_identical == False: s = 'different peptide interfaces' elif chains_identical == False and interfaces_identical == False: s = 'different multimers' else: if verbose == True: print chains_identical, interfaces_identical, stop_not_expected return s
def main(self): '''Script for comparison of biological units from the PDB and from MSD-PISA''' ## self.rsync() ## self.gunzip() fd = open('pdbbind_2007.txt','r') s = fd.read() fd.close() l_pdbs = s.split() d_pdbs = {} for pdb in l_pdbs: if not pdb[1:3] in d_pdbs.keys(): d_pdbs[pdb[1:3]] = [pdb] else: d_pdbs[pdb[1:3]] += [pdb] l_subs = d_pdbs.keys() l_subs.sort() l_pdbs = [] for sub in l_subs: for pdb in d_pdbs[sub]: l_pdbs += [pdb] for pdb in l_pdbs: if l_pdbs.index(pdb)+1 < int(sys.argv[1]): continue print pdb, l_pdbs.index(pdb)+1, len(l_pdbs) if pdb in [ ## '1fiv','1hef','1heg','1e5a', ## ligand overlap upon transformation, two alternative ligand binding conformations?!' e.g. 1bm7!!! ## biounits should be identical, but different number of ligands in each PDB transformed biounit (water atoms often transformed incorrectly!!) ## '1osv','1oxn','1oxq','2pcp','1u9l','1igj','1jq9','1jq8','1lpk','1lpg','1lpz','2a4m', ## '1c1r', ## grey region '2a4m', ## ligand not in PDB '2a5b','2a5c','2a8g', ## some of the ligands not transformed in PDB '1a69', ## altloc used for alternative temperature factors (but identical coordinates!) '1a8i', ## v2 atom names in biounit ## ## multiple biounits ## ## '1fpu', ## identical multimer in PISA and PDB ## '1a4k','1a94','1pkx','1wc1','1fj4','1h1p','1cgl','1dqx','1eix','1fkn','1fl3','1h1s','1hsh','1i7z','2cht','1uw6','1jqy','1los','1lrh','1m4h','1a08','1is0','1mh5','1mjj','1njj','1p1q','1q4k','3tmk','1umw','1uv6','1uz8', ## tommy error ## '2dqt', ## dimer in PISA and PDB ## ## other PISA multimers and/or interfaces are stable in solution ## ## '1qhc','1fq5','1it6','1jn4','2jxr','1yei','1tuf','1e2k','1e2n','1oe7','1e2l','1nms','1afk','1fch','1fkf','1kc7','6std','7std','1fzj','1fzk','1slg','1sle','1jyq','1kyv','1o9d','1oe8','1os0','1p19','1qca','5std','1tyr','1ugp','1v48','1vfn','1vpo','1vwl','1aqc','1ghy','2izl', ## ## same size multimers, but different interfaces ## '1adl','1af2','2ans',##'1loq','1lyx','1kll','1trd','1w72','1b55','1oko','1lyb','3pck','3pcj', ## ## different multimers ## ## dimer in PISA, monomer in PDB '2ayr','1b11',##'1oar','1s39','1udt','1p6e','1caq','1rd4','1p9p','1q63','1qi0','1gpk','1lf9','1k1y','1k4g','1oim','1b8y','1gpn','1ow4','1h6h','1lee','1qy2','1q4w','1w3j','1c5s','1ciz','1d7j','1dy4','1f3e','1ghz','1imx','1j17','1jt1','1k4h','1kpm','1l2s','1lnm','1m13','1m48','1n2v','1njs','1nw7','1nw5','1oif','1p28','1q65','1q66','1q91','1qft','1qy1','1r5y','1s38','1sqn','1sw1','1uho','1uj6','1uj5','1uz1','1wm1','1xzx','5yas','1d7i', ## multimer in PISA, monomer in PDB ## '1b8o','1b8n','1j4r','1g7v','1lf2','830c','1jn2','1fv0','4tmk','5tmp','2usn', '1usn', ## correct multimer might be in "grey area" (deltaG_dissociation ~< 0) ## PQS interfaces might be different from PDB interaces '1atl',##'1b42','1bky','1bra','3mag','3mct', ## monomer in PISA, dimer in PDB ## '2csn','1iup','1gz9','1igb','1ii5','1p1n', ## monomer in PISA, dimer in PDB, dimer in PQS ## '1kdk','1lhw', ## monomer in PISA, dimer in PDB, hexamer in PQS ## '1f8d', ## monomer in PISA, tetramer in PDB, tetramer in PQS, tetramer in PISA upon removal of ligands ## '1bm7','1n51','2tmn','4tmn','5tmn','1wht', ## dimer in PISA, tetramer in PDB, tetramer in PQS '1a1b','1a1c','1a1e', ## dimer in PISA, tetramer in PDB ## '1m5w', ## dimer in PISA, octamer in PDB, octamer in PQS ## '1ftm', ## trimer in PISA, monomer in PDB, dimer in PQS ## '1awi', ## monomer/dimer in PISA, trimer in PDB, trimer in PQS '1a99', ## tetramer in PISA, dimer in PDB ## ## ligand positions ## ## identical multimers in PISA and PDB, but different ligand/ion/sugar position(s) '2aj8','2aoc','2aod','2aoe','2aog','1apv','1apw',##'2cgr','3gss','1elr','1elb','1hyo','1gvx','1gyx','1jet','2gss','1gyy','1gvu','1qkb','1ofz','1b9j','1jao','1jeu','1jev','10gs','11gs','1kui','1kuk','1kug','1obx','1ogx','1px4','1qka','1ur9','1e6s','1e6q','1e70', ## ACY not in PISA '2aac','2avm', ## MG positions '1af6', ## SO4 positions '2avo','2avs', ## GOL positions '2avq', ## CL positions '2avv', ## U1 positions '1b05','1b0h','1b2h','1b4h', ## IUM positions '1b1h','1b32','1b3f','1b3g','1b3h','1b3l','1b40','1b46','1b4z','1b51','1b52', ## '1h22','1h23', ## acetylcholine esterase ## acetylcholine esterase ## dimer ## 4 helix bundle interface ## 0.490nm between LYS530NZ and ASP365ODD2 ## 0.263nm between LYS530NZ and ASP369ODD2 ## hydrophobic core involving LEU366,LEU373,PHE527,LEU531 ## other AChE structures (1j06,1j07,1n5r,1n5m) have similar dimer interfaces with ligand in between interfaces ## PDB molecule of the month states it is a dimer, but no details about the dimer interface ]: continue ## ## ## d_transformations_PISA = biounit.biounit().parse_pisa_multimers(pdb) if d_transformations_PISA == {}: continue ## temporary!!! d_coordinates_PDB = {} d_lines_PDB = {} for bm in range(1,10000): if bm == 1 or os.path.isfile('/oxygenase_local/data/biounit/%s/%s.pdb%i' %(pdb[1:3],pdb,bm)): print bm fd = open('/oxygenase_local/data/biounit/%s/%s.pdb%i' %(pdb[1:3],pdb,bm),'r') lines = fd.readlines() fd.close() d_coordinates_PDB[bm], d_lines_PDB[bm] = self.parse_coordinates(lines) continue else: if bm > 3: print bm stop break l_biomolecules = d_coordinates_PDB.keys() l_assemblies = d_transformations_PISA.keys() for bm in l_biomolecules: biounits_identical = False chains_identical = False interfaces_identical = False l_chains_PDB = d_coordinates_PDB[bm]['ATOM'].keys() for assembly in l_assemblies: print bm, assembly l_chains_PISA = [] for chain in d_transformations_PISA[assembly]['chains'].keys(): if len(chain) == 1 and chain != '-': l_chains_PISA += [chain] ## print d_transformations_PISA[assembly]['chains'].keys() ## print d_chains_PDB['ATOM'][bm]+d_chains_PDB['HETATM'][bm] ## different chains (different IDs) if len(set(l_chains_PISA) ^ set(l_chains_PDB)) > 0: print 'different chains', l_chains_PDB, l_chains_PISA continue else: d_lines_PISA, d_coordinates_PISA = biounit.biounit().parse_pdb_coordinates(pdb, d_transformations_PISA, assembly) ## different chains (different number of transformations) if not ( len(d_lines_PISA[assembly]['ATOM']) % len(d_lines_PDB[bm]['ATOM']) == 0 and len(d_lines_PDB[bm]['ATOM']) % len(d_lines_PISA[assembly]['ATOM'])== 0 ): if ( len(d_lines_PISA[assembly]['ATOM']) % len(d_lines_PDB[bm]['ATOM']) != 0 and len(d_lines_PDB[bm]['ATOM']) % len(d_lines_PISA[assembly]['ATOM']) != 0 ): stop chains_identical = False print 'different chains', len(d_lines_PDB[bm]['ATOM']), len(d_lines_PISA[assembly]['ATOM']) continue ## identical chains (identical IDs, identical number of transformations) else: chains_identical = True ATOM_identical = self.identical_d_coordinates('ATOM',d_coordinates_PISA,d_coordinates_PDB,assembly,bm,) HETATM_identical = self.identical_d_coordinates('HETATM',d_coordinates_PISA,d_coordinates_PDB,assembly,bm,) print ATOM_identical, HETATM_identical if chains_identical == True and ATOM_identical == True and HETATM_identical == True: biounits_identical = True print bm,assembly,'identical' elif chains_identical == True and ATOM_identical == False and HETATM_identical == True: print bm,assembly,'different interfaces' continue elif chains_identical == True and ATOM_identical == True and HETATM_identical == False: interfaces_identical = True print bm,assembly,'different ligands' ## if ( ## len(set(d_lines_PISA[assembly]['ATOM'])^set(d_lines_PDB[bm]['ATOM'])) == 0 and ## len(set(d_lines_PISA[assembly]['HETATM'])^set(d_lines_PDB['HETATM'][bm])) == 0 ## ): ## biounits_identical = True ## print assembly,bm, 'identical' ## break ## elif len(set(d_lines_PISA[assembly]['ATOM'])^set(d_lines_PDB[bm]['ATOM'])) != 0: ## set_PISA_ATOM = set(d_lines_PISA[assembly]['ATOM'])-set(d_lines_PDB[bm]['ATOM']) ## set_PDB_ATOM = set(d_lines_PDB[bm]['ATOM'])-set(d_lines_PISA[assembly]['ATOM']) ## print len(set_PISA_ATOM), len(set_PDB_ATOM) ## print bm, assembly, 'different interfaces' ## if assembly == 4: ## a = list(set_PISA_ATOM) ## b = list(set_PDB_ATOM) ## a.sort() ## b.sort() ## print a[:10] ## print b[:10] ## stop ## continue ## else: ## interfaces_identical = True ## set_PISA_ATOM = set(d_lines_PISA[assembly]['ATOM'])-set(d_lines_PDB[bm]['ATOM']) ## set_PDB_ATOM = set(d_lines_PDB[bm]['ATOM'])-set(d_lines_PISA[assembly]['ATOM']) ## set_PISA_HETATM = set(d_lines_PISA[assembly]['HETATM'])-set(d_lines_PDB['HETATM'][bm]) ## set_PDB_HETATM = set(d_lines_PDB['HETATM'][bm])-set(d_lines_PISA[assembly]['HETATM']) ## a = list(set_PISA_HETATM) ## a.sort() ## b = list(set_PDB_HETATM) ## b.sort() ## print a ## print b ## print len(set_PISA_ATOM), len(set_PDB_ATOM) ## print len(set_PISA_HETATM), len(set_PDB_HETATM) ## print assembly, bm, 'different ligands' ## continue if biounits_identical == False: print pdb, l_pdbs.index(pdb)+1 if chains_identical == True and interfaces_identical == True: stop_ligand_differences elif chains_identical == True and interfaces_identical == False: stop_interfaces_different elif chains_identical == False and interfaces_identical == False: stop_different_multimers else: stop_not_expected elif biounits_identical == True: continue continue ## monomer in asu and biou if d_transformations_REMARK350 == {} and d_transformations_PISA == {}: continue ## asu == biou in PDB, biou == asu in PISA if d_transformations_REMARK350 == {}: for assembly in d_transformations_PISA.keys(): for chain in d_transformations_PISA[assembly]['chains'].keys(): for molecule in d_transformations_PISA[assembly]['chains'][chain].keys(): if d_transformations_PISA[assembly]['chains'][chain][molecule]['r'] != Numeric.array([[1.,0.,0.],[0.,1.,0.],[0.,0.,1.]]): stop1 if d_transformations_PISA[assembly]['chains'][chain][molecule]['t'] != Numeric.array([0.,0.,0.]): stop2 continue ## biou=asu in PISA, biou!=asu in PDB if d_transformations_PISA == {}: for biou in d_transformations_REMARK350.keys(): chains = d_transformations_REMARK350[biou]['chains'].keys() for chain in chains: matrixnos = d_transformations_REMARK350[biou]['chains'][chain] if len(matrixnos) != 1: stop2 matrix = d_transformations_REMARK350[biou]['matrices'][list(matrixnos)[0]] if matrix != [['1.000000', '0.000000', '0.000000', '0.00000'], ['0.000000', '1.000000', '0.000000', '0.00000'], ['0.000000', '0.000000', '1.000000', '0.00000']]: stop3 continue biounits = d_transformations_REMARK350.keys() ## multimer in PISA and PDB if len(biounits) != 1 and pdb not in [ ]: ## loop over biounits and replace [1] with [biounit] if this doesnt hold true!!! print d_transformations_PISA print d_transformations_REMARK350 print pdb, biounits stop_multimer_in_PISA_and_PDB ## print d_transformations_PISA ## print d_transformations_REMARK350 ## print pdb ## for assembly in d_transformations_PISA.keys(): ## size1 = 0 ## for chain_PISA in d_transformations_PISA[assembly]['chains'].keys(): ## if len(chain_PISA) == 1: ## chain = chain_PISA ## if chain == '-': ## continue ## else: ## chain = chain_PISA[chain_PISA.index(']')+1] ## if chain == '-': ## continue ## molecules = d_transformations_PISA[assembly]['chains'][chain_PISA].keys() ## size1 += len(molecules) ## print size1 ## break ## ## size2 = 0 ## for chain in d_transformations_REMARK350[1]['chains'].keys(): ## matrixnos = d_transformations_REMARK350[1]['chains'][chain] ## print chain, matrixnos ## size2 += len(matrixnos) ## print size2 ## ## if size2 > size1: ## print d_transformations_REMARK350 ## stop_maybe_water_has_chain_id d_transformations = {'chains':{},'matrices':{}} for assembly in d_transformations_PISA.keys(): matrices_identical = True chains_PISA = d_transformations_PISA[assembly]['chains'].keys() for chain_PISA in chains_PISA: ## parse PISA matrix molecules = d_transformations_PISA[assembly]['chains'][chain_PISA].keys() for molecule in molecules: r = d_transformations_PISA[assembly]['chains'][chain_PISA][molecule]['r'] t = d_transformations_PISA[assembly]['chains'][chain_PISA][molecule]['t'] ## convert PISA chain ID to default chain ID if len(chain_PISA) == 1: chain = chain_PISA if chain == '-': ## temporary!!! continue else: chain = chain_PISA[chain_PISA.index(']')+1] if chain == '-': ## temporary!!! continue ## compare PISA and REMARK350 matrices set_matrixnos = d_transformations_REMARK350[1]['chains'][chain] for matrixno in set_matrixnos: matrix_identical = True matrix_REMARK350 = d_transformations_REMARK350[1]['matrices'][matrixno] d_transformations['matrices'][matrixno] = matrix_REMARK350 for i in range(3): if ( round(float(matrix_REMARK350[i][0]),5) == round(r[i][0],5) and round(float(matrix_REMARK350[i][1]),5) == round(r[i][1],5) and round(float(matrix_REMARK350[i][2]),5) == round(r[i][2],5) and round(float(matrix_REMARK350[i][3]),5) == round(t[i],5) ): continue else: matrix_identical = False break ## continue loop over REMARK350 matrices if matrix_identical == False: continue else: if chain not in d_transformations['chains'].keys(): d_transformations['chains'][chain] = set([matrixno]) else: d_transformations['chains'][chain] |= set([matrixno]) if matrix_identical == False: stop_temporary matrix_identical = True break ## break loop over molecules if matrix_identical == False: for matrixno in set_matrixnos: print d_transformations_REMARK350[1]['matrices'][matrixno] print 'assembly', assembly print 'molecule', molecule print 'chain', chain_PISA print d_transformations_PISA[assembly]['chains'][chain_PISA] print float(matrix_REMARK350[i][0]) == round(r[i][0],6) print float(matrix_REMARK350[i][1]) == round(r[i][1],6) print float(matrix_REMARK350[i][2]) == round(r[i][2],6) print float(matrix_REMARK350[i][3]), round(t[i],6) if len(chain_PISA) == 1: stop_multimer_difference else: stop_different_ligand_locations matrices_identical = False break ## break loop over PISA chains if matrices_identical == False: break if chain != '-': if matrix_identical == False: print assembly, molecule, chain_PISA print matrices_identical stop1 ## continue loop over assemblies if matrices_identical == False: continue if matrix_identical == False: stop2 ## if matrix_identical == False: ## stop3 if d_transformations_PISA != {}: if matrices_identical == False: print d_transformations_REMARK350[1]['matrices'] print d_transformations_PISA[assembly]['chains'][chain_PISA] print assembly, molecule, chain_PISA print d_transformations_PISA.keys() stop4 if d_transformations_REMARK350[1] != d_transformations: ## if ( ## d_transformations_REMARK350[1]['matrices'].keys() != 1 and ## d_transformations_REMARK350[1]['matrices'][1] != [['1.000000', '0.000000', '0.000000', '0.00000'], ['0.000000', '1.000000', '0.000000', '0.00000'], ['0.000000', '0.000000', '1.000000', '0.00000']] ## ): ## print d_transformations_PISA print d_transformations_REMARK350[1] print d_transformations print pdb stop_PDB_larger_than_PISA return
def main(self): '''Script for comparison of biological units from the PDB and from MSD-PISA''' fd = open('pdbbind_2007.txt','r') s = fd.read() fd.close() l_pdbs = s.split() d_pdbs = {} for pdb in l_pdbs: if not pdb[1:3] in d_pdbs.keys(): d_pdbs[pdb[1:3]] = [pdb] else: d_pdbs[pdb[1:3]] += [pdb] l_subs = d_pdbs.keys() l_subs.sort() l_pdbs = [] for sub in l_subs: for pdb in d_pdbs[sub]: l_pdbs += [pdb] for pdb in l_pdbs: if l_pdbs.index(pdb)+1 < int(sys.argv[1]): continue print pdb, l_pdbs.index(pdb)+1, len(l_pdbs) if pdb == '2bsu': pdb = '2v2w' ## if pdb in [ ## '1fiv','1hef','1heg','1e5a', ## ligand overlap upon transformation, two alternative ligand binding conformations?!' e.g. 1bm7!!! ## '1osv','1oxn','1oxq','2pcp','1u9l','1igj','1jq9','1jq8','1lpk','1lpg','1lpz', ## biounits should be identical, but different number of ligands in each PDB transformed biounit (water atoms often transformed incorrectly!!) ## '1c1r', ## grey region ## ## ## ## ## multiple biounits ## ## #### '1fpu', ## ## identical multimer in PISA and PDB ## '1a4k','1a94','1pkx','1wc1','1fj4','1h1p','1cgl','1dqx','1eix','1fkn','1fl3','1h1s','1hsh','1i7z','2cht','1uw6','1jqy','1los','1lrh','1m4h','1a08','1b3l','1is0','1mh5','1mjj','1njj','1p1q','1q4k','3tmk','1umw','1uv6','1uz8', ## ## tommy error #### '2dqt', ## dimer in PISA and PDB ## ## ## ## ## other PISA multimers and/or interfaces are stable in solution ## ## ## '1qhc','1fq5','1it6','1jn4','2jxr','1yei','1tuf','1e2k','1e2n','1oe7','1e2l','1nms','1afk','1fch','1fkf','1kc7','6std','7std','1fzj','1fzk','1slg','1sle','1jyq','1kyv','1o9d','1oe8','1os0','1p19','1qca','5std','1tyr','1ugp','1v48','1vfn','1vpo','1vwl','1aqc','1ghy','2izl', ## ## ## ## ## same size multimers, but different interfaces ## ## ## '1loq','1lyx','1adl','2ans','1kll','1trd','1w72','1b55','1oko','1lyb','3pck','3pcj', ## ## ## ## ## different multimers ## ## ## ## ## dimer in PISA, monomer in PDB ## '1oar','1s39','1udt','1p6e','1caq','1rd4','1p9p','1q63','1qi0','1gpk','1lf9','1k1y','1k4g','1oim','1b8y','1gpn','1ow4','1h6h','1lee','1qy2','1q4w','1w3j','1c5s','1ciz','1d7j','1dy4','1f3e','1ghz','1imx','1j17','1jt1','1k4h','1kpm','1l2s','1lnm','1m13','1m48','1n2v','1njs','1nw7','1nw5','1oif','1p28','1q65','1q66','1q91','1qft','1qy1','1r5y','1s38','1sqn','1sw1','1uho','1uj6','1uj5','1uz1','1wm1','1xzx','5yas','1d7i', ## ## multimer in PISA, monomer in PDB ## '1b8o','1b8n','1j4r','1g7v','1lf2','830c','1jn2','1fv0','4tmk','5tmp','2usn', '1usn', ## ## ## correct multimer might be in "grey area" (deltaG_dissociation ~< 0) ## ## PQS interfaces might be different from PDB interaces ## '1b42','1bky','1bra','3mag','3mct', ## monomer in PISA, dimer in PDB ## '2csn','1iup','1gz9','1igb','1ii5','1p1n', ## monomer in PISA, dimer in PDB, dimer in PQS ## '1kdk','1lhw', ## monomer in PISA, dimer in PDB, hexamer in PQS ## '1f8d', ## monomer in PISA, tetramer in PDB, tetramer in PQS, tetramer in PISA upon removal of ligands ## '1bm7','1n51','2tmn','4tmn','5tmn','1wht', ## dimer in PISA, tetramer in PDB, tetramer in PQS ## '1m5w', ## dimer in PISA, octamer in PDB, octamer in PQS ## '1ftm', ## trimer in PISA, monomer in PDB, dimer in PQS ## '1awi', ## monomer/dimer in PISA, trimer in PDB, trimer in PQS ## '1a99', ## tetramer in PISA, dimer in PDB ## ## ## ## ## ## ligand positions ## ## ## ## identical multimers in PISA and PDB, but different ligand/ion/sugar position(s) ## '2cgr','3gss','1elr','1elb','1hyo','1gvx','1gyx','1jet','2gss','1gyy','1gvu','1qkb','1ofz','1b9j','1jao','1jeu','1jev','1af6','10gs','11gs','1kui','1kuk','1kug','1obx','1ogx','1px4','1qka','1ur9','1e6s','1e6q','1e70', ## ## '1h22','1h23', ## acetylcholine esterase ## ## acetylcholine esterase ## ## dimer ## ## 4 helix bundle interface ## ## 0.490nm between LYS530NZ and ASP365ODD2 ## ## 0.263nm between LYS530NZ and ASP369ODD2 ## ## hydrophobic core involving LEU366,LEU373,PHE527,LEU531 ## ## other AChE structures (1j06,1j07,1n5r,1n5m) have similar dimer interfaces with ligand in between interfaces ## ## PDB molecule of the month states it is a dimer, but no details about the dimer interface ## ]: ## continue ## ## ## d_transformations_PISA, d_chains_PISA = biounit.biounit().parse_pisa_multimers(pdb) ## ## ## d_transformations_REMARK350 = {} set_water = set() set_nonwater = set() fd = open('/oxygenase_local/data/pdb/%s/pdb%s.ent' %(pdb[1:3],pdb),'r') lines = fd.readlines() fd.close() for i in range(len(lines)): line = lines[i] record = line[:6].strip() if record == 'REMARK': remark = int(line[7:10]) if remark == 350: if line[11:23] == 'BIOMOLECULE:': d_transformations_REMARK350 = self.parse_REMARK350_biomolecules(d_transformations_REMARK350, lines, i) elif record in ['ATOM','HETATM',]: res_name = line[17:20] chain = line[21] if res_name == 'HOH': set_water |= set([chain]) else: set_nonwater |= set([chain]) set_water -= set_nonwater ## remove water transformations if d_transformations_REMARK350 != {}: for chain in d_transformations_REMARK350[1]['chains'].keys(): if chain in set_water: del d_transformations_REMARK350[1]['chains'][chain] fd = open('m3.txt') s = fd.read() fd.close() l_pdbs = s.split(',') print len(l_pdbs) l_pdbs = list(set(l_pdbs)) print len(l_pdbs) stop if d_transformations_REMARK350.keys() not in [[],[1],]: print d_transformations_REMARK350.keys() set_matrices = set() for chain in d_transformations_REMARK350[2]['chains'].keys(): set_matrices |= d_transformations_REMARK350[2]['chains'][chain] m1 = False m2 = False for matrix_no in set_matrices: matrix = d_transformations_REMARK350[2]['matrices'][matrix_no] if matrix == [['1.000000', '0.000000', '0.000000', '0.00000'], ['0.000000', '1.000000', '0.000000', '0.00000'], ['0.000000', '0.000000', '1.000000', '0.00000']]: m1 = True else: m2 = True if m1 == True and m2 == True: s = 'm3.txt' elif m1 == True and m2 == False: s = 'm1.txt' elif m1 == False and m2 == True: stop else: stop fd = open(s,'a') fd.write('%s,' %(pdb)) fd.close() continue else: continue ## monomer in asu and biou if d_transformations_REMARK350 == {} and d_transformations_PISA == {}: continue ## asu == biou in PDB, biou == asu in PISA if d_transformations_REMARK350 == {}: for assembly in d_transformations_PISA.keys(): for chain in d_transformations_PISA[assembly]['chains'].keys(): for molecule in d_transformations_PISA[assembly]['chains'][chain].keys(): if d_transformations_PISA[assembly]['chains'][chain][molecule]['r'] != Numeric.array([[1.,0.,0.],[0.,1.,0.],[0.,0.,1.]]): stop1 if d_transformations_PISA[assembly]['chains'][chain][molecule]['t'] != Numeric.array([0.,0.,0.]): stop2 continue ## biou=asu in PISA, biou!=asu in PDB if d_transformations_PISA == {}: for biou in d_transformations_REMARK350.keys(): chains = d_transformations_REMARK350[biou]['chains'].keys() for chain in chains: matrixnos = d_transformations_REMARK350[biou]['chains'][chain] if len(matrixnos) != 1: stop2 matrix = d_transformations_REMARK350[biou]['matrices'][list(matrixnos)[0]] if matrix != [['1.000000', '0.000000', '0.000000', '0.00000'], ['0.000000', '1.000000', '0.000000', '0.00000'], ['0.000000', '0.000000', '1.000000', '0.00000']]: stop3 continue biounits = d_transformations_REMARK350.keys() ## multimer in PISA and PDB if len(biounits) != 1 and pdb not in [ ]: ## loop over biounits and replace [1] with [biounit] if this doesnt hold true!!! print d_transformations_PISA print d_transformations_REMARK350 print pdb, biounits stop_multimer_in_PISA_and_PDB ## print d_transformations_PISA ## print d_transformations_REMARK350 ## print pdb ## for assembly in d_transformations_PISA.keys(): ## size1 = 0 ## for chain_PISA in d_transformations_PISA[assembly]['chains'].keys(): ## if len(chain_PISA) == 1: ## chain = chain_PISA ## if chain == '-': ## continue ## else: ## chain = chain_PISA[chain_PISA.index(']')+1] ## if chain == '-': ## continue ## molecules = d_transformations_PISA[assembly]['chains'][chain_PISA].keys() ## size1 += len(molecules) ## print size1 ## break ## ## size2 = 0 ## for chain in d_transformations_REMARK350[1]['chains'].keys(): ## matrixnos = d_transformations_REMARK350[1]['chains'][chain] ## print chain, matrixnos ## size2 += len(matrixnos) ## print size2 ## ## if size2 > size1: ## print d_transformations_REMARK350 ## stop_maybe_water_has_chain_id d_transformations = {'chains':{},'matrices':{}} for assembly in d_transformations_PISA.keys(): matrices_identical = True chains_PISA = d_transformations_PISA[assembly]['chains'].keys() for chain_PISA in chains_PISA: ## parse PISA matrix molecules = d_transformations_PISA[assembly]['chains'][chain_PISA].keys() for molecule in molecules: r = d_transformations_PISA[assembly]['chains'][chain_PISA][molecule]['r'] t = d_transformations_PISA[assembly]['chains'][chain_PISA][molecule]['t'] ## convert PISA chain ID to default chain ID if len(chain_PISA) == 1: chain = chain_PISA if chain == '-': ## temporary!!! continue else: chain = chain_PISA[chain_PISA.index(']')+1] if chain == '-': ## temporary!!! continue ## compare PISA and REMARK350 matrices set_matrixnos = d_transformations_REMARK350[1]['chains'][chain] for matrixno in set_matrixnos: matrix_identical = True matrix_REMARK350 = d_transformations_REMARK350[1]['matrices'][matrixno] d_transformations['matrices'][matrixno] = matrix_REMARK350 for i in range(3): if ( round(float(matrix_REMARK350[i][0]),5) == round(r[i][0],5) and round(float(matrix_REMARK350[i][1]),5) == round(r[i][1],5) and round(float(matrix_REMARK350[i][2]),5) == round(r[i][2],5) and round(float(matrix_REMARK350[i][3]),5) == round(t[i],5) ): continue else: matrix_identical = False break ## continue loop over REMARK350 matrices if matrix_identical == False: continue else: if chain not in d_transformations['chains'].keys(): d_transformations['chains'][chain] = set([matrixno]) else: d_transformations['chains'][chain] |= set([matrixno]) if matrix_identical == False: stop_temporary matrix_identical = True break ## break loop over molecules if matrix_identical == False: for matrixno in set_matrixnos: print d_transformations_REMARK350[1]['matrices'][matrixno] print 'assembly', assembly print 'molecule', molecule print 'chain', chain_PISA print d_transformations_PISA[assembly]['chains'][chain_PISA] print float(matrix_REMARK350[i][0]) == round(r[i][0],6) print float(matrix_REMARK350[i][1]) == round(r[i][1],6) print float(matrix_REMARK350[i][2]) == round(r[i][2],6) print float(matrix_REMARK350[i][3]), round(t[i],6) if len(chain_PISA) == 1: stop_multimer_difference else: stop_different_ligand_locations matrices_identical = False break ## break loop over PISA chains if matrices_identical == False: break if chain != '-': if matrix_identical == False: print assembly, molecule, chain_PISA print matrices_identical stop1 ## continue loop over assemblies if matrices_identical == False: continue if matrix_identical == False: stop2 ## if matrix_identical == False: ## stop3 if d_transformations_PISA != {}: if matrices_identical == False: print d_transformations_REMARK350[1]['matrices'] print d_transformations_PISA[assembly]['chains'][chain_PISA] print assembly, molecule, chain_PISA print d_transformations_PISA.keys() stop4 if d_transformations_REMARK350[1] != d_transformations: ## if ( ## d_transformations_REMARK350[1]['matrices'].keys() != 1 and ## d_transformations_REMARK350[1]['matrices'][1] != [['1.000000', '0.000000', '0.000000', '0.00000'], ['0.000000', '1.000000', '0.000000', '0.00000'], ['0.000000', '0.000000', '1.000000', '0.00000']] ## ): ## print d_transformations_PISA print d_transformations_REMARK350[1] print d_transformations print pdb stop_PDB_larger_than_PISA return