def pdbidx1( pdbname ): with open( pdbname, "r" ) as f: outlines = pdb_util.pdb_idx1( pdbname ) outfn = pdb_util.get_uniq_outpdbname( pdbname, "_idx1") buf = open( outfn, "w" ) buf.write( outlines ) buf.close()
def correct_alignment_using_pdb( alignment, sbj_pdbfile, remove_temppdb=True ): """ This function is trying to fix an alignment issue caused by the penalty of opening a gap in dynamic programming eg: 123456789 NKTTTTTKG <- ref_seq_aligned NK------G <- mistaken sbj_seq_aligned (doesn't like a gap) N------KG <- correct sbj_seq_aligned (from the pdb, KG are connected) 1 23 seq_map = { 1:1, 2:2, <- should be 2:8 3:9 } """ ref_seq_aligned, sbj_seq_aligned = ( alignment[0], alignment[1] ) # to have a standard, number residues from 1 to the end, continuously pdb_util.pdb_idx1( sbj_pdbfile, "temp.pdb" ) xyz_dict, pdbline_dict, resname_dict = pdb_util.create_xyzDict_bychain( "temp.pdb" ) if remove_temppdb: os.remove("temp.pdb") assert len( pdbline_dict.keys() ) == 1, "this script does not deal with pdbs containing multiple chains (%s)" % pdbline_dict.keys() chain = ( pdbline_dict.keys()[0] ) xyz_dict = xyz_dict[ chain ] sbj_pdb_idx1_res_nums = sorted( pdbline_dict[chain].keys() ) seq_map = seq_mapping( alignment ) corrected_seq_map = seq_map for idx, rsn in enumerate( sbj_pdb_idx1_res_nums ): try: next_rsn = sbj_pdb_idx1_res_nums[idx+1] assert next_rsn == rsn+1 # shouldn't it be rsn+1 since residues in the pdb has been reindex from 1 except: pass # do nothing since next_rsn will be equal to rsn # newrsn means the rsn you would like to number from the reference (ref_seq_aligned) newrsn = seq_map[rsn] try: next_newrsn = seq_map[next_rsn] except KeyError: sys.stderr.write("ERROR: couldn't find the next rsn in the old\n") raise # detect chain break from alignment if ( next_newrsn - newrsn ) > 1: # there's a gap #sys.stderr.write("%s, %s\n" %(newrsn, next_newrsn )) dist = pdb_util.cal_dist( xyz_dict[rsn]["CA"], xyz_dict[next_rsn]["CA"] ) # dist from old numbering sys.stderr.write("Chainbreak (from alignment) at %s(%s)-%s(%s) with dist %.3f\n" %(rsn, newrsn, next_rsn, next_newrsn, dist)) # if no physically chainbreak detected, possibly could be error in dynamic programming, trying fixing it by looking for same residue name in next_newrsn-1, if so, overwrite the corrent numbering to write to the next_newrsn-1 if ( dist <= 4.5 ): sys.stderr.write("WARNING: no gap physically detected from %s-%s; caught an error in dynamic programming...\n" %(newrsn, next_newrsn)) # index in seq, thus -1 # just to make sure this error is caused by the dynamic programming (must be same residue name) assert ( ref_seq_aligned[newrsn-1] == ref_seq_aligned[next_newrsn-2] ), "ERROR: failing to looking for same residue name as pos:%s in pos:%s of fasta file could not fix the error in dynamic programming\n%s\n%s\n%s\nIs this a partial thread?" %( newrsn, next_newrsn-1, alignment[0], alignment[1], sbj_pdbfile ) sys.stderr.write("Correcting an alignment problem caused by DP: %s(%s) -> %s(%s)\n"%( ref_seq_aligned[newrsn-1], newrsn, ref_seq_aligned[next_newrsn-2], next_newrsn-1 ) ) corrected_newrsn = next_newrsn-1 corrected_seq_map[rsn] = corrected_newrsn # make new sbj_seq_aligned corrected_sbj_seq_aligned = "" """ make dict[ ref_numbering ] = old_numbering; was dict[ old_numbering ] = ref_numbering seq_map = { 1:1, 2:8, 3:9 } thus the keys will be residues that are aligned (in reference numbering) revert_seq_map = { 1:1, 8:2, 9:3 } """ new_dict = python_util.invert_dict( corrected_seq_map ) res_numbers_aligned = new_dict.keys() for idx, rsd in enumerate(ref_seq_aligned): rsn = idx+1 if rsn in res_numbers_aligned: corrected_sbj_seq_aligned += ref_seq_aligned[idx] else: corrected_sbj_seq_aligned += "-" return (ref_seq_aligned, corrected_sbj_seq_aligned)