def match_numbering(ref_pdb, renum_pdb): """ Given two perfectly aligned pdbs (allowing offset in residue numbering), match the numbering in renum_pdb to ref_pdb It is definitely somewhat redundant to the: def renumber_pdb( full_length_fasta_fn, truncated_pdb_fn, alignments=None ): where the perfect function will be spliting sequences mapping (take the index mapping dictionary), and renumbering as two separate functions I did it for the quick and dirty trying not to mess up the working renumber_pdb() function """ ref_fasta = seq_util.pdb2fasta(ref_pdb) renum_fasta = seq_util.pdb2fasta(renum_pdb) if ref_fasta != renum_fasta and ref_fasta.index(renum_fasta): offset = ref_fasta.index(renum_fasta) idx2refpdbnum = idx_to_pdbnum(ref_pdb, offset) pdb_as_ref = ref_fasta pdb_to_renum = offset * "-" + renum_fasta assert len(pdb_as_ref) == len(pdb_to_renum) print "pdb_as_ref: %s" % pdb_as_ref print "pdb_to_renum: %s" % pdb_to_renum else: assert ref_fasta == renum_fasta idx2refpdbnum = idx_to_pdbnum(ref_pdb) # pdbline_dict = create_xyzDict_bychain(renum_pdb)[1] chains = pdbline_dict.keys() assert len(chains) == 1, "this script does not deal with pdbs containing multiple chains" pdbline_dict = pdbline_dict[chains[0]] res_nums = sorted(pdbline_dict.keys()) # pdbline_dict[ chain ].keys() print res_nums output_line = "" for idx, rsn in enumerate(res_nums): newrsn = idx2refpdbnum[idx] # write to the outfile based on for line in pdbline_dict[rsn].split("\n")[:-1]: # [:-1], because the last item in the list is '' output_line += line[0:22] + "%4s" % newrsn + line[26:] + "\n" output_fn = get_uniq_outpdbname(renum_pdb, "_matched") output_buff = open(output_fn, "w") output_buff.write("REMARK numberings are matched to %s\n" % ref_pdb) output_buff.write(output_line) output_buff.write("TER\n") output_buff.close() return output_fn
def align_and_renumber_pdb(fulllength_fasta, truncated_pdbfile, ignore_check=False): """ This function is going to make the renumber_pdb() obsolete """ # make alignment fl_seq = seq_util.fasta_file_reader(fulllength_fasta) tc_seq = seq_util.pdb2fasta(truncated_pdbfile) alignment = alignment_util.align_two_seqs(fl_seq, tc_seq) if ignore_check: pdb_idx1( truncated_pdbfile, "temp.pdb" ) # for the following step, this has been used in alignment_util.correct_alignment_using_pdb else: alignment = alignment_util.correct_alignment_using_pdb(alignment, truncated_pdbfile, False) seq_map = alignment_util.seq_mapping(alignment) xyz_dict, pdbline_dict, resname_dict = create_xyzDict_bychain("temp.pdb") assert len(pdbline_dict.keys()) == 1, ( "this script does not deal with pdbs containing multiple chains (%s)" % pdbline_dict.keys() ) chain = pdbline_dict.keys()[0] xyz_dict = xyz_dict[chain] pdbline_dict = pdbline_dict[chain] resname_dict = resname_dict[chain] res_nums = sorted(pdbline_dict.keys()) out_pdblines = "REMARK full_length_aln %s\n" % alignment[0] out_pdblines += "REMARK truncated_aln %s\n" % alignment[1] for idx, rsn in enumerate(res_nums): newrsn = seq_map[rsn] for line in pdbline_dict[rsn].split("\n")[:-1]: # [:-1], because the last item in the list is '' out_pdblines += line[0:22] + "%4s" % newrsn + line[26:] + "\n" out_pdblines += "TER\n" os.remove("temp.pdb") return out_pdblines
def renumber_pdb(full_length_fasta_fn, truncated_pdb_fn, alignments=None): """ renumber the pdb based on the alignment """ """ future plan: separate the renumber function into two functions 1. make a mapping function to return a mapping_Dict based on the alignments 2. renumber pdb based on the mapping_Dict 140914: fixed a bug caused by an error introduced by dynamic programming, where an residue name in the begining of a gap is the same as the end of the gap KTTTTTKG <- query sequence K------G <- mistaken alignment ------KG <- correct alignment """ """ the alignment should be in a list as two strings ["query_aln", "template_aln"] """ from sys import stderr import os if not alignments: # get sequences import seq_util truncated_seq = seq_util.pdb2fasta(truncated_pdb_fn) full_length_seq = seq_util.fasta_file_reader(full_length_fasta_fn) if not truncated_seq.strip() or not full_length_seq.strip(): print "ERROR:" exit() # get alignment from biopython from Bio import pairwise2 align_results = pairwise2.align.globalms( full_length_seq, truncated_seq, 5, -5, -15, -0.5, penalize_end_gaps=False ) if not align_results: print full_length_seq print ">", truncated_seq exit() alignments = align_results[0] # print # print "> Biopython pairwise2 globalms" # print " match: 5, mismatch= -5" # print " gap_open_penalty: -15, gap_extend_penalty: -0.5" # print " penalize_end_gaps=False" # from Bio.SubsMat.MatrixInfo import blosum62 # alignments = pairwise2.align.globalds( full_length_seq, truncated_seq, blosum62, -15, -0.1 )[0] else: print "alignments are given by the argument" # create a map, based on the alignments assert alignments full_length_aln = alignments[0] truncated_aln = alignments[1] print "full_length_aln ", full_length_aln print "truncated_aln ", truncated_aln print """debug: print out the index in the alignments count = 1 for f, t in zip( full_length_aln, truncated_aln ): print "%3s %s %s" %( count, f, t ) count += 1 #""" assert len(full_length_aln) == len(truncated_aln) assert "-" not in full_length_aln, "this is a really dumb script, doesn't renum pdbs with aa unmatched" # sequence mapping index = 1 map_Dict = {} full_length_rsn = 1 for str_idx in range(0, len(full_length_aln)): """ in this loop, there is no way you should check this since compared to the partial_thread alignments, the full_length_aln is from fasta directly, there is no "-" at all here """ """if full_length_aln.strip()[str_idx] == "-": str_idx -= 1 full_length_rsn -= 1""" if truncated_aln.strip()[str_idx] == "-": full_length_rsn += 1 continue """debug print "%3s %3s %s %s %3s" %(str_idx+1, full_length_rsn, full_length_aln.strip()[str_idx], truncated_aln.strip()[str_idx], index)#""" if index not in map_Dict.keys(): map_Dict[index] = full_length_rsn else: from sys import stderr stderr.write("ERROR: something is wrong when indexing the alignments mapping.\n") return 0 # print index, dict[str(index)] index += 1 full_length_rsn += 1 """# debug for i in range( 0, len( full_length_aln )): print i, map_Dict[ i ]#""" # renumber the pdb based on the alignment mapping from os import popen, remove, system """because the mapping assumes the truncated_pdb_fn starts with residue number 1, we need to renumber the input pdb to start at one""" # should probably get rid of this dependency in the future temp_file = basename(truncated_pdb_fn.split(".pdb")[0]) + ".temp.pdb" renumberPDBs_script = "/net/em-stor4/Volumes/data/wangyr/scripts/pdb_utils/renumberPDBs.pl" assert os.path.exists(renumberPDBs_script), "Error: %s doesn't exist" % renumberPDBs_script os.system("%s -pdbfile %s -res1 1 > %s" % (renumberPDBs_script, truncated_pdb_fn, temp_file)) xyz_dict, pdbline_dict, resname_dict = create_xyzDict_bychain(temp_file) assert len(pdbline_dict.keys()) == 1, ( "this script does not deal with pdbs containing multiple chains (%s)" % pdbline_dict.keys() ) chain = pdbline_dict.keys()[0] xyz_dict = xyz_dict[chain] pdbline_dict = pdbline_dict[chain] resname_dict = resname_dict[chain] res_nums = sorted(pdbline_dict.keys()) output_line = "" for idx, rsn in enumerate(res_nums): try: next_rsn = res_nums[idx + 1] except: pass # do nothing since next_rsn will be equal to rsn newrsn = map_Dict[rsn] next_newrsn = map_Dict[next_rsn] # detect chain break from alignment if (next_newrsn - newrsn) > 1: dist = cal_dist(xyz_dict[next_rsn]["CA"], xyz_dict[rsn]["CA"]) # dist from old numbering stderr.write("chainbreak (from alignment) at %s-%s with dist %.3f\n" % (newrsn, next_newrsn, dist)) # if no physically chainbreak detected, possibly could be error in dynamic programming, trying fixing it by looking for same residue name in next_newrsn-1, if so, overwrite the corrent numbering to write to the next_newrsn-1 if dist <= 4.5: stderr.write( "WARNING: no gap physically detected from %s-%s; caught an error in dynamic programming; correcting now...\n" % (newrsn, next_newrsn) ) # index in seq, thus -1 assert full_length_seq[newrsn - 1] == full_length_seq[next_newrsn - 2], ( "ERROR: failing to looking for same residue name as pos:%s in pos:%s of fasta file could not fix the error in dynamic programming\n" % (newrsn, next_newrsn - 1) ) newrsn = next_newrsn - 1 # write to the outfile based on for line in pdbline_dict[rsn].split("\n")[:-1]: # [:-1], because the last item in the list is '' output_line += line[0:22] + "%4s" % newrsn + line[26:] + "\n" output_fn = basename(truncated_pdb_fn[:-4]) + "_renum.pdb" output_buff = open(output_fn, "w") output_buff.write("REMARK full_length_aln %s\n" % full_length_aln) output_buff.write("REMARK truncated_aln %s\n" % truncated_aln) output_buff.write(output_line) output_buff.write("TER\n") output_buff.close() os.remove(temp_file) return output_fn
trimmed_fasta = parser.add_mutually_exclusive_group() trimmed_fasta.add_argument("--truncated_pdb") trimmed_fasta.add_argument("--truncated_fasta") parser.add_argument("--outfile_tag", default="trimmed", help="") parser.add_argument("--debug", action="store_true", help="") opts = parser.parse_args() # read into fragment as frag[pos] = fragments for fragfile in opts.fragfiles: frag_dict = frag_util.read_fragfile(fragfile) frag_len = frag_util.get_fraglen(frag_dict) fl_seq = seq_util.fasta_file_reader(opts.fragfile_fasta) if opts.truncated_pdb: tc_seq = seq_util.pdb2fasta(opts.truncated_pdb) alignment = alignment_util.correct_alignment_using_pdb( alignment_util.align_two_seqs(fl_seq, tc_seq), opts.truncated_pdb ) elif opts.truncated_fasta: tc_seq = seq_util.fasta_file_reader(opts.truncated_fasta) alignment = alignment_util.align_two_seqs(fl_seq, tc_seq) else: sys.stderr.write("ERROR: you need to either give --truncated_pdb or --truncated_fasta\n") exit() chainbreak_resnums = frag_util.get_positions_to_skip_from_alignment(alignment, frag_len) seq_map = alignment_util.seq_mapping(alignment) residues = sorted(seq_map.keys())