def align_and_renumber_pdb(fulllength_fasta, truncated_pdbfile, ignore_check=False): """ This function is going to make the renumber_pdb() obsolete """ # make alignment fl_seq = seq_util.fasta_file_reader(fulllength_fasta) tc_seq = seq_util.pdb2fasta(truncated_pdbfile) alignment = alignment_util.align_two_seqs(fl_seq, tc_seq) if ignore_check: pdb_idx1( truncated_pdbfile, "temp.pdb" ) # for the following step, this has been used in alignment_util.correct_alignment_using_pdb else: alignment = alignment_util.correct_alignment_using_pdb(alignment, truncated_pdbfile, False) seq_map = alignment_util.seq_mapping(alignment) xyz_dict, pdbline_dict, resname_dict = create_xyzDict_bychain("temp.pdb") assert len(pdbline_dict.keys()) == 1, ( "this script does not deal with pdbs containing multiple chains (%s)" % pdbline_dict.keys() ) chain = pdbline_dict.keys()[0] xyz_dict = xyz_dict[chain] pdbline_dict = pdbline_dict[chain] resname_dict = resname_dict[chain] res_nums = sorted(pdbline_dict.keys()) out_pdblines = "REMARK full_length_aln %s\n" % alignment[0] out_pdblines += "REMARK truncated_aln %s\n" % alignment[1] for idx, rsn in enumerate(res_nums): newrsn = seq_map[rsn] for line in pdbline_dict[rsn].split("\n")[:-1]: # [:-1], because the last item in the list is '' out_pdblines += line[0:22] + "%4s" % newrsn + line[26:] + "\n" out_pdblines += "TER\n" os.remove("temp.pdb") return out_pdblines
def is_within_junction( args, scorefile ): ''' get fastaA positions that could have closab_scores with fastaB ''' ''' because of the setup of calculating closab score: only look forward to do the pair calculatation, we only care about fastaA ''' pos = get_pos( scorefile ) cutpoint = len( seq_util.fasta_file_reader( args.fastaA ) ) lower_junction = cutpoint + 1 - args.closabscore_gapsize lower_frag_pos = lower_junction - args.mer + 1 upper_frag_pos = cutpoint - args.mer + 1 upper_junction = pos + args.mer - 1 + args.closabscore_gapsize rsds_in_fastaA_to_clean = range( lower_frag_pos, upper_frag_pos+1 ) rsds_in_fastaB_to_clean = range( cutpoint+1, upper_junction + 1 ) if pos in rsds_in_fastaA_to_clean: return rsds_in_fastaB_to_clean else: stderr.write("pos: %s is not within junction regions(%s)\n" %( pos, " ".join( map( str, rsds_in_fastaA_to_clean )))) return None
from os.path import basename, exists def renum_frag( fragfn, offset ): fragfn = basename( fragfn ) assert fragfn.startswith("after"), fragfn ls = fragfn.split(".") pos = int(float( ls[2] )) new_pos = pos + offset return "%s.%s.%s.%s.%s.%s.pdb" %( ls[0], ls[1], new_pos, ls[3], ls[4], ls[5] ) if __name__=="__main__": parser = ArgumentParser() parser.add_argument("-f", "--fragfiles", required=True, nargs="+" ) parser.add_argument("-a", "--fastaA", required=True ) parser.add_argument("--target_dir", default="./" ) args = parser.parse_args() #from Bio import pairwise2, SeqIO #offset = len( str( SeqIO.read( args.fastaA, "fasta" ).seq ) ) offset = len( seq_util.fasta_file_reader( args.fastaA ) ) print "offset:", offset for fragfn in args.fragfiles: new_fragfn = renum_frag( fragfn, offset ) #print "mv", fragfn, new_fragfn #shutil.copy( fragfn, args.target_dir+new_fragfn ) link = args.target_dir+new_fragfn if exists( link ): os.unlink( link ) os.symlink( fragfn, link )
def printer( rsd_list ): print "# %s rsds: %s" %( len(rsd_list), " ".join( map( str, rsd_list ) ) ) for rsd in rsd_list: print rsd if __name__=="__main__": parser = ArgumentParser() parser.add_argument("--pdb", required=True, help="") parser.add_argument("-f1", "--fastaA", required=True, help="") parser.add_argument("-f2", "--fastaB", required=True, help="") parser.add_argument("-p", "--print_rsds", choices=["A", "B"], required=True, help="") parser.add_argument("--assigned_rsds", action="store_true", default=False, help="") args = parser.parse_args() seqA = ( seq_util.fasta_file_reader( args.fastaA ) ) offset = len(seqA) seqB = ( seq_util.fasta_file_reader( args.fastaB ) ) seqAB = seqA+seqB seqAB_rsds = range( 1, len(seqAB)+1 ) #print len(seqAB_rsds) xyz_dict, junk, pdbline_dict = pdb_util.create_xyzDict( args.pdb ) assigned_rsds = xyz_dict.keys() #print len(assigned_rsds) unassigned_rsds = list( set(seqAB_rsds) - set(assigned_rsds) ) if args.assigned_rsds: print "# assigned_rsds" rsd_list = assigned_rsds else:
def renumber_pdb(full_length_fasta_fn, truncated_pdb_fn, alignments=None): """ renumber the pdb based on the alignment """ """ future plan: separate the renumber function into two functions 1. make a mapping function to return a mapping_Dict based on the alignments 2. renumber pdb based on the mapping_Dict 140914: fixed a bug caused by an error introduced by dynamic programming, where an residue name in the begining of a gap is the same as the end of the gap KTTTTTKG <- query sequence K------G <- mistaken alignment ------KG <- correct alignment """ """ the alignment should be in a list as two strings ["query_aln", "template_aln"] """ from sys import stderr import os if not alignments: # get sequences import seq_util truncated_seq = seq_util.pdb2fasta(truncated_pdb_fn) full_length_seq = seq_util.fasta_file_reader(full_length_fasta_fn) if not truncated_seq.strip() or not full_length_seq.strip(): print "ERROR:" exit() # get alignment from biopython from Bio import pairwise2 align_results = pairwise2.align.globalms( full_length_seq, truncated_seq, 5, -5, -15, -0.5, penalize_end_gaps=False ) if not align_results: print full_length_seq print ">", truncated_seq exit() alignments = align_results[0] # print # print "> Biopython pairwise2 globalms" # print " match: 5, mismatch= -5" # print " gap_open_penalty: -15, gap_extend_penalty: -0.5" # print " penalize_end_gaps=False" # from Bio.SubsMat.MatrixInfo import blosum62 # alignments = pairwise2.align.globalds( full_length_seq, truncated_seq, blosum62, -15, -0.1 )[0] else: print "alignments are given by the argument" # create a map, based on the alignments assert alignments full_length_aln = alignments[0] truncated_aln = alignments[1] print "full_length_aln ", full_length_aln print "truncated_aln ", truncated_aln print """debug: print out the index in the alignments count = 1 for f, t in zip( full_length_aln, truncated_aln ): print "%3s %s %s" %( count, f, t ) count += 1 #""" assert len(full_length_aln) == len(truncated_aln) assert "-" not in full_length_aln, "this is a really dumb script, doesn't renum pdbs with aa unmatched" # sequence mapping index = 1 map_Dict = {} full_length_rsn = 1 for str_idx in range(0, len(full_length_aln)): """ in this loop, there is no way you should check this since compared to the partial_thread alignments, the full_length_aln is from fasta directly, there is no "-" at all here """ """if full_length_aln.strip()[str_idx] == "-": str_idx -= 1 full_length_rsn -= 1""" if truncated_aln.strip()[str_idx] == "-": full_length_rsn += 1 continue """debug print "%3s %3s %s %s %3s" %(str_idx+1, full_length_rsn, full_length_aln.strip()[str_idx], truncated_aln.strip()[str_idx], index)#""" if index not in map_Dict.keys(): map_Dict[index] = full_length_rsn else: from sys import stderr stderr.write("ERROR: something is wrong when indexing the alignments mapping.\n") return 0 # print index, dict[str(index)] index += 1 full_length_rsn += 1 """# debug for i in range( 0, len( full_length_aln )): print i, map_Dict[ i ]#""" # renumber the pdb based on the alignment mapping from os import popen, remove, system """because the mapping assumes the truncated_pdb_fn starts with residue number 1, we need to renumber the input pdb to start at one""" # should probably get rid of this dependency in the future temp_file = basename(truncated_pdb_fn.split(".pdb")[0]) + ".temp.pdb" renumberPDBs_script = "/net/em-stor4/Volumes/data/wangyr/scripts/pdb_utils/renumberPDBs.pl" assert os.path.exists(renumberPDBs_script), "Error: %s doesn't exist" % renumberPDBs_script os.system("%s -pdbfile %s -res1 1 > %s" % (renumberPDBs_script, truncated_pdb_fn, temp_file)) xyz_dict, pdbline_dict, resname_dict = create_xyzDict_bychain(temp_file) assert len(pdbline_dict.keys()) == 1, ( "this script does not deal with pdbs containing multiple chains (%s)" % pdbline_dict.keys() ) chain = pdbline_dict.keys()[0] xyz_dict = xyz_dict[chain] pdbline_dict = pdbline_dict[chain] resname_dict = resname_dict[chain] res_nums = sorted(pdbline_dict.keys()) output_line = "" for idx, rsn in enumerate(res_nums): try: next_rsn = res_nums[idx + 1] except: pass # do nothing since next_rsn will be equal to rsn newrsn = map_Dict[rsn] next_newrsn = map_Dict[next_rsn] # detect chain break from alignment if (next_newrsn - newrsn) > 1: dist = cal_dist(xyz_dict[next_rsn]["CA"], xyz_dict[rsn]["CA"]) # dist from old numbering stderr.write("chainbreak (from alignment) at %s-%s with dist %.3f\n" % (newrsn, next_newrsn, dist)) # if no physically chainbreak detected, possibly could be error in dynamic programming, trying fixing it by looking for same residue name in next_newrsn-1, if so, overwrite the corrent numbering to write to the next_newrsn-1 if dist <= 4.5: stderr.write( "WARNING: no gap physically detected from %s-%s; caught an error in dynamic programming; correcting now...\n" % (newrsn, next_newrsn) ) # index in seq, thus -1 assert full_length_seq[newrsn - 1] == full_length_seq[next_newrsn - 2], ( "ERROR: failing to looking for same residue name as pos:%s in pos:%s of fasta file could not fix the error in dynamic programming\n" % (newrsn, next_newrsn - 1) ) newrsn = next_newrsn - 1 # write to the outfile based on for line in pdbline_dict[rsn].split("\n")[:-1]: # [:-1], because the last item in the list is '' output_line += line[0:22] + "%4s" % newrsn + line[26:] + "\n" output_fn = basename(truncated_pdb_fn[:-4]) + "_renum.pdb" output_buff = open(output_fn, "w") output_buff.write("REMARK full_length_aln %s\n" % full_length_aln) output_buff.write("REMARK truncated_aln %s\n" % truncated_aln) output_buff.write(output_line) output_buff.write("TER\n") output_buff.close() os.remove(temp_file) return output_fn
#!/usr/local/bin/python2.7 from argparse import ArgumentParser from sys import exit, stderr, stdout from os.path import exists from os import system, stat, remove from alignment_util import alignment_parser from seq_util import fasta_file_reader if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("-a", "--alignment_fn", required=True, type=str, help="hhpred results id") parser.add_argument("-f", "--fasta_fn", required=True, type=str, help="hhpred results id") options = parser.parse_args() fasta_seq = fasta_file_reader(options.fasta_fn) alignment_Dict = alignment_parser(options.alignment_fn) for aln_id in alignment_Dict.keys(): orig_alignment_List = alignment_Dict[aln_id].split("\n")[:-1] orig_query_alignment = orig_alignment_List[3] orig_template_alignment = orig_alignment_List[4] idx = int(orig_query_alignment.split()[0]) orig_query_seq = orig_query_alignment.split()[1].replace("-", "") if fasta_seq[:20] != orig_query_seq[:20]: # stderr.write("%s\n%s\n%s\n" %( aln_id, fasta_seq[:20], orig_query_seq[:20] )) try: idx = fasta_seq.index(orig_query_seq) except:
#!/usr/bin/env python2.7 import argparse from seq_util import fasta_file_reader from sys import argv if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('fasta') parser.add_argument('-n', "--num_to_print", type=int ) parser.add_argument('-a', "--print_all_res", action="store_true", default=False ) parser.add_argument("--slice", nargs="+", help="1:200 210:300") args = parser.parse_args() fasta_seq = fasta_file_reader( args.fasta ) tag = open( args.fasta, "r" ).readline() if not args.num_to_print and not args.print_all_res and not args.slice: # if no argument pass print tag.strip(), len(fasta_seq), "aa" print fasta_seq elif args.num_to_print: print fasta_seq[args.num_to_print - 1] elif args.slice: assert len( args.slice ) <= 2 start = int(args.slice[0]) try: end = int(args.slice[1]) except: end = len( fasta_seq )
chunk_dict[ idx ].append(i) i+=1 #print idx, chunk_dict[idx] i+=1 idx=i return chunk_dict if __name__=="__main__": parser = ArgumentParser() parser.add_argument("-p", "--pdb", required=True, help="") parser.add_argument("-f", "--fasta", required=True, help="") parser.add_argument("-c", "--chewing", default=2, type=int, help="") args = parser.parse_args() seq = ( seq_util.fasta_file_reader( args.fasta ) ) seq_dict = {} for i in range(1,len(seq)+1): seq_dict[i]=0 xyz_dict, junk, pdbline_dict, k = pdb_util.create_xyzDict( args.pdb ) for rsd in xyz_dict.keys(): if seq_dict.has_key( rsd ): seq_dict[rsd] = 1 ''' # print states for debugging prev_state = 0 i=1 chunk_dict = {} while i <= len(seq_dict.keys()):
parser.add_argument("--fragfile_fasta", required=True, help="") trimmed_fasta = parser.add_mutually_exclusive_group() trimmed_fasta.add_argument("--truncated_pdb") trimmed_fasta.add_argument("--truncated_fasta") parser.add_argument("--outfile_tag", default="trimmed", help="") parser.add_argument("--debug", action="store_true", help="") opts = parser.parse_args() # read into fragment as frag[pos] = fragments for fragfile in opts.fragfiles: frag_dict = frag_util.read_fragfile(fragfile) frag_len = frag_util.get_fraglen(frag_dict) fl_seq = seq_util.fasta_file_reader(opts.fragfile_fasta) if opts.truncated_pdb: tc_seq = seq_util.pdb2fasta(opts.truncated_pdb) alignment = alignment_util.correct_alignment_using_pdb( alignment_util.align_two_seqs(fl_seq, tc_seq), opts.truncated_pdb ) elif opts.truncated_fasta: tc_seq = seq_util.fasta_file_reader(opts.truncated_fasta) alignment = alignment_util.align_two_seqs(fl_seq, tc_seq) else: sys.stderr.write("ERROR: you need to either give --truncated_pdb or --truncated_fasta\n") exit() chainbreak_resnums = frag_util.get_positions_to_skip_from_alignment(alignment, frag_len) seq_map = alignment_util.seq_mapping(alignment)