def slice_pdb( arguments ): pdb, resnum_to_slice_dict, save_hetatm = arguments pdbline_dict = pdb_util.create_xyzDict_bychain( pdb, save_hetatm )[1] outfn_lines = "" for chain in pdb_util.chnids: if chain in pdbline_dict.keys(): resnums = pdb_util.get_resnums_from_resnum_dict( resnum_to_slice_dict, pdbline_dict, chain ) if not resnums: continue for resnum in resnums: try: outfn_lines += pdbline_dict[ chain ][ resnum ] except KeyError: if resnum <= max(pdbline_dict[chain]): print "WARNING:", resnum, "is not present in the pdb" continue else: print "NOT" return False outfn_lines += "TER\n" # end of a chain outfn_lines += "END\n" assert outfn_lines, "nothing is going to be sliced, check your chain definition while selecting residues" outfn = pdb_util.get_uniq_outpdbname( pdb, "_sliced") buf = open( outfn, "w" ) buf.write( outfn_lines ) buf.close()
def trim_pdb( arguments ): pdb, resnum_to_trim_dict, save_hetatm = arguments pdbline_dict = pdb_util.create_xyzDict_bychain( pdb, save_hetatm )[1] trim_res_list = [] for chain in sorted( pdbline_dict.keys() ): resnums = pdb_util.get_resnums_from_resnum_dict( resnum_to_trim_dict, pdbline_dict, chain ) if not resnums: continue for resnum in resnums: try: del pdbline_dict[ chain ][ resnum ] trim_res_list.append( str(resnum)+chain ) except KeyError: if resnum <= max(pdbline_dict[chain]): print "WARNING:", resnum, "not in pdb" continue else: stderr.write("ERROR: residue %s not present in the file\n" %(str(resnum)+chain)) return False outfn_lines = "REMARK trim residues: %s\n" % (" ".join( trim_res_list )) #for chain in sorted(pdbline_dict.keys()): for chain in pdb_util.chnids: if chain in (pdbline_dict.keys()): for rsn in sorted(pdbline_dict[ chain ].keys()): outfn_lines += pdbline_dict[ chain ][ rsn ] outfn_lines += "TER\n" outfn_lines += "END\n" outfn = pdb_util.get_uniq_outpdbname(pdb, "_trim") ''' pdb = basename(pdb) tag = "_trim" outfn = pdb[:-4] + tag while exists( outfn+".pdb" ): outfn += tag print outfn outfn = outfn+".pdb" ''' buf = open( outfn, "w" ) buf.write( outfn_lines ) buf.close()
def whats_in_pdb( pdb ): pdbline_dict = pdb_util.create_xyzDict_bychain( pdb )[1] total_rsds = 0 chains = sorted( pdbline_dict.keys() ) check_chainbreak=False for chain in chains: res_list = pdbline_dict[ chain ] first_rsn = min( res_list ) last_rsn = max( res_list ) n_rsd = len( res_list ) print "%s: chain:%s, from %4s to %4s, %4s rsds" %( pdb, chain, min( res_list ), max( res_list ), len( res_list )) if first_rsn + n_rsd - 1 != last_rsn: # there is a chainbreak check_chainbreak=True total_rsds += len( res_list ) print print "found chainbreak(s):" print pdb_util.detect_chainbreaks_in_pdb( pdb ) print "[Summary] %s: %s chains (%s) and %s rsds" %( pdb, len(chains), " ".join(chains), total_rsds )
def chain_extractor( pdb ): assert exists( pdb ) pdbline_dict = pdb_util.create_xyzDict_bychain( pdb, opts.save_hetatm, opts.stripH )[1] pdbtag = basename(pdb).split(".pdb")[0] if not opts.chains: # extract all chains stderr.write("extracting all chains from %s: " %( pdb )) for chain in pdbline_dict.keys(): print chain, outlines = "" for rsn in sorted( pdbline_dict[ chain ].keys() ): outlines += pdbline_dict[ chain ][ rsn ] outpdb = open( pdbtag + opts.delimiter + chain + ".pdb", "w" ) outpdb.write( outlines ) outpdb.write( "TER\n" ) outpdb.close() print else: for chain in opts.chains: stderr.write("extracting chain %s from %s\n" %( chain, pdb )) outlines = "" if len(chain) > 1: # a batch selection, say ABC for c in chain: try: for rsn in sorted( pdbline_dict[ c ].keys() ): outlines += pdbline_dict[ c ][ rsn ] except: stderr.write("ERROR: %s does not have chain %s\n" %(pdb, c)) return False else: try: for rsn in sorted( pdbline_dict[ chain ].keys() ): outlines += pdbline_dict[ chain ][ rsn ] except: stderr.write("ERROR: %s does not have chain %s\n" %(pdb, chain)) return False outpdb = open( pdbtag + opts.delimiter + chain + ".pdb", "w" ) outpdb.write( outlines ) outpdb.write( "TER\n" ) outpdb.close()
def correct_alignment_using_pdb( alignment, sbj_pdbfile, remove_temppdb=True ): """ This function is trying to fix an alignment issue caused by the penalty of opening a gap in dynamic programming eg: 123456789 NKTTTTTKG <- ref_seq_aligned NK------G <- mistaken sbj_seq_aligned (doesn't like a gap) N------KG <- correct sbj_seq_aligned (from the pdb, KG are connected) 1 23 seq_map = { 1:1, 2:2, <- should be 2:8 3:9 } """ ref_seq_aligned, sbj_seq_aligned = ( alignment[0], alignment[1] ) # to have a standard, number residues from 1 to the end, continuously pdb_util.pdb_idx1( sbj_pdbfile, "temp.pdb" ) xyz_dict, pdbline_dict, resname_dict = pdb_util.create_xyzDict_bychain( "temp.pdb" ) if remove_temppdb: os.remove("temp.pdb") assert len( pdbline_dict.keys() ) == 1, "this script does not deal with pdbs containing multiple chains (%s)" % pdbline_dict.keys() chain = ( pdbline_dict.keys()[0] ) xyz_dict = xyz_dict[ chain ] sbj_pdb_idx1_res_nums = sorted( pdbline_dict[chain].keys() ) seq_map = seq_mapping( alignment ) corrected_seq_map = seq_map for idx, rsn in enumerate( sbj_pdb_idx1_res_nums ): try: next_rsn = sbj_pdb_idx1_res_nums[idx+1] assert next_rsn == rsn+1 # shouldn't it be rsn+1 since residues in the pdb has been reindex from 1 except: pass # do nothing since next_rsn will be equal to rsn # newrsn means the rsn you would like to number from the reference (ref_seq_aligned) newrsn = seq_map[rsn] try: next_newrsn = seq_map[next_rsn] except KeyError: sys.stderr.write("ERROR: couldn't find the next rsn in the old\n") raise # detect chain break from alignment if ( next_newrsn - newrsn ) > 1: # there's a gap #sys.stderr.write("%s, %s\n" %(newrsn, next_newrsn )) dist = pdb_util.cal_dist( xyz_dict[rsn]["CA"], xyz_dict[next_rsn]["CA"] ) # dist from old numbering sys.stderr.write("Chainbreak (from alignment) at %s(%s)-%s(%s) with dist %.3f\n" %(rsn, newrsn, next_rsn, next_newrsn, dist)) # if no physically chainbreak detected, possibly could be error in dynamic programming, trying fixing it by looking for same residue name in next_newrsn-1, if so, overwrite the corrent numbering to write to the next_newrsn-1 if ( dist <= 4.5 ): sys.stderr.write("WARNING: no gap physically detected from %s-%s; caught an error in dynamic programming...\n" %(newrsn, next_newrsn)) # index in seq, thus -1 # just to make sure this error is caused by the dynamic programming (must be same residue name) assert ( ref_seq_aligned[newrsn-1] == ref_seq_aligned[next_newrsn-2] ), "ERROR: failing to looking for same residue name as pos:%s in pos:%s of fasta file could not fix the error in dynamic programming\n%s\n%s\n%s\nIs this a partial thread?" %( newrsn, next_newrsn-1, alignment[0], alignment[1], sbj_pdbfile ) sys.stderr.write("Correcting an alignment problem caused by DP: %s(%s) -> %s(%s)\n"%( ref_seq_aligned[newrsn-1], newrsn, ref_seq_aligned[next_newrsn-2], next_newrsn-1 ) ) corrected_newrsn = next_newrsn-1 corrected_seq_map[rsn] = corrected_newrsn # make new sbj_seq_aligned corrected_sbj_seq_aligned = "" """ make dict[ ref_numbering ] = old_numbering; was dict[ old_numbering ] = ref_numbering seq_map = { 1:1, 2:8, 3:9 } thus the keys will be residues that are aligned (in reference numbering) revert_seq_map = { 1:1, 8:2, 9:3 } """ new_dict = python_util.invert_dict( corrected_seq_map ) res_numbers_aligned = new_dict.keys() for idx, rsd in enumerate(ref_seq_aligned): rsn = idx+1 if rsn in res_numbers_aligned: corrected_sbj_seq_aligned += ref_seq_aligned[idx] else: corrected_sbj_seq_aligned += "-" return (ref_seq_aligned, corrected_sbj_seq_aligned)
if __name__=="__main__": parser = ArgumentParser() parser.add_argument("pdbs", nargs="+", help="") parser.add_argument("-o", "--outfn", default="merged", help="") parser.add_argument("-p", "--prefix", default="", help="") parser.add_argument("--debug", action="store_true", help="") parser.add_argument("--save_hetatm", action="store_true", help="") args = parser.parse_args() # merge different pdbs in a chainA-Z order and resnum all_pdb_dict = {} input_res_ctr = 0 output_res_ctr = 0 for pdb in args.pdbs: pdbline_dict = pdb_util.create_xyzDict_bychain( pdb, args.save_hetatm )[1] for chain in pdbline_dict.keys(): if chain not in all_pdb_dict.keys(): all_pdb_dict[ chain ] = pdbline_dict[ chain] input_res_ctr += len( pdbline_dict[ chain ].keys() ) # serve as a sanity check to prevent two same residues present in pdbs to merge else: for resnum in pdbline_dict[ chain ].keys(): if args.debug: # this will be really slow to do a check everytime assert resnum not in all_pdb_dict[ chain ].keys(), "ERROR: residue %s has multiple copies" % resnum all_pdb_dict[ chain ][ resnum ] = pdbline_dict[ chain ][ resnum ] input_res_ctr += len( pdbline_dict[ chain ].keys() ) # serve as a sanity check to prevent two same residues present in pdbs to merge # update the line number everytime