Python fasta_file_readerの例、seq_util.fasta_file_reader Pythonの例

コード例 #1

0

ファイルを表示

ファイル: pdb_util.py プロジェクト: raywangyr/scripts

def align_and_renumber_pdb(fulllength_fasta, truncated_pdbfile, ignore_check=False):
    """ This function is going to make the renumber_pdb() obsolete """
    # make alignment
    fl_seq = seq_util.fasta_file_reader(fulllength_fasta)
    tc_seq = seq_util.pdb2fasta(truncated_pdbfile)
    alignment = alignment_util.align_two_seqs(fl_seq, tc_seq)

    if ignore_check:
        pdb_idx1(
            truncated_pdbfile, "temp.pdb"
        )  # for the following step, this has been used in alignment_util.correct_alignment_using_pdb
    else:
        alignment = alignment_util.correct_alignment_using_pdb(alignment, truncated_pdbfile, False)

    seq_map = alignment_util.seq_mapping(alignment)

    xyz_dict, pdbline_dict, resname_dict = create_xyzDict_bychain("temp.pdb")
    assert len(pdbline_dict.keys()) == 1, (
        "this script does not deal with pdbs containing multiple chains (%s)" % pdbline_dict.keys()
    )
    chain = pdbline_dict.keys()[0]

    xyz_dict = xyz_dict[chain]
    pdbline_dict = pdbline_dict[chain]
    resname_dict = resname_dict[chain]
    res_nums = sorted(pdbline_dict.keys())

    out_pdblines = "REMARK full_length_aln %s\n" % alignment[0]
    out_pdblines += "REMARK truncated_aln   %s\n" % alignment[1]

    for idx, rsn in enumerate(res_nums):
        newrsn = seq_map[rsn]
        for line in pdbline_dict[rsn].split("\n")[:-1]:  # [:-1], because the last item in the list is ''
            out_pdblines += line[0:22] + "%4s" % newrsn + line[26:] + "\n"
    out_pdblines += "TER\n"

    os.remove("temp.pdb")

    return out_pdblines

コード例 #2

0

ファイルを表示

ファイル: junction_closabscore_cleaner.py プロジェクト: raywangyr/scripts

def is_within_junction( args, scorefile ):
    ''' get fastaA positions that could have closab_scores with fastaB ''' 
    ''' because of the setup of calculating closab score: only look forward to do the pair calculatation, we only care about fastaA '''

    pos = get_pos( scorefile )
    cutpoint = len( seq_util.fasta_file_reader( args.fastaA ) )
    
    lower_junction = cutpoint + 1 - args.closabscore_gapsize
    lower_frag_pos = lower_junction - args.mer + 1

    upper_frag_pos = cutpoint - args.mer + 1
    upper_junction = pos + args.mer - 1 + args.closabscore_gapsize 


    rsds_in_fastaA_to_clean = range( lower_frag_pos, upper_frag_pos+1 )
    rsds_in_fastaB_to_clean = range( cutpoint+1, upper_junction + 1 )

    if pos in rsds_in_fastaA_to_clean:
        return rsds_in_fastaB_to_clean
    else:
        stderr.write("pos: %s is not within junction regions(%s)\n" %( pos, " ".join( map( str, rsds_in_fastaA_to_clean ))))
        return None

コード例 #3

0

ファイルを表示

ファイル: renum_frags.py プロジェクト: raywangyr/scripts

from os.path import basename, exists

def renum_frag( fragfn, offset ):
    fragfn = basename( fragfn )
    assert fragfn.startswith("after"), fragfn
    ls = fragfn.split(".")
    pos = int(float( ls[2] ))
    new_pos = pos + offset

    return "%s.%s.%s.%s.%s.%s.pdb" %( ls[0], ls[1], new_pos, ls[3], ls[4], ls[5] )

if __name__=="__main__":
    parser = ArgumentParser()
    parser.add_argument("-f", "--fragfiles", required=True, nargs="+" )
    parser.add_argument("-a", "--fastaA", required=True )
    parser.add_argument("--target_dir", default="./" )
    args = parser.parse_args()

    #from Bio import pairwise2, SeqIO
    #offset = len( str( SeqIO.read( args.fastaA, "fasta" ).seq ) )
    offset = len( seq_util.fasta_file_reader( args.fastaA ) )
    print "offset:", offset

    for fragfn in args.fragfiles:
        new_fragfn = renum_frag( fragfn, offset )
        #print "mv", fragfn, new_fragfn
        #shutil.copy( fragfn, args.target_dir+new_fragfn )
        link = args.target_dir+new_fragfn
        if exists( link ): os.unlink( link )
        os.symlink( fragfn, link )

コード例 #4

0

ファイルを表示

ファイル: print_out_remaining_rsds_given_averagemodel.py プロジェクト: raywangyr/scripts

def printer( rsd_list ):
    print "# %s rsds: %s" %( len(rsd_list), " ".join( map( str, rsd_list ) ) )
    for rsd in rsd_list:
        print rsd

if __name__=="__main__":
    parser = ArgumentParser()
    parser.add_argument("--pdb", required=True, help="")
    parser.add_argument("-f1", "--fastaA", required=True, help="")
    parser.add_argument("-f2", "--fastaB", required=True, help="")
    parser.add_argument("-p", "--print_rsds", choices=["A", "B"], required=True, help="")
    parser.add_argument("--assigned_rsds", action="store_true", default=False, help="")
    args = parser.parse_args()

    seqA = ( seq_util.fasta_file_reader( args.fastaA ) )
    offset = len(seqA)
    seqB = ( seq_util.fasta_file_reader( args.fastaB ) )
    seqAB = seqA+seqB
    seqAB_rsds = range( 1, len(seqAB)+1 )
    #print len(seqAB_rsds)
    
    xyz_dict, junk, pdbline_dict = pdb_util.create_xyzDict( args.pdb )
    assigned_rsds = xyz_dict.keys()
    #print len(assigned_rsds)
    unassigned_rsds = list( set(seqAB_rsds) - set(assigned_rsds) )

    if args.assigned_rsds:
        print "# assigned_rsds"
        rsd_list = assigned_rsds
    else:

コード例 #5

0

ファイルを表示

ファイル: pdb_util.py プロジェクト: raywangyr/scripts

def renumber_pdb(full_length_fasta_fn, truncated_pdb_fn, alignments=None):
    """ renumber the pdb based on the alignment  """
    """  future plan:
            separate the renumber function into two functions
            1. make a mapping function to return a mapping_Dict based on the alignments
            2. renumber pdb based on the mapping_Dict

         140914:
             fixed a bug caused by an error introduced by dynamic programming,
             where an residue name in the begining of a gap is the same as the end of the gap
             KTTTTTKG <- query sequence
             K------G <- mistaken alignment
             ------KG <- correct alignment

    """

    """ the alignment should be in a list as two strings ["query_aln", "template_aln"] """
    from sys import stderr
    import os

    if not alignments:
        # get sequences
        import seq_util

        truncated_seq = seq_util.pdb2fasta(truncated_pdb_fn)
        full_length_seq = seq_util.fasta_file_reader(full_length_fasta_fn)
        if not truncated_seq.strip() or not full_length_seq.strip():
            print "ERROR:"
            exit()

        # get alignment from biopython
        from Bio import pairwise2

        align_results = pairwise2.align.globalms(
            full_length_seq, truncated_seq, 5, -5, -15, -0.5, penalize_end_gaps=False
        )
        if not align_results:
            print full_length_seq
            print ">", truncated_seq
            exit()
        alignments = align_results[0]
        # print
        # print "> Biopython pairwise2 globalms"
        # print "  match: 5, mismatch= -5"
        # print "  gap_open_penalty: -15, gap_extend_penalty: -0.5"
        # print "  penalize_end_gaps=False"

        # from Bio.SubsMat.MatrixInfo import blosum62
        # alignments = pairwise2.align.globalds( full_length_seq, truncated_seq, blosum62, -15, -0.1 )[0]

    else:
        print "alignments are given by the argument"

    # create a map, based on the alignments
    assert alignments
    full_length_aln = alignments[0]
    truncated_aln = alignments[1]
    print "full_length_aln ", full_length_aln
    print "truncated_aln   ", truncated_aln
    print

    """debug: print out the index in the alignments
    count = 1
    for f, t in zip( full_length_aln, truncated_aln ):
        print "%3s %s %s" %( count, f, t )
        count += 1 #"""

    assert len(full_length_aln) == len(truncated_aln)
    assert "-" not in full_length_aln, "this is a really dumb script, doesn't renum pdbs with aa unmatched"

    # sequence mapping
    index = 1
    map_Dict = {}
    full_length_rsn = 1
    for str_idx in range(0, len(full_length_aln)):
        """ in this loop, there is no way you should check this since compared to the partial_thread alignments, the full_length_aln is from fasta directly, there is no "-" at all here """
        """if full_length_aln.strip()[str_idx] == "-":
            str_idx         -= 1
            full_length_rsn -= 1"""
        if truncated_aln.strip()[str_idx] == "-":
            full_length_rsn += 1
            continue

        """debug
        print "%3s %3s %s %s %3s" %(str_idx+1, full_length_rsn, full_length_aln.strip()[str_idx], truncated_aln.strip()[str_idx], index)#"""

        if index not in map_Dict.keys():
            map_Dict[index] = full_length_rsn
        else:
            from sys import stderr

            stderr.write("ERROR: something is wrong when indexing the alignments mapping.\n")
            return 0
            # print index, dict[str(index)]

        index += 1
        full_length_rsn += 1

    """# debug
    for i in range( 0, len( full_length_aln )):
        print i, map_Dict[ i ]#"""

    # renumber the pdb based on the alignment mapping
    from os import popen, remove, system

    """because the mapping assumes the truncated_pdb_fn starts with residue number 1, we need to renumber the input pdb to start at one"""
    # should probably get rid of this dependency in the future
    temp_file = basename(truncated_pdb_fn.split(".pdb")[0]) + ".temp.pdb"
    renumberPDBs_script = "/net/em-stor4/Volumes/data/wangyr/scripts/pdb_utils/renumberPDBs.pl"
    assert os.path.exists(renumberPDBs_script), "Error: %s doesn't exist" % renumberPDBs_script

    os.system("%s -pdbfile %s -res1 1 > %s" % (renumberPDBs_script, truncated_pdb_fn, temp_file))

    xyz_dict, pdbline_dict, resname_dict = create_xyzDict_bychain(temp_file)
    assert len(pdbline_dict.keys()) == 1, (
        "this script does not deal with pdbs containing multiple chains (%s)" % pdbline_dict.keys()
    )
    chain = pdbline_dict.keys()[0]

    xyz_dict = xyz_dict[chain]
    pdbline_dict = pdbline_dict[chain]
    resname_dict = resname_dict[chain]
    res_nums = sorted(pdbline_dict.keys())

    output_line = ""
    for idx, rsn in enumerate(res_nums):
        try:
            next_rsn = res_nums[idx + 1]
        except:
            pass  # do nothing since next_rsn will be equal to rsn

        newrsn = map_Dict[rsn]
        next_newrsn = map_Dict[next_rsn]

        # detect chain break from alignment
        if (next_newrsn - newrsn) > 1:
            dist = cal_dist(xyz_dict[next_rsn]["CA"], xyz_dict[rsn]["CA"])  # dist from old numbering
            stderr.write("chainbreak (from alignment) at %s-%s with dist %.3f\n" % (newrsn, next_newrsn, dist))

            # if no physically chainbreak detected, possibly could be error in dynamic programming, trying fixing it by looking for same residue name in next_newrsn-1, if so, overwrite the corrent numbering to write to the next_newrsn-1
            if dist <= 4.5:
                stderr.write(
                    "WARNING: no gap physically detected from %s-%s; caught an error in dynamic programming; correcting now...\n"
                    % (newrsn, next_newrsn)
                )
                # index in seq, thus -1
                assert full_length_seq[newrsn - 1] == full_length_seq[next_newrsn - 2], (
                    "ERROR: failing to looking for same residue name as pos:%s in pos:%s of fasta file could not fix the error in dynamic programming\n"
                    % (newrsn, next_newrsn - 1)
                )
                newrsn = next_newrsn - 1

        # write to the outfile based on
        for line in pdbline_dict[rsn].split("\n")[:-1]:  # [:-1], because the last item in the list is ''
            output_line += line[0:22] + "%4s" % newrsn + line[26:] + "\n"

    output_fn = basename(truncated_pdb_fn[:-4]) + "_renum.pdb"
    output_buff = open(output_fn, "w")
    output_buff.write("REMARK full_length_aln %s\n" % full_length_aln)
    output_buff.write("REMARK truncated_aln   %s\n" % truncated_aln)
    output_buff.write(output_line)
    output_buff.write("TER\n")
    output_buff.close()
    os.remove(temp_file)

    return output_fn

コード例 #6

0

ファイルを表示

ファイル: offset_alignments.py プロジェクト: raywangyr/scripts

#!/usr/local/bin/python2.7
from argparse import ArgumentParser
from sys import exit, stderr, stdout
from os.path import exists
from os import system, stat, remove
from alignment_util import alignment_parser
from seq_util import fasta_file_reader

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("-a", "--alignment_fn", required=True, type=str, help="hhpred results id")
    parser.add_argument("-f", "--fasta_fn", required=True, type=str, help="hhpred results id")
    options = parser.parse_args()

    fasta_seq = fasta_file_reader(options.fasta_fn)
    alignment_Dict = alignment_parser(options.alignment_fn)

    for aln_id in alignment_Dict.keys():
        orig_alignment_List = alignment_Dict[aln_id].split("\n")[:-1]

        orig_query_alignment = orig_alignment_List[3]
        orig_template_alignment = orig_alignment_List[4]

        idx = int(orig_query_alignment.split()[0])
        orig_query_seq = orig_query_alignment.split()[1].replace("-", "")

        if fasta_seq[:20] != orig_query_seq[:20]:
            # stderr.write("%s\n%s\n%s\n" %( aln_id, fasta_seq[:20], orig_query_seq[:20] ))
            try:
                idx = fasta_seq.index(orig_query_seq)
            except:

コード例 #7

0

ファイルを表示

ファイル: fasta_editor.py プロジェクト: raywangyr/scripts

#!/usr/bin/env python2.7
import argparse
from seq_util import fasta_file_reader
from sys import argv

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('fasta')
    parser.add_argument('-n', "--num_to_print", type=int )
    parser.add_argument('-a', "--print_all_res", action="store_true", default=False )
    parser.add_argument("--slice", nargs="+", help="1:200 210:300")
    args = parser.parse_args()

    fasta_seq = fasta_file_reader( args.fasta )
    tag = open( args.fasta, "r" ).readline()

    if not args.num_to_print and not args.print_all_res and not args.slice:
        # if no argument pass
        print tag.strip(), len(fasta_seq), "aa"
        print fasta_seq

    elif args.num_to_print:
        print fasta_seq[args.num_to_print - 1]

    elif args.slice:
        assert len( args.slice ) <= 2

        start = int(args.slice[0])

        try:    end = int(args.slice[1])
        except: end = len( fasta_seq )

コード例 #8

0

ファイルを表示

ファイル: chewing_back_gap_regions.py プロジェクト: raywangyr/scripts

                chunk_dict[ idx ].append(i)
            i+=1
            #print idx, chunk_dict[idx]
        i+=1
        idx=i
    return chunk_dict


if __name__=="__main__":
    parser = ArgumentParser()
    parser.add_argument("-p", "--pdb", required=True, help="")
    parser.add_argument("-f", "--fasta", required=True, help="")
    parser.add_argument("-c", "--chewing", default=2, type=int, help="")
    args = parser.parse_args()

    seq = ( seq_util.fasta_file_reader( args.fasta ) )
    seq_dict = {}
    for i in range(1,len(seq)+1):
        seq_dict[i]=0 
    
    xyz_dict, junk, pdbline_dict, k = pdb_util.create_xyzDict( args.pdb )
    for rsd in xyz_dict.keys():
        if seq_dict.has_key( rsd ):
            seq_dict[rsd] = 1

    '''
    # print states for debugging
    prev_state = 0
    i=1
    chunk_dict = {}
    while i <= len(seq_dict.keys()):

コード例 #9

0

ファイルを表示

ファイル: truncate_fragfiles.py プロジェクト: raywangyr/scripts

    parser.add_argument("--fragfile_fasta", required=True, help="")

    trimmed_fasta = parser.add_mutually_exclusive_group()
    trimmed_fasta.add_argument("--truncated_pdb")
    trimmed_fasta.add_argument("--truncated_fasta")

    parser.add_argument("--outfile_tag", default="trimmed", help="")
    parser.add_argument("--debug", action="store_true", help="")
    opts = parser.parse_args()

    # read into fragment as frag[pos] = fragments
    for fragfile in opts.fragfiles:
        frag_dict = frag_util.read_fragfile(fragfile)
        frag_len = frag_util.get_fraglen(frag_dict)

        fl_seq = seq_util.fasta_file_reader(opts.fragfile_fasta)
        if opts.truncated_pdb:
            tc_seq = seq_util.pdb2fasta(opts.truncated_pdb)
            alignment = alignment_util.correct_alignment_using_pdb(
                alignment_util.align_two_seqs(fl_seq, tc_seq), opts.truncated_pdb
            )
        elif opts.truncated_fasta:
            tc_seq = seq_util.fasta_file_reader(opts.truncated_fasta)
            alignment = alignment_util.align_two_seqs(fl_seq, tc_seq)
        else:
            sys.stderr.write("ERROR: you need to either give --truncated_pdb or --truncated_fasta\n")
            exit()

        chainbreak_resnums = frag_util.get_positions_to_skip_from_alignment(alignment, frag_len)

        seq_map = alignment_util.seq_mapping(alignment)