Esempio n. 1
0
def match_numbering(ref_pdb, renum_pdb):
    """
    Given two perfectly aligned pdbs (allowing offset in residue numbering),
    match the numbering in renum_pdb to ref_pdb
    It is definitely somewhat redundant to the:
        def renumber_pdb( full_length_fasta_fn, truncated_pdb_fn, alignments=None ):
    where the perfect function will be spliting sequences mapping (take the index mapping dictionary),
    and renumbering as two separate functions
    I did it for the quick and dirty trying not to mess up the working renumber_pdb() function
    """

    ref_fasta = seq_util.pdb2fasta(ref_pdb)
    renum_fasta = seq_util.pdb2fasta(renum_pdb)

    if ref_fasta != renum_fasta and ref_fasta.index(renum_fasta):
        offset = ref_fasta.index(renum_fasta)
        idx2refpdbnum = idx_to_pdbnum(ref_pdb, offset)

        pdb_as_ref = ref_fasta
        pdb_to_renum = offset * "-" + renum_fasta
        assert len(pdb_as_ref) == len(pdb_to_renum)

        print "pdb_as_ref:   %s" % pdb_as_ref
        print "pdb_to_renum: %s" % pdb_to_renum

    else:
        assert ref_fasta == renum_fasta
        idx2refpdbnum = idx_to_pdbnum(ref_pdb)

    #
    pdbline_dict = create_xyzDict_bychain(renum_pdb)[1]
    chains = pdbline_dict.keys()
    assert len(chains) == 1, "this script does not deal with pdbs containing multiple chains"
    pdbline_dict = pdbline_dict[chains[0]]
    res_nums = sorted(pdbline_dict.keys())  # pdbline_dict[ chain ].keys()
    print res_nums

    output_line = ""
    for idx, rsn in enumerate(res_nums):
        newrsn = idx2refpdbnum[idx]
        # write to the outfile based on
        for line in pdbline_dict[rsn].split("\n")[:-1]:  # [:-1], because the last item in the list is ''
            output_line += line[0:22] + "%4s" % newrsn + line[26:] + "\n"

    output_fn = get_uniq_outpdbname(renum_pdb, "_matched")

    output_buff = open(output_fn, "w")
    output_buff.write("REMARK numberings are matched to %s\n" % ref_pdb)
    output_buff.write(output_line)
    output_buff.write("TER\n")
    output_buff.close()

    return output_fn
Esempio n. 2
0
def align_and_renumber_pdb(fulllength_fasta, truncated_pdbfile, ignore_check=False):
    """ This function is going to make the renumber_pdb() obsolete """
    # make alignment
    fl_seq = seq_util.fasta_file_reader(fulllength_fasta)
    tc_seq = seq_util.pdb2fasta(truncated_pdbfile)
    alignment = alignment_util.align_two_seqs(fl_seq, tc_seq)

    if ignore_check:
        pdb_idx1(
            truncated_pdbfile, "temp.pdb"
        )  # for the following step, this has been used in alignment_util.correct_alignment_using_pdb
    else:
        alignment = alignment_util.correct_alignment_using_pdb(alignment, truncated_pdbfile, False)

    seq_map = alignment_util.seq_mapping(alignment)

    xyz_dict, pdbline_dict, resname_dict = create_xyzDict_bychain("temp.pdb")
    assert len(pdbline_dict.keys()) == 1, (
        "this script does not deal with pdbs containing multiple chains (%s)" % pdbline_dict.keys()
    )
    chain = pdbline_dict.keys()[0]

    xyz_dict = xyz_dict[chain]
    pdbline_dict = pdbline_dict[chain]
    resname_dict = resname_dict[chain]
    res_nums = sorted(pdbline_dict.keys())

    out_pdblines = "REMARK full_length_aln %s\n" % alignment[0]
    out_pdblines += "REMARK truncated_aln   %s\n" % alignment[1]

    for idx, rsn in enumerate(res_nums):
        newrsn = seq_map[rsn]
        for line in pdbline_dict[rsn].split("\n")[:-1]:  # [:-1], because the last item in the list is ''
            out_pdblines += line[0:22] + "%4s" % newrsn + line[26:] + "\n"
    out_pdblines += "TER\n"

    os.remove("temp.pdb")

    return out_pdblines
Esempio n. 3
0
def renumber_pdb(full_length_fasta_fn, truncated_pdb_fn, alignments=None):
    """ renumber the pdb based on the alignment  """
    """  future plan:
            separate the renumber function into two functions
            1. make a mapping function to return a mapping_Dict based on the alignments
            2. renumber pdb based on the mapping_Dict

         140914:
             fixed a bug caused by an error introduced by dynamic programming,
             where an residue name in the begining of a gap is the same as the end of the gap
             KTTTTTKG <- query sequence
             K------G <- mistaken alignment
             ------KG <- correct alignment

    """

    """ the alignment should be in a list as two strings ["query_aln", "template_aln"] """
    from sys import stderr
    import os

    if not alignments:
        # get sequences
        import seq_util

        truncated_seq = seq_util.pdb2fasta(truncated_pdb_fn)
        full_length_seq = seq_util.fasta_file_reader(full_length_fasta_fn)
        if not truncated_seq.strip() or not full_length_seq.strip():
            print "ERROR:"
            exit()

        # get alignment from biopython
        from Bio import pairwise2

        align_results = pairwise2.align.globalms(
            full_length_seq, truncated_seq, 5, -5, -15, -0.5, penalize_end_gaps=False
        )
        if not align_results:
            print full_length_seq
            print ">", truncated_seq
            exit()
        alignments = align_results[0]
        # print
        # print "> Biopython pairwise2 globalms"
        # print "  match: 5, mismatch= -5"
        # print "  gap_open_penalty: -15, gap_extend_penalty: -0.5"
        # print "  penalize_end_gaps=False"

        # from Bio.SubsMat.MatrixInfo import blosum62
        # alignments = pairwise2.align.globalds( full_length_seq, truncated_seq, blosum62, -15, -0.1 )[0]

    else:
        print "alignments are given by the argument"

    # create a map, based on the alignments
    assert alignments
    full_length_aln = alignments[0]
    truncated_aln = alignments[1]
    print "full_length_aln ", full_length_aln
    print "truncated_aln   ", truncated_aln
    print

    """debug: print out the index in the alignments
    count = 1
    for f, t in zip( full_length_aln, truncated_aln ):
        print "%3s %s %s" %( count, f, t )
        count += 1 #"""

    assert len(full_length_aln) == len(truncated_aln)
    assert "-" not in full_length_aln, "this is a really dumb script, doesn't renum pdbs with aa unmatched"

    # sequence mapping
    index = 1
    map_Dict = {}
    full_length_rsn = 1
    for str_idx in range(0, len(full_length_aln)):
        """ in this loop, there is no way you should check this since compared to the partial_thread alignments, the full_length_aln is from fasta directly, there is no "-" at all here """
        """if full_length_aln.strip()[str_idx] == "-":
            str_idx         -= 1
            full_length_rsn -= 1"""
        if truncated_aln.strip()[str_idx] == "-":
            full_length_rsn += 1
            continue

        """debug
        print "%3s %3s %s %s %3s" %(str_idx+1, full_length_rsn, full_length_aln.strip()[str_idx], truncated_aln.strip()[str_idx], index)#"""

        if index not in map_Dict.keys():
            map_Dict[index] = full_length_rsn
        else:
            from sys import stderr

            stderr.write("ERROR: something is wrong when indexing the alignments mapping.\n")
            return 0
            # print index, dict[str(index)]

        index += 1
        full_length_rsn += 1

    """# debug
    for i in range( 0, len( full_length_aln )):
        print i, map_Dict[ i ]#"""

    # renumber the pdb based on the alignment mapping
    from os import popen, remove, system

    """because the mapping assumes the truncated_pdb_fn starts with residue number 1, we need to renumber the input pdb to start at one"""
    # should probably get rid of this dependency in the future
    temp_file = basename(truncated_pdb_fn.split(".pdb")[0]) + ".temp.pdb"
    renumberPDBs_script = "/net/em-stor4/Volumes/data/wangyr/scripts/pdb_utils/renumberPDBs.pl"
    assert os.path.exists(renumberPDBs_script), "Error: %s doesn't exist" % renumberPDBs_script

    os.system("%s -pdbfile %s -res1 1 > %s" % (renumberPDBs_script, truncated_pdb_fn, temp_file))

    xyz_dict, pdbline_dict, resname_dict = create_xyzDict_bychain(temp_file)
    assert len(pdbline_dict.keys()) == 1, (
        "this script does not deal with pdbs containing multiple chains (%s)" % pdbline_dict.keys()
    )
    chain = pdbline_dict.keys()[0]

    xyz_dict = xyz_dict[chain]
    pdbline_dict = pdbline_dict[chain]
    resname_dict = resname_dict[chain]
    res_nums = sorted(pdbline_dict.keys())

    output_line = ""
    for idx, rsn in enumerate(res_nums):
        try:
            next_rsn = res_nums[idx + 1]
        except:
            pass  # do nothing since next_rsn will be equal to rsn

        newrsn = map_Dict[rsn]
        next_newrsn = map_Dict[next_rsn]

        # detect chain break from alignment
        if (next_newrsn - newrsn) > 1:
            dist = cal_dist(xyz_dict[next_rsn]["CA"], xyz_dict[rsn]["CA"])  # dist from old numbering
            stderr.write("chainbreak (from alignment) at %s-%s with dist %.3f\n" % (newrsn, next_newrsn, dist))

            # if no physically chainbreak detected, possibly could be error in dynamic programming, trying fixing it by looking for same residue name in next_newrsn-1, if so, overwrite the corrent numbering to write to the next_newrsn-1
            if dist <= 4.5:
                stderr.write(
                    "WARNING: no gap physically detected from %s-%s; caught an error in dynamic programming; correcting now...\n"
                    % (newrsn, next_newrsn)
                )
                # index in seq, thus -1
                assert full_length_seq[newrsn - 1] == full_length_seq[next_newrsn - 2], (
                    "ERROR: failing to looking for same residue name as pos:%s in pos:%s of fasta file could not fix the error in dynamic programming\n"
                    % (newrsn, next_newrsn - 1)
                )
                newrsn = next_newrsn - 1

        # write to the outfile based on
        for line in pdbline_dict[rsn].split("\n")[:-1]:  # [:-1], because the last item in the list is ''
            output_line += line[0:22] + "%4s" % newrsn + line[26:] + "\n"

    output_fn = basename(truncated_pdb_fn[:-4]) + "_renum.pdb"
    output_buff = open(output_fn, "w")
    output_buff.write("REMARK full_length_aln %s\n" % full_length_aln)
    output_buff.write("REMARK truncated_aln   %s\n" % truncated_aln)
    output_buff.write(output_line)
    output_buff.write("TER\n")
    output_buff.close()
    os.remove(temp_file)

    return output_fn
Esempio n. 4
0
    trimmed_fasta = parser.add_mutually_exclusive_group()
    trimmed_fasta.add_argument("--truncated_pdb")
    trimmed_fasta.add_argument("--truncated_fasta")

    parser.add_argument("--outfile_tag", default="trimmed", help="")
    parser.add_argument("--debug", action="store_true", help="")
    opts = parser.parse_args()

    # read into fragment as frag[pos] = fragments
    for fragfile in opts.fragfiles:
        frag_dict = frag_util.read_fragfile(fragfile)
        frag_len = frag_util.get_fraglen(frag_dict)

        fl_seq = seq_util.fasta_file_reader(opts.fragfile_fasta)
        if opts.truncated_pdb:
            tc_seq = seq_util.pdb2fasta(opts.truncated_pdb)
            alignment = alignment_util.correct_alignment_using_pdb(
                alignment_util.align_two_seqs(fl_seq, tc_seq), opts.truncated_pdb
            )
        elif opts.truncated_fasta:
            tc_seq = seq_util.fasta_file_reader(opts.truncated_fasta)
            alignment = alignment_util.align_two_seqs(fl_seq, tc_seq)
        else:
            sys.stderr.write("ERROR: you need to either give --truncated_pdb or --truncated_fasta\n")
            exit()

        chainbreak_resnums = frag_util.get_positions_to_skip_from_alignment(alignment, frag_len)

        seq_map = alignment_util.seq_mapping(alignment)
        residues = sorted(seq_map.keys())