Example #1
0
def build(pro_align, nucl_seqs, corr_dict=None, gap_char='-', unknown='X',
          codon_table=default_codon_table, alphabet=None,
          complete_protein=False, anchor_len=10, max_score=10):
    """Build a codon alignment from protein alignment and corresponding nucleotides.

    Arguments:
     - pro_align  - a protein MultipleSeqAlignment object
     - nucl_align - an object returned by SeqIO.parse or SeqIO.index
       or a collection of SeqRecord.
     - alphabet   - alphabet for the returned codon alignment
     - corr_dict  - a dict that maps protein id to nucleotide id
     - complete_protein - whether the sequence begins with a start
       codon
     - frameshift - whether to apply frameshift detection

    Return a CodonAlignment object

    >>> from Bio.Alphabet import IUPAC
    >>> from Bio.Seq import Seq
    >>> from Bio.SeqRecord import SeqRecord
    >>> from Bio.Align import MultipleSeqAlignment
    >>> seq1 = SeqRecord(Seq('TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG',
    ...     alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1')
    >>> seq2 = SeqRecord(Seq('TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG',
    ...     alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2')
    >>> pro1 = SeqRecord(Seq('SGTARTKLLLLLAALCAAGGALE', alphabet=IUPAC.protein),id='pro1')
    >>> pro2 = SeqRecord(Seq('SGTSRTKRLLLLAALGAAGGALE', alphabet=IUPAC.protein),id='pro2')
    >>> aln = MultipleSeqAlignment([pro1, pro2])
    >>> codon_aln = build(aln, [seq1, seq2])
    >>> print(codon_aln)
    CodonAlphabet(Standard) CodonAlignment with 2 rows and 69 columns (23 codons)
    TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGT...GAG pro1
    TCAGGGACTTCGAGAACCAAGCG-CTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGT...GAG pro2

    """
    # TODO
    # add an option to allow the user to specify the returned object?

    from Bio.Alphabet import ProteinAlphabet
    from Bio.Align import MultipleSeqAlignment

    # check the type of object of pro_align
    if not isinstance(pro_align, MultipleSeqAlignment):
        raise TypeError("the first argument should be a MultipleSeqAlignment "
                        "object")
    # check the alphabet of pro_align
    for pro in pro_align:
        if not isinstance(_get_base_alphabet(pro.seq.alphabet), ProteinAlphabet):
            raise TypeError("Alphabet Error!\nThe input alignment should be "
                            "a *PROTEIN* alignemnt, found %r" % pro.seq.alphabet)
    if alphabet is None:
        alphabet = _get_codon_alphabet(codon_table, gap_char=gap_char)
    # check whether the number of seqs in pro_align and nucl_seqs is
    # the same
    pro_num = len(pro_align)
    if corr_dict is None:
        if nucl_seqs.__class__.__name__ == "generator":
            # nucl_seqs will be a tuple if read by SeqIO.parse()
            nucl_seqs = tuple(nucl_seqs)
        nucl_num = len(nucl_seqs)
        if pro_num > nucl_num:
            raise ValueError("Higher Number of SeqRecords in Protein Alignment "
                             "({0}) than the Number of Nucleotide SeqRecords "
                             "({1}) are found!".format(pro_num, nucl_num))

        # Determine the protein sequences and nucl sequences
        # correspondence. If nucl_seqs is a list, tuple or read by
        # SeqIO.parse(), we assume the order of sequences in pro_align
        # and nucl_seqs are the same. If nucl_seqs is a dict or read by
        # SeqIO.index(), we match seqs in pro_align and those in
        # nucl_seq by their id.
        if nucl_seqs.__class__.__name__ in ("_IndexedSeqFileDict", "dict"):
            corr_method = 1
        elif nucl_seqs.__class__.__name__ in ("list", "tuple"):
            corr_method = 0
        else:
            raise TypeError("Nucl Sequences Error, Unknown type to assign "
                            "correspondence method")
    else:
        if not isinstance(corr_dict, dict):
            raise TypeError("corr_dict should be a dict that corresponds "
                            "protein id to nucleotide id!")
        if len(corr_dict) >= pro_num:
            # read by SeqIO.parse()
            if nucl_seqs.__class__.__name__ == "generator":
                from Bio import SeqIO
                nucl_seqs = SeqIO.to_dict(nucl_seqs)
            elif nucl_seqs.__class__.__name__ in ("list", "tuple"):
                nucl_seqs = dict((i.id, i) for i in nucl_seqs)
                # nucl_seqs = {i.id: i for i in nucl_seqs}
            elif nucl_seqs.__class__.__name__ in \
                    ("_IndexedSeqFileDict", "dict"):
                pass
            else:
                raise TypeError("Nucl Sequences Error, Unknown type of "
                                "Nucleotide Records!")
            corr_method = 2
        else:
            raise RuntimeError("Number of items in corr_dict ({0}) is less "
                               "than number of protein records "
                               "({1})".format(len(corr_dict), pro_num))

    # set up pro-nucl correspondence based on corr_method
    # corr_method = 0, consecutive pairing
    if corr_method == 0:
        pro_nucl_pair = zip(pro_align, nucl_seqs)
    # corr_method = 1, keyword pairing
    elif corr_method == 1:
        nucl_id = set(nucl_seqs.keys())
        pro_id = set(i.id for i in pro_align)
        # check if there is pro_id that does not have a nucleotide match
        if pro_id - nucl_id:
            diff = pro_id - nucl_id
            raise ValueError("Protein Record {0} cannot find a nucleotide "
                             "sequence match, please check the "
                             "id".format(', '.join(diff)))
        else:
            pro_nucl_pair = []
            for pro_rec in pro_align:
                pro_nucl_pair.append((pro_rec, nucl_seqs[pro_rec.id]))
    # corr_method = 2, dict pairing
    elif corr_method == 2:
        pro_nucl_pair = []
        for pro_rec in pro_align:
            try:
                nucl_id = corr_dict[pro_rec.id]
            except KeyError:
                print("Protein record (%s) is not in corr_dict!" % pro_rec.id)
                exit(1)
            pro_nucl_pair.append((pro_rec, nucl_seqs[nucl_id]))

    codon_aln = []
    shift = False
    for pair in pro_nucl_pair:
        # Beware that the following span corresponds to an ungapped
        # nucleotide sequence.
        corr_span = _check_corr(pair[0], pair[1], gap_char=gap_char,
                                codon_table=codon_table,
                                complete_protein=complete_protein,
                                anchor_len=anchor_len)
        if not corr_span:
            raise ValueError("Protein Record {0} and Nucleotide Record {1} do"
                             " not match!".format(pair[0].id, pair[1].id))
        else:
            codon_rec = _get_codon_rec(pair[0], pair[1], corr_span,
                                       alphabet=alphabet,
                                       complete_protein=False,
                                       codon_table=codon_table,
                                       max_score=max_score)
            codon_aln.append(codon_rec)
            if corr_span[1] == 2:
                shift = True
    if shift:
        return CodonAlignment(_align_shift_recs(codon_aln), alphabet=alphabet)
    else:
        return CodonAlignment(codon_aln, alphabet=alphabet)
Example #2
0
def build(pro_align,
          nucl_seqs,
          corr_dict=None,
          gap_char='-',
          unknown='X',
          codon_table=default_codon_table,
          alphabet=None,
          complete_protein=False,
          anchor_len=10,
          max_score=10):
    """Build a codon alignment from a protein alignment and
    corresponding nucleotide sequences

    Arguments:
        - pro_align  - a protein MultipleSeqAlignment object
        - nucl_align - an object returned by SeqIO.parse or SeqIO.index
          or a collection of SeqRecord.
        - alphabet   - alphabet for the returned codon alignment
        - corr_dict  - a dict that maps protein id to nucleotide id
        - complete_protein - whether the sequence begins with a start
          codon
        - frameshift - whether to apply frameshift detection

    Return a CodonAlignment object

    >>> from Bio.Alphabet import IUPAC
    >>> from Bio.Seq import Seq
    >>> from Bio.SeqRecord import SeqRecord
    >>> from Bio.Align import MultipleSeqAlignment
    >>> seq1 = SeqRecord(Seq('TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG',
    ...     alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1')
    >>> seq2 = SeqRecord(Seq('TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG',
    ...     alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2')
    >>> pro1 = SeqRecord(Seq('SGTARTKLLLLLAALCAAGGALE', alphabet=IUPAC.protein),id='pro1')
    >>> pro2 = SeqRecord(Seq('SGTSRTKRLLLLAALGAAGGALE', alphabet=IUPAC.protein),id='pro2')
    >>> aln = MultipleSeqAlignment([pro1, pro2])
    >>> codon_aln = build(aln, [seq1, seq2])
    >>> print(codon_aln)
    CodonAlphabet(Standard) CodonAlignment with 2 rows and 69 columns (23 codons)
    TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGT...GAG pro1
    TCAGGGACTTCGAGAACCAAGCG-CTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGT...GAG pro2

    """
    # TODO
    # add an option to allow the user to specify the returned object?

    from Bio.Alphabet import ProteinAlphabet
    from Bio.Align import MultipleSeqAlignment

    # check the type of object of pro_align
    if not isinstance(pro_align, MultipleSeqAlignment):
        raise TypeError("the first argument should be a MultipleSeqAlignment "
                        "object")
    # check the alphabet of pro_align
    for pro in pro_align:
        if not isinstance(pro.seq.alphabet, ProteinAlphabet):
            raise TypeError("Alphabet Error!\nThe input alignment should be "
                            "a *PROTEIN* alignment")
    if alphabet is None:
        alphabet = _get_codon_alphabet(codon_table, gap_char=gap_char)
    # check whether the number of seqs in pro_align and nucl_seqs is
    # the same
    pro_num = len(pro_align)
    if corr_dict is None:
        if nucl_seqs.__class__.__name__ == "generator":
            # nucl_seqs will be a tuple if read by SeqIO.parse()
            nucl_seqs = tuple(nucl_seqs)
        nucl_num = len(nucl_seqs)
        if pro_num > nucl_num:
            raise ValueError(
                "Higher Number of SeqRecords in Protein Alignment "
                "({0}) than the Number of Nucleotide SeqRecords "
                "({1}) are found!".format(pro_num, nucl_num))

        # Determine the protein sequences and nucl sequences
        # correspondence. If nucl_seqs is a list, tuple or read by
        # SeqIO.parse(), we assume the order of sequences in pro_align
        # and nucl_seqs are the same. If nucl_seqs is a dict or read by
        # SeqIO.index(), we match seqs in pro_align and those in
        # nucl_seq by their id.
        if nucl_seqs.__class__.__name__ in ("_IndexedSeqFileDict", "dict"):
            corr_method = 1
        elif nucl_seqs.__class__.__name__ in ("list", "tuple"):
            corr_method = 0
        else:
            raise TypeError("Nucl Sequences Error, Unknown type to assign "
                            "correspondence method")
    else:
        if not isinstance(corr_dict, dict):
            raise TypeError("corr_dict should be a dict that corresponds "
                            "protein id to nucleotide id!")
        if len(corr_dict) >= pro_num:
            # read by SeqIO.parse()
            if nucl_seqs.__class__.__name__ == "generator":
                from Bio import SeqIO
                nucl_seqs = SeqIO.to_dict(nucl_seqs)
            elif nucl_seqs.__class__.__name__ in ("list", "tuple"):
                nucl_seqs = dict((i.id, i) for i in nucl_seqs)
                # nucl_seqs = {i.id: i for i in nucl_seqs}
            elif nucl_seqs.__class__.__name__ in \
                    ("_IndexedSeqFileDict", "dict"):
                pass
            else:
                raise TypeError("Nucl Sequences Error, Unknown type of "
                                "Nucleotide Records!")
            corr_method = 2
        else:
            raise RuntimeError("Number of items in corr_dict ({0}) is less "
                               "than number of protein records "
                               "({1})".format(len(corr_dict), pro_num))

    # set up pro-nucl correspondence based on corr_method
    # corr_method = 0, consecutive pairing
    if corr_method == 0:
        pro_nucl_pair = izip(pro_align, nucl_seqs)
    # corr_method = 1, keyword pairing
    elif corr_method == 1:
        nucl_id = set(nucl_seqs.keys())
        pro_id = set([i.id for i in pro_align])
        # check if there is pro_id that does not have a nucleotide match
        if pro_id - nucl_id:
            diff = pro_id - nucl_id
            raise ValueError("Protein Record {0} cannot find a nucleotide "
                             "sequence match, please check the "
                             "id".format(', '.join(diff)))
        else:
            pro_nucl_pair = []
            for pro_rec in pro_align:
                pro_nucl_pair.append((pro_rec, nucl_seqs[pro_rec.id]))
    # corr_method = 2, dict pairing
    elif corr_method == 2:
        pro_nucl_pair = []
        for pro_rec in pro_align:
            try:
                nucl_id = corr_dict[pro_rec.id]
            except KeyError:
                print("Protein record (%s) is not in corr_dict!" % pro_rec.id)
                exit(1)
            pro_nucl_pair.append((pro_rec, nucl_seqs[nucl_id]))

    codon_aln = []
    shift = False
    for pair in pro_nucl_pair:
        # Beware that the following span corresponds to an ungapped
        # nucleotide sequence.
        corr_span = _check_corr(pair[0],
                                pair[1],
                                gap_char=gap_char,
                                codon_table=codon_table,
                                complete_protein=complete_protein,
                                anchor_len=anchor_len)
        if not corr_span:
            raise ValueError("Protein Record {0} and Nucleotide Record {1} do"
                             " not match!".format((pair[0].id, pair[1].id)))
        else:
            codon_rec = _get_codon_rec(pair[0],
                                       pair[1],
                                       corr_span,
                                       alphabet=alphabet,
                                       complete_protein=False,
                                       codon_table=codon_table,
                                       max_score=max_score)
            codon_aln.append(codon_rec)
            if corr_span[1] == 2:
                shift = True
    if shift:
        return CodonAlignment(_align_shift_recs(codon_aln), alphabet=alphabet)
    else:
        return CodonAlignment(codon_aln, alphabet=alphabet)
Example #3
0
def build(
    pro_align,
    nucl_seqs,
    corr_dict=None,
    gap_char="-",
    unknown="X",
    codon_table=default_codon_table,
    alphabet=None,
    complete_protein=False,
    anchor_len=10,
    max_score=10,
):
    """Build a codon alignment from protein alignment and corresponding nucleotides.

    Arguments:
     - pro_align  - a protein MultipleSeqAlignment object
     - nucl_seqs - an object returned by SeqIO.parse or SeqIO.index
       or a collection of SeqRecord.
     - alphabet   - alphabet for the returned codon alignment
     - corr_dict  - a dict that maps protein id to nucleotide id
     - complete_protein - whether the sequence begins with a start
       codon

    Return a CodonAlignment object.

    The example below answers this Biostars question: https://www.biostars.org/p/89741/

    >>> from Bio.Alphabet import generic_dna, generic_protein
    >>> from Bio.Seq import Seq
    >>> from Bio.SeqRecord import SeqRecord
    >>> from Bio.Align import MultipleSeqAlignment
    >>> from Bio.codonalign import build
    >>> seq1 = SeqRecord(Seq('ATGTCTCGT', alphabet=generic_dna), id='pro1')
    >>> seq2 = SeqRecord(Seq('ATGCGT', alphabet=generic_dna), id='pro2')
    >>> pro1 = SeqRecord(Seq('MSR', alphabet=generic_protein), id='pro1')
    >>> pro2 = SeqRecord(Seq('M-R', alphabet=generic_protein), id='pro2')
    >>> aln = MultipleSeqAlignment([pro1, pro2])
    >>> codon_aln = build(aln, [seq1, seq2])
    >>> print(codon_aln)
    CodonAlphabet(Standard) CodonAlignment with 2 rows and 9 columns (3 codons)
    ATGTCTCGT pro1
    ATG---CGT pro2

    """
    # TODO
    # add an option to allow the user to specify the returned object?

    from Bio.Alphabet import ProteinAlphabet
    from Bio.Align import MultipleSeqAlignment

    # check the type of object of pro_align
    if not isinstance(pro_align, MultipleSeqAlignment):
        raise TypeError(
            "the first argument should be a MultipleSeqAlignment object")
    # check the alphabet of pro_align
    for pro in pro_align:
        if not isinstance(_get_base_alphabet(pro.seq.alphabet),
                          ProteinAlphabet):
            raise TypeError("Alphabet Error!\nThe input alignment should be "
                            "a *PROTEIN* alignemnt, found %r" %
                            pro.seq.alphabet)
    if alphabet is None:
        alphabet = _get_codon_alphabet(codon_table, gap_char=gap_char)
    # check whether the number of seqs in pro_align and nucl_seqs is
    # the same
    pro_num = len(pro_align)
    if corr_dict is None:
        if nucl_seqs.__class__.__name__ == "generator":
            # nucl_seqs will be a tuple if read by SeqIO.parse()
            nucl_seqs = tuple(nucl_seqs)
        nucl_num = len(nucl_seqs)
        if pro_num > nucl_num:
            raise ValueError(
                "Higher Number of SeqRecords in Protein Alignment "
                "({0}) than the Number of Nucleotide SeqRecords "
                "({1}) are found!".format(pro_num, nucl_num))

        # Determine the protein sequences and nucl sequences
        # correspondence. If nucl_seqs is a list, tuple or read by
        # SeqIO.parse(), we assume the order of sequences in pro_align
        # and nucl_seqs are the same. If nucl_seqs is a dict or read by
        # SeqIO.index(), we match seqs in pro_align and those in
        # nucl_seq by their id.
        if nucl_seqs.__class__.__name__ in ("_IndexedSeqFileDict", "dict"):
            corr_method = 1
        elif nucl_seqs.__class__.__name__ in ("list", "tuple"):
            corr_method = 0
        else:
            raise TypeError(
                "Nucl Sequences Error, Unknown type to assign correspondence method"
            )
    else:
        if not isinstance(corr_dict, dict):
            raise TypeError("corr_dict should be a dict that corresponds "
                            "protein id to nucleotide id!")
        if len(corr_dict) >= pro_num:
            # read by SeqIO.parse()
            if nucl_seqs.__class__.__name__ == "generator":
                from Bio import SeqIO

                nucl_seqs = SeqIO.to_dict(nucl_seqs)
            elif nucl_seqs.__class__.__name__ in ("list", "tuple"):
                nucl_seqs = {i.id: i for i in nucl_seqs}
            elif nucl_seqs.__class__.__name__ in ("_IndexedSeqFileDict",
                                                  "dict"):
                pass
            else:
                raise TypeError(
                    "Nucl Sequences Error, Unknown type of Nucleotide Records!"
                )
            corr_method = 2
        else:
            raise RuntimeError(
                "Number of items in corr_dict ({0}) is less than number of protein "
                "records ({1})".format(len(corr_dict), pro_num))

    # set up pro-nucl correspondence based on corr_method
    # corr_method = 0, consecutive pairing
    if corr_method == 0:
        pro_nucl_pair = zip(pro_align, nucl_seqs)
    # corr_method = 1, keyword pairing
    elif corr_method == 1:
        nucl_id = set(nucl_seqs.keys())
        pro_id = {i.id for i in pro_align}
        # check if there is pro_id that does not have a nucleotide match
        if pro_id - nucl_id:
            diff = pro_id - nucl_id
            raise ValueError("Protein Record {0} cannot find a nucleotide "
                             "sequence match, please check the "
                             "id".format(", ".join(diff)))
        else:
            pro_nucl_pair = []
            for pro_rec in pro_align:
                pro_nucl_pair.append((pro_rec, nucl_seqs[pro_rec.id]))
    # corr_method = 2, dict pairing
    elif corr_method == 2:
        pro_nucl_pair = []
        for pro_rec in pro_align:
            try:
                nucl_id = corr_dict[pro_rec.id]
            except KeyError:
                print("Protein record (%s) is not in corr_dict!" % pro_rec.id)
                exit(1)
            pro_nucl_pair.append((pro_rec, nucl_seqs[nucl_id]))

    codon_aln = []
    shift = False
    for pair in pro_nucl_pair:
        # Beware that the following span corresponds to an ungapped
        # nucleotide sequence.
        corr_span = _check_corr(
            pair[0],
            pair[1],
            gap_char=gap_char,
            codon_table=codon_table,
            complete_protein=complete_protein,
            anchor_len=anchor_len,
        )
        if not corr_span:
            raise ValueError("Protein Record {0} and Nucleotide Record {1} do"
                             " not match!".format(pair[0].id, pair[1].id))
        else:
            codon_rec = _get_codon_rec(
                pair[0],
                pair[1],
                corr_span,
                alphabet=alphabet,
                complete_protein=False,
                codon_table=codon_table,
                max_score=max_score,
            )
            codon_aln.append(codon_rec)
            if corr_span[1] == 2:
                shift = True
    if shift:
        return CodonAlignment(_align_shift_recs(codon_aln), alphabet=alphabet)
    else:
        return CodonAlignment(codon_aln, alphabet=alphabet)