コード例 #1
0
 def test_exception_when_added_protein_has_more_than_one_stop_codon_type(
         self):
     """Test resulting protein has stop codon types '*' and '@'"""
     a = Seq.Seq(
         "MEDG-KRXR@",
         Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.extended_protein, "-"),
                               "@"))
     b = Seq.Seq(
         "MEDG-KRXR*",
         Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "*"),
                         "-"))
     with self.assertRaises(ValueError):
         a + b
コード例 #2
0
 def setUp(self):
     self.s = Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna)
     self.dna = [
         Seq.Seq("ATCG", IUPAC.ambiguous_dna),
         Seq.Seq("gtca", Alphabet.generic_dna),
         Seq.MutableSeq("GGTCA", Alphabet.generic_dna),
         Seq.Seq("CTG-CA", Alphabet.Gapped(IUPAC.unambiguous_dna, "-")),
     ]
     self.rna = [
         Seq.Seq("AUUUCG", IUPAC.ambiguous_rna),
         Seq.MutableSeq("AUUCG", IUPAC.ambiguous_rna),
         Seq.Seq("uCAg", Alphabet.generic_rna),
         Seq.MutableSeq("UC-AG", Alphabet.Gapped(Alphabet.generic_rna,
                                                 "-")),
         Seq.Seq("U.CAG", Alphabet.Gapped(Alphabet.generic_rna, ".")),
     ]
     self.nuc = [Seq.Seq("ATCG", Alphabet.generic_nucleotide)]
     self.protein = [
         Seq.Seq("ATCGPK", IUPAC.protein),
         Seq.Seq("atcGPK", Alphabet.generic_protein),
         Seq.Seq("T.CGPK", Alphabet.Gapped(IUPAC.protein, ".")),
         Seq.Seq("T-CGPK", Alphabet.Gapped(IUPAC.protein, "-")),
         Seq.Seq(
             "MEDG-KRXR*",
             Alphabet.Gapped(
                 Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")),
         Seq.MutableSeq(
             "ME-K-DRXR*XU",
             Alphabet.Gapped(
                 Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")),
         Seq.Seq(
             "MEDG-KRXR@",
             Alphabet.HasStopCodon(
                 Alphabet.Gapped(IUPAC.extended_protein, "-"), "@")),
         Seq.Seq(
             "ME-KR@",
             Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.protein, "-"),
                                   "@")),
         Seq.Seq(
             "MEDG.KRXR@",
             Alphabet.Gapped(
                 Alphabet.HasStopCodon(IUPAC.extended_protein, "@"), ".")),
     ]
     self.test_chars = ["-", Seq.Seq("-"), Seq.Seq("*"), "-X@"]
コード例 #3
0
 def setUp(self):
     self.dna = [
         Seq.Seq("ATCG", IUPAC.ambiguous_dna),
         Seq.Seq("gtca", Alphabet.generic_dna),
         Seq.MutableSeq("GGTCA", Alphabet.generic_dna),
         Seq.Seq("CTG-CA", Alphabet.Gapped(IUPAC.unambiguous_dna, "-")),
         "TGGTCA",
     ]
     self.rna = [
         Seq.Seq("AUUUCG", IUPAC.ambiguous_rna),
         Seq.MutableSeq("AUUCG", IUPAC.ambiguous_rna),
         Seq.Seq("uCAg", Alphabet.generic_rna),
         Seq.MutableSeq("UC-AG", Alphabet.Gapped(Alphabet.generic_rna,
                                                 "-")),
         Seq.Seq("U.CAG", Alphabet.Gapped(Alphabet.generic_rna, ".")),
         "UGCAU",
     ]
     self.nuc = [
         Seq.Seq("ATCG", Alphabet.generic_nucleotide),
         "UUUTTTACG",
     ]
     self.protein = [
         Seq.Seq("ATCGPK", IUPAC.protein),
         Seq.Seq("atcGPK", Alphabet.generic_protein),
         Seq.Seq("T.CGPK", Alphabet.Gapped(IUPAC.protein, ".")),
         Seq.Seq("T-CGPK", Alphabet.Gapped(IUPAC.protein, "-")),
         Seq.Seq(
             "MEDG-KRXR*",
             Alphabet.Gapped(
                 Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")),
         Seq.MutableSeq(
             "ME-K-DRXR*XU",
             Alphabet.Gapped(
                 Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")),
         "TEDDF",
     ]
コード例 #4
0
    def translate(self, seq, stop_symbol="*"):
        #Allow different instances of the same class to be used:
        assert seq.alphabet.__class__ == \
               self.table.nucleotide_alphabet.__class__, \
               "cannot translate from given alphabet (have %s, need %s)" %\
               (seq.alphabet, self.table.nucleotide_alphabet)
        s = seq.data
        letters = []
        append = letters.append
        table = self.table
        get = table.forward_table.get
        n = len(seq)
        for i in range(0, n - n % 3, 3):
            append(get(s[i:i + 3], stop_symbol))

        # return with the correct alphabet encoding (cache the encoding)
        try:
            alphabet = self._encoded[stop_symbol]
        except KeyError:
            alphabet = Alphabet.HasStopCodon(table.protein_alphabet,
                                             stop_symbol)
            self._encoded[stop_symbol] = alphabet

        return Seq.Seq("".join(letters), alphabet)
コード例 #5
0
ファイル: test_seq.py プロジェクト: wl2wl2/biopython
rna = [
    Seq.Seq("AUUUCG", IUPAC.ambiguous_rna),
    Seq.MutableSeq("AUUCG", IUPAC.ambiguous_rna),
    Seq.Seq("uCAg", Alphabet.generic_rna),
    Seq.MutableSeq("UC-AG", Alphabet.Gapped(Alphabet.generic_rna, "-")),
    Seq.Seq("U.CAG", Alphabet.Gapped(Alphabet.generic_rna, ".")), "UGCAU"
]
nuc = [Seq.Seq("ATCG", Alphabet.generic_nucleotide), "UUUTTTACG"]
protein = [
    Seq.Seq("ATCGPK", IUPAC.protein),
    Seq.Seq("atcGPK", Alphabet.generic_protein),
    Seq.Seq("T.CGPK", Alphabet.Gapped(IUPAC.protein, ".")),
    Seq.Seq("T-CGPK", Alphabet.Gapped(IUPAC.protein, "-")),
    Seq.Seq(
        "MEDG-KRXR*",
        Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "*"),
                        "-")),
    Seq.MutableSeq(
        "ME-K-DRXR*XU",
        Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "*"),
                        "-")),
    Seq.Seq(
        "MEDG-KRXR@",
        Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.extended_protein, "-"),
                              "@")),
    Seq.Seq("ME-KR@",
            Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.protein, "-"), "@")),
    Seq.Seq(
        "MEDG.KRXR@",
        Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "@"),
                        ".")), "TEDDF"
コード例 #6
0
    consensus = summary.dumb_consensus(ambiguous="N")
    print consensus
    consensus = summary.gap_consensus(ambiguous="N")
    print consensus
    print
    print summary.pos_specific_score_matrix(chars_to_ignore=['-'],
                                            axis_seq=consensus)
    print
    #Have a generic alphabet, without a declared gap char, so must tell
    #provide the frequencies and chars to ignore explicitly.
    print summary.information_content(e_freq_table=expected,
                                      chars_to_ignore=['-'])
    print
    print "Trying a protein sequence with gaps and stops"

    alpha = Alphabet.HasStopCodon(
        Alphabet.Gapped(Alphabet.generic_protein, "-"), "*")
    a = Alignment(alpha)
    a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-")
    a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*")
    a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*")
    print a
    print "=" * a.get_alignment_length()

    s = SummaryInfo(a)
    c = s.dumb_consensus(ambiguous="X")
    print c
    c = s.gap_consensus(ambiguous="X")
    print c
    print
    print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
コード例 #7
0
def testseq(size=30,
            alphabet=IUPAC.unambiguous_dna,
            table=1,
            gc_target=None,
            persistent=True,
            from_start=True,
            to_stop=True,
            stop_symbol="*",
            truncate=True,
            messenger=False,
            rand_seed=0):
    """Generate and return a Seq object.

    This function will generate and return a custom Seq object
    using any IUPAC alphabet. These sequences are a faux representation
    of biological data and can be used for testing/demonstration purposes.

    Arguments:
        - size - The number of letters in the generated sequence.
        This preferably accepts an integer value and will attempt
        to convert any input to an integer.
        - alphabet - Any IUPAC alphabet can be used to generate
        the sequence. Defaults to unambiguous DNA.
        - table - An integer value that denotes the NCBI identifier
        of the codon table which will be used in generating the sequence.
        This argument prefers an integer and will attempt to convert
        any input into an integer value.
        - gc_target - The function will attempt to generate a sequence
        with a GC-content equal to the 'gc_target' argument. The argument will
        accept any integer between 0 and 100. Alternatively, if 'gc_target'
        is set to 'None', the 'gc_target' argument will be ignored when
        generating the sequence.
        - persistent - A boolean that, if set to True, will remove
        any stop codons that are generated by chance within the sequence.
        - from_start - A boolean that, if set to True, will ensure
        that the first codon in the generated sequence is a start codon.
        - to_stop - A boolean that, if set to True, will ensure
        that the last codon in the generated sequence is a stop codon.
        - stop_symbol - Single character string that denotes the presence
        of a translated stop codon.  This defaults to the asterisk, "*".
        - truncate - A boolean that, if set to True, will ensure that
        the size of any generated (non-protein) sequence is a multiple of three (3).
        - messenger - A boolean that, if set to True, will ensure that
        any RNA sequence generated will additionally have a 5'-UTR,
        3'-UTR, and a poly-A tail.
        - rand_seed - The seed used to generate the randomized sequence.
        This argument accepts any hashable data value as the seed. If the
        argument is set to None, the sequence will be re-seeded with every
        function call.

    Hey there! We can use the 'testseq' function to quickly generate sequences.

    >>> from Scripts.testseq import testseq
    >>> my_seq = testseq()
    >>> type(my_seq)
    <class 'Bio.Seq.Seq'>

    The default size of the generated sequence is 30 letters,
    but you can change that at any time, like so:

    >>> my_seq = testseq(1500)
    >>> len(my_seq)
    1500

    You can generate your sequence using any IUPAC alphabet, like so:

    >>> from Bio.Alphabet import IUPAC
    >>> my_seq = testseq(alphabet=IUPAC.extended_protein)
    >>> my_seq.alphabet
    HasStopCodon(ExtendedIUPACProtein(), '*')

    Please notice below that this sequence starts with Methionine(M) and
    ends in an asterisk(*). That's because of the two arguments 'from_start'
    and 'to_stop' respectively. Curiously, there are no asterisks (or terminators)
    within the sequence either; this is due to the 'persistent' argument.

    >>> my_seq[0]
    'M'
    >>> my_seq[-1]
    '*'
    >>> "*" in my_seq[1:-1]
    False

    The 'from_start', 'to_stop', and 'persistent' arguments are all set to True
    by default. You can read more about what they do in the "Arguments" section
    above. It's useful to note that all three of those arguments involve the use
    of codon tables! When generating your sequence, you can set which codon table
    you'd like to use:

    >>> my_seq1 = testseq(table=5)

    Now we can translate our sequence with ease!

    >>> my_seq2 = my_seq1.translate(table=6)
    >>> my_seq2.alphabet
    IUPACProtein()

    Oops! We're missing a stop codon. We've generated a sequence using Table 5,
    but translated it using Table 6. Those tables don't share a common stop codon!
    Let's fix that...

    >>> my_seq2 = my_seq1.translate(table=5)
    >>> my_seq2.alphabet
    HasStopCodon(IUPACProtein(), '*')

    That's better!

    The 'testseq' function can also attempt to generate sequences with a
    custom GC-content. You can alter your desired GC-content by declaring
    it in the 'gc_target' argument, like so:

    >>> my_seq = testseq(gc_target=60)
    >>> from Bio.SeqUtils import GC
    >>> error = 60 - GC(my_seq)
    >>> -5 < error < 5
    False

    What happened? Note that in the above example, the sequence is at the
    default size of 30 letters. Since the sequence is generated letter by letter,
    larger sequences will have a tendency to be closer to the desired GC-content
    than smaller sequences. Let's try that again with a much larger sequence:

    >>> my_seq = testseq(10000, gc_target=60)
    >>> error = 60 - GC(my_seq)
    >>> -5 < error < 5
    True

    Much better! It's also worth noting that the 'gc_target' argument is ignored
    when generating protein sequences.

    Lets revisit the 'size' argument for a moment. You will notice that when
    generating the sequence above, I declared a 'size' of 10000. Lets confirm
    whether that was generated as requested:

    >>> len(my_seq)
    9999

    That may seem like a bug, but it isn't! All non-protein sequences are
    truncated to a multiple of three (3). This is to allow for smooth translation
    from nucleotide to amino-acid alphabets. This behavior is controlled by the
    'truncate' argument, which is set to True by default. Let's see what happens
    when we set it to False:

    >>> my_seq = testseq(10000, truncate=False)
    >>> len(my_seq)
    10000

    The result above seems cleaner, but would result in a 'BiopythonWarning' if
    you translated that sequence. So please be careful when altering the
    'truncate' argument.

    Let us briefly discuss the 'messenger' argument. You can use it to add
    messenger RNA components to a generated RNA sequence. Though, it has
    two caveats. First, the messenger argument is ignored unless an RNA alphabet
    is declared. More importantly though, all mRNA compenents are added to
    the sequence addtionally. Let's look at an example:

    >>> my_seq = testseq(300, alphabet=IUPAC.unambiguous_rna, messenger=True)
    >>> len(my_seq) == 300
    False
    >>> len(my_seq) > 400
    True

    Notice that the sequence requested was 300 letters, however the final length of
    the sequence is much larger. Those extra letters are the mRNA components. The
    generated sequence is buried in there and it is exactly 300 letters in size!

    Lastly, lets discuss the sequence generator itself. The sequence is created
    using a pseudo-random number generator which relies on a seed to process
    and spit out random numbers.
    Lets look at an example:

    >>> a = testseq()
    >>> b = testseq()
    >>> a == b
    True

    Since sequence "a" and sequence "b" were both generated using the same seed,
    they ended up being the exact same sequence. We can change that behavior
    to shuffle the seed every time the function is called by setting the
    the 'rand_seed' argument to 'None' (without quotation marks), like so:

    >>> a = testseq(rand_seed=None)
    >>> b = testseq(rand_seed=None)
    >>> a == b
    False

    If you'd like to generate a specific sequence, you can set the
    'rand_seed' argument to any desired hashable data value:

    >>> a = testseq(rand_seed=0.7334)
    >>> b = testseq(rand_seed="Hello World!")
    >>> a == b
    False

    This concludes our discussion. Thanks again for using Biopython!
    Contribution by Adil Iqbal (2017).
    """
    # Set seed, gather data, clean-up logic, validate arguments.
    if rand_seed is None:
        rand_seed = anchor_instance.random()
    random_instance.seed(rand_seed)
    typeof = _SeqType(alphabet)
    if not typeof.rna and messenger:
        warnings.warn("Only RNA sequences can be messengers.",
                      BiopythonWarning)
        messenger = False
    if typeof.rna and messenger:
        from_start = True
        to_stop = True
        persistent = True
    if persistent or from_start or to_stop:
        stop_symbol = str(stop_symbol)[0]
        codon_set = _CodonSet(alphabet, table, stop_symbol)
    if gc_target is not None and typeof.protein:
        warnings.warn("Proteins do not have a GC-content.", BiopythonWarning)
    if gc_target is not None and not typeof.protein:
        gc_target = int(gc_target)
        if gc_target < 0:
            warnings.warn(
                "Argument 'gc_target' must be an integer between 0 and 100."
                "It has been set to 0.", BiopythonWarning)
            gc_target = 0
        if gc_target > 100:
            warnings.warn(
                "Argument 'gc_target' must be an integer between 0 and 100."
                "It has been set to 100.", BiopythonWarning)
            gc_target = 100
        probability_table = _construct_probability_table(alphabet, gc_target)
    size = int(size)
    if not typeof.protein and truncate:
        size -= size % 3
    # Begin generating sequence.
    seq = ""
    for i in range(size):
        if gc_target is not None and not typeof.protein:
            seq += _pick_one(probability_table)
        else:
            roll = random_instance.randint(0, len(alphabet.letters) - 1)
            seq += alphabet.letters[roll]
        if len(seq) >= 3 and len(
                seq) % 3 == 0 and not typeof.protein and persistent:
            # Replace stop codons with non-stop codons.
            this_codon = seq[-3:]
            if this_codon in codon_set.stop:
                roll = random_instance.randint(0, len(codon_set.nonstop) - 1)
                new_codon = codon_set.nonstop[roll]
                seq = seq[:-3] + new_codon
    # Additional processing of generated sequence.
    x = 3
    if typeof.protein:
        x = 1
    if from_start:
        aug = None
        if typeof.dna:
            aug = "ATG"
        elif typeof.rna:
            aug = "AUG"
        elif typeof.protein:
            aug = "M"
        if aug is not None and aug in codon_set.start:
            start = aug
        else:
            roll = random_instance.randint(0, len(codon_set.start) - 1)
            start = codon_set.start[roll]
        seq = start + seq[x:]
    if to_stop:
        roll = random_instance.randint(0, len(codon_set.stop) - 1)
        stop = codon_set.stop[roll]
        seq = seq[:-x] + stop
    if messenger:
        seq = _add_messenger_parts(seq, size, alphabet, codon_set)
    if typeof.protein and stop_symbol in seq:
        alphabet = Alphabet.HasStopCodon(alphabet, stop_symbol)
    return Seq(seq, alphabet)
コード例 #8
0
ファイル: test_seq.py プロジェクト: joshainglis/biopython
       Seq.Seq("gtca", Alphabet.generic_dna),
       Seq.MutableSeq("GGTCA", Alphabet.generic_dna),
       Seq.Seq("CTG-CA", Alphabet.Gapped(IUPAC.unambiguous_dna, "-")),
       "TGGTCA"]
rna = [Seq.Seq("AUUUCG", IUPAC.ambiguous_rna),
       Seq.MutableSeq("AUUCG", IUPAC.ambiguous_rna),
       Seq.Seq("uCAg", Alphabet.generic_rna),
       Seq.MutableSeq("UC-AG", Alphabet.Gapped(Alphabet.generic_rna, "-")),
       Seq.Seq("U.CAG", Alphabet.Gapped(Alphabet.generic_rna, ".")),
       "UGCAU"]
nuc = [Seq.Seq("ATCG", Alphabet.generic_nucleotide),"UUUTTTACG"]
protein = [Seq.Seq("ATCGPK", IUPAC.protein),
           Seq.Seq("atcGPK", Alphabet.generic_protein),
           Seq.Seq("T.CGPK", Alphabet.Gapped(IUPAC.protein, ".")),
           Seq.Seq("T-CGPK", Alphabet.Gapped(IUPAC.protein, "-")),
           Seq.Seq("MEDG-KRXR*", Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")),
           Seq.MutableSeq("ME-K-DRXR*XU", Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "*"), "-")),
           Seq.Seq("MEDG-KRXR@", Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.extended_protein, "-"), "@")),
           Seq.Seq("ME-KR@", Alphabet.HasStopCodon(Alphabet.Gapped(IUPAC.protein, "-"), "@")),
           Seq.Seq("MEDG.KRXR@", Alphabet.Gapped(Alphabet.HasStopCodon(IUPAC.extended_protein, "@"), ".")),
           "TEDDF"]
for a in dna+rna:
    for b in nuc:
        c=a+b
        assert str(c) == str(a) + str(b)
for a in rna:
    for b in rna:
        try:
            c=a+b
            assert str(c) == str(a) + str(b)
        except ValueError as e: