Esempio n. 1
0
def _retrieve_seq(adaptor, primary_id):
    #The database schema ensures there will be only one matching
    #row in the table.

    #If an UnknownSeq was recorded, seq will be NULL,
    #but length will be populated.  This means length(seq)
    #will return None.
    seqs = adaptor.execute_and_fetchall(
        "SELECT alphabet, length, length(seq) FROM biosequence"
        " WHERE bioentry_id = %s", (primary_id, ))
    if not seqs:
        return
    assert len(seqs) == 1
    moltype, given_length, length = seqs[0]

    try:
        length = int(length)
        given_length = int(length)
        assert length == given_length
        have_seq = True
    except TypeError:
        assert length is None
        seqs = adaptor.execute_and_fetchall(
            "SELECT alphabet, length, seq FROM biosequence"
            " WHERE bioentry_id = %s", (primary_id, ))
        assert len(seqs) == 1
        moltype, given_length, seq = seqs[0]
        assert seq is None or seq == ""
        length = int(given_length)
        have_seq = False
        del seq
    del given_length

    moltype = moltype.lower()  # might be upper case in database
    #We have no way of knowing if these sequences will use IUPAC
    #alphabets, and we certainly can't assume they are unambiguous!
    if moltype == "dna":
        alphabet = Alphabet.generic_dna
    elif moltype == "rna":
        alphabet = Alphabet.generic_rna
    elif moltype == "protein":
        alphabet = Alphabet.generic_protein
    elif moltype == "unknown":
        #This is used in BioSQL/Loader.py and would happen
        #for any generic or nucleotide alphabets.
        alphabet = Alphabet.single_letter_alphabet
    else:
        raise AssertionError("Unknown moltype: %s" % moltype)

    if have_seq:
        return DBSeq(primary_id, adaptor, alphabet, 0, int(length))
    else:
        return UnknownSeq(length, alphabet)
Esempio n. 2
0
 def setUp(self):
     self.seqrec = SeqRecord(UnknownSeq(21))
     loc = CompoundLocation([
         FeatureLocation(12, 15, strand=1),
         FeatureLocation(18, 21, strand=1),
         FeatureLocation(0, 3, strand=1),
         FeatureLocation(6, 9, strand=1)
     ],
                            operator="join")
     self.seqcds = SeqFeature(loc, type="CDS")
     self.seqgene = SeqFeature(loc, type="gene")
     self.seqrec.annotations["topology"] = "circular"
Esempio n. 3
0
 def __init__(self, biopython_object=None):
     
     # first we define our underlying SeqRecord object
     if biopython_object == None:
         self._record = SeqRecord(seq=UnknownSeq(0,alphabet=NucleotideAlphabet()),id='',name='',description='')
     elif isinstance(biopython_object,Seq):
         self._record = SeqRecord(seq=copy.deepcopy(biopython_object),id='',name='',description='')
     elif isinstance(biopython_object,SeqRecord):
         self._record = copy.deepcopy(biopython_object)
     
     # define dictionary of features for faster lookup
     self._features = {}
     for (i,feature) in enumerate(self._record.features):
         self._features.setdefault(feature.type,[]).append(i)
Esempio n. 4
0
    def test_join_UnknownSeq(self):
        """Checks if UnknownSeq join correctly concatenates sequence with the spacer."""
        spacer1 = UnknownSeq(5, character="-")
        spacer2 = UnknownSeq(0, character="-")
        spacers = [spacer1, spacer2]

        self.assertEqual(
            "-" * 15,
            spacer1.join([
                UnknownSeq(5, character="-"),
                UnknownSeq(5, character="-"),
            ]),
        )
        self.assertEqual(
            "N" * 5 + "-" * 10,
            spacer1.join([
                Seq("NNNNN"),
                UnknownSeq(5, character="-"),
            ]),
        )

        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer2.join(example_strings)

        self.assertEqual(str(str_concatenated), "".join(example_strings))

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(str(seq_concatenated),
                             str(spacer).join(example_strings))
            # Now try single sequence arguments, should join the letters
            for target in example_strings + example_strings_seqs:
                self.assertEqual(
                    str(spacer).join(str(target)), str(spacer.join(target)))
def _retrieve_seq(adaptor, primary_id):
    # The database schema ensures there will be only one matching
    # row in the table.

    # If an UnknownSeq was recorded, seq will be NULL,
    # but length will be populated.  This means length(seq)
    # will return None.
    seqs = adaptor.execute_and_fetchall(
        "SELECT alphabet, length, length(seq) FROM biosequence WHERE bioentry_id = %s",
        (primary_id, ),
    )
    if not seqs:
        return
    assert len(seqs) == 1
    moltype, given_length, length = seqs[0]

    try:
        length = int(length)
        given_length = int(length)
        assert length == given_length
        have_seq = True
    except TypeError:
        assert length is None
        seqs = adaptor.execute_and_fetchall(
            "SELECT alphabet, length, seq FROM biosequence WHERE bioentry_id = %s",
            (primary_id, ),
        )
        assert len(seqs) == 1
        moltype, given_length, seq = seqs[0]
        assert seq is None or seq == ""
        length = int(given_length)
        have_seq = False
        del seq
    del given_length

    if have_seq:
        return DBSeq(primary_id,
                     adaptor,
                     alphabet=None,
                     start=0,
                     length=int(length))
    else:
        if moltype in ("dna", "rna"):
            character = "N"
        elif moltype == "protein":
            character = "X"
        else:
            character = "?"
        return UnknownSeq(length, character=character)
Esempio n. 6
0
    def test_join_UnknownSeq(self):
        """Checks if UnknownSeq join correctly concatenates sequence with the spacer."""
        # Only expect it to take Seq objects and/or strings in an iterable!

        spacer1 = UnknownSeq(0, character="-", alphabet=generic_dna)
        spacers = [
            spacer1,
            UnknownSeq(5, character="-", alphabet=generic_dna),
            UnknownSeq(5, character="-", alphabet=generic_nucleotide),
        ]

        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str(str_concatenated), "".join(example_strings))
        self.assertEqual(str_concatenated.alphabet, spacer1.alphabet)

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(str(seq_concatenated), str(spacer).join(example_strings))
            self.assertEqual(seq_concatenated.alphabet, spacer.alphabet)
Esempio n. 7
0
def concatenate(infiles, outfile):
    alignments = [AlignIO.read(open(f, "r"), "fasta") for f in infiles]

    # Get the full set of labels (i.e. sequence ids) for all the alignments
    all_labels = set(seq.id for aln in alignments for seq in aln)

    # Make a dictionary to store info as we go along
    # (defaultdict is convenient -- asking for a missing key gives back an empty list)
    tmp = defaultdict(list)

    # Assume all alignments have same alphabet
    #alphabet = alignments[0]._alphabet

    for aln in alignments:
        length = aln.get_alignment_length()

        # check if any labels are missing in the current alignment
        these_labels = set(rec.id for rec in aln)
        missing = all_labels - these_labels

        # if any are missing, create unknown data of the right length,
        # stuff the string representation into the tmp dict
        for label in missing:
            #new_seq = UnknownSeq(length, alphabet=alphabet)
            new_seq = UnknownSeq(length)
            tmp[label].append(str(new_seq))

        # else stuff the string representation into the tmp dict
        for rec in aln:
            tmp[rec.id].append(str(rec.seq))

    # Stitch all the substrings together using join (most efficient way),
    # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment
    #msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(v), alphabet=alphabet), id=k)
    msa = MultipleSeqAlignment(
        SeqRecord(Seq(''.join(v)), id=k) for (k, v) in tmp.items())

    #    with open(outfile, "w") as out:
    AlignIO.write(msa, outfile, "fasta")


#tmpdir = tempfile.TemporaryDirectory()
#print(tmpdir.name)
#timeit.timeit('concatenate(infiles,outfile)',
#    setup='infiles=simAlignments(10,10,tmpdir.name),outfile=tempfile.NamedTemporaryFile(dir=tmpdir).name')

# python -m timeit -s 'import tempfile; tmpdir=tempfile.TemporaryDirectory(); from concatenate import simAlignments; infiles=simAlignments(10,10,tmpdir.name); outf=tempfile.NamedTemporaryFile().name' "from concatenate import concatenate; concatenate(infiles,outf)"
#100 loops, best of 3: 2.94 msec per loop
Esempio n. 8
0
 def _get_rec(self, base, info_dict):
     """Retrieve a record to add features to."""
     max_loc = info_dict.get("location", (0, 1))[1]
     try:
         cur_rec = base[info_dict["rec_id"]]
         # update generated unknown sequences with the expected maximum length
         if isinstance(cur_rec.seq, UnknownSeq):
             cur_rec.seq._length = max([max_loc, cur_rec.seq._length])
         return cur_rec, base
     except KeyError:
         if self._create_missing:
             new_rec = SeqRecord(UnknownSeq(max_loc), info_dict["rec_id"])
             base[info_dict["rec_id"]] = new_rec
             return new_rec, base
         else:
             raise
Esempio n. 9
0
 def test_join_UnknownSeq_mixed_alpha(self):
     """Check UnknownSeq can join incompatible alphabets."""
     spacer = UnknownSeq(5, character="-", alphabet=generic_dna)
     self.assertEqual(
         "-" * 15,
         spacer.join([
             UnknownSeq(5, character="-", alphabet=generic_rna),
             UnknownSeq(5, character="-", alphabet=generic_rna),
         ]),
     )
     self.assertEqual(
         "N" * 5 + "-" * 10,
         spacer.join([
             Seq("NNNNN", generic_protein),
             UnknownSeq(5, character="-", alphabet=generic_protein),
         ]),
     )
Esempio n. 10
0
def concatenate(alignments):
    # Get the full set of labels (i.e. sequence ids) for all the alignments
    all_labels = set(seq.id for aln in alignments for seq in aln)
    logger.debug("extracted {} different labels in all alignments: {}".format(
        len(all_labels), all_labels))

    # Make a dictionary to store info as we go along
    # (defaultdict is convenient -- asking for a missing key gives back an empty list)
    concat_buf = defaultdict(list)

    # Assume all alignments have same alphabet
    alphabet = alignments[0]._alphabet
    logger.debug('detected alphabet: {}'.format(alphabet))

    for aln in alignments:
        length = aln.get_alignment_length()

        # check if any labels are missing in the current alignment
        these_labels = set(rec.id for rec in aln)
        missing = all_labels - these_labels
        logger.debug(
            "alignment of length {} with {} sequences, {} missing ({})".format(
                length, len(these_labels), len(missing), missing))

        # if any are missing, create unknown data of the right length,
        # stuff the string representation into the concat_buf dict
        for label in missing:
            new_seq = UnknownSeq(length, alphabet=alphabet)
            concat_buf[label].append(str(new_seq))

        # else stuff the string representation into the concat_buf dict
        for rec in aln:
            concat_buf[rec.id].append(str(rec.seq))

    # Stitch all the substrings together using join (most efficient way),
    # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment
    msa = MultipleSeqAlignment(
        SeqRecord(Seq(''.join(seq_arr), alphabet=alphabet), id=label)
        for (label, seq_arr) in concat_buf.items())
    logger.info(
        "concatenated MSA of {} taxa and total length {} created".format(
            len(msa), len(msa[0])))
    return msa
Esempio n. 11
0
    def test_join_UnknownSeq_with_file(self):
        """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer."""
        filename = 'Fasta/f003'
        seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = UnknownSeq(0, character="-", alphabet=generic_dna)
        spacer1 = UnknownSeq(5, character="-", alphabet=generic_dna)
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(str(seq_concatenated), ref_data)
        self.assertEqual(str(seq_concatenated1), ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, 'fasta'))
Esempio n. 12
0
def concatenate(alignments):
    """
    Concatenates a list of Bio.Align.MultipleSeqAlignment objects.
    If any sequences are missing the are padded with unknown data
    (Bio.Seq.UnknownSeq).
    Returns a single Bio.Align.MultipleSeqAlignment.
    Limitations: any annotations in the sub-alignments are lost in
    the concatenated alignment.
    """

    # Get the full set of labels (i.e. sequence ids) for all the alignments
    all_labels = set(seq.id for aln in alignments for seq in aln)

    # Make a dictionary to store info as we go along
    # (defaultdict is convenient -- asking for a missing key gives back an empty list)
    tmp = defaultdict(list)

    # Assume all alignments have same alphabet
    alphabet = alignments[0]._alphabet

    for aln in alignments:
        length = aln.get_alignment_length()

        # check if any labels are missing in the current alignment
        these_labels = set(rec.id for rec in aln)
        missing = all_labels - these_labels

        # if any are missing, create unknown data of the right length,
        # stuff the string representation into the tmp dict
        for label in missing:
            new_seq = UnknownSeq(length, alphabet=alphabet)
            tmp[label].append(str(new_seq))

        # else stuff the string representation into the tmp dict
        for rec in aln:
            tmp[rec.id].append(str(rec.seq))

    # Stitch all the substrings together using join (most efficient way),
    # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment
    msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(v), alphabet=alphabet), id=k, name=k, description=k)
               for (k,v) in tmp.items())
    return msa
Esempio n. 13
0
def cfg_out_iterator(handle, alphabet=single_letter_alphabet):
    """Generator to iterate Centrifuge output (as SeqRecord objects)

    Arguments:
     - handle - input file
     - alphabet - optional alphabet
    """
    for (read_id, seq_id, tax_id, score, second_score, hit_length,
         query_length, num_matches) in simple_out_parser(handle):
        try:
            first_word = read_id.split(None, 1)[0]
        except IndexError:
            assert not read_id, repr(read_id)
            # Should we use SeqRecord default for no ID?
            first_word = ""
        # From Centrifuge score get the "single hit equivalent length"
        try:
            adapted_score = float(score)**0.5 + 15
        except ValueError:
            print(f'Error parsing score ({score}) for taxid {tax_id}'
                  f' in {handle}...')
            raise
        try:
            adapted_2nd_score = float(second_score)**0.5 + 15
        except ValueError:
            print(f'Error parsing score ({second_score}) for taxid {tax_id}'
                  f' in {handle}...')
            raise
        yield SeqRecord(UnknownSeq(0, alphabet),
                        id=first_word,
                        name=first_word,
                        description=read_id,
                        dbxrefs=[seq_id],
                        annotations={
                            'taxID': tax_id,
                            'score': adapted_score,
                            '2ndBestScore': adapted_2nd_score,
                            'hitLength': hit_length,
                            'queryLength': query_length,
                            'numMatches': int(num_matches),
                        })
Esempio n. 14
0
def join_seqs(s1, s2, length=None):
    if length:
        pad_length = length - len(s1) - len(s2)
        try:
            pad = SeqRecord(
                UnknownSeq(pad_length, character='-'),
                letter_annotations={'phred_quality': [0] * pad_length},
            )
        except ValueError:
            sys.exit(
                'Total length of the two reads exceeds given length (%s)' %
                (length))
        else:
            s_joined = s1 + pad + s2.reverse_complement()
    else:
        s_joined = s1 + s2.reverse_complement()

    ## assumes the read ID ends in a 2-char suffix for direction (e.g. _1)
    s_joined.id = s1.id[:-2]
    s_joined.description = ''  ## not required for fastq
    return s_joined
Esempio n. 15
0
def prepare_cluster_qual_files(work_dir, qual_file, cluster_seq_dir):
    cluster_qual_dir = work_dir + "/cluster_qual"
    os.mkdir(cluster_qual_dir)
    # get a list of all quality scores
    fd_qual = open(qual_file, "rU")
    quals = SeqIO.to_dict(SeqIO.parse(fd_qual, "qual"))
    # get quality scores for the clusters
    for cluster_seq_file in os.listdir(cluster_seq_dir):
        if os.path.isfile(
                cluster_seq_dir + "/" + cluster_seq_file
        ):  # check if file, can do some more checking here e.g. is fasta file
            fd_cluster_seq = open(cluster_seq_dir + "/" + cluster_seq_file,
                                  "rU")
            cluster_seqs = SeqIO.parse(fd_cluster_seq, "fasta")
            cluster_quals = []
            for seq in cluster_seqs:
                qual = quals[seq.name]
                cluster_qual = SeqRecord(seq=UnknownSeq(
                    len(qual.letter_annotations["phred_quality"])),
                                         id="",
                                         description=qual.description)
                cluster_qual.letter_annotations[
                    "phred_quality"] = qual.letter_annotations["phred_quality"]
                cluster_quals.append(cluster_qual)

            cluster_qual_file = cluster_qual_dir + "/" + cluster_seq_file.split(
                ".")[0] + ".qual"
            fd_cluster_qual = open(cluster_qual_file, "w")
            SeqIO.write(cluster_quals, fd_cluster_qual, "qual")
            fd_cluster_qual.close()
            os.system("sed -i \"s/> />/g\" " + cluster_qual_file
                      )  # need to replace the space after the > in header
            fd_cluster_seq.close()

    fd_qual.close()
    return cluster_qual_dir
Esempio n. 16
0
print mutSeq, type(mutSeq)
mutseq[1]='T'    # imposible on simple Seq
print mutseq
seq1 = mutseq.toseq()    # convert to Seq
mutSeq.remove('A')    # remove first A
mutSeq[2:-5]='TTTT'
mutSeq.reverse()    # reverse() and reverse_complement() change object itself
print mutSeq
#MutableSeq can't be a dictionary key, Seq and string can

#UnknownSeq
# Subclass of Seq when you know length but not the characters to save memory
from Bio.Seq import UnknownSeq
unk = UnknownSeq(25)
print unk, len(unk), type(unk)
unkDNA = UnknownSeq(20, alphabet=IUPAC.ambiguous_dna)
print unkDNA    # N = any base
unkProt = UnknownSeq(10, alphabet=IUPAC.protein)
print unkProt    # X = any aminoacid

print unkDNA.complement(), unkDNA.reverse_complement()
print unkDNA.transcribe(), unkDNA.translate()
unkProt = unkDNA.translate()
print unkProt, len(unkProt)

#Directly on strings
from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate
noseq = 'GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG'
print reverse_complement(noseq)    # these functions
print transcribe(noseq)            # receive either strings
print back_transcribe(noseq)       # Seq, MutableSeq, UnknownSeq
from Bio.Seq import UnknownSeq
from Bio.Alphabet import IUPAC

unk = UnknownSeq(20, alphabet=IUPAC.ambiguous_dna)
unk.complement()
unk.reverse_complement()
unk_rna = unk.transcribe()
print(unk_rna)
unk_protein = unk.translate()
print(unk_protein)
Esempio n. 18
0
def merge(records):
    """Merge multiple SeqRecords into one, using a defined spacer

    :param records: Iterable containing SeqRecords to be merged
    :param length: Length of the spacer in kbp
    :param spacer: Kind of spacer to use ('n' for UnknownSeq spacer, 'stop' for all-frame stop codon spacer)

    :return: A single SeqRecord that is the product of the merge.
    """
    length = 20
    spacer = 'n'

    if spacer not in ('n', 'stop'):
        raise ValueError("Invalid spacer: %r, use either 'n' or 'stop'" %
                         spacer)

    if not len(records):
        raise ValueError("No records given")

    if spacer == 'stop':
        spacer_seq = Seq(ALL_FRAME_STOP_MOTIF * 40 * length,
                         Alphabet.generic_dna)
    else:
        spacer_seq = UnknownSeq(length * 1000,
                                alphabet=Alphabet.generic_dna,
                                character='N')

    new_rec = records[0]

    if len(records) == 1:
        return new_rec

    rec_id = new_rec.id
    rec_name = new_rec.name
    rec_desc = new_rec.description
    date = new_rec.annotations.get('date', '')
    source = new_rec.annotations.get("source", '')
    organism = new_rec.annotations.get('organism', '')
    taxonomy = new_rec.annotations.get('taxonomy', [])
    data_file_division = new_rec.annotations.get('data_file_division', 'UNK')
    topology = new_rec.annotations.get('topology', 'linear')

    for i, rec in enumerate(records[1:]):
        spacer_id = 'spacer_{}'.format(i + 1)

        spacer_feature = SeqFeature(FeatureLocation(0, length * 1000, 0),
                                    type='misc_feature',
                                    id=spacer_id,
                                    qualifiers={'note': [spacer_id]})

        spacer_rec = SeqRecord(spacer_seq,
                               id=spacer_id,
                               name=spacer_id,
                               description=spacer_id,
                               features=[spacer_feature])

        new_rec = new_rec + spacer_rec + rec

    new_rec.id = rec_id
    new_rec.name = rec_name
    new_rec.description = rec_desc
    new_rec.annotations["date"] = date
    new_rec.annotations["source"] = source
    new_rec.annotations["organism"] = organism
    new_rec.annotations["taxonomy"] = taxonomy
    new_rec.annotations["data_file_division"] = data_file_division
    new_rec.annotations["topology"] = topology

    return new_rec
Esempio n. 19
0
def concatenate(alignments):
    """
    Concatenates a list of multiple sequence alignment objects.

    The alignments are concatenated based on their label, i.e. the
    sequences from the different alignments which have the same id/labels
    will become a single sequence. The order is preserved.

    If any sequences are missing in one or several alignments, these parts
    are padded with unknown data (:py:class:`Bio.Seq.UnknownSeq`).

    :param alignments: the list of alignments objects, i.e. list(:py:class:`Bio.Align.MultipleSeqAlignment`)
    :returns: a single :py:class:`Bio.Align.MultipleSeqAlignment`

    Example::

        >>> sequences = {'aln1': {'seq1': 'acgtca',
        ...                       'seq2': 'acgtt-',
        ...                       'seq3': 'ac-ta-'},
        ...              'aln2': {'seq2': 'ttg-cta',
        ...                       'seq3': 'tcgacta',
        ...                       'seq4': 'ttgacta'}}
        >>> alignments = [MultipleSeqAlignment([SeqRecord(Seq(sequence,
        ...                    alphabet=IUPAC.extended_dna), id=key)
        ...      for (key, sequence) in sequences[aln].items()])
        ...               for aln in ('aln1', 'aln2')]
        >>> con_alignment = concatenate(alignments)
        >>> con_alignment.sort()
        >>> print(con_alignment)
        ExtendedIUPACDNA() alignment with 4 rows and 13 columns
        acgtcaNNNNNNN seq1
        acgtt-ttg-cta seq2
        ac-ta-tcgacta seq3
        NNNNNNttgacta seq4

    :note:

       Limitations: any annotations in the sub-alignments are lost in
       the concatenated alignment.

    """

    # First check to see whether we're inputting filenames of alignments or the Biopython alignments
    # Assume that it's a biopython alignment if it's not a filename
    tmp_aligns = []
    for filename in alignments:
        if identify_input(filename).name == 'FILENAME':
            tmp_aligns.append(AlignIO.read(filename, "fasta"))
        else:
            tmp_aligns.append(filename)

    # Copy back to alignments
    alignments = tmp_aligns

    # Get the full set of labels (i.e. sequence ids) for all the alignments
    all_labels = set(seq.id for aln in alignments for seq in aln)

    # Make a dictionary to store info as we go along
    # (defaultdict is convenient -- asking for a missing key gives back an empty list)
    tmp = defaultdict(list)

    # Assume all alignments have same alphabet
    alphabet = alignments[0]._alphabet

    for aln in alignments:
        length = aln.get_alignment_length()

        # check if any labels are missing in the current alignment
        these_labels = set(rec.id for rec in aln)
        missing = all_labels - these_labels

        # if any are missing, create unknown data of the right length,
        # stuff the string representation into the tmp dict
        for label in missing:
            new_seq = UnknownSeq(length, alphabet=alphabet)
            tmp[label].append(str(new_seq))

        # else stuff the string representation into the tmp dict
        for rec in aln:
            tmp[rec.id].append(str(rec.seq))

    # Stitch all the substrings together using join (most efficient way),
    # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment
    return MultipleSeqAlignment(
        SeqRecord(Seq(''.join(v), alphabet=alphabet), id=k)
        for (k, v) in tmp.items())
Esempio n. 20
0
#manipulation des tables pour la traduction
from Bio.Data import CodonTable
std_table = CodonTable.unambiguous_dna_by_name["Standard"]
bact_table = CodonTable.unambiguous_dna_by_name["Bacterial"]
bact_table.start_codons
bact_table.stop_codons

#pour comparer séquences (attention à l'alphabet)
str(bli) == str(blu) 

#on peut faire des séquences mutables, cf tuto

#pour faire des séquences inconnues, avec des N pour nucléotides et X pour les protéines
from Bio.Seq import UnknownSeq
unk_dna = UnknownSeq(20, alphabet=IUPAC.ambiguous_dna)


#SeqRecord
from Bio.SeqRecord import SeqRecord
help(SeqRecord) #pour voir les différents champs
SeqRecord(bli)
from Bio import SeqIO
machin = SeqIO.read("hao.fasta", "fasta") #pour fichier avec une seule séquence
print machin
print machin.format("fasta")
#mêmes types de choses existent pour les .gnk (format GeneBank)

for seq_record in SeqIO.parse("nosZ.fasta", "fasta"):
    print seq_record.id
    print seq_record.seq
Esempio n. 21
0
from Bio.Seq import MutableSeq
from Bio.Alphabet import IUPAC
mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)

mutable_seq
mutable_seq[5] = "C"
mutable_seq
mutable_seq.remove("T")
mutable_seq
mutable_seq.reverse()
mutable_seq

# UnknownSeq objects
from Bio.Seq import UnknownSeq
unk = UnknownSeq(20)
unk
print(unk)
len(unk)


from Bio.Seq import UnknownSeq
from Bio.Alphabet import IUPAC
unk_dna = UnknownSeq(20, alphabet=IUPAC.ambiguous_dna) 
unk_dna
print(unk_dna)

unk_protein = unk_dna.translate()
unk_protein

# Connecting with biological databases
Esempio n. 22
0
def parse_gff(handle):
    """Quick hack to parse Bacterial GFF files from Prokka etc.

    Does NOT support multi-line features (i.e. splicing and
    multiple exons). Will load EVERYTHING into memory!

    Iterator yielding SeqRecord objects, intended to fit into the
    Biopython SeqIO structure.
    """
    line = handle.readline()
    assert line.startswith("##gff-version 3"), line
    # print("Parsing GFF3")
    references = OrderedDict()
    for line in handle:
        # print(line)
        if line.startswith("##sequence-region "):
            _, name, start, end = line.split()
            assert start == "1"
            references[name] = SeqRecord(
                UnknownSeq(int(end)), id=name, name=name)
        elif line.strip() == "##FASTA":
            break
        elif line.startswith("#"):
            raise NotImplementedError(line)
        elif line.count("\t") == 8:
            seqid, source, ftype, start, end, score, strand, phase, attributes = line.split(
                "\t")
            assert seqid in references, "Reference %r not declared with ##sequence-region line:\n%r" % (
                seqid, line)
            start = int(start) - 1
            end = int(end)
            assert 0 <= start < end < len(references[seqid])
            if ftype in FEATURE_TYPE_TO_IGNORE:
                continue
            if FEATURE_TYPE_WANTED and ftype not in FEATURE_TYPE_WANTED:
                continue
            if strand == "+":
                loc = FeatureLocation(start, end, +1)
            elif strand == "-":
                loc = FeatureLocation(start, end, -1)
            elif strand == ".":
                # Unstranded - should use zero but +1 to match EMBL/GB
                loc = FeatureLocation(start, end, +1)
            elif strand == "?":
                # Stranded by missing - should use None but +1 to match EMBL/GB
                loc = FeatureLocation(start, end, +1)
            else:
                raise ValueError("Bad strand %r in line:\n%r" % (strand, line))
            f = SeqFeature(loc, type=ftype)
            for part in attributes.strip().split(";"):
                if not part:
                    assert ";;" in line, line
                    sys.stderr.write(
                        "Warning - missing key=value or double semi-colon in line:\n%r\n" % line)
                    continue
                if "=" not in part:
                    sys.exit("Bad key=value entry %r in line:\n%r" %
                             (part, line))
                key, value = part.split("=", 1)
                if key in MISSING_QUALIFIERS_TO_IGNORE:
                    continue
                if key == "eC_number":
                    key = "EC_number"
                value = value.replace("%2C", ",")
                try:
                    f.qualifiers[key].append(value)
                except KeyError:
                    f.qualifiers[key] = [value]
            references[seqid].features.append(f)
        else:
            raise NotImplementedError(line)
    # Deal with any FASTA block
    name = None
    seqs = []
    for line in handle:
        if line.startswith(">"):
            if name and seqs:
                seq = "".join(seqs)
                assert len(seq) == len(references[name]), \
                    "FASTA entry for %s was %i long, expected %i" % (
                        name, len(seq), len(references[name]))
                references[name].seq = Seq(seq)
            name = line[1:].split(None, 1)[0]
            seqs = []
        elif name:
            seqs.append(line.strip())
        elif line.strip():
            raise NotImplementedError(line)
    if name and seqs:
        seq = "".join(seqs)
        assert len(seq) == len(references[name]), \
            "FASTA entry for %s was %i long, expected %i" % (
                name, len(seq), len(references[name]))
        references[name].seq = Seq(seq)
    # Return results
    for name, record in references.items():
        # print("%s length %i with %i features" % (name, len(record), len(record.seq)))
        yield record
Esempio n. 23
0
class StringMethodTests(unittest.TestCase):
    _examples = [
        #These are length 9, a multiple of 3 for translation tests:
        Seq("ACGTGGGGT", generic_protein),
        Seq("ACGTGGGGT", generic_nucleotide),
        Seq("ACGTGGGGT", generic_dna),
        Seq("ACGUGGGGU", generic_rna),
        Seq("GG", generic_protein),
        Seq("GG", generic_nucleotide),
        Seq("GG", generic_dna),
        Seq("GG", generic_rna),
        Seq("A", generic_protein),
        Seq("A", generic_nucleotide),
        Seq("A", generic_dna),
        Seq("A", generic_rna),
        UnknownSeq(1),
        UnknownSeq(1, character="n"),
        UnknownSeq(1, generic_rna),
        UnknownSeq(1, generic_rna, "n"),
        UnknownSeq(1, generic_rna, "N"),
        UnknownSeq(12, generic_rna, "N"),
        UnknownSeq(12, generic_dna, "N"),
        UnknownSeq(12, generic_nucleotide, "N"),
        UnknownSeq(12, generic_protein, "X"),
        UnknownSeq(12, character="X"),
        UnknownSeq(12),
    ]
    for seq in _examples[:]:
        if isinstance(seq, Seq):
            _examples.append(seq.tomutable())
    _start_end_values = [0, 1, 2, 1000, -1, -2, -999]

    def _test_method(self,
                     method_name,
                     pre_comp_function=None,
                     start_end=False):
        """Check this method matches the plain string's method."""
        self.assertTrue(isinstance(method_name, str))
        for example1 in self._examples:
            if not hasattr(example1, method_name):
                #e.g. MutableSeq does not support find
                continue
            str1 = str(example1)

            for example2 in self._examples:
                if not hasattr(example2, method_name):
                    #e.g. MutableSeq does not support find
                    continue
                str2 = str(example2)

                i = getattr(example1, method_name)(str2)
                j = getattr(str1, method_name)(str2)
                if pre_comp_function:
                    i = pre_comp_function(i)
                    j = pre_comp_function(j)
                if i != j:
                    raise ValueError(
                        "%s.%s(%s) = %i, not %i" %
                        (repr(example1), method_name, repr(str2), i, j))

                try:
                    i = getattr(example1, method_name)(example2)
                    j = getattr(str1, method_name)(str2)
                    if pre_comp_function:
                        i = pre_comp_function(i)
                        j = pre_comp_function(j)
                    if i != j:
                        raise ValueError("%s.%s(%s) = %i, not %i" %
                                         (repr(example1), method_name,
                                          repr(example2), i, j))
                except TypeError:
                    #TODO - Check the alphabets do clash!
                    pass

                if start_end:
                    for start in self._start_end_values:
                        i = getattr(example1, method_name)(str2, start)
                        j = getattr(str1, method_name)(str2, start)
                        if pre_comp_function:
                            i = pre_comp_function(i)
                            j = pre_comp_function(j)
                        if i != j:
                            raise ValueError("%s.%s(%s, %i) = %i, not %i" %
                                             (repr(example1), method_name,
                                              repr(str2), start, i, j))

                        for end in self._start_end_values:
                            i = getattr(example1, method_name)(str2, start,
                                                               end)
                            j = getattr(str1, method_name)(str2, start, end)
                            if pre_comp_function:
                                i = pre_comp_function(i)
                                j = pre_comp_function(j)
                            if i != j:
                                raise ValueError(
                                    "%s.%s(%s, %i, %i) = %i, not %i" %
                                    (repr(example1), method_name, repr(str2),
                                     start, end, i, j))

    def test_str_count(self):
        """Check matches the python string count method."""
        self._test_method("count", start_end=True)

    def test_str_find(self):
        """Check matches the python string find method."""
        self._test_method("find", start_end=True)

    def test_str_rfind(self):
        """Check matches the python string rfind method."""
        self._test_method("rfind", start_end=True)

    def test_str_startswith(self):
        """Check matches the python string startswith method."""
        self._test_method("startswith", start_end=True)

        try:
            self.assertTrue("ABCDE".startswith(("ABE", "OBE", "ABC")))
        except TypeError:
            #Base string only supports this on Python 2.5+, skip this
            return

        #Now check with a tuple of sub sequences
        for example1 in self._examples:
            if not hasattr(example1, "startswith"):
                #e.g. MutableSeq does not support this
                continue
            subs = tuple([
                example1[start:start + 2]
                for start in range(0,
                                   len(example1) - 2, 3)
            ])
            subs_str = tuple([str(s) for s in subs])

            self.assertEqual(
                str(example1).startswith(subs_str), example1.startswith(subs))
            self.assertEqual(
                str(example1).startswith(subs_str),
                example1.startswith(subs_str))  # strings!
            self.assertEqual(
                str(example1).startswith(subs_str, 3),
                example1.startswith(subs, 3))
            self.assertEqual(
                str(example1).startswith(subs_str, 2, 6),
                example1.startswith(subs, 2, 6))

    def test_str_endswith(self):
        """Check matches the python string endswith method."""
        self._test_method("endswith", start_end=True)

        try:
            self.assertTrue("ABCDE".endswith(("ABE", "OBE", "CDE")))
        except TypeError:
            #Base string only supports this on Python 2.5+, skip this
            return

        #Now check with a tuple of sub sequences
        for example1 in self._examples:
            if not hasattr(example1, "endswith"):
                #e.g. MutableSeq does not support this
                continue
            subs = tuple([
                example1[start:start + 2]
                for start in range(0,
                                   len(example1) - 2, 3)
            ])
            subs_str = tuple([str(s) for s in subs])

            self.assertEqual(
                str(example1).endswith(subs_str), example1.endswith(subs))
            self.assertEqual(
                str(example1).startswith(subs_str),
                example1.startswith(subs_str))  # strings!
            self.assertEqual(
                str(example1).endswith(subs_str, 3),
                example1.endswith(subs, 3))
            self.assertEqual(
                str(example1).endswith(subs_str, 2, 6),
                example1.endswith(subs, 2, 6))

    def test_str_strip(self):
        """Check matches the python string strip method."""
        self._test_method("strip", pre_comp_function=str)

    def test_str_rstrip(self):
        """Check matches the python string rstrip method."""
        self._test_method("rstrip", pre_comp_function=str)

    def test_str_split(self):
        """Check matches the python string rstrip method."""
        #Calling (r)split should return a list of Seq-like objects, we'll
        #just apply str() to each of them so it matches the string method
        self._test_method("rstrip", pre_comp_function=lambda x: map(str, x))

    def test_str_rsplit(self):
        """Check matches the python string rstrip method."""
        #Calling (r)split should return a list of Seq-like objects, we'll
        #just apply str() to each of them so it matches the string method
        self._test_method("rstrip", pre_comp_function=lambda x: map(str, x))

    def test_str_lsplit(self):
        """Check matches the python string rstrip method."""
        #Calling (r)split should return a list of Seq-like objects, we'll
        #just apply str() to each of them so it matches the string method
        self._test_method("rstrip", pre_comp_function=lambda x: map(str, x))

    def test_str_length(self):
        """Check matches the python string __len__ method."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(len(example1), len(str1))

    def test_str_upper(self):
        """Check matches the python string upper method."""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            str1 = str(example1)
            self.assertEqual(str(example1.upper()), str1.upper())

    def test_str_lower(self):
        """Check matches the python string lower method."""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            str1 = str(example1)
            self.assertEqual(str(example1.lower()), str1.lower())

    def test_str_getitem(self):
        """Check slicing and indexing works like a string."""
        for example1 in self._examples:
            str1 = str(example1)
            for i in self._start_end_values:
                if abs(i) < len(example1):
                    self.assertEqual(str(example1[i]), str1[i])
                self.assertEqual(str(example1[:i]), str1[:i])
                self.assertEqual(str(example1[i:]), str1[i:])
                for j in self._start_end_values:
                    self.assertEqual(str(example1[i:j]), str1[i:j])
                    for step in range(-3, 4):
                        if step == 0:
                            try:
                                print(example1[i:j:step])
                                self._assert(False)  # Should fail!
                            except ValueError:
                                pass
                        else:
                            self.assertEqual(str(example1[i:j:step]),
                                             str1[i:j:step])

    def test_tostring(self):
        """Check str(obj) and obj.tostring() match."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(example1.tostring(), str1)

    def test_tomutable(self):
        """Check obj.tomutable() method."""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            mut = example1.tomutable()
            self.assertTrue(isinstance(mut, MutableSeq))
            self.assertEqual(str(mut), str(example1))
            self.assertEqual(mut.alphabet, example1.alphabet)

    def test_toseq(self):
        """Check obj.toseq() method."""
        for example1 in self._examples:
            try:
                seq = example1.toseq()
            except AttributeError:
                self.assertTrue(isinstance(example1, Seq))
                continue
            self.assertTrue(isinstance(seq, Seq))
            self.assertEqual(str(seq), str(example1))
            self.assertEqual(seq.alphabet, example1.alphabet)

    def test_the_complement(self):
        """Check obj.complement() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                comp = example1.complement()
            except ValueError as e:
                self.assertEqual(str(e), "Proteins do not have complements!")
                continue
            str1 = str(example1)
            #This only does the unambiguous cases
            if "U" in str1 or "u" in str1 \
            or example1.alphabet==generic_rna:
                mapping = maketrans("ACGUacgu", "UGCAugca")
            elif "T" in str1 or "t" in str1 \
            or example1.alphabet==generic_dna \
            or example1.alphabet==generic_nucleotide:
                mapping = maketrans("ACGTacgt", "TGCAtgca")
            elif "A" not in str1 and "a" not in str1:
                mapping = maketrans("CGcg", "GCgc")
            else:
                #TODO - look at alphabet?
                raise ValueError(example1)
            self.assertEqual(str1.translate(mapping), str(comp))
            self.assertEqual(comp.alphabet, example1.alphabet)

    def test_the_reverse_complement(self):
        """Check obj.reverse_complement() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                comp = example1.reverse_complement()
            except ValueError as e:
                self.assertEqual(str(e), "Proteins do not have complements!")
                continue
            str1 = str(example1)
            #This only does the unambiguous cases
            if "U" in str1 or "u" in str1 \
            or example1.alphabet==generic_rna:
                mapping = maketrans("ACGUacgu", "UGCAugca")
            elif "T" in str1 or "t" in str1 \
            or example1.alphabet==generic_dna \
            or example1.alphabet==generic_nucleotide:
                mapping = maketrans("ACGTacgt", "TGCAtgca")
            elif "A" not in str1 and "a" not in str1:
                mapping = maketrans("CGcg", "GCgc")
            else:
                #TODO - look at alphabet?
                continue
            self.assertEqual(str1.translate(mapping)[::-1], str(comp))
            self.assertEqual(comp.alphabet, example1.alphabet)

    def test_the_transcription(self):
        """Check obj.transcribe() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                tran = example1.transcribe()
            except ValueError as e:
                if str(e) == "Proteins cannot be transcribed!":
                    continue
                if str(e) == "RNA cannot be transcribed!":
                    continue
                raise e
            str1 = str(example1)
            if len(str1) % 3 != 0:
                #TODO - Check for or silence the expected warning?
                continue
            self.assertEqual(
                str1.replace("T", "U").replace("t", "u"), str(tran))
            self.assertEqual(tran.alphabet,
                             generic_rna)  # based on limited examples

    def test_the_back_transcription(self):
        """Check obj.back_transcribe() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                tran = example1.back_transcribe()
            except ValueError as e:
                if str(e) == "Proteins cannot be back transcribed!":
                    continue
                if str(e) == "DNA cannot be back transcribed!":
                    continue
                raise e
            str1 = str(example1)
            self.assertEqual(
                str1.replace("U", "T").replace("u", "t"), str(tran))
            self.assertEqual(tran.alphabet,
                             generic_dna)  # based on limited examples

    def test_the_translate(self):
        """Check obj.translate() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            if len(example1) % 3 != 0:
                #TODO - Check for or silence the expected warning?
                continue
            try:
                tran = example1.translate()
            except ValueError as e:
                if str(e) == "Proteins cannot be translated!":
                    continue
                raise e
            #This is based on the limited example not having stop codons:
            if tran.alphabet not in [
                    extended_protein, protein, generic_protein
            ]:
                print(tran.alphabet)
                self.assertTrue(False)
            #TODO - check the actual translation, and all the optional args

    def test_the_translation_of_stops(self):
        """Check obj.translate() method with stop codons."""
        misc_stops = "TAATAGTGAAGAAGG"
        for nuc in [
                Seq(misc_stops),
                Seq(misc_stops, generic_nucleotide),
                Seq(misc_stops, generic_dna),
                Seq(misc_stops, unambiguous_dna)
        ]:
            self.assertEqual("***RR", str(nuc.translate()))
            self.assertEqual("***RR", str(nuc.translate(1)))
            self.assertEqual("***RR", str(nuc.translate("SGC0")))
            self.assertEqual("**W**", str(nuc.translate(table=2)))
            self.assertEqual("**WRR",
                             str(nuc.translate(table='Yeast Mitochondrial')))
            self.assertEqual("**WSS", str(nuc.translate(table=5)))
            self.assertEqual("**WSS", str(nuc.translate(table=9)))
            self.assertEqual("**CRR",
                             str(nuc.translate(table='Euplotid Nuclear')))
            self.assertEqual("***RR", str(nuc.translate(table=11)))
            self.assertEqual("***RR", str(nuc.translate(table='11')))
            self.assertEqual("***RR", str(nuc.translate(table='Bacterial')))
            self.assertEqual("", str(nuc.translate(to_stop=True)))
            self.assertEqual("O*ORR", str(nuc.translate(table=special_table)))
            self.assertEqual(
                "*QWRR", str(nuc.translate(table=Chilodonella_uncinata_table)))
            #These test the Bio.Seq.translate() function - move these?:
            self.assertEqual(
                "*QWRR", translate(str(nuc),
                                   table=Chilodonella_uncinata_table))
            self.assertEqual("O*ORR", translate(str(nuc), table=special_table))
            self.assertEqual("", translate(str(nuc), to_stop=True))
            self.assertEqual("***RR", translate(str(nuc), table='Bacterial'))
            self.assertEqual("***RR", translate(str(nuc), table='11'))
            self.assertEqual("***RR", translate(str(nuc), table=11))
            self.assertEqual("**W**", translate(str(nuc), table=2))
        self.assertEqual(str(Seq("TAT").translate()), "Y")
        self.assertEqual(str(Seq("TAR").translate()), "*")
        self.assertEqual(str(Seq("TAN").translate()), "X")
        self.assertEqual(str(Seq("NNN").translate()), "X")
        self.assertEqual(str(Seq("TAt").translate()), "Y")
        self.assertEqual(str(Seq("TaR").translate()), "*")
        self.assertEqual(str(Seq("TaN").translate()), "X")
        self.assertEqual(str(Seq("nnN").translate()), "X")
        self.assertEqual(str(Seq("tat").translate()), "Y")
        self.assertEqual(str(Seq("tar").translate()), "*")
        self.assertEqual(str(Seq("tan").translate()), "X")
        self.assertEqual(str(Seq("nnn").translate()), "X")

    def test_the_translation_of_invalid_codons(self):
        """Check obj.translate() method with invalid codons."""
        for codon in ["TA?", "N-N", "AC_", "Ac_"]:
            for nuc in [
                    Seq(codon),
                    Seq(codon, generic_nucleotide),
                    Seq(codon, generic_dna),
                    Seq(codon, unambiguous_dna)
            ]:
                try:
                    print(nuc.translate())
                    self.assertTrue(False, "Transating %s should fail" % codon)
                except TranslationError:
                    pass

    def test_the_translation_of_ambig_codons(self):
        """Check obj.translate() method with ambiguous codons."""
        for letters, ambig_values in [
            (ambiguous_dna.letters, ambiguous_dna_values),
            (ambiguous_rna.letters, ambiguous_rna_values)
        ]:
            ambig = set(letters)
            for c1 in ambig:
                for c2 in ambig:
                    for c3 in ambig:
                        values = set([
                            str(Seq(a + b + c).translate())
                            for a in ambig_values[c1] for b in ambig_values[c2]
                            for c in ambig_values[c3]
                        ])
                        t = str(Seq(c1 + c2 + c3).translate())
                        if t == "*":
                            self.assertEqual(values, set("*"))
                        elif t == "X":
                            self.assertTrue(
                                len(values) > 1,
                                "translate('%s') = '%s' not '%s'" %
                                (c1 + c2 + c3, t, ",".join(values)))
                        elif t == "Z":
                            self.assertEqual(values, set("EQ"))
                        elif t == "B":
                            self.assertEqual(values, set("DN"))
                        elif t == "J":
                            self.assertEqual(values, set("LI"))
                        else:
                            self.assertEqual(values, set(t))
                        #TODO - Use the Bio.Data.IUPACData module for the
                        #ambiguous protein mappings?

    def test_init_typeerror(self):
        """Check Seq __init__ gives TypeError exceptions."""
        #Only expect it to take strings and unicode - not Seq objects!
        self.assertRaises(TypeError, Seq, (1066))
        self.assertRaises(TypeError, Seq, (Seq("ACGT", generic_dna)))
 def test_generated(self):
     """Write and read back odd SeqRecord objects."""
     record1 = SeqRecord(
         Seq("ACGT" * 500),
         id="Test",
         description="Long " * 500,
         letter_annotations={"phred_quality": [40, 30, 20, 10] * 500},
     )
     record2 = SeqRecord(
         MutableSeq("NGGC" * 1000),
         id="Mut",
         description="very " * 1000 + "long",
         letter_annotations={"phred_quality": [0, 5, 5, 10] * 1000},
     )
     record3 = SeqRecord(
         UnknownSeq(2000, character="N"),
         id="Unk",
         description="l" + ("o" * 1000) + "ng",
         letter_annotations={"phred_quality": [0, 1] * 1000},
     )
     record4 = SeqRecord(
         Seq("ACGT" * 500),
         id="no_descr",
         description="",
         name="",
         letter_annotations={"phred_quality": [40, 50, 60, 62] * 500},
     )
     record5 = SeqRecord(
         Seq(""),
         id="empty_p",
         description="(could have been trimmed lots)",
         letter_annotations={"phred_quality": []},
     )
     record6 = SeqRecord(
         Seq(""),
         id="empty_s",
         description="(could have been trimmed lots)",
         letter_annotations={"solexa_quality": []},
     )
     record7 = SeqRecord(
         Seq("ACNN" * 500),
         id="Test_Sol",
         description="Long " * 500,
         letter_annotations={"solexa_quality": [40, 30, 0, -5] * 500},
     )
     record8 = SeqRecord(
         Seq("ACGT"),
         id="HighQual",
         description=
         "With very large qualities that even Sanger FASTQ can't hold!",
         letter_annotations={"solexa_quality": [0, 10, 100, 1000]},
     )
     # TODO - Record with no identifier?
     records = [
         record1,
         record2,
         record3,
         record4,
         record5,
         record6,
         record7,
         record8,
     ]
     for fmt in [
             "fasta", "fastq", "fastq-solexa", "fastq-illumina", "qual"
     ]:
         handle = StringIO()
         with warnings.catch_warnings():
             # TODO - Have a Biopython defined "DataLossWarning?"
             warnings.simplefilter("ignore", BiopythonWarning)
             SeqIO.write(records, handle, fmt)
         handle.seek(0)
         self.compare_records(records, list(SeqIO.parse(handle, fmt)), fmt)
Esempio n. 25
0
    def test_count_overlap_start_end_GG(self):
        """Check our count_overlap method using GG with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [
            (1, 7, 3),
            (3, None, 3),
            (3, 6, 2),
            (4, 6, 1),
            (4, -1, 2),
            (-5, None, 2),
            (-5, 7, 2),
            (7, -5, 0),
            (-100, None, 3),
            (None, 100, 3),
            (-100, 1000, 3),
        ]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(Seq(testing_seq).count_overlap("GG", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("GG", start, end), exp
            )

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0)

        # Testing UnknownSeq() with variable start and end arguments
        char_start_end_exp = [
            ("N", 1, 7, 0),
            ("N", 1, 7, 0),
            ("N", -4, None, 0),
            ("N", -4, None, 0),
            ("X", 1, 7, 0),
        ]

        for char, start, end, exp in char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, character=char).count_overlap("GG", start, end), exp
            )
        self.assertEqual(UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [
            ("G", 100, 105, 0),
            ("G", -1, 4, 0),
            ("G", 4, -1, 0),
            ("G", -8, -2, 0),
            ("G", -2, -8, 0),
            ("G", 8, 2, 0),
            ("G", 2, 8, 0),
            ("GG", 8, 2, 0),
            ("GG", 2, 8, 0),
            ("GG", -5, -1, 0),
            ("GG", 1, 5, 0),
            ("GGG", None, None, 0),
            ("GGGGGGGGG", None, None, 0),
            ("GGG", 1, 2, 0),
        ]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end), exp
            )
        self.assertEqual(UnknownSeq(7, character="N").count_overlap("GG", 1), 0)
Esempio n. 26
0
class StringMethodTests(unittest.TestCase):
    _examples = [
        # These are length 9, a multiple of 3 for translation tests:
        Seq("ACGTGGGGT", generic_protein),
        Seq("ACGTGGGGT", generic_nucleotide),
        Seq("ACGTGGGGT", generic_dna),
        Seq("ACGUGGGGU", generic_rna),
        Seq("GG", generic_protein),
        Seq("GG", generic_nucleotide),
        Seq("GG", generic_dna),
        Seq("GG", generic_rna),
        Seq("A", generic_protein),
        Seq("A", generic_nucleotide),
        Seq("A", generic_dna),
        Seq("A", generic_rna),
        UnknownSeq(1),
        UnknownSeq(1, character="n"),
        UnknownSeq(1, generic_rna),
        UnknownSeq(1, generic_rna, "n"),
        UnknownSeq(1, generic_rna, "N"),
        UnknownSeq(12, generic_rna, "N"),
        UnknownSeq(12, generic_dna, "N"),
        UnknownSeq(12, generic_nucleotide, "N"),
        UnknownSeq(12, generic_protein, "X"),
        UnknownSeq(12, character="X"),
        UnknownSeq(12),
    ]
    for seq in _examples[:]:
        if isinstance(seq, Seq):
            _examples.append(seq.tomutable())
    _start_end_values = [0, 1, 2, 1000, -1, -2, -999, None]

    def _test_method(self,
                     method_name,
                     pre_comp_function=None,
                     start_end=False):
        """Check this method matches the plain string's method."""
        self.assertTrue(isinstance(method_name, str))
        for example1 in self._examples:
            if not hasattr(example1, method_name):
                # e.g. MutableSeq does not support find
                continue
            str1 = str(example1)

            for example2 in self._examples:
                if not hasattr(example2, method_name):
                    # e.g. MutableSeq does not support find
                    continue
                str2 = str(example2)

                i = getattr(example1, method_name)(str2)
                j = getattr(str1, method_name)(str2)
                if pre_comp_function:
                    i = pre_comp_function(i)
                    j = pre_comp_function(j)
                if i != j:
                    raise ValueError(
                        "%s.%s(%s) = %i, not %i" %
                        (repr(example1), method_name, repr(str2), i, j))

                try:
                    i = getattr(example1, method_name)(example2)
                    j = getattr(str1, method_name)(str2)
                    if pre_comp_function:
                        i = pre_comp_function(i)
                        j = pre_comp_function(j)
                    if i != j:
                        raise ValueError("%s.%s(%s) = %i, not %i" %
                                         (repr(example1), method_name,
                                          repr(example2), i, j))
                except TypeError:
                    # TODO - Check the alphabets do clash!
                    pass

                if start_end:
                    for start in self._start_end_values:
                        i = getattr(example1, method_name)(str2, start)
                        j = getattr(str1, method_name)(str2, start)
                        if pre_comp_function:
                            i = pre_comp_function(i)
                            j = pre_comp_function(j)
                        if i != j:
                            raise ValueError("%s.%s(%s, %i) = %i, not %i" %
                                             (repr(example1), method_name,
                                              repr(str2), start, i, j))

                        for end in self._start_end_values:
                            i = getattr(example1, method_name)(str2, start,
                                                               end)
                            j = getattr(str1, method_name)(str2, start, end)
                            if pre_comp_function:
                                i = pre_comp_function(i)
                                j = pre_comp_function(j)
                            if i != j:
                                raise ValueError(
                                    "%s.%s(%s, %i, %i) = %i, not %i" %
                                    (repr(example1), method_name, repr(str2),
                                     start, end, i, j))

    def test_str_count(self):
        """Check matches the python string count method."""
        self._test_method("count", start_end=True)

    def test_str_count_overlap_GG(self):
        """Check our count_overlap method using GG."""

        # Testing with self._examples
        expected = [
            3,
            3,
            3,
            3,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,  # Seq() Tests
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0
        ]  # UnknownSeq() Tests
        expected *= 2  # MutableSeq() Tests

        assert len(self._examples) == len(expected)

        for seq, exp in zip(self._examples, expected):
            # Using search term GG as a string
            self.assertEqual(seq.count_overlap("GG"), exp)
            self.assertEqual(seq.count_overlap("G" * 5), 0)
            # Using search term GG as a Seq with generic alphabet
            self.assertEqual(seq.count_overlap(Seq("GG")), exp)
            self.assertEqual(seq.count_overlap(Seq("G" * 5)), 0)

    def test_count_overlap_start_end_GG(self):
        """Check our count_overlap method using GG with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [(1, 7, 3), (3, None, 3), (3, 6, 2), (4, 6, 1),
                         (4, -1, 2), (-5, None, 2), (-5, 7, 2), (7, -5, 0),
                         (-100, None, 3), (None, 100, 3), (-100, 1000, 3)]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(
                Seq(testing_seq).count_overlap("GG", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("GG", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0)

        # Testing UnknownSeq() with variable start and end arguments
        alphabet_char_start_end_exp = [(generic_rna, "N", 1, 7, 0),
                                       (generic_dna, "N", 1, 7, 0),
                                       (generic_rna, "N", -4, None, 0),
                                       (generic_dna, "N", -4, None, 0),
                                       (generic_protein, "X", 1, 7, 0)]

        for alpha, char, start, end, exp in alphabet_char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, alpha, char).count_overlap("GG", start, end),
                exp)
        self.assertEqual(
            UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [("G", 100, 105, 0), ("G", -1, 4, 0),
                                ("G", 4, -1, 0), ("G", -8, -2, 0),
                                ("G", -2, -8, 0), ("G", 8, 2, 0),
                                ("G", 2, 8, 0), ("GG", 8, 2, 0),
                                ("GG", 2, 8, 0), ("GG", -5, -1, 0),
                                ("GG", 1, 5, 0), ("GGG", None, None, 0),
                                ("GGGGGGGGG", None, None, 0), ("GGG", 1, 2, 0)]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end),
                exp)
        self.assertEqual(
            UnknownSeq(7, character="N").count_overlap("GG", 1), 0)

    def test_str_count_overlap_NN(self):
        """Check our count_overlap method using NN."""

        # Testing with self._examples
        expected = [
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,  # Seq() Tests
            0,
            0,
            0,
            0,
            0,
            11,
            11,
            11,
            0,
            0,
            0
        ]  # UnknownSeq() Tests
        expected *= 2  # MutableSeq() Tests

        assert len(self._examples) == len(expected)

        for seq, exp in zip(self._examples, expected):
            # Using search term NN as a string
            self.assertEqual(seq.count_overlap("NN"), exp)
            self.assertEqual(seq.count_overlap("N" * 13), 0)
            # Using search term NN as a Seq with generic alphabet
            self.assertEqual(seq.count_overlap(Seq("NN")), exp)
            self.assertEqual(seq.count_overlap(Seq("N" * 13)), 0)

    def test_count_overlap_start_end_NN(self):
        """Check our count_overlap method using NN with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [(1, 7, 0), (3, None, 0), (3, 6, 0), (4, 6, 0),
                         (4, -1, 0), (-5, None, 0), (-5, 7, 0), (7, -5, 0),
                         (-100, None, 0), (None, 100, 0), (-100, 1000, 0)]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(
                Seq(testing_seq).count_overlap("NN", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("NN", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0)

        # Testing UnknownSeq() with variable start and end arguments
        alphabet_char_start_end_exp = [(generic_rna, "N", 1, 7, 5),
                                       (generic_dna, "N", 1, 7, 5),
                                       (generic_rna, "N", -4, None, 3),
                                       (generic_dna, "N", -4, None, 3),
                                       (generic_protein, "X", 1, 7, 0)]

        for alpha, char, start, end, exp in alphabet_char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, alpha, char).count_overlap("NN", start, end),
                exp)
        self.assertEqual(
            UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [("N", 100, 105, 0), ("N", -1, 4, 0),
                                ("N", 4, -1, 2), ("N", -8, -2, 5),
                                ("N", -2, -8, 0), ("N", 8, 2, 0),
                                ("N", 2, 8, 5), ("NN", 8, 2, 0),
                                ("NN", 2, 8, 4), ("NN", -5, -1, 3),
                                ("NN", 1, 5, 3), ("NNN", None, None, 5),
                                ("NNNNNNNNN", None, None, 0), ("NNN", 1, 2, 0)]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end),
                exp)
        self.assertEqual(
            UnknownSeq(7, character="N").count_overlap("NN", 1), 5)

    def test_str_find(self):
        """Check matches the python string find method."""
        self._test_method("find", start_end=True)

    def test_str_rfind(self):
        """Check matches the python string rfind method."""
        self._test_method("rfind", start_end=True)

    def test_str_startswith(self):
        """Check matches the python string startswith method."""
        self._test_method("startswith", start_end=True)
        self.assertTrue("ABCDE".startswith(("ABE", "OBE", "ABC")))

        # Now check with a tuple of sub sequences
        for example1 in self._examples:
            if not hasattr(example1, "startswith"):
                # e.g. MutableSeq does not support this
                continue
            subs = tuple([
                example1[start:start + 2]
                for start in range(0,
                                   len(example1) - 2, 3)
            ])
            subs_str = tuple([str(s) for s in subs])

            self.assertEqual(
                str(example1).startswith(subs_str), example1.startswith(subs))
            self.assertEqual(
                str(example1).startswith(subs_str),
                example1.startswith(subs_str))  # strings!
            self.assertEqual(
                str(example1).startswith(subs_str, 3),
                example1.startswith(subs, 3))
            self.assertEqual(
                str(example1).startswith(subs_str, 2, 6),
                example1.startswith(subs, 2, 6))

    def test_str_endswith(self):
        """Check matches the python string endswith method."""
        self._test_method("endswith", start_end=True)
        self.assertTrue("ABCDE".endswith(("ABE", "OBE", "CDE")))

        # Now check with a tuple of sub sequences
        for example1 in self._examples:
            if not hasattr(example1, "endswith"):
                # e.g. MutableSeq does not support this
                continue
            subs = tuple([
                example1[start:start + 2]
                for start in range(0,
                                   len(example1) - 2, 3)
            ])
            subs_str = tuple([str(s) for s in subs])

            self.assertEqual(
                str(example1).endswith(subs_str), example1.endswith(subs))
            self.assertEqual(
                str(example1).startswith(subs_str),
                example1.startswith(subs_str))  # strings!
            self.assertEqual(
                str(example1).endswith(subs_str, 3),
                example1.endswith(subs, 3))
            self.assertEqual(
                str(example1).endswith(subs_str, 2, 6),
                example1.endswith(subs, 2, 6))

    def test_str_strip(self):
        """Check matches the python string strip method."""
        self._test_method("strip", pre_comp_function=str)

    def test_str_rstrip(self):
        """Check matches the python string rstrip method."""
        self._test_method("rstrip", pre_comp_function=str)

    def test_str_split(self):
        """Check matches the python string rstrip method."""
        # Calling (r)split should return a list of Seq-like objects, we'll
        # just apply str() to each of them so it matches the string method
        self._test_method("rstrip",
                          pre_comp_function=lambda x: [str(y) for y in x])

    def test_str_rsplit(self):
        """Check matches the python string rstrip method."""
        # Calling (r)split should return a list of Seq-like objects, we'll
        # just apply str() to each of them so it matches the string method
        self._test_method("rstrip",
                          pre_comp_function=lambda x: [str(y) for y in x])

    def test_str_lsplit(self):
        """Check matches the python string rstrip method."""
        # Calling (r)split should return a list of Seq-like objects, we'll
        # just apply str() to each of them so it matches the string method
        self._test_method("rstrip",
                          pre_comp_function=lambda x: [str(y) for y in x])

    def test_str_length(self):
        """Check matches the python string __len__ method."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(len(example1), len(str1))

    def test_str_upper(self):
        """Check matches the python string upper method."""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            str1 = str(example1)
            self.assertEqual(str(example1.upper()), str1.upper())

    def test_str_lower(self):
        """Check matches the python string lower method."""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            str1 = str(example1)
            self.assertEqual(str(example1.lower()), str1.lower())

    def test_str_hash(self):
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            with warnings.catch_warnings():
                # Silence change in behaviour warning
                warnings.simplefilter('ignore', BiopythonWarning)
                self.assertEqual(
                    hash(str(example1)), hash(example1),
                    "Hash mismatch, %r for %r vs %r for %r" %
                    (hash(str(example1)), id(example1), hash(example1),
                     example1))

    def test_str_comparison(self):
        for example1 in self._examples:
            for example2 in self._examples:
                with warnings.catch_warnings():
                    # Silence alphabet warning
                    warnings.simplefilter('ignore', BiopythonWarning)
                    self.assertEqual(
                        str(example1) == str(example2), example1 == example2,
                        "Checking %r == %r" % (example1, example2))
                    self.assertEqual(
                        str(example1) != str(example2), example1 != example2,
                        "Checking %r != %r" % (example1, example2))
                    self.assertEqual(
                        str(example1) < str(example2), example1 < example2,
                        "Checking %r < %r" % (example1, example2))
                    self.assertEqual(
                        str(example1) <= str(example2), example1 <= example2,
                        "Checking %r <= %r" % (example1, example2))
                    self.assertEqual(
                        str(example1) > str(example2), example1 > example2,
                        "Checking %r > %r" % (example1, example2))
                    self.assertEqual(
                        str(example1) >= str(example2), example1 >= example2,
                        "Checking %r >= %r" % (example1, example2))

    def test_str_getitem(self):
        """Check slicing and indexing works like a string."""
        for example1 in self._examples:
            str1 = str(example1)
            for i in self._start_end_values:
                if i is not None and abs(i) < len(example1):
                    self.assertEqual(str(example1[i]), str1[i])
                self.assertEqual(str(example1[:i]), str1[:i])
                self.assertEqual(str(example1[i:]), str1[i:])
                for j in self._start_end_values:
                    self.assertEqual(str(example1[i:j]), str1[i:j])
                    for step in range(-3, 4):
                        if step == 0:
                            try:
                                print(example1[i:j:step])
                                self._assert(False)  # Should fail!
                            except ValueError:
                                pass
                        else:
                            self.assertEqual(str(example1[i:j:step]),
                                             str1[i:j:step])

    def test_tomutable(self):
        """Check obj.tomutable() method."""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            mut = example1.tomutable()
            self.assertTrue(isinstance(mut, MutableSeq))
            self.assertEqual(str(mut), str(example1))
            self.assertEqual(mut.alphabet, example1.alphabet)

    def test_toseq(self):
        """Check obj.toseq() method."""
        for example1 in self._examples:
            try:
                seq = example1.toseq()
            except AttributeError:
                self.assertTrue(isinstance(example1, Seq))
                continue
            self.assertTrue(isinstance(seq, Seq))
            self.assertEqual(str(seq), str(example1))
            self.assertEqual(seq.alphabet, example1.alphabet)

    def test_the_complement(self):
        """Check obj.complement() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                comp = example1.complement()
            except ValueError as e:
                self.assertEqual(str(e), "Proteins do not have complements!")
                continue
            str1 = str(example1)
            # This only does the unambiguous cases
            if any(("U" in str1, "u"
                    in str1, example1.alphabet == generic_rna)):
                mapping = maketrans("ACGUacgu", "UGCAugca")
            elif any(
                ("T" in str1, "t" in str1, example1.alphabet == generic_dna,
                 example1.alphabet == generic_nucleotide)):
                mapping = maketrans("ACGTacgt", "TGCAtgca")
            elif "A" not in str1 and "a" not in str1:
                mapping = maketrans("CGcg", "GCgc")
            else:
                # TODO - look at alphabet?
                raise ValueError(example1)
            self.assertEqual(str1.translate(mapping), str(comp))
            self.assertEqual(comp.alphabet, example1.alphabet)

    def test_the_reverse_complement(self):
        """Check obj.reverse_complement() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                comp = example1.reverse_complement()
            except ValueError as e:
                self.assertEqual(str(e), "Proteins do not have complements!")
                continue
            str1 = str(example1)
            # This only does the unambiguous cases
            if any(("U" in str1, "u"
                    in str1, example1.alphabet == generic_rna)):
                mapping = maketrans("ACGUacgu", "UGCAugca")
            elif any(
                ("T" in str1, "t" in str1, example1.alphabet == generic_dna,
                 example1.alphabet == generic_nucleotide)):
                mapping = maketrans("ACGTacgt", "TGCAtgca")
            elif "A" not in str1 and "a" not in str1:
                mapping = maketrans("CGcg", "GCgc")
            else:
                # TODO - look at alphabet?
                continue
            self.assertEqual(str1.translate(mapping)[::-1], str(comp))
            self.assertEqual(comp.alphabet, example1.alphabet)

    def test_the_transcription(self):
        """Check obj.transcribe() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                tran = example1.transcribe()
            except ValueError as e:
                if str(e) == "Proteins cannot be transcribed!":
                    continue
                if str(e) == "RNA cannot be transcribed!":
                    continue
                raise e
            str1 = str(example1)
            if len(str1) % 3 != 0:
                # TODO - Check for or silence the expected warning?
                continue
            self.assertEqual(
                str1.replace("T", "U").replace("t", "u"), str(tran))
            self.assertEqual(tran.alphabet,
                             generic_rna)  # based on limited examples

    def test_the_back_transcription(self):
        """Check obj.back_transcribe() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                tran = example1.back_transcribe()
            except ValueError as e:
                if str(e) == "Proteins cannot be back transcribed!":
                    continue
                if str(e) == "DNA cannot be back transcribed!":
                    continue
                raise e
            str1 = str(example1)
            self.assertEqual(
                str1.replace("U", "T").replace("u", "t"), str(tran))
            self.assertEqual(tran.alphabet,
                             generic_dna)  # based on limited examples

    def test_the_translate(self):
        """Check obj.translate() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            if len(example1) % 3 != 0:
                # TODO - Check for or silence the expected warning?
                continue
            try:
                tran = example1.translate()
            except ValueError as e:
                if str(e) == "Proteins cannot be translated!":
                    continue
                raise e
            # This is based on the limited example not having stop codons:
            if tran.alphabet not in [
                    extended_protein, protein, generic_protein
            ]:
                print(tran.alphabet)
                self.fail()
            # TODO - check the actual translation, and all the optional args

    def test_the_translation_of_stops(self):
        """Check obj.translate() method with stop codons."""
        misc_stops = "TAATAGTGAAGAAGG"
        for nuc in [
                Seq(misc_stops),
                Seq(misc_stops, generic_nucleotide),
                Seq(misc_stops, generic_dna),
                Seq(misc_stops, unambiguous_dna)
        ]:
            self.assertEqual("***RR", str(nuc.translate()))
            self.assertEqual("***RR", str(nuc.translate(1)))
            self.assertEqual("***RR", str(nuc.translate("SGC0")))
            self.assertEqual("**W**", str(nuc.translate(table=2)))
            self.assertEqual("**WRR",
                             str(nuc.translate(table='Yeast Mitochondrial')))
            self.assertEqual("**WSS", str(nuc.translate(table=5)))
            self.assertEqual("**WSS", str(nuc.translate(table=9)))
            self.assertEqual("**CRR",
                             str(nuc.translate(table='Euplotid Nuclear')))
            self.assertEqual("***RR", str(nuc.translate(table=11)))
            self.assertEqual("***RR", str(nuc.translate(table='11')))
            self.assertEqual("***RR", str(nuc.translate(table='Bacterial')))
            self.assertEqual("**GRR", str(nuc.translate(table=25)))
            self.assertEqual("", str(nuc.translate(to_stop=True)))
            self.assertEqual("O*ORR", str(nuc.translate(table=special_table)))
            self.assertEqual(
                "*QWRR", str(nuc.translate(table=Chilodonella_uncinata_table)))
            # These test the Bio.Seq.translate() function - move these?:
            self.assertEqual(
                "*QWRR", translate(str(nuc),
                                   table=Chilodonella_uncinata_table))
            self.assertEqual("O*ORR", translate(str(nuc), table=special_table))
            self.assertEqual("", translate(str(nuc), to_stop=True))
            self.assertEqual("***RR", translate(str(nuc), table='Bacterial'))
            self.assertEqual("***RR", translate(str(nuc), table='11'))
            self.assertEqual("***RR", translate(str(nuc), table=11))
            self.assertEqual("**W**", translate(str(nuc), table=2))
        self.assertEqual(str(Seq("TAT").translate()), "Y")
        self.assertEqual(str(Seq("TAR").translate()), "*")
        self.assertEqual(str(Seq("TAN").translate()), "X")
        self.assertEqual(str(Seq("NNN").translate()), "X")
        self.assertEqual(str(Seq("TAt").translate()), "Y")
        self.assertEqual(str(Seq("TaR").translate()), "*")
        self.assertEqual(str(Seq("TaN").translate()), "X")
        self.assertEqual(str(Seq("nnN").translate()), "X")
        self.assertEqual(str(Seq("tat").translate()), "Y")
        self.assertEqual(str(Seq("tar").translate()), "*")
        self.assertEqual(str(Seq("tan").translate()), "X")
        self.assertEqual(str(Seq("nnn").translate()), "X")

    def test_the_translation_of_invalid_codons(self):
        """Check obj.translate() method with invalid codons."""
        for codon in ["TA?", "N-N", "AC_", "Ac_"]:
            for nuc in [
                    Seq(codon),
                    Seq(codon, generic_nucleotide),
                    Seq(codon, generic_dna),
                    Seq(codon, unambiguous_dna)
            ]:
                try:
                    print(nuc.translate())
                    self.fail("Translating %s should fail" % codon)
                except TranslationError:
                    pass

    def test_the_translation_of_ambig_codons(self):
        """Check obj.translate() method with ambiguous codons."""
        for letters, ambig_values in [
            (ambiguous_dna.letters, ambiguous_dna_values),
            (ambiguous_rna.letters, ambiguous_rna_values)
        ]:
            ambig = set(letters)
            for c1 in ambig:
                for c2 in ambig:
                    for c3 in ambig:
                        values = set(
                            str(Seq(a + b + c).translate())
                            for a in ambig_values[c1] for b in ambig_values[c2]
                            for c in ambig_values[c3])
                        t = str(Seq(c1 + c2 + c3).translate())
                        if t == "*":
                            self.assertEqual(values, set("*"))
                        elif t == "X":
                            self.assertTrue(
                                len(values) > 1,
                                "translate('%s') = '%s' not '%s'" %
                                (c1 + c2 + c3, t, ",".join(values)))
                        elif t == "Z":
                            self.assertEqual(values, set("EQ"))
                        elif t == "B":
                            self.assertEqual(values, set("DN"))
                        elif t == "J":
                            self.assertEqual(values, set("LI"))
                        else:
                            self.assertEqual(values, set(t))
                        # TODO - Use the Bio.Data.IUPACData module for the
                        # ambiguous protein mappings?

    def test_init_typeerror(self):
        """Check Seq __init__ gives TypeError exceptions."""
        # Only expect it to take strings and unicode - not Seq objects!
        self.assertRaises(TypeError, Seq, (1066))
        self.assertRaises(TypeError, Seq, (Seq("ACGT", generic_dna)))

    def test_MutableSeq_init_typeerror(self):
        """Check MutableSeq __init__ gives TypeError exceptions."""
        self.assertRaises(TypeError, MutableSeq, (Seq("A")))
        self.assertRaises(TypeError, MutableSeq, (UnknownSeq(1)))

    def test_join_Seq_ValueError(self):
        """Checks that a ValueError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = Seq('NNNNN')
        self.assertRaises(ValueError, spacer.join, 5)
        self.assertRaises(ValueError, spacer.join, "ATG")
        self.assertRaises(ValueError, spacer.join, Seq("ATG"))
        self.assertRaises(ValueError, spacer.join, MutableSeq("ATG"))
        self.assertRaises(ValueError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_UnknownSeq_ValueError(self):
        """Checks that a ValueError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = UnknownSeq(5, character="-")
        self.assertRaises(ValueError, spacer.join, 5)
        self.assertRaises(ValueError, spacer.join, "ATG")
        self.assertRaises(ValueError, spacer.join, Seq("ATG"))
        self.assertRaises(ValueError, spacer.join, MutableSeq("ATG"))
        self.assertRaises(ValueError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_MutableSeq_ValueError(self):
        """Checks that a ValueError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = MutableSeq("MMMMM")
        self.assertRaises(ValueError, spacer.join, 5)
        self.assertRaises(ValueError, spacer.join, "ATG")
        self.assertRaises(ValueError, spacer.join, Seq("ATG"))
        self.assertRaises(ValueError, spacer.join, MutableSeq("ATG"))
        self.assertRaises(ValueError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_Seq_TypeError(self):
        """Checks that a TypeError is thrown for incompatible alphabets."""

        spacer = Seq('NNNNN', generic_dna)
        self.assertRaises(
            TypeError, spacer.join,
            [Seq('NNNNN', generic_rna),
             Seq('NNNNN', generic_rna)])
        self.assertRaises(
            TypeError, spacer.join,
            [Seq('NNNNN', generic_protein),
             Seq('NNNNN', generic_protein)])

    def test_join_UnknownSeq_TypeError(self):
        """Checks that a TypeError is thrown for incompatible alphabets."""

        spacer = UnknownSeq(5, character="-", alphabet=generic_dna)
        self.assertRaises(TypeError, spacer.join, [
            UnknownSeq(5, character="-", alphabet=generic_rna),
            UnknownSeq(5, character="-", alphabet=generic_rna)
        ])
        self.assertRaises(TypeError, spacer.join, [
            Seq('NNNNN', generic_protein),
            UnknownSeq(5, character="-", alphabet=generic_protein)
        ])

    def test_join_MutableSeq_TypeError(self):
        """Checks that a TypeError is thrown for incompatible alphabets."""

        spacer = MutableSeq('NNNNN', generic_dna)
        self.assertRaises(TypeError, spacer.join, [
            MutableSeq('NNNNN', generic_rna),
            MutableSeq('NNNNN', generic_rna)
        ])
        self.assertRaises(TypeError, spacer.join, [
            Seq('NNNNN', generic_protein),
            MutableSeq('NNNNN', generic_protein)
        ])

    def test_join_Seq(self):
        """Checks if Seq join correctly concatenates sequence with the spacer."""
        # Only expect it to take Seq objects and/or strings in an iterable!

        spacer1 = Seq('', generic_dna)
        spacers = [
            spacer1,
            Seq('NNNNN', generic_dna),
            Seq('GGG', generic_nucleotide)
        ]
        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str(str_concatenated), "".join(example_strings))
        self.assertEqual(str_concatenated.alphabet, spacer1.alphabet)

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(str(seq_concatenated),
                             str(spacer).join(example_strings))
            self.assertEqual(seq_concatenated.alphabet, spacer.alphabet)

    def test_join_Seq_with_file(self):
        """Checks if Seq join correctly concatenates sequence from a file with the spacer."""
        filename = 'Fasta/f003'
        seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = Seq('NNNNN')
        spacer1 = Seq('')
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(str(seq_concatenated), ref_data)
        self.assertEqual(str(seq_concatenated1), ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, 'fasta'))

    def test_join_UnknownSeq(self):
        """Checks if UnknownSeq join correctly concatenates sequence with the spacer."""
        # Only expect it to take Seq objects and/or strings in an iterable!

        spacer1 = UnknownSeq(0, character="-", alphabet=generic_dna)
        spacers = [
            spacer1,
            UnknownSeq(5, character="-", alphabet=generic_dna),
            UnknownSeq(5, character="-", alphabet=generic_nucleotide)
        ]

        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str(str_concatenated), "".join(example_strings))
        self.assertEqual(str_concatenated.alphabet, spacer1.alphabet)

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(str(seq_concatenated),
                             str(spacer).join(example_strings))
            self.assertEqual(seq_concatenated.alphabet, spacer.alphabet)

    def test_join_UnknownSeq_with_file(self):
        """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer."""
        filename = 'Fasta/f003'
        seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = UnknownSeq(0, character="-", alphabet=generic_dna)
        spacer1 = UnknownSeq(5, character="-", alphabet=generic_dna)
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(str(seq_concatenated), ref_data)
        self.assertEqual(str(seq_concatenated1), ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, 'fasta'))

    def test_join_MutableSeq(self):
        """Checks if MutableSeq join correctly concatenates sequence with the spacer."""
        # Only expect it to take Seq objects and/or strings in an iterable!

        spacer1 = MutableSeq('', generic_dna)
        spacers = [
            spacer1,
            MutableSeq('NNNNN', generic_dna),
            MutableSeq('GGG', generic_nucleotide)
        ]
        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str(str_concatenated), "".join(example_strings))
        self.assertEqual(str_concatenated.alphabet, spacer1.alphabet)

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(str(seq_concatenated),
                             str(spacer).join(example_strings))
            self.assertEqual(seq_concatenated.alphabet, spacer.alphabet)

    def test_join_MutableSeq_with_file(self):
        """Checks if MutableSeq join correctly concatenates sequence from a file with the spacer."""
        filename = 'Fasta/f003'
        seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = MutableSeq('NNNNN')
        spacer1 = MutableSeq('')
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(str(seq_concatenated), ref_data)
        self.assertEqual(str(seq_concatenated1), ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, 'fasta'))
Esempio n. 27
0
class StringMethodTests(unittest.TestCase):
    _examples = [
        # These are length 9, a multiple of 3 for translation tests:
        Seq("ACGTGGGGT"),
        Seq("ACGUGGGGU"),
        Seq("GG"),
        Seq("A"),
        UnknownSeq(1),
        UnknownSeq(1, character="n"),
        UnknownSeq(1, character="N"),
        UnknownSeq(12, character="N"),
        UnknownSeq(12, character="X"),
        UnknownSeq(12),
    ]
    for seq in _examples[:]:
        if not isinstance(seq, MutableSeq):
            _examples.append(MutableSeq(seq))
    _start_end_values = [0, 1, 2, 1000, -1, -2, -999, None]

    def _test_method(self, method_name, start_end=False):
        """Check this method matches the plain string's method."""
        self.assertIsInstance(method_name, str)
        for example1 in self._examples:
            if not hasattr(example1, method_name):
                # e.g. MutableSeq does not support translate
                continue
            str1 = str(example1)

            for example2 in self._examples:
                if not hasattr(example2, method_name):
                    # e.g. MutableSeq does not support translate
                    continue
                str2 = str(example2)

                try:
                    i = getattr(example1, method_name)(str2)
                except ValueError:
                    i = ValueError
                try:
                    j = getattr(str1, method_name)(str2)
                except ValueError:
                    j = ValueError
                self.assertEqual(i, j, "%r.%s(%r)" % (example1, method_name, str2))
                try:
                    i = getattr(example1, method_name)(example2)
                except ValueError:
                    i = ValueError
                try:
                    j = getattr(str1, method_name)(str2)
                except ValueError:
                    j = ValueError
                self.assertEqual(i, j, "%r.%s(%r)" % (example1, method_name, example2))

                if start_end:
                    for start in self._start_end_values:
                        try:
                            i = getattr(example1, method_name)(str2, start)
                        except ValueError:
                            i = ValueError
                        try:
                            j = getattr(str1, method_name)(str2, start)
                        except ValueError:
                            j = ValueError
                        self.assertEqual(
                            i, j, "%r.%s(%r, %s)" % (example1, method_name, str2, start)
                        )

                        for end in self._start_end_values:
                            try:
                                i = getattr(example1, method_name)(str2, start, end)
                            except ValueError:
                                i = ValueError
                            try:
                                j = getattr(str1, method_name)(str2, start, end)
                            except ValueError:
                                j = ValueError
                            self.assertEqual(
                                i,
                                j,
                                "%r.%s(%r, %s, %s)"
                                % (example1, method_name, str2, start, end),
                            )

    def test_str_count(self):
        """Check matches the python string count method."""
        self._test_method("count", start_end=True)
        self.assertEqual(Seq("AC777GT").count("7"), 3)
        self.assertRaises(TypeError, Seq("AC777GT").count, 7)
        self.assertRaises(TypeError, Seq("AC777GT").count, None)

    def test_count_overlap(self):
        """Check count_overlap exception matches python string count method."""
        self.assertEqual(Seq("AC777GT").count("77"), 1)
        self.assertEqual(Seq("AC777GT").count_overlap("77"), 2)
        self.assertEqual(Seq("AC777GT").count_overlap("7"), 3)
        self.assertRaises(TypeError, Seq("AC777GT").count_overlap, 7)
        self.assertRaises(TypeError, Seq("AC777GT").count_overlap, None)

    def test_str_count_overlap_GG(self):
        """Check our count_overlap method using GG."""
        # Testing with self._examples
        expected = [
            3,
            3,
            1,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
        ]
        expected *= 2  # MutableSeq() Tests

        assert len(self._examples) == len(expected)

        for seq, exp in zip(self._examples, expected):
            # Using search term GG as a string
            self.assertEqual(seq.count_overlap("GG"), exp)
            self.assertEqual(seq.count_overlap("G" * 5), 0)
            # Using search term GG as a Seq
            self.assertEqual(seq.count_overlap(Seq("GG")), exp)
            self.assertEqual(seq.count_overlap(Seq("G" * 5)), 0)

    def test_count_overlap_start_end_GG(self):
        """Check our count_overlap method using GG with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [
            (1, 7, 3),
            (3, None, 3),
            (3, 6, 2),
            (4, 6, 1),
            (4, -1, 2),
            (-5, None, 2),
            (-5, 7, 2),
            (7, -5, 0),
            (-100, None, 3),
            (None, 100, 3),
            (-100, 1000, 3),
        ]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(Seq(testing_seq).count_overlap("GG", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("GG", start, end), exp
            )

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0)

        # Testing UnknownSeq() with variable start and end arguments
        char_start_end_exp = [
            ("N", 1, 7, 0),
            ("N", 1, 7, 0),
            ("N", -4, None, 0),
            ("N", -4, None, 0),
            ("X", 1, 7, 0),
        ]

        for char, start, end, exp in char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, character=char).count_overlap("GG", start, end), exp
            )
        self.assertEqual(UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [
            ("G", 100, 105, 0),
            ("G", -1, 4, 0),
            ("G", 4, -1, 0),
            ("G", -8, -2, 0),
            ("G", -2, -8, 0),
            ("G", 8, 2, 0),
            ("G", 2, 8, 0),
            ("GG", 8, 2, 0),
            ("GG", 2, 8, 0),
            ("GG", -5, -1, 0),
            ("GG", 1, 5, 0),
            ("GGG", None, None, 0),
            ("GGGGGGGGG", None, None, 0),
            ("GGG", 1, 2, 0),
        ]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end), exp
            )
        self.assertEqual(UnknownSeq(7, character="N").count_overlap("GG", 1), 0)

    def test_str_count_overlap_NN(self):
        """Check our count_overlap method using NN."""
        # Testing with self._examples
        expected = [
            0,
            0,
            0,
            0,  # Seq() Tests
            0,
            0,
            0,
            11,
            0,
            0,
        ]  # UnknownSeq() Tests
        expected *= 2  # MutableSeq() Tests

        assert len(self._examples) == len(expected)

        for seq, exp in zip(self._examples, expected):
            # Using search term NN as a string
            self.assertEqual(seq.count_overlap("NN"), exp)
            self.assertEqual(seq.count_overlap("N" * 13), 0)
            # Using search term NN as a Seq
            self.assertEqual(seq.count_overlap(Seq("NN")), exp)
            self.assertEqual(seq.count_overlap(Seq("N" * 13)), 0)

    def test_count_overlap_start_end_NN(self):
        """Check our count_overlap method using NN with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [
            (1, 7, 0),
            (3, None, 0),
            (3, 6, 0),
            (4, 6, 0),
            (4, -1, 0),
            (-5, None, 0),
            (-5, 7, 0),
            (7, -5, 0),
            (-100, None, 0),
            (None, 100, 0),
            (-100, 1000, 0),
        ]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(Seq(testing_seq).count_overlap("NN", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("NN", start, end), exp
            )

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0)

        # Testing UnknownSeq() with variable start and end arguments
        char_start_end_exp = [
            ("N", 1, 7, 5),
            ("N", 1, 7, 5),
            ("N", -4, None, 3),
            ("N", -4, None, 3),
            ("X", 1, 7, 0),
        ]

        for char, start, end, exp in char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, character=char).count_overlap("NN", start, end), exp
            )
        self.assertEqual(UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [
            ("N", 100, 105, 0),
            ("N", -1, 4, 0),
            ("N", 4, -1, 2),
            ("N", -8, -2, 5),
            ("N", -2, -8, 0),
            ("N", 8, 2, 0),
            ("N", 2, 8, 5),
            ("NN", 8, 2, 0),
            ("NN", 2, 8, 4),
            ("NN", -5, -1, 3),
            ("NN", 1, 5, 3),
            ("NNN", None, None, 5),
            ("NNNNNNNNN", None, None, 0),
            ("NNN", 1, 2, 0),
        ]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end), exp
            )
        self.assertEqual(UnknownSeq(7, character="N").count_overlap("NN", 1), 5)

    def test_str_find(self):
        """Check matches the python string find method."""
        self._test_method("find", start_end=True)
        self.assertEqual(Seq("AC7GT").find("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").find, 7)
        self.assertRaises(TypeError, Seq("ACGT").find, None)

    def test_str_rfind(self):
        """Check matches the python string rfind method."""
        self._test_method("rfind", start_end=True)
        self.assertEqual(Seq("AC7GT").rfind("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").rfind, 7)
        self.assertRaises(TypeError, Seq("ACGT").rfind, None)

    def test_str_index(self):
        """Check matches the python string index method."""
        self._test_method("index", start_end=True)
        self.assertEqual(Seq("AC7GT").index("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").index, 7)
        self.assertRaises(TypeError, Seq("ACGT").index, None)
        self.assertEqual(MutableSeq("AC7GT").index("7"), 2)
        self.assertRaises(TypeError, MutableSeq("AC7GT").index, 7)
        self.assertRaises(TypeError, MutableSeq("ACGT").index, None)

    def test_str_rindex(self):
        """Check matches the python string rindex method."""
        self._test_method("rindex", start_end=True)
        self.assertEqual(Seq("AC7GT").rindex("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").rindex, 7)
        self.assertRaises(TypeError, Seq("ACGT").rindex, None)
        self.assertEqual(MutableSeq("AC7GT").rindex("7"), 2)
        self.assertRaises(TypeError, MutableSeq("AC7GT").rindex, 7)
        self.assertRaises(TypeError, MutableSeq("ACGT").rindex, None)

    def test_str_startswith(self):
        """Check matches the python string startswith method."""
        self._test_method("startswith", start_end=True)
        self.assertTrue("ABCDE".startswith(("ABE", "OBE", "ABC")))
        self.assertRaises(TypeError, Seq("ACGT").startswith, None)
        self.assertRaises(TypeError, MutableSeq("ACGT").startswith, None)

        # Now check with a tuple of sub sequences
        for example1 in self._examples:
            subs = tuple(
                example1[start : start + 2] for start in range(0, len(example1) - 2, 3)
            )
            subs_str = tuple(str(s) for s in subs)

            self.assertEqual(
                str(example1).startswith(subs_str), example1.startswith(subs)
            )
            self.assertEqual(
                str(example1).startswith(subs_str), example1.startswith(subs_str)
            )  # strings!
            self.assertEqual(
                str(example1).startswith(subs_str, 3), example1.startswith(subs, 3)
            )
            self.assertEqual(
                str(example1).startswith(subs_str, 2, 6),
                example1.startswith(subs, 2, 6),
            )

    def test_str_endswith(self):
        """Check matches the python string endswith method."""
        self._test_method("endswith", start_end=True)
        self.assertTrue("ABCDE".endswith(("ABE", "OBE", "CDE")))
        self.assertRaises(TypeError, Seq("ACGT").endswith, None)

        # Now check with a tuple of sub sequences
        for example1 in self._examples:
            subs = tuple(
                example1[start : start + 2] for start in range(0, len(example1) - 2, 3)
            )
            subs_str = tuple(str(s) for s in subs)

            self.assertEqual(str(example1).endswith(subs_str), example1.endswith(subs))
            self.assertEqual(
                str(example1).startswith(subs_str), example1.startswith(subs_str)
            )  # strings!
            self.assertEqual(
                str(example1).endswith(subs_str, 3), example1.endswith(subs, 3)
            )
            self.assertEqual(
                str(example1).endswith(subs_str, 2, 6), example1.endswith(subs, 2, 6)
            )

    def test_str_strip(self):
        """Check matches the python string strip method."""
        self._test_method("strip")
        self.assertEqual(Seq(" ACGT ").strip(), "ACGT")
        self.assertRaises(TypeError, Seq("ACGT").strip, 7)

    def test_str_rstrip(self):
        """Check matches the python string rstrip method."""
        self._test_method("rstrip")
        self.assertEqual(Seq(" ACGT ").rstrip(), " ACGT")
        self.assertRaises(TypeError, Seq("ACGT").rstrip, 7)

    def test_str_lstrip(self):
        """Check matches the python string lstrip method."""
        self._test_method("rstrip")
        self.assertEqual(Seq(" ACGT ").lstrip(), "ACGT ")
        self.assertRaises(TypeError, Seq("ACGT").lstrip, 7)

    def test_str_split(self):
        """Check matches the python string rstrip method."""
        self._test_method("split")
        self.assertEqual(Seq("AC7GT").rsplit("7"), "AC7GT".split("7"))
        self.assertRaises(TypeError, Seq("AC7GT").split, 7)

    def test_str_rsplit(self):
        """Check matches the python string rstrip method."""
        self._test_method("rsplit")
        self.assertEqual(Seq("AC7GT").rsplit("7"), "AC7GT".rsplit("7"))
        self.assertRaises(TypeError, Seq("AC7GT").rsplit, 7)

    def test_str_length(self):
        """Check matches the python string __len__ method."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(len(example1), len(str1))

    def test_str_upper(self):
        """Check matches the python string upper method."""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            str1 = str(example1)
            self.assertEqual(example1.upper(), str1.upper())

    def test_str_lower(self):
        """Check matches the python string lower method."""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            str1 = str(example1)
            self.assertEqual(example1.lower(), str1.lower())

    def test_str_encode(self):
        """Check matches the python string encode method."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(bytes(example1), str1.encode("ascii"))

    def test_str_hash(self):
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            with warnings.catch_warnings():
                # Silence change in behaviour warning
                warnings.simplefilter("ignore", BiopythonWarning)
                self.assertEqual(
                    hash(str(example1)),
                    hash(example1),
                    "Hash mismatch, %r for %r vs %r for %r"
                    % (hash(str(example1)), id(example1), hash(example1), example1),
                )

    def test_str_comparison(self):
        for example1 in self._examples:
            for example2 in self._examples:
                with warnings.catch_warnings():
                    self.assertEqual(
                        str(example1) == str(example2),
                        example1 == example2,
                        "Checking %r == %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) != str(example2),
                        example1 != example2,
                        "Checking %r != %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) < str(example2),
                        example1 < example2,
                        "Checking %r < %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) <= str(example2),
                        example1 <= example2,
                        "Checking %r <= %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) > str(example2),
                        example1 > example2,
                        "Checking %r > %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) >= str(example2),
                        example1 >= example2,
                        "Checking %r >= %r" % (example1, example2),
                    )

    def test_str_getitem(self):
        """Check slicing and indexing works like a string."""
        for example1 in self._examples:
            str1 = str(example1)
            for i in self._start_end_values:
                if i is not None and abs(i) < len(example1):
                    self.assertEqual(example1[i], str1[i])
                self.assertEqual(example1[:i], str1[:i])
                self.assertEqual(example1[i:], str1[i:])
                for j in self._start_end_values:
                    self.assertEqual(example1[i:j], str1[i:j])
                    for step in range(-3, 4):
                        if step == 0:
                            try:
                                print(example1[i:j:step])
                                self._assert(False)  # Should fail!
                            except ValueError:
                                pass
                        else:
                            self.assertEqual(example1[i:j:step], str1[i:j:step])

    def test_tomutable(self):
        """Check creating a MutableSeq object."""
        for example1 in self._examples:
            mut = MutableSeq(example1)
            self.assertIsInstance(mut, MutableSeq)
            self.assertEqual(mut, example1)

    def test_toseq(self):
        """Check creating a Seq object."""
        for example1 in self._examples:
            seq = Seq(example1)
            self.assertIsInstance(seq, Seq)
            self.assertEqual(seq, example1)

    def test_the_complement(self):
        """Check obj.complement() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                comp = example1.complement()
            except ValueError as e:
                self.assertEqual(str(e), "Proteins do not have complements!")
                continue
            str1 = str(example1)
            if "U" in str1 or "u" in str1:
                mapping = str.maketrans("ACGUacgu", "UGCAugca")
            else:
                # Default to DNA, e.g. complement("A") -> "T" not "U"
                mapping = str.maketrans("ACGTacgt", "TGCAtgca")
            self.assertEqual(str1.translate(mapping), comp)

    def test_the_reverse_complement(self):
        """Check obj.reverse_complement() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                comp = example1.reverse_complement()
            except ValueError as e:
                self.assertEqual(str(e), "Proteins do not have complements!")
                continue
            str1 = str(example1)
            if "U" in str1 or "u" in str1:
                mapping = str.maketrans("ACGUacgu", "UGCAugca")
            else:
                # Defaults to DNA, so reverse_complement("A") --> "T" not "U"
                mapping = str.maketrans("ACGTacgt", "TGCAtgca")
            self.assertEqual(str1.translate(mapping)[::-1], comp)

    def test_the_transcription(self):
        """Check obj.transcribe() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                tran = example1.transcribe()
            except ValueError as e:
                if str(e) == "Proteins cannot be transcribed!":
                    continue
                if str(e) == "RNA cannot be transcribed!":
                    continue
                raise
            str1 = str(example1)
            if len(str1) % 3 != 0:
                # TODO - Check for or silence the expected warning?
                continue
            self.assertEqual(str1.replace("T", "U").replace("t", "u"), tran)

    def test_the_back_transcription(self):
        """Check obj.back_transcribe() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                tran = example1.back_transcribe()
            except ValueError as e:
                if str(e) == "Proteins cannot be back transcribed!":
                    continue
                if str(e) == "DNA cannot be back transcribed!":
                    continue
                raise
            str1 = str(example1)
            self.assertEqual(str1.replace("U", "T").replace("u", "t"), tran)

    def test_the_translate(self):
        """Check obj.translate() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            if len(example1) % 3 != 0:
                # TODO - Check for or silence the expected warning?
                continue
            try:
                tran = example1.translate()
            except ValueError as e:
                if str(e) == "Proteins cannot be translated!":
                    continue
                raise
            # Try with positional vs named argument:
            self.assertEqual(example1.translate(11), example1.translate(table=11))

            # TODO - check the actual translation, and all the optional args

    def test_the_translation_of_stops(self):
        """Check obj.translate() method with stop codons."""
        misc_stops = "TAATAGTGAAGAAGG"
        nuc = Seq(misc_stops)
        self.assertEqual("***RR", nuc.translate())
        self.assertEqual("***RR", nuc.translate(1))
        self.assertEqual("***RR", nuc.translate("SGC0"))
        self.assertEqual("**W**", nuc.translate(table=2))
        self.assertEqual("**WRR", nuc.translate(table="Yeast Mitochondrial"))
        self.assertEqual("**WSS", nuc.translate(table=5))
        self.assertEqual("**WSS", nuc.translate(table=9))
        self.assertEqual("**CRR", nuc.translate(table="Euplotid Nuclear"))
        self.assertEqual("***RR", nuc.translate(table=11))
        self.assertEqual("***RR", nuc.translate(table="11"))
        self.assertEqual("***RR", nuc.translate(table="Bacterial"))
        self.assertEqual("**GRR", nuc.translate(table=25))
        self.assertEqual("", nuc.translate(to_stop=True))
        self.assertEqual("O*ORR", nuc.translate(table=special_table))
        self.assertEqual("*QWRR", nuc.translate(table=Chilodonella_uncinata_table))
        # These test the Bio.Seq.translate() function - move these?:
        self.assertEqual(
            "*QWRR", translate(str(nuc), table=Chilodonella_uncinata_table)
        )
        self.assertEqual("O*ORR", translate(str(nuc), table=special_table))
        self.assertEqual("", translate(str(nuc), to_stop=True))
        self.assertEqual("***RR", translate(str(nuc), table="Bacterial"))
        self.assertEqual("***RR", translate(str(nuc), table="11"))
        self.assertEqual("***RR", translate(str(nuc), table=11))
        self.assertEqual("**W**", translate(str(nuc), table=2))
        self.assertEqual(Seq("TAT").translate(), "Y")
        self.assertEqual(Seq("TAR").translate(), "*")
        self.assertEqual(Seq("TAN").translate(), "X")
        self.assertEqual(Seq("NNN").translate(), "X")
        self.assertEqual(Seq("TAt").translate(), "Y")
        self.assertEqual(Seq("TaR").translate(), "*")
        self.assertEqual(Seq("TaN").translate(), "X")
        self.assertEqual(Seq("nnN").translate(), "X")
        self.assertEqual(Seq("tat").translate(), "Y")
        self.assertEqual(Seq("tar").translate(), "*")
        self.assertEqual(Seq("tan").translate(), "X")
        self.assertEqual(Seq("nnn").translate(), "X")

    def test_the_translation_of_invalid_codons(self):
        """Check obj.translate() method with invalid codons."""
        for codon in ["TA?", "N-N", "AC_", "Ac_"]:
            nuc = Seq(codon)
            try:
                nuc.translate()
                self.fail("Translating %s should fail" % codon)
            except TranslationError:
                pass

    def test_the_translation_of_ambig_codons(self):
        """Check obj.translate() method with ambiguous codons."""
        for ambig_values in [ambiguous_dna_values, ambiguous_rna_values]:
            ambig = set(ambig_values.keys())
            ambig.remove("X")
            for c1 in ambig:
                for c2 in ambig:
                    for c3 in ambig:
                        values = {
                            str(Seq(a + b + c).translate())
                            for a in ambig_values[c1]
                            for b in ambig_values[c2]
                            for c in ambig_values[c3]
                        }
                        t = Seq(c1 + c2 + c3).translate()
                        if t == "*":
                            self.assertEqual(values, set("*"))
                        elif t == "X":
                            self.assertGreater(
                                len(values),
                                1,
                                "translate('%s') = '%s' not '%s'"
                                % (c1 + c2 + c3, t, ",".join(values)),
                            )
                        elif t == "Z":
                            self.assertEqual(values, set("EQ"))
                        elif t == "B":
                            self.assertEqual(values, set("DN"))
                        elif t == "J":
                            self.assertEqual(values, set("LI"))
                        else:
                            self.assertEqual(values, set(t))
                        # TODO - Use the Bio.Data.IUPACData module for the
                        # ambiguous protein mappings?

    def test_init_typeerror(self):
        """Check Seq __init__ gives TypeError exceptions."""
        self.assertRaises(TypeError, Seq, ("A", "C", "G", "T"))
        self.assertRaises(TypeError, Seq, ["A", "C", "G", "T"])
        self.assertRaises(TypeError, Seq, 1)
        self.assertRaises(TypeError, Seq, 1.0)

    def test_MutableSeq_init_typeerror(self):
        """Check MutableSeq __init__ gives TypeError exceptions."""
        self.assertRaises(TypeError, MutableSeq, ("A", "C", "G", "T"))
        self.assertRaises(TypeError, MutableSeq, ["A", "C", "G", "T"])
        self.assertRaises(TypeError, MutableSeq, 1)
        self.assertRaises(TypeError, MutableSeq, 1.0)

    def test_join_Seq_TypeError(self):
        """Checks that a TypeError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = Seq("NNNNN")
        self.assertRaises(TypeError, spacer.join, 5)
        self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_UnknownSeq_TypeError_iter(self):
        """Checks that a TypeError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = UnknownSeq(5, character="-")
        self.assertRaises(TypeError, spacer.join, 5)
        self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_MutableSeq_TypeError_iter(self):
        """Checks that a TypeError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = MutableSeq("MMMMM")
        self.assertRaises(TypeError, spacer.join, 5)
        self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_Seq(self):
        """Checks if Seq join correctly concatenates sequence with the spacer."""
        spacer = Seq("NNNNN")
        self.assertEqual(
            "N" * 15, spacer.join([Seq("NNNNN"), Seq("NNNNN")]),
        )

        spacer1 = Seq("")
        spacers = [spacer1, Seq("NNNNN"), Seq("GGG")]
        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str_concatenated, "".join(example_strings))

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(seq_concatenated, str(spacer).join(example_strings))
            # Now try single sequence arguments, should join the letters
            for target in example_strings + example_strings_seqs:
                self.assertEqual(
                    str(spacer).join(str(target)), str(spacer.join(target))
                )

    def test_join_UnknownSeq(self):
        """Checks if UnknownSeq join correctly concatenates sequence with the spacer."""
        spacer1 = UnknownSeq(5, character="-")
        spacer2 = UnknownSeq(0, character="-")
        spacers = [spacer1, spacer2]

        self.assertEqual(
            "-" * 15,
            spacer1.join([UnknownSeq(5, character="-"), UnknownSeq(5, character="-")]),
        )
        self.assertEqual(
            "N" * 5 + "-" * 10,
            spacer1.join([Seq("NNNNN"), UnknownSeq(5, character="-")]),
        )

        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer2.join(example_strings)

        self.assertEqual(str_concatenated, "".join(example_strings))

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(seq_concatenated, str(spacer).join(example_strings))
            # Now try single sequence arguments, should join the letters
            for target in example_strings + example_strings_seqs:
                self.assertEqual(
                    str(spacer).join(str(target)), str(spacer.join(target))
                )

    def test_join_MutableSeq_mixed(self):
        """Check MutableSeq objects can be joined."""
        spacer = MutableSeq("NNNNN")
        self.assertEqual(
            "N" * 15, spacer.join([MutableSeq("NNNNN"), MutableSeq("NNNNN")]),
        )
        self.assertRaises(
            TypeError, spacer.join([Seq("NNNNN"), MutableSeq("NNNNN")]),
        )

    def test_join_Seq_with_file(self):
        """Checks if Seq join correctly concatenates sequence from a file with the spacer."""
        filename = "Fasta/f003"
        seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = Seq("NNNNN")
        spacer1 = Seq("")
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(seq_concatenated, ref_data)
        self.assertEqual(seq_concatenated1, ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, "fasta"))

    def test_join_UnknownSeq_with_file(self):
        """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer."""
        filename = "Fasta/f003"
        seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = UnknownSeq(0, character="-")
        spacer1 = UnknownSeq(5, character="-")
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(seq_concatenated, ref_data)
        self.assertEqual(seq_concatenated1, ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, "fasta"))

    def test_join_MutableSeq(self):
        """Checks if MutableSeq join correctly concatenates sequence with the spacer."""
        # Only expect it to take Seq objects and/or strings in an iterable!

        spacer1 = MutableSeq("")
        spacers = [
            spacer1,
            MutableSeq("NNNNN"),
            MutableSeq("GGG"),
        ]
        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str_concatenated, "".join(example_strings))

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(seq_concatenated, str(spacer).join(example_strings))

    def test_join_MutableSeq_with_file(self):
        """Checks if MutableSeq join correctly concatenates sequence from a file with the spacer."""
        filename = "Fasta/f003"
        seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = MutableSeq("NNNNN")
        spacer1 = MutableSeq("")
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(seq_concatenated, ref_data)
        self.assertEqual(seq_concatenated1, ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, "fasta"))

    def test_equality(self):
        """Test equality when mixing types."""
        self.assertEqual(Seq("6"), "6")
        self.assertNotEqual(Seq("6"), 6)
        self.assertEqual(Seq(""), "")
        self.assertNotEqual(Seq(""), None)
        self.assertEqual(Seq("None"), "None")
        self.assertNotEqual(Seq("None"), None)

        self.assertEqual(MutableSeq("6"), "6")
        self.assertNotEqual(MutableSeq("6"), 6)
        self.assertEqual(MutableSeq(""), "")
        self.assertNotEqual(MutableSeq(""), None)
        self.assertEqual(MutableSeq("None"), "None")
        self.assertNotEqual(MutableSeq("None"), None)

        self.assertEqual(UnknownSeq(1, character="6"), "6")
        self.assertNotEqual(UnknownSeq(1, character="6"), 6)
        self.assertEqual(UnknownSeq(0), "")
        self.assertNotEqual(UnknownSeq(0), None)
Esempio n. 28
0
 def test_MutableSeq_init_typeerror(self):
     """Check MutableSeq __init__ gives TypeError exceptions."""
     self.assertRaises(TypeError, MutableSeq, (Seq("A")))
     self.assertRaises(TypeError, MutableSeq, (UnknownSeq(1)))
Esempio n. 29
0
 def test_join_UnknownSeq_TypeError(self):
     """Checks that a TypeError is thrown for incompatible alphabets."""
     spacer = UnknownSeq(5, character="-", alphabet=generic_dna)
     self.assertRaises(TypeError, spacer.join, [UnknownSeq(5, character="-", alphabet=generic_rna), UnknownSeq(5, character="-", alphabet=generic_rna)])
     self.assertRaises(TypeError, spacer.join, [Seq("NNNNN", generic_protein), UnknownSeq(5, character="-", alphabet=generic_protein)])
Esempio n. 30
0
    def test_count_overlap_start_end_NN(self):
        """Check our count_overlap method using NN with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [(1, 7, 0),
                         (3, None, 0),
                         (3, 6, 0),
                         (4, 6, 0),
                         (4, -1, 0),
                         (-5, None, 0),
                         (-5, 7, 0),
                         (7, -5, 0),
                         (-100, None, 0),
                         (None, 100, 0),
                         (-100, 1000, 0)]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(Seq(testing_seq).count_overlap("NN", start, end), exp)
            self.assertEqual(MutableSeq(testing_seq).count_overlap("NN", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0)

        # Testing UnknownSeq() with variable start and end arguments
        alphabet_char_start_end_exp = [(generic_rna, "N", 1, 7, 5),
                                       (generic_dna, "N", 1, 7, 5),
                                       (generic_rna, "N", -4, None, 3),
                                       (generic_dna, "N", -4, None, 3),
                                       (generic_protein, "X", 1, 7, 0)]

        for alpha, char, start, end, exp in alphabet_char_start_end_exp:
            self.assertEqual(UnknownSeq(12, alpha, char).count_overlap("NN", start, end), exp)
        self.assertEqual(UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [("N", 100, 105, 0),
                                ("N", -1, 4, 0),
                                ("N", 4, -1, 2),
                                ("N", -8, -2, 5),
                                ("N", -2, -8, 0),
                                ("N", 8, 2, 0),
                                ("N", 2, 8, 5),
                                ("NN", 8, 2, 0),
                                ("NN", 2, 8, 4),
                                ("NN", -5, -1, 3),
                                ("NN", 1, 5, 3),
                                ("NNN", None, None, 5),
                                ("NNNNNNNNN", None, None, 0),
                                ("NNN", 1, 2, 0)]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(UnknownSeq(7, character="N").count_overlap(substr, start, end), exp)
        self.assertEqual(UnknownSeq(7, character="N").count_overlap("NN", 1), 5)
Esempio n. 31
0
def record_end(self, content):
    """Clean up when we've finished the record.
    """
    from Bio import Alphabet
    from Bio.Alphabet import IUPAC
    from Bio.Seq import Seq, UnknownSeq

    # Try and append the version number to the accession for the full id
    if not self.data.id:
        assert "accessions" not in self.data.annotations, self.data.annotations[
            "accessions"]
        self.data.id = self.data.name  # Good fall back?
    elif self.data.id.count(".") == 0:
        try:
            self.data.id += ".%i" % self.data.annotations["sequence_version"]
        except KeyError:
            pass

    # add the sequence information
    # first, determine the alphabet
    # we default to an generic alphabet if we don't have a
    # seq type or have strange sequence information.
    seq_alphabet = Alphabet.generic_alphabet

    # now set the sequence
    sequence = "".join(self._seq_data)

    if (self._expected_size is not None and len(sequence) != 0
            and self._expected_size != len(sequence)):
        import warnings
        from Bio import BiopythonParserWarning

        warnings.warn(
            "Expected sequence length %i, found %i (%s)." %
            (self._expected_size, len(sequence), self.data.id),
            BiopythonParserWarning,
        )

    if self._seq_type:
        # mRNA is really also DNA, since it is actually cDNA
        if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper():
            seq_alphabet = IUPAC.ambiguous_dna
        # are there ever really RNA sequences in GenBank?
        elif "RNA" in self._seq_type.upper():
            # Even for data which was from RNA, the sequence string
            # is usually given as DNA (T not U).  Bug 2408
            if "T" in sequence and "U" not in sequence:
                seq_alphabet = IUPAC.ambiguous_dna
            else:
                seq_alphabet = IUPAC.ambiguous_rna
        elif ("PROTEIN" in self._seq_type.upper() or self._seq_type
              == "PRT"):  # PRT is used in EMBL-bank for patents
            seq_alphabet = IUPAC.protein  # or extended protein?
        # work around ugly GenBank records which have circular or
        # linear but no indication of sequence type
        elif self._seq_type in ["circular", "linear", "unspecified"]:
            pass
        # we have a bug if we get here
        else:
            raise ValueError("Could not determine alphabet for seq_type %s" %
                             self._seq_type)

        # Also save the chomosome layout
        if "circular" in self._seq_type.lower():
            self.data.annotations["topology"] = "circular"
        elif "linear" in self._seq_type.lower():
            self.data.annotations["topology"] = "linear"

    if not sequence and self.__expected_size:
        self.data.seq = UnknownSeq(self._expected_size, seq_alphabet)
    else:
        self.data.seq = Seq(sequence, seq_alphabet)