def _retrieve_seq(adaptor, primary_id): #The database schema ensures there will be only one matching #row in the table. #If an UnknownSeq was recorded, seq will be NULL, #but length will be populated. This means length(seq) #will return None. seqs = adaptor.execute_and_fetchall( "SELECT alphabet, length, length(seq) FROM biosequence" " WHERE bioentry_id = %s", (primary_id, )) if not seqs: return assert len(seqs) == 1 moltype, given_length, length = seqs[0] try: length = int(length) given_length = int(length) assert length == given_length have_seq = True except TypeError: assert length is None seqs = adaptor.execute_and_fetchall( "SELECT alphabet, length, seq FROM biosequence" " WHERE bioentry_id = %s", (primary_id, )) assert len(seqs) == 1 moltype, given_length, seq = seqs[0] assert seq is None or seq == "" length = int(given_length) have_seq = False del seq del given_length moltype = moltype.lower() # might be upper case in database #We have no way of knowing if these sequences will use IUPAC #alphabets, and we certainly can't assume they are unambiguous! if moltype == "dna": alphabet = Alphabet.generic_dna elif moltype == "rna": alphabet = Alphabet.generic_rna elif moltype == "protein": alphabet = Alphabet.generic_protein elif moltype == "unknown": #This is used in BioSQL/Loader.py and would happen #for any generic or nucleotide alphabets. alphabet = Alphabet.single_letter_alphabet else: raise AssertionError("Unknown moltype: %s" % moltype) if have_seq: return DBSeq(primary_id, adaptor, alphabet, 0, int(length)) else: return UnknownSeq(length, alphabet)
def setUp(self): self.seqrec = SeqRecord(UnknownSeq(21)) loc = CompoundLocation([ FeatureLocation(12, 15, strand=1), FeatureLocation(18, 21, strand=1), FeatureLocation(0, 3, strand=1), FeatureLocation(6, 9, strand=1) ], operator="join") self.seqcds = SeqFeature(loc, type="CDS") self.seqgene = SeqFeature(loc, type="gene") self.seqrec.annotations["topology"] = "circular"
def __init__(self, biopython_object=None): # first we define our underlying SeqRecord object if biopython_object == None: self._record = SeqRecord(seq=UnknownSeq(0,alphabet=NucleotideAlphabet()),id='',name='',description='') elif isinstance(biopython_object,Seq): self._record = SeqRecord(seq=copy.deepcopy(biopython_object),id='',name='',description='') elif isinstance(biopython_object,SeqRecord): self._record = copy.deepcopy(biopython_object) # define dictionary of features for faster lookup self._features = {} for (i,feature) in enumerate(self._record.features): self._features.setdefault(feature.type,[]).append(i)
def test_join_UnknownSeq(self): """Checks if UnknownSeq join correctly concatenates sequence with the spacer.""" spacer1 = UnknownSeq(5, character="-") spacer2 = UnknownSeq(0, character="-") spacers = [spacer1, spacer2] self.assertEqual( "-" * 15, spacer1.join([ UnknownSeq(5, character="-"), UnknownSeq(5, character="-"), ]), ) self.assertEqual( "N" * 5 + "-" * 10, spacer1.join([ Seq("NNNNN"), UnknownSeq(5, character="-"), ]), ) example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"] # strings with empty spacer str_concatenated = spacer2.join(example_strings) self.assertEqual(str(str_concatenated), "".join(example_strings)) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(str(seq_concatenated), str(spacer).join(example_strings)) # Now try single sequence arguments, should join the letters for target in example_strings + example_strings_seqs: self.assertEqual( str(spacer).join(str(target)), str(spacer.join(target)))
def _retrieve_seq(adaptor, primary_id): # The database schema ensures there will be only one matching # row in the table. # If an UnknownSeq was recorded, seq will be NULL, # but length will be populated. This means length(seq) # will return None. seqs = adaptor.execute_and_fetchall( "SELECT alphabet, length, length(seq) FROM biosequence WHERE bioentry_id = %s", (primary_id, ), ) if not seqs: return assert len(seqs) == 1 moltype, given_length, length = seqs[0] try: length = int(length) given_length = int(length) assert length == given_length have_seq = True except TypeError: assert length is None seqs = adaptor.execute_and_fetchall( "SELECT alphabet, length, seq FROM biosequence WHERE bioentry_id = %s", (primary_id, ), ) assert len(seqs) == 1 moltype, given_length, seq = seqs[0] assert seq is None or seq == "" length = int(given_length) have_seq = False del seq del given_length if have_seq: return DBSeq(primary_id, adaptor, alphabet=None, start=0, length=int(length)) else: if moltype in ("dna", "rna"): character = "N" elif moltype == "protein": character = "X" else: character = "?" return UnknownSeq(length, character=character)
def test_join_UnknownSeq(self): """Checks if UnknownSeq join correctly concatenates sequence with the spacer.""" # Only expect it to take Seq objects and/or strings in an iterable! spacer1 = UnknownSeq(0, character="-", alphabet=generic_dna) spacers = [ spacer1, UnknownSeq(5, character="-", alphabet=generic_dna), UnknownSeq(5, character="-", alphabet=generic_nucleotide), ] example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"] # strings with empty spacer str_concatenated = spacer1.join(example_strings) self.assertEqual(str(str_concatenated), "".join(example_strings)) self.assertEqual(str_concatenated.alphabet, spacer1.alphabet) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(str(seq_concatenated), str(spacer).join(example_strings)) self.assertEqual(seq_concatenated.alphabet, spacer.alphabet)
def concatenate(infiles, outfile): alignments = [AlignIO.read(open(f, "r"), "fasta") for f in infiles] # Get the full set of labels (i.e. sequence ids) for all the alignments all_labels = set(seq.id for aln in alignments for seq in aln) # Make a dictionary to store info as we go along # (defaultdict is convenient -- asking for a missing key gives back an empty list) tmp = defaultdict(list) # Assume all alignments have same alphabet #alphabet = alignments[0]._alphabet for aln in alignments: length = aln.get_alignment_length() # check if any labels are missing in the current alignment these_labels = set(rec.id for rec in aln) missing = all_labels - these_labels # if any are missing, create unknown data of the right length, # stuff the string representation into the tmp dict for label in missing: #new_seq = UnknownSeq(length, alphabet=alphabet) new_seq = UnknownSeq(length) tmp[label].append(str(new_seq)) # else stuff the string representation into the tmp dict for rec in aln: tmp[rec.id].append(str(rec.seq)) # Stitch all the substrings together using join (most efficient way), # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment #msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(v), alphabet=alphabet), id=k) msa = MultipleSeqAlignment( SeqRecord(Seq(''.join(v)), id=k) for (k, v) in tmp.items()) # with open(outfile, "w") as out: AlignIO.write(msa, outfile, "fasta") #tmpdir = tempfile.TemporaryDirectory() #print(tmpdir.name) #timeit.timeit('concatenate(infiles,outfile)', # setup='infiles=simAlignments(10,10,tmpdir.name),outfile=tempfile.NamedTemporaryFile(dir=tmpdir).name') # python -m timeit -s 'import tempfile; tmpdir=tempfile.TemporaryDirectory(); from concatenate import simAlignments; infiles=simAlignments(10,10,tmpdir.name); outf=tempfile.NamedTemporaryFile().name' "from concatenate import concatenate; concatenate(infiles,outf)" #100 loops, best of 3: 2.94 msec per loop
def _get_rec(self, base, info_dict): """Retrieve a record to add features to.""" max_loc = info_dict.get("location", (0, 1))[1] try: cur_rec = base[info_dict["rec_id"]] # update generated unknown sequences with the expected maximum length if isinstance(cur_rec.seq, UnknownSeq): cur_rec.seq._length = max([max_loc, cur_rec.seq._length]) return cur_rec, base except KeyError: if self._create_missing: new_rec = SeqRecord(UnknownSeq(max_loc), info_dict["rec_id"]) base[info_dict["rec_id"]] = new_rec return new_rec, base else: raise
def test_join_UnknownSeq_mixed_alpha(self): """Check UnknownSeq can join incompatible alphabets.""" spacer = UnknownSeq(5, character="-", alphabet=generic_dna) self.assertEqual( "-" * 15, spacer.join([ UnknownSeq(5, character="-", alphabet=generic_rna), UnknownSeq(5, character="-", alphabet=generic_rna), ]), ) self.assertEqual( "N" * 5 + "-" * 10, spacer.join([ Seq("NNNNN", generic_protein), UnknownSeq(5, character="-", alphabet=generic_protein), ]), )
def concatenate(alignments): # Get the full set of labels (i.e. sequence ids) for all the alignments all_labels = set(seq.id for aln in alignments for seq in aln) logger.debug("extracted {} different labels in all alignments: {}".format( len(all_labels), all_labels)) # Make a dictionary to store info as we go along # (defaultdict is convenient -- asking for a missing key gives back an empty list) concat_buf = defaultdict(list) # Assume all alignments have same alphabet alphabet = alignments[0]._alphabet logger.debug('detected alphabet: {}'.format(alphabet)) for aln in alignments: length = aln.get_alignment_length() # check if any labels are missing in the current alignment these_labels = set(rec.id for rec in aln) missing = all_labels - these_labels logger.debug( "alignment of length {} with {} sequences, {} missing ({})".format( length, len(these_labels), len(missing), missing)) # if any are missing, create unknown data of the right length, # stuff the string representation into the concat_buf dict for label in missing: new_seq = UnknownSeq(length, alphabet=alphabet) concat_buf[label].append(str(new_seq)) # else stuff the string representation into the concat_buf dict for rec in aln: concat_buf[rec.id].append(str(rec.seq)) # Stitch all the substrings together using join (most efficient way), # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment msa = MultipleSeqAlignment( SeqRecord(Seq(''.join(seq_arr), alphabet=alphabet), id=label) for (label, seq_arr) in concat_buf.items()) logger.info( "concatenated MSA of {} taxa and total length {} created".format( len(msa), len(msa[0]))) return msa
def test_join_UnknownSeq_with_file(self): """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer.""" filename = 'Fasta/f003' seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')] seqlist_as_strings = [str(_) for _ in seqlist] spacer = UnknownSeq(0, character="-", alphabet=generic_dna) spacer1 = UnknownSeq(5, character="-", alphabet=generic_dna) # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(str(seq_concatenated), ref_data) self.assertEqual(str(seq_concatenated1), ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, 'fasta'))
def concatenate(alignments): """ Concatenates a list of Bio.Align.MultipleSeqAlignment objects. If any sequences are missing the are padded with unknown data (Bio.Seq.UnknownSeq). Returns a single Bio.Align.MultipleSeqAlignment. Limitations: any annotations in the sub-alignments are lost in the concatenated alignment. """ # Get the full set of labels (i.e. sequence ids) for all the alignments all_labels = set(seq.id for aln in alignments for seq in aln) # Make a dictionary to store info as we go along # (defaultdict is convenient -- asking for a missing key gives back an empty list) tmp = defaultdict(list) # Assume all alignments have same alphabet alphabet = alignments[0]._alphabet for aln in alignments: length = aln.get_alignment_length() # check if any labels are missing in the current alignment these_labels = set(rec.id for rec in aln) missing = all_labels - these_labels # if any are missing, create unknown data of the right length, # stuff the string representation into the tmp dict for label in missing: new_seq = UnknownSeq(length, alphabet=alphabet) tmp[label].append(str(new_seq)) # else stuff the string representation into the tmp dict for rec in aln: tmp[rec.id].append(str(rec.seq)) # Stitch all the substrings together using join (most efficient way), # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(v), alphabet=alphabet), id=k, name=k, description=k) for (k,v) in tmp.items()) return msa
def cfg_out_iterator(handle, alphabet=single_letter_alphabet): """Generator to iterate Centrifuge output (as SeqRecord objects) Arguments: - handle - input file - alphabet - optional alphabet """ for (read_id, seq_id, tax_id, score, second_score, hit_length, query_length, num_matches) in simple_out_parser(handle): try: first_word = read_id.split(None, 1)[0] except IndexError: assert not read_id, repr(read_id) # Should we use SeqRecord default for no ID? first_word = "" # From Centrifuge score get the "single hit equivalent length" try: adapted_score = float(score)**0.5 + 15 except ValueError: print(f'Error parsing score ({score}) for taxid {tax_id}' f' in {handle}...') raise try: adapted_2nd_score = float(second_score)**0.5 + 15 except ValueError: print(f'Error parsing score ({second_score}) for taxid {tax_id}' f' in {handle}...') raise yield SeqRecord(UnknownSeq(0, alphabet), id=first_word, name=first_word, description=read_id, dbxrefs=[seq_id], annotations={ 'taxID': tax_id, 'score': adapted_score, '2ndBestScore': adapted_2nd_score, 'hitLength': hit_length, 'queryLength': query_length, 'numMatches': int(num_matches), })
def join_seqs(s1, s2, length=None): if length: pad_length = length - len(s1) - len(s2) try: pad = SeqRecord( UnknownSeq(pad_length, character='-'), letter_annotations={'phred_quality': [0] * pad_length}, ) except ValueError: sys.exit( 'Total length of the two reads exceeds given length (%s)' % (length)) else: s_joined = s1 + pad + s2.reverse_complement() else: s_joined = s1 + s2.reverse_complement() ## assumes the read ID ends in a 2-char suffix for direction (e.g. _1) s_joined.id = s1.id[:-2] s_joined.description = '' ## not required for fastq return s_joined
def prepare_cluster_qual_files(work_dir, qual_file, cluster_seq_dir): cluster_qual_dir = work_dir + "/cluster_qual" os.mkdir(cluster_qual_dir) # get a list of all quality scores fd_qual = open(qual_file, "rU") quals = SeqIO.to_dict(SeqIO.parse(fd_qual, "qual")) # get quality scores for the clusters for cluster_seq_file in os.listdir(cluster_seq_dir): if os.path.isfile( cluster_seq_dir + "/" + cluster_seq_file ): # check if file, can do some more checking here e.g. is fasta file fd_cluster_seq = open(cluster_seq_dir + "/" + cluster_seq_file, "rU") cluster_seqs = SeqIO.parse(fd_cluster_seq, "fasta") cluster_quals = [] for seq in cluster_seqs: qual = quals[seq.name] cluster_qual = SeqRecord(seq=UnknownSeq( len(qual.letter_annotations["phred_quality"])), id="", description=qual.description) cluster_qual.letter_annotations[ "phred_quality"] = qual.letter_annotations["phred_quality"] cluster_quals.append(cluster_qual) cluster_qual_file = cluster_qual_dir + "/" + cluster_seq_file.split( ".")[0] + ".qual" fd_cluster_qual = open(cluster_qual_file, "w") SeqIO.write(cluster_quals, fd_cluster_qual, "qual") fd_cluster_qual.close() os.system("sed -i \"s/> />/g\" " + cluster_qual_file ) # need to replace the space after the > in header fd_cluster_seq.close() fd_qual.close() return cluster_qual_dir
print mutSeq, type(mutSeq) mutseq[1]='T' # imposible on simple Seq print mutseq seq1 = mutseq.toseq() # convert to Seq mutSeq.remove('A') # remove first A mutSeq[2:-5]='TTTT' mutSeq.reverse() # reverse() and reverse_complement() change object itself print mutSeq #MutableSeq can't be a dictionary key, Seq and string can #UnknownSeq # Subclass of Seq when you know length but not the characters to save memory from Bio.Seq import UnknownSeq unk = UnknownSeq(25) print unk, len(unk), type(unk) unkDNA = UnknownSeq(20, alphabet=IUPAC.ambiguous_dna) print unkDNA # N = any base unkProt = UnknownSeq(10, alphabet=IUPAC.protein) print unkProt # X = any aminoacid print unkDNA.complement(), unkDNA.reverse_complement() print unkDNA.transcribe(), unkDNA.translate() unkProt = unkDNA.translate() print unkProt, len(unkProt) #Directly on strings from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate noseq = 'GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG' print reverse_complement(noseq) # these functions print transcribe(noseq) # receive either strings print back_transcribe(noseq) # Seq, MutableSeq, UnknownSeq
from Bio.Seq import UnknownSeq from Bio.Alphabet import IUPAC unk = UnknownSeq(20, alphabet=IUPAC.ambiguous_dna) unk.complement() unk.reverse_complement() unk_rna = unk.transcribe() print(unk_rna) unk_protein = unk.translate() print(unk_protein)
def merge(records): """Merge multiple SeqRecords into one, using a defined spacer :param records: Iterable containing SeqRecords to be merged :param length: Length of the spacer in kbp :param spacer: Kind of spacer to use ('n' for UnknownSeq spacer, 'stop' for all-frame stop codon spacer) :return: A single SeqRecord that is the product of the merge. """ length = 20 spacer = 'n' if spacer not in ('n', 'stop'): raise ValueError("Invalid spacer: %r, use either 'n' or 'stop'" % spacer) if not len(records): raise ValueError("No records given") if spacer == 'stop': spacer_seq = Seq(ALL_FRAME_STOP_MOTIF * 40 * length, Alphabet.generic_dna) else: spacer_seq = UnknownSeq(length * 1000, alphabet=Alphabet.generic_dna, character='N') new_rec = records[0] if len(records) == 1: return new_rec rec_id = new_rec.id rec_name = new_rec.name rec_desc = new_rec.description date = new_rec.annotations.get('date', '') source = new_rec.annotations.get("source", '') organism = new_rec.annotations.get('organism', '') taxonomy = new_rec.annotations.get('taxonomy', []) data_file_division = new_rec.annotations.get('data_file_division', 'UNK') topology = new_rec.annotations.get('topology', 'linear') for i, rec in enumerate(records[1:]): spacer_id = 'spacer_{}'.format(i + 1) spacer_feature = SeqFeature(FeatureLocation(0, length * 1000, 0), type='misc_feature', id=spacer_id, qualifiers={'note': [spacer_id]}) spacer_rec = SeqRecord(spacer_seq, id=spacer_id, name=spacer_id, description=spacer_id, features=[spacer_feature]) new_rec = new_rec + spacer_rec + rec new_rec.id = rec_id new_rec.name = rec_name new_rec.description = rec_desc new_rec.annotations["date"] = date new_rec.annotations["source"] = source new_rec.annotations["organism"] = organism new_rec.annotations["taxonomy"] = taxonomy new_rec.annotations["data_file_division"] = data_file_division new_rec.annotations["topology"] = topology return new_rec
def concatenate(alignments): """ Concatenates a list of multiple sequence alignment objects. The alignments are concatenated based on their label, i.e. the sequences from the different alignments which have the same id/labels will become a single sequence. The order is preserved. If any sequences are missing in one or several alignments, these parts are padded with unknown data (:py:class:`Bio.Seq.UnknownSeq`). :param alignments: the list of alignments objects, i.e. list(:py:class:`Bio.Align.MultipleSeqAlignment`) :returns: a single :py:class:`Bio.Align.MultipleSeqAlignment` Example:: >>> sequences = {'aln1': {'seq1': 'acgtca', ... 'seq2': 'acgtt-', ... 'seq3': 'ac-ta-'}, ... 'aln2': {'seq2': 'ttg-cta', ... 'seq3': 'tcgacta', ... 'seq4': 'ttgacta'}} >>> alignments = [MultipleSeqAlignment([SeqRecord(Seq(sequence, ... alphabet=IUPAC.extended_dna), id=key) ... for (key, sequence) in sequences[aln].items()]) ... for aln in ('aln1', 'aln2')] >>> con_alignment = concatenate(alignments) >>> con_alignment.sort() >>> print(con_alignment) ExtendedIUPACDNA() alignment with 4 rows and 13 columns acgtcaNNNNNNN seq1 acgtt-ttg-cta seq2 ac-ta-tcgacta seq3 NNNNNNttgacta seq4 :note: Limitations: any annotations in the sub-alignments are lost in the concatenated alignment. """ # First check to see whether we're inputting filenames of alignments or the Biopython alignments # Assume that it's a biopython alignment if it's not a filename tmp_aligns = [] for filename in alignments: if identify_input(filename).name == 'FILENAME': tmp_aligns.append(AlignIO.read(filename, "fasta")) else: tmp_aligns.append(filename) # Copy back to alignments alignments = tmp_aligns # Get the full set of labels (i.e. sequence ids) for all the alignments all_labels = set(seq.id for aln in alignments for seq in aln) # Make a dictionary to store info as we go along # (defaultdict is convenient -- asking for a missing key gives back an empty list) tmp = defaultdict(list) # Assume all alignments have same alphabet alphabet = alignments[0]._alphabet for aln in alignments: length = aln.get_alignment_length() # check if any labels are missing in the current alignment these_labels = set(rec.id for rec in aln) missing = all_labels - these_labels # if any are missing, create unknown data of the right length, # stuff the string representation into the tmp dict for label in missing: new_seq = UnknownSeq(length, alphabet=alphabet) tmp[label].append(str(new_seq)) # else stuff the string representation into the tmp dict for rec in aln: tmp[rec.id].append(str(rec.seq)) # Stitch all the substrings together using join (most efficient way), # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment return MultipleSeqAlignment( SeqRecord(Seq(''.join(v), alphabet=alphabet), id=k) for (k, v) in tmp.items())
#manipulation des tables pour la traduction from Bio.Data import CodonTable std_table = CodonTable.unambiguous_dna_by_name["Standard"] bact_table = CodonTable.unambiguous_dna_by_name["Bacterial"] bact_table.start_codons bact_table.stop_codons #pour comparer séquences (attention à l'alphabet) str(bli) == str(blu) #on peut faire des séquences mutables, cf tuto #pour faire des séquences inconnues, avec des N pour nucléotides et X pour les protéines from Bio.Seq import UnknownSeq unk_dna = UnknownSeq(20, alphabet=IUPAC.ambiguous_dna) #SeqRecord from Bio.SeqRecord import SeqRecord help(SeqRecord) #pour voir les différents champs SeqRecord(bli) from Bio import SeqIO machin = SeqIO.read("hao.fasta", "fasta") #pour fichier avec une seule séquence print machin print machin.format("fasta") #mêmes types de choses existent pour les .gnk (format GeneBank) for seq_record in SeqIO.parse("nosZ.fasta", "fasta"): print seq_record.id print seq_record.seq
from Bio.Seq import MutableSeq from Bio.Alphabet import IUPAC mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna) mutable_seq mutable_seq[5] = "C" mutable_seq mutable_seq.remove("T") mutable_seq mutable_seq.reverse() mutable_seq # UnknownSeq objects from Bio.Seq import UnknownSeq unk = UnknownSeq(20) unk print(unk) len(unk) from Bio.Seq import UnknownSeq from Bio.Alphabet import IUPAC unk_dna = UnknownSeq(20, alphabet=IUPAC.ambiguous_dna) unk_dna print(unk_dna) unk_protein = unk_dna.translate() unk_protein # Connecting with biological databases
def parse_gff(handle): """Quick hack to parse Bacterial GFF files from Prokka etc. Does NOT support multi-line features (i.e. splicing and multiple exons). Will load EVERYTHING into memory! Iterator yielding SeqRecord objects, intended to fit into the Biopython SeqIO structure. """ line = handle.readline() assert line.startswith("##gff-version 3"), line # print("Parsing GFF3") references = OrderedDict() for line in handle: # print(line) if line.startswith("##sequence-region "): _, name, start, end = line.split() assert start == "1" references[name] = SeqRecord( UnknownSeq(int(end)), id=name, name=name) elif line.strip() == "##FASTA": break elif line.startswith("#"): raise NotImplementedError(line) elif line.count("\t") == 8: seqid, source, ftype, start, end, score, strand, phase, attributes = line.split( "\t") assert seqid in references, "Reference %r not declared with ##sequence-region line:\n%r" % ( seqid, line) start = int(start) - 1 end = int(end) assert 0 <= start < end < len(references[seqid]) if ftype in FEATURE_TYPE_TO_IGNORE: continue if FEATURE_TYPE_WANTED and ftype not in FEATURE_TYPE_WANTED: continue if strand == "+": loc = FeatureLocation(start, end, +1) elif strand == "-": loc = FeatureLocation(start, end, -1) elif strand == ".": # Unstranded - should use zero but +1 to match EMBL/GB loc = FeatureLocation(start, end, +1) elif strand == "?": # Stranded by missing - should use None but +1 to match EMBL/GB loc = FeatureLocation(start, end, +1) else: raise ValueError("Bad strand %r in line:\n%r" % (strand, line)) f = SeqFeature(loc, type=ftype) for part in attributes.strip().split(";"): if not part: assert ";;" in line, line sys.stderr.write( "Warning - missing key=value or double semi-colon in line:\n%r\n" % line) continue if "=" not in part: sys.exit("Bad key=value entry %r in line:\n%r" % (part, line)) key, value = part.split("=", 1) if key in MISSING_QUALIFIERS_TO_IGNORE: continue if key == "eC_number": key = "EC_number" value = value.replace("%2C", ",") try: f.qualifiers[key].append(value) except KeyError: f.qualifiers[key] = [value] references[seqid].features.append(f) else: raise NotImplementedError(line) # Deal with any FASTA block name = None seqs = [] for line in handle: if line.startswith(">"): if name and seqs: seq = "".join(seqs) assert len(seq) == len(references[name]), \ "FASTA entry for %s was %i long, expected %i" % ( name, len(seq), len(references[name])) references[name].seq = Seq(seq) name = line[1:].split(None, 1)[0] seqs = [] elif name: seqs.append(line.strip()) elif line.strip(): raise NotImplementedError(line) if name and seqs: seq = "".join(seqs) assert len(seq) == len(references[name]), \ "FASTA entry for %s was %i long, expected %i" % ( name, len(seq), len(references[name])) references[name].seq = Seq(seq) # Return results for name, record in references.items(): # print("%s length %i with %i features" % (name, len(record), len(record.seq))) yield record
class StringMethodTests(unittest.TestCase): _examples = [ #These are length 9, a multiple of 3 for translation tests: Seq("ACGTGGGGT", generic_protein), Seq("ACGTGGGGT", generic_nucleotide), Seq("ACGTGGGGT", generic_dna), Seq("ACGUGGGGU", generic_rna), Seq("GG", generic_protein), Seq("GG", generic_nucleotide), Seq("GG", generic_dna), Seq("GG", generic_rna), Seq("A", generic_protein), Seq("A", generic_nucleotide), Seq("A", generic_dna), Seq("A", generic_rna), UnknownSeq(1), UnknownSeq(1, character="n"), UnknownSeq(1, generic_rna), UnknownSeq(1, generic_rna, "n"), UnknownSeq(1, generic_rna, "N"), UnknownSeq(12, generic_rna, "N"), UnknownSeq(12, generic_dna, "N"), UnknownSeq(12, generic_nucleotide, "N"), UnknownSeq(12, generic_protein, "X"), UnknownSeq(12, character="X"), UnknownSeq(12), ] for seq in _examples[:]: if isinstance(seq, Seq): _examples.append(seq.tomutable()) _start_end_values = [0, 1, 2, 1000, -1, -2, -999] def _test_method(self, method_name, pre_comp_function=None, start_end=False): """Check this method matches the plain string's method.""" self.assertTrue(isinstance(method_name, str)) for example1 in self._examples: if not hasattr(example1, method_name): #e.g. MutableSeq does not support find continue str1 = str(example1) for example2 in self._examples: if not hasattr(example2, method_name): #e.g. MutableSeq does not support find continue str2 = str(example2) i = getattr(example1, method_name)(str2) j = getattr(str1, method_name)(str2) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError( "%s.%s(%s) = %i, not %i" % (repr(example1), method_name, repr(str2), i, j)) try: i = getattr(example1, method_name)(example2) j = getattr(str1, method_name)(str2) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError("%s.%s(%s) = %i, not %i" % (repr(example1), method_name, repr(example2), i, j)) except TypeError: #TODO - Check the alphabets do clash! pass if start_end: for start in self._start_end_values: i = getattr(example1, method_name)(str2, start) j = getattr(str1, method_name)(str2, start) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError("%s.%s(%s, %i) = %i, not %i" % (repr(example1), method_name, repr(str2), start, i, j)) for end in self._start_end_values: i = getattr(example1, method_name)(str2, start, end) j = getattr(str1, method_name)(str2, start, end) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError( "%s.%s(%s, %i, %i) = %i, not %i" % (repr(example1), method_name, repr(str2), start, end, i, j)) def test_str_count(self): """Check matches the python string count method.""" self._test_method("count", start_end=True) def test_str_find(self): """Check matches the python string find method.""" self._test_method("find", start_end=True) def test_str_rfind(self): """Check matches the python string rfind method.""" self._test_method("rfind", start_end=True) def test_str_startswith(self): """Check matches the python string startswith method.""" self._test_method("startswith", start_end=True) try: self.assertTrue("ABCDE".startswith(("ABE", "OBE", "ABC"))) except TypeError: #Base string only supports this on Python 2.5+, skip this return #Now check with a tuple of sub sequences for example1 in self._examples: if not hasattr(example1, "startswith"): #e.g. MutableSeq does not support this continue subs = tuple([ example1[start:start + 2] for start in range(0, len(example1) - 2, 3) ]) subs_str = tuple([str(s) for s in subs]) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs)) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs_str)) # strings! self.assertEqual( str(example1).startswith(subs_str, 3), example1.startswith(subs, 3)) self.assertEqual( str(example1).startswith(subs_str, 2, 6), example1.startswith(subs, 2, 6)) def test_str_endswith(self): """Check matches the python string endswith method.""" self._test_method("endswith", start_end=True) try: self.assertTrue("ABCDE".endswith(("ABE", "OBE", "CDE"))) except TypeError: #Base string only supports this on Python 2.5+, skip this return #Now check with a tuple of sub sequences for example1 in self._examples: if not hasattr(example1, "endswith"): #e.g. MutableSeq does not support this continue subs = tuple([ example1[start:start + 2] for start in range(0, len(example1) - 2, 3) ]) subs_str = tuple([str(s) for s in subs]) self.assertEqual( str(example1).endswith(subs_str), example1.endswith(subs)) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs_str)) # strings! self.assertEqual( str(example1).endswith(subs_str, 3), example1.endswith(subs, 3)) self.assertEqual( str(example1).endswith(subs_str, 2, 6), example1.endswith(subs, 2, 6)) def test_str_strip(self): """Check matches the python string strip method.""" self._test_method("strip", pre_comp_function=str) def test_str_rstrip(self): """Check matches the python string rstrip method.""" self._test_method("rstrip", pre_comp_function=str) def test_str_split(self): """Check matches the python string rstrip method.""" #Calling (r)split should return a list of Seq-like objects, we'll #just apply str() to each of them so it matches the string method self._test_method("rstrip", pre_comp_function=lambda x: map(str, x)) def test_str_rsplit(self): """Check matches the python string rstrip method.""" #Calling (r)split should return a list of Seq-like objects, we'll #just apply str() to each of them so it matches the string method self._test_method("rstrip", pre_comp_function=lambda x: map(str, x)) def test_str_lsplit(self): """Check matches the python string rstrip method.""" #Calling (r)split should return a list of Seq-like objects, we'll #just apply str() to each of them so it matches the string method self._test_method("rstrip", pre_comp_function=lambda x: map(str, x)) def test_str_length(self): """Check matches the python string __len__ method.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(len(example1), len(str1)) def test_str_upper(self): """Check matches the python string upper method.""" for example1 in self._examples: if isinstance(example1, MutableSeq): continue str1 = str(example1) self.assertEqual(str(example1.upper()), str1.upper()) def test_str_lower(self): """Check matches the python string lower method.""" for example1 in self._examples: if isinstance(example1, MutableSeq): continue str1 = str(example1) self.assertEqual(str(example1.lower()), str1.lower()) def test_str_getitem(self): """Check slicing and indexing works like a string.""" for example1 in self._examples: str1 = str(example1) for i in self._start_end_values: if abs(i) < len(example1): self.assertEqual(str(example1[i]), str1[i]) self.assertEqual(str(example1[:i]), str1[:i]) self.assertEqual(str(example1[i:]), str1[i:]) for j in self._start_end_values: self.assertEqual(str(example1[i:j]), str1[i:j]) for step in range(-3, 4): if step == 0: try: print(example1[i:j:step]) self._assert(False) # Should fail! except ValueError: pass else: self.assertEqual(str(example1[i:j:step]), str1[i:j:step]) def test_tostring(self): """Check str(obj) and obj.tostring() match.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(example1.tostring(), str1) def test_tomutable(self): """Check obj.tomutable() method.""" for example1 in self._examples: if isinstance(example1, MutableSeq): continue mut = example1.tomutable() self.assertTrue(isinstance(mut, MutableSeq)) self.assertEqual(str(mut), str(example1)) self.assertEqual(mut.alphabet, example1.alphabet) def test_toseq(self): """Check obj.toseq() method.""" for example1 in self._examples: try: seq = example1.toseq() except AttributeError: self.assertTrue(isinstance(example1, Seq)) continue self.assertTrue(isinstance(seq, Seq)) self.assertEqual(str(seq), str(example1)) self.assertEqual(seq.alphabet, example1.alphabet) def test_the_complement(self): """Check obj.complement() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: comp = example1.complement() except ValueError as e: self.assertEqual(str(e), "Proteins do not have complements!") continue str1 = str(example1) #This only does the unambiguous cases if "U" in str1 or "u" in str1 \ or example1.alphabet==generic_rna: mapping = maketrans("ACGUacgu", "UGCAugca") elif "T" in str1 or "t" in str1 \ or example1.alphabet==generic_dna \ or example1.alphabet==generic_nucleotide: mapping = maketrans("ACGTacgt", "TGCAtgca") elif "A" not in str1 and "a" not in str1: mapping = maketrans("CGcg", "GCgc") else: #TODO - look at alphabet? raise ValueError(example1) self.assertEqual(str1.translate(mapping), str(comp)) self.assertEqual(comp.alphabet, example1.alphabet) def test_the_reverse_complement(self): """Check obj.reverse_complement() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: comp = example1.reverse_complement() except ValueError as e: self.assertEqual(str(e), "Proteins do not have complements!") continue str1 = str(example1) #This only does the unambiguous cases if "U" in str1 or "u" in str1 \ or example1.alphabet==generic_rna: mapping = maketrans("ACGUacgu", "UGCAugca") elif "T" in str1 or "t" in str1 \ or example1.alphabet==generic_dna \ or example1.alphabet==generic_nucleotide: mapping = maketrans("ACGTacgt", "TGCAtgca") elif "A" not in str1 and "a" not in str1: mapping = maketrans("CGcg", "GCgc") else: #TODO - look at alphabet? continue self.assertEqual(str1.translate(mapping)[::-1], str(comp)) self.assertEqual(comp.alphabet, example1.alphabet) def test_the_transcription(self): """Check obj.transcribe() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: tran = example1.transcribe() except ValueError as e: if str(e) == "Proteins cannot be transcribed!": continue if str(e) == "RNA cannot be transcribed!": continue raise e str1 = str(example1) if len(str1) % 3 != 0: #TODO - Check for or silence the expected warning? continue self.assertEqual( str1.replace("T", "U").replace("t", "u"), str(tran)) self.assertEqual(tran.alphabet, generic_rna) # based on limited examples def test_the_back_transcription(self): """Check obj.back_transcribe() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: tran = example1.back_transcribe() except ValueError as e: if str(e) == "Proteins cannot be back transcribed!": continue if str(e) == "DNA cannot be back transcribed!": continue raise e str1 = str(example1) self.assertEqual( str1.replace("U", "T").replace("u", "t"), str(tran)) self.assertEqual(tran.alphabet, generic_dna) # based on limited examples def test_the_translate(self): """Check obj.translate() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue if len(example1) % 3 != 0: #TODO - Check for or silence the expected warning? continue try: tran = example1.translate() except ValueError as e: if str(e) == "Proteins cannot be translated!": continue raise e #This is based on the limited example not having stop codons: if tran.alphabet not in [ extended_protein, protein, generic_protein ]: print(tran.alphabet) self.assertTrue(False) #TODO - check the actual translation, and all the optional args def test_the_translation_of_stops(self): """Check obj.translate() method with stop codons.""" misc_stops = "TAATAGTGAAGAAGG" for nuc in [ Seq(misc_stops), Seq(misc_stops, generic_nucleotide), Seq(misc_stops, generic_dna), Seq(misc_stops, unambiguous_dna) ]: self.assertEqual("***RR", str(nuc.translate())) self.assertEqual("***RR", str(nuc.translate(1))) self.assertEqual("***RR", str(nuc.translate("SGC0"))) self.assertEqual("**W**", str(nuc.translate(table=2))) self.assertEqual("**WRR", str(nuc.translate(table='Yeast Mitochondrial'))) self.assertEqual("**WSS", str(nuc.translate(table=5))) self.assertEqual("**WSS", str(nuc.translate(table=9))) self.assertEqual("**CRR", str(nuc.translate(table='Euplotid Nuclear'))) self.assertEqual("***RR", str(nuc.translate(table=11))) self.assertEqual("***RR", str(nuc.translate(table='11'))) self.assertEqual("***RR", str(nuc.translate(table='Bacterial'))) self.assertEqual("", str(nuc.translate(to_stop=True))) self.assertEqual("O*ORR", str(nuc.translate(table=special_table))) self.assertEqual( "*QWRR", str(nuc.translate(table=Chilodonella_uncinata_table))) #These test the Bio.Seq.translate() function - move these?: self.assertEqual( "*QWRR", translate(str(nuc), table=Chilodonella_uncinata_table)) self.assertEqual("O*ORR", translate(str(nuc), table=special_table)) self.assertEqual("", translate(str(nuc), to_stop=True)) self.assertEqual("***RR", translate(str(nuc), table='Bacterial')) self.assertEqual("***RR", translate(str(nuc), table='11')) self.assertEqual("***RR", translate(str(nuc), table=11)) self.assertEqual("**W**", translate(str(nuc), table=2)) self.assertEqual(str(Seq("TAT").translate()), "Y") self.assertEqual(str(Seq("TAR").translate()), "*") self.assertEqual(str(Seq("TAN").translate()), "X") self.assertEqual(str(Seq("NNN").translate()), "X") self.assertEqual(str(Seq("TAt").translate()), "Y") self.assertEqual(str(Seq("TaR").translate()), "*") self.assertEqual(str(Seq("TaN").translate()), "X") self.assertEqual(str(Seq("nnN").translate()), "X") self.assertEqual(str(Seq("tat").translate()), "Y") self.assertEqual(str(Seq("tar").translate()), "*") self.assertEqual(str(Seq("tan").translate()), "X") self.assertEqual(str(Seq("nnn").translate()), "X") def test_the_translation_of_invalid_codons(self): """Check obj.translate() method with invalid codons.""" for codon in ["TA?", "N-N", "AC_", "Ac_"]: for nuc in [ Seq(codon), Seq(codon, generic_nucleotide), Seq(codon, generic_dna), Seq(codon, unambiguous_dna) ]: try: print(nuc.translate()) self.assertTrue(False, "Transating %s should fail" % codon) except TranslationError: pass def test_the_translation_of_ambig_codons(self): """Check obj.translate() method with ambiguous codons.""" for letters, ambig_values in [ (ambiguous_dna.letters, ambiguous_dna_values), (ambiguous_rna.letters, ambiguous_rna_values) ]: ambig = set(letters) for c1 in ambig: for c2 in ambig: for c3 in ambig: values = set([ str(Seq(a + b + c).translate()) for a in ambig_values[c1] for b in ambig_values[c2] for c in ambig_values[c3] ]) t = str(Seq(c1 + c2 + c3).translate()) if t == "*": self.assertEqual(values, set("*")) elif t == "X": self.assertTrue( len(values) > 1, "translate('%s') = '%s' not '%s'" % (c1 + c2 + c3, t, ",".join(values))) elif t == "Z": self.assertEqual(values, set("EQ")) elif t == "B": self.assertEqual(values, set("DN")) elif t == "J": self.assertEqual(values, set("LI")) else: self.assertEqual(values, set(t)) #TODO - Use the Bio.Data.IUPACData module for the #ambiguous protein mappings? def test_init_typeerror(self): """Check Seq __init__ gives TypeError exceptions.""" #Only expect it to take strings and unicode - not Seq objects! self.assertRaises(TypeError, Seq, (1066)) self.assertRaises(TypeError, Seq, (Seq("ACGT", generic_dna)))
def test_generated(self): """Write and read back odd SeqRecord objects.""" record1 = SeqRecord( Seq("ACGT" * 500), id="Test", description="Long " * 500, letter_annotations={"phred_quality": [40, 30, 20, 10] * 500}, ) record2 = SeqRecord( MutableSeq("NGGC" * 1000), id="Mut", description="very " * 1000 + "long", letter_annotations={"phred_quality": [0, 5, 5, 10] * 1000}, ) record3 = SeqRecord( UnknownSeq(2000, character="N"), id="Unk", description="l" + ("o" * 1000) + "ng", letter_annotations={"phred_quality": [0, 1] * 1000}, ) record4 = SeqRecord( Seq("ACGT" * 500), id="no_descr", description="", name="", letter_annotations={"phred_quality": [40, 50, 60, 62] * 500}, ) record5 = SeqRecord( Seq(""), id="empty_p", description="(could have been trimmed lots)", letter_annotations={"phred_quality": []}, ) record6 = SeqRecord( Seq(""), id="empty_s", description="(could have been trimmed lots)", letter_annotations={"solexa_quality": []}, ) record7 = SeqRecord( Seq("ACNN" * 500), id="Test_Sol", description="Long " * 500, letter_annotations={"solexa_quality": [40, 30, 0, -5] * 500}, ) record8 = SeqRecord( Seq("ACGT"), id="HighQual", description= "With very large qualities that even Sanger FASTQ can't hold!", letter_annotations={"solexa_quality": [0, 10, 100, 1000]}, ) # TODO - Record with no identifier? records = [ record1, record2, record3, record4, record5, record6, record7, record8, ] for fmt in [ "fasta", "fastq", "fastq-solexa", "fastq-illumina", "qual" ]: handle = StringIO() with warnings.catch_warnings(): # TODO - Have a Biopython defined "DataLossWarning?" warnings.simplefilter("ignore", BiopythonWarning) SeqIO.write(records, handle, fmt) handle.seek(0) self.compare_records(records, list(SeqIO.parse(handle, fmt)), fmt)
def test_count_overlap_start_end_GG(self): """Check our count_overlap method using GG with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [ (1, 7, 3), (3, None, 3), (3, 6, 2), (4, 6, 1), (4, -1, 2), (-5, None, 2), (-5, 7, 2), (7, -5, 0), (-100, None, 3), (None, 100, 3), (-100, 1000, 3), ] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual(Seq(testing_seq).count_overlap("GG", start, end), exp) self.assertEqual( MutableSeq(testing_seq).count_overlap("GG", start, end), exp ) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0) # Testing UnknownSeq() with variable start and end arguments char_start_end_exp = [ ("N", 1, 7, 0), ("N", 1, 7, 0), ("N", -4, None, 0), ("N", -4, None, 0), ("X", 1, 7, 0), ] for char, start, end, exp in char_start_end_exp: self.assertEqual( UnknownSeq(12, character=char).count_overlap("GG", start, end), exp ) self.assertEqual(UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [ ("G", 100, 105, 0), ("G", -1, 4, 0), ("G", 4, -1, 0), ("G", -8, -2, 0), ("G", -2, -8, 0), ("G", 8, 2, 0), ("G", 2, 8, 0), ("GG", 8, 2, 0), ("GG", 2, 8, 0), ("GG", -5, -1, 0), ("GG", 1, 5, 0), ("GGG", None, None, 0), ("GGGGGGGGG", None, None, 0), ("GGG", 1, 2, 0), ] for substr, start, end, exp in substr_start_end_exp: self.assertEqual( UnknownSeq(7, character="N").count_overlap(substr, start, end), exp ) self.assertEqual(UnknownSeq(7, character="N").count_overlap("GG", 1), 0)
class StringMethodTests(unittest.TestCase): _examples = [ # These are length 9, a multiple of 3 for translation tests: Seq("ACGTGGGGT", generic_protein), Seq("ACGTGGGGT", generic_nucleotide), Seq("ACGTGGGGT", generic_dna), Seq("ACGUGGGGU", generic_rna), Seq("GG", generic_protein), Seq("GG", generic_nucleotide), Seq("GG", generic_dna), Seq("GG", generic_rna), Seq("A", generic_protein), Seq("A", generic_nucleotide), Seq("A", generic_dna), Seq("A", generic_rna), UnknownSeq(1), UnknownSeq(1, character="n"), UnknownSeq(1, generic_rna), UnknownSeq(1, generic_rna, "n"), UnknownSeq(1, generic_rna, "N"), UnknownSeq(12, generic_rna, "N"), UnknownSeq(12, generic_dna, "N"), UnknownSeq(12, generic_nucleotide, "N"), UnknownSeq(12, generic_protein, "X"), UnknownSeq(12, character="X"), UnknownSeq(12), ] for seq in _examples[:]: if isinstance(seq, Seq): _examples.append(seq.tomutable()) _start_end_values = [0, 1, 2, 1000, -1, -2, -999, None] def _test_method(self, method_name, pre_comp_function=None, start_end=False): """Check this method matches the plain string's method.""" self.assertTrue(isinstance(method_name, str)) for example1 in self._examples: if not hasattr(example1, method_name): # e.g. MutableSeq does not support find continue str1 = str(example1) for example2 in self._examples: if not hasattr(example2, method_name): # e.g. MutableSeq does not support find continue str2 = str(example2) i = getattr(example1, method_name)(str2) j = getattr(str1, method_name)(str2) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError( "%s.%s(%s) = %i, not %i" % (repr(example1), method_name, repr(str2), i, j)) try: i = getattr(example1, method_name)(example2) j = getattr(str1, method_name)(str2) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError("%s.%s(%s) = %i, not %i" % (repr(example1), method_name, repr(example2), i, j)) except TypeError: # TODO - Check the alphabets do clash! pass if start_end: for start in self._start_end_values: i = getattr(example1, method_name)(str2, start) j = getattr(str1, method_name)(str2, start) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError("%s.%s(%s, %i) = %i, not %i" % (repr(example1), method_name, repr(str2), start, i, j)) for end in self._start_end_values: i = getattr(example1, method_name)(str2, start, end) j = getattr(str1, method_name)(str2, start, end) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError( "%s.%s(%s, %i, %i) = %i, not %i" % (repr(example1), method_name, repr(str2), start, end, i, j)) def test_str_count(self): """Check matches the python string count method.""" self._test_method("count", start_end=True) def test_str_count_overlap_GG(self): """Check our count_overlap method using GG.""" # Testing with self._examples expected = [ 3, 3, 3, 3, 1, 1, 1, 1, 0, 0, 0, 0, # Seq() Tests 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] # UnknownSeq() Tests expected *= 2 # MutableSeq() Tests assert len(self._examples) == len(expected) for seq, exp in zip(self._examples, expected): # Using search term GG as a string self.assertEqual(seq.count_overlap("GG"), exp) self.assertEqual(seq.count_overlap("G" * 5), 0) # Using search term GG as a Seq with generic alphabet self.assertEqual(seq.count_overlap(Seq("GG")), exp) self.assertEqual(seq.count_overlap(Seq("G" * 5)), 0) def test_count_overlap_start_end_GG(self): """Check our count_overlap method using GG with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [(1, 7, 3), (3, None, 3), (3, 6, 2), (4, 6, 1), (4, -1, 2), (-5, None, 2), (-5, 7, 2), (7, -5, 0), (-100, None, 3), (None, 100, 3), (-100, 1000, 3)] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual( Seq(testing_seq).count_overlap("GG", start, end), exp) self.assertEqual( MutableSeq(testing_seq).count_overlap("GG", start, end), exp) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0) # Testing UnknownSeq() with variable start and end arguments alphabet_char_start_end_exp = [(generic_rna, "N", 1, 7, 0), (generic_dna, "N", 1, 7, 0), (generic_rna, "N", -4, None, 0), (generic_dna, "N", -4, None, 0), (generic_protein, "X", 1, 7, 0)] for alpha, char, start, end, exp in alphabet_char_start_end_exp: self.assertEqual( UnknownSeq(12, alpha, char).count_overlap("GG", start, end), exp) self.assertEqual( UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [("G", 100, 105, 0), ("G", -1, 4, 0), ("G", 4, -1, 0), ("G", -8, -2, 0), ("G", -2, -8, 0), ("G", 8, 2, 0), ("G", 2, 8, 0), ("GG", 8, 2, 0), ("GG", 2, 8, 0), ("GG", -5, -1, 0), ("GG", 1, 5, 0), ("GGG", None, None, 0), ("GGGGGGGGG", None, None, 0), ("GGG", 1, 2, 0)] for substr, start, end, exp in substr_start_end_exp: self.assertEqual( UnknownSeq(7, character="N").count_overlap(substr, start, end), exp) self.assertEqual( UnknownSeq(7, character="N").count_overlap("GG", 1), 0) def test_str_count_overlap_NN(self): """Check our count_overlap method using NN.""" # Testing with self._examples expected = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # Seq() Tests 0, 0, 0, 0, 0, 11, 11, 11, 0, 0, 0 ] # UnknownSeq() Tests expected *= 2 # MutableSeq() Tests assert len(self._examples) == len(expected) for seq, exp in zip(self._examples, expected): # Using search term NN as a string self.assertEqual(seq.count_overlap("NN"), exp) self.assertEqual(seq.count_overlap("N" * 13), 0) # Using search term NN as a Seq with generic alphabet self.assertEqual(seq.count_overlap(Seq("NN")), exp) self.assertEqual(seq.count_overlap(Seq("N" * 13)), 0) def test_count_overlap_start_end_NN(self): """Check our count_overlap method using NN with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [(1, 7, 0), (3, None, 0), (3, 6, 0), (4, 6, 0), (4, -1, 0), (-5, None, 0), (-5, 7, 0), (7, -5, 0), (-100, None, 0), (None, 100, 0), (-100, 1000, 0)] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual( Seq(testing_seq).count_overlap("NN", start, end), exp) self.assertEqual( MutableSeq(testing_seq).count_overlap("NN", start, end), exp) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual( MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0) # Testing UnknownSeq() with variable start and end arguments alphabet_char_start_end_exp = [(generic_rna, "N", 1, 7, 5), (generic_dna, "N", 1, 7, 5), (generic_rna, "N", -4, None, 3), (generic_dna, "N", -4, None, 3), (generic_protein, "X", 1, 7, 0)] for alpha, char, start, end, exp in alphabet_char_start_end_exp: self.assertEqual( UnknownSeq(12, alpha, char).count_overlap("NN", start, end), exp) self.assertEqual( UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [("N", 100, 105, 0), ("N", -1, 4, 0), ("N", 4, -1, 2), ("N", -8, -2, 5), ("N", -2, -8, 0), ("N", 8, 2, 0), ("N", 2, 8, 5), ("NN", 8, 2, 0), ("NN", 2, 8, 4), ("NN", -5, -1, 3), ("NN", 1, 5, 3), ("NNN", None, None, 5), ("NNNNNNNNN", None, None, 0), ("NNN", 1, 2, 0)] for substr, start, end, exp in substr_start_end_exp: self.assertEqual( UnknownSeq(7, character="N").count_overlap(substr, start, end), exp) self.assertEqual( UnknownSeq(7, character="N").count_overlap("NN", 1), 5) def test_str_find(self): """Check matches the python string find method.""" self._test_method("find", start_end=True) def test_str_rfind(self): """Check matches the python string rfind method.""" self._test_method("rfind", start_end=True) def test_str_startswith(self): """Check matches the python string startswith method.""" self._test_method("startswith", start_end=True) self.assertTrue("ABCDE".startswith(("ABE", "OBE", "ABC"))) # Now check with a tuple of sub sequences for example1 in self._examples: if not hasattr(example1, "startswith"): # e.g. MutableSeq does not support this continue subs = tuple([ example1[start:start + 2] for start in range(0, len(example1) - 2, 3) ]) subs_str = tuple([str(s) for s in subs]) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs)) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs_str)) # strings! self.assertEqual( str(example1).startswith(subs_str, 3), example1.startswith(subs, 3)) self.assertEqual( str(example1).startswith(subs_str, 2, 6), example1.startswith(subs, 2, 6)) def test_str_endswith(self): """Check matches the python string endswith method.""" self._test_method("endswith", start_end=True) self.assertTrue("ABCDE".endswith(("ABE", "OBE", "CDE"))) # Now check with a tuple of sub sequences for example1 in self._examples: if not hasattr(example1, "endswith"): # e.g. MutableSeq does not support this continue subs = tuple([ example1[start:start + 2] for start in range(0, len(example1) - 2, 3) ]) subs_str = tuple([str(s) for s in subs]) self.assertEqual( str(example1).endswith(subs_str), example1.endswith(subs)) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs_str)) # strings! self.assertEqual( str(example1).endswith(subs_str, 3), example1.endswith(subs, 3)) self.assertEqual( str(example1).endswith(subs_str, 2, 6), example1.endswith(subs, 2, 6)) def test_str_strip(self): """Check matches the python string strip method.""" self._test_method("strip", pre_comp_function=str) def test_str_rstrip(self): """Check matches the python string rstrip method.""" self._test_method("rstrip", pre_comp_function=str) def test_str_split(self): """Check matches the python string rstrip method.""" # Calling (r)split should return a list of Seq-like objects, we'll # just apply str() to each of them so it matches the string method self._test_method("rstrip", pre_comp_function=lambda x: [str(y) for y in x]) def test_str_rsplit(self): """Check matches the python string rstrip method.""" # Calling (r)split should return a list of Seq-like objects, we'll # just apply str() to each of them so it matches the string method self._test_method("rstrip", pre_comp_function=lambda x: [str(y) for y in x]) def test_str_lsplit(self): """Check matches the python string rstrip method.""" # Calling (r)split should return a list of Seq-like objects, we'll # just apply str() to each of them so it matches the string method self._test_method("rstrip", pre_comp_function=lambda x: [str(y) for y in x]) def test_str_length(self): """Check matches the python string __len__ method.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(len(example1), len(str1)) def test_str_upper(self): """Check matches the python string upper method.""" for example1 in self._examples: if isinstance(example1, MutableSeq): continue str1 = str(example1) self.assertEqual(str(example1.upper()), str1.upper()) def test_str_lower(self): """Check matches the python string lower method.""" for example1 in self._examples: if isinstance(example1, MutableSeq): continue str1 = str(example1) self.assertEqual(str(example1.lower()), str1.lower()) def test_str_hash(self): for example1 in self._examples: if isinstance(example1, MutableSeq): continue with warnings.catch_warnings(): # Silence change in behaviour warning warnings.simplefilter('ignore', BiopythonWarning) self.assertEqual( hash(str(example1)), hash(example1), "Hash mismatch, %r for %r vs %r for %r" % (hash(str(example1)), id(example1), hash(example1), example1)) def test_str_comparison(self): for example1 in self._examples: for example2 in self._examples: with warnings.catch_warnings(): # Silence alphabet warning warnings.simplefilter('ignore', BiopythonWarning) self.assertEqual( str(example1) == str(example2), example1 == example2, "Checking %r == %r" % (example1, example2)) self.assertEqual( str(example1) != str(example2), example1 != example2, "Checking %r != %r" % (example1, example2)) self.assertEqual( str(example1) < str(example2), example1 < example2, "Checking %r < %r" % (example1, example2)) self.assertEqual( str(example1) <= str(example2), example1 <= example2, "Checking %r <= %r" % (example1, example2)) self.assertEqual( str(example1) > str(example2), example1 > example2, "Checking %r > %r" % (example1, example2)) self.assertEqual( str(example1) >= str(example2), example1 >= example2, "Checking %r >= %r" % (example1, example2)) def test_str_getitem(self): """Check slicing and indexing works like a string.""" for example1 in self._examples: str1 = str(example1) for i in self._start_end_values: if i is not None and abs(i) < len(example1): self.assertEqual(str(example1[i]), str1[i]) self.assertEqual(str(example1[:i]), str1[:i]) self.assertEqual(str(example1[i:]), str1[i:]) for j in self._start_end_values: self.assertEqual(str(example1[i:j]), str1[i:j]) for step in range(-3, 4): if step == 0: try: print(example1[i:j:step]) self._assert(False) # Should fail! except ValueError: pass else: self.assertEqual(str(example1[i:j:step]), str1[i:j:step]) def test_tomutable(self): """Check obj.tomutable() method.""" for example1 in self._examples: if isinstance(example1, MutableSeq): continue mut = example1.tomutable() self.assertTrue(isinstance(mut, MutableSeq)) self.assertEqual(str(mut), str(example1)) self.assertEqual(mut.alphabet, example1.alphabet) def test_toseq(self): """Check obj.toseq() method.""" for example1 in self._examples: try: seq = example1.toseq() except AttributeError: self.assertTrue(isinstance(example1, Seq)) continue self.assertTrue(isinstance(seq, Seq)) self.assertEqual(str(seq), str(example1)) self.assertEqual(seq.alphabet, example1.alphabet) def test_the_complement(self): """Check obj.complement() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: comp = example1.complement() except ValueError as e: self.assertEqual(str(e), "Proteins do not have complements!") continue str1 = str(example1) # This only does the unambiguous cases if any(("U" in str1, "u" in str1, example1.alphabet == generic_rna)): mapping = maketrans("ACGUacgu", "UGCAugca") elif any( ("T" in str1, "t" in str1, example1.alphabet == generic_dna, example1.alphabet == generic_nucleotide)): mapping = maketrans("ACGTacgt", "TGCAtgca") elif "A" not in str1 and "a" not in str1: mapping = maketrans("CGcg", "GCgc") else: # TODO - look at alphabet? raise ValueError(example1) self.assertEqual(str1.translate(mapping), str(comp)) self.assertEqual(comp.alphabet, example1.alphabet) def test_the_reverse_complement(self): """Check obj.reverse_complement() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: comp = example1.reverse_complement() except ValueError as e: self.assertEqual(str(e), "Proteins do not have complements!") continue str1 = str(example1) # This only does the unambiguous cases if any(("U" in str1, "u" in str1, example1.alphabet == generic_rna)): mapping = maketrans("ACGUacgu", "UGCAugca") elif any( ("T" in str1, "t" in str1, example1.alphabet == generic_dna, example1.alphabet == generic_nucleotide)): mapping = maketrans("ACGTacgt", "TGCAtgca") elif "A" not in str1 and "a" not in str1: mapping = maketrans("CGcg", "GCgc") else: # TODO - look at alphabet? continue self.assertEqual(str1.translate(mapping)[::-1], str(comp)) self.assertEqual(comp.alphabet, example1.alphabet) def test_the_transcription(self): """Check obj.transcribe() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: tran = example1.transcribe() except ValueError as e: if str(e) == "Proteins cannot be transcribed!": continue if str(e) == "RNA cannot be transcribed!": continue raise e str1 = str(example1) if len(str1) % 3 != 0: # TODO - Check for or silence the expected warning? continue self.assertEqual( str1.replace("T", "U").replace("t", "u"), str(tran)) self.assertEqual(tran.alphabet, generic_rna) # based on limited examples def test_the_back_transcription(self): """Check obj.back_transcribe() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: tran = example1.back_transcribe() except ValueError as e: if str(e) == "Proteins cannot be back transcribed!": continue if str(e) == "DNA cannot be back transcribed!": continue raise e str1 = str(example1) self.assertEqual( str1.replace("U", "T").replace("u", "t"), str(tran)) self.assertEqual(tran.alphabet, generic_dna) # based on limited examples def test_the_translate(self): """Check obj.translate() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue if len(example1) % 3 != 0: # TODO - Check for or silence the expected warning? continue try: tran = example1.translate() except ValueError as e: if str(e) == "Proteins cannot be translated!": continue raise e # This is based on the limited example not having stop codons: if tran.alphabet not in [ extended_protein, protein, generic_protein ]: print(tran.alphabet) self.fail() # TODO - check the actual translation, and all the optional args def test_the_translation_of_stops(self): """Check obj.translate() method with stop codons.""" misc_stops = "TAATAGTGAAGAAGG" for nuc in [ Seq(misc_stops), Seq(misc_stops, generic_nucleotide), Seq(misc_stops, generic_dna), Seq(misc_stops, unambiguous_dna) ]: self.assertEqual("***RR", str(nuc.translate())) self.assertEqual("***RR", str(nuc.translate(1))) self.assertEqual("***RR", str(nuc.translate("SGC0"))) self.assertEqual("**W**", str(nuc.translate(table=2))) self.assertEqual("**WRR", str(nuc.translate(table='Yeast Mitochondrial'))) self.assertEqual("**WSS", str(nuc.translate(table=5))) self.assertEqual("**WSS", str(nuc.translate(table=9))) self.assertEqual("**CRR", str(nuc.translate(table='Euplotid Nuclear'))) self.assertEqual("***RR", str(nuc.translate(table=11))) self.assertEqual("***RR", str(nuc.translate(table='11'))) self.assertEqual("***RR", str(nuc.translate(table='Bacterial'))) self.assertEqual("**GRR", str(nuc.translate(table=25))) self.assertEqual("", str(nuc.translate(to_stop=True))) self.assertEqual("O*ORR", str(nuc.translate(table=special_table))) self.assertEqual( "*QWRR", str(nuc.translate(table=Chilodonella_uncinata_table))) # These test the Bio.Seq.translate() function - move these?: self.assertEqual( "*QWRR", translate(str(nuc), table=Chilodonella_uncinata_table)) self.assertEqual("O*ORR", translate(str(nuc), table=special_table)) self.assertEqual("", translate(str(nuc), to_stop=True)) self.assertEqual("***RR", translate(str(nuc), table='Bacterial')) self.assertEqual("***RR", translate(str(nuc), table='11')) self.assertEqual("***RR", translate(str(nuc), table=11)) self.assertEqual("**W**", translate(str(nuc), table=2)) self.assertEqual(str(Seq("TAT").translate()), "Y") self.assertEqual(str(Seq("TAR").translate()), "*") self.assertEqual(str(Seq("TAN").translate()), "X") self.assertEqual(str(Seq("NNN").translate()), "X") self.assertEqual(str(Seq("TAt").translate()), "Y") self.assertEqual(str(Seq("TaR").translate()), "*") self.assertEqual(str(Seq("TaN").translate()), "X") self.assertEqual(str(Seq("nnN").translate()), "X") self.assertEqual(str(Seq("tat").translate()), "Y") self.assertEqual(str(Seq("tar").translate()), "*") self.assertEqual(str(Seq("tan").translate()), "X") self.assertEqual(str(Seq("nnn").translate()), "X") def test_the_translation_of_invalid_codons(self): """Check obj.translate() method with invalid codons.""" for codon in ["TA?", "N-N", "AC_", "Ac_"]: for nuc in [ Seq(codon), Seq(codon, generic_nucleotide), Seq(codon, generic_dna), Seq(codon, unambiguous_dna) ]: try: print(nuc.translate()) self.fail("Translating %s should fail" % codon) except TranslationError: pass def test_the_translation_of_ambig_codons(self): """Check obj.translate() method with ambiguous codons.""" for letters, ambig_values in [ (ambiguous_dna.letters, ambiguous_dna_values), (ambiguous_rna.letters, ambiguous_rna_values) ]: ambig = set(letters) for c1 in ambig: for c2 in ambig: for c3 in ambig: values = set( str(Seq(a + b + c).translate()) for a in ambig_values[c1] for b in ambig_values[c2] for c in ambig_values[c3]) t = str(Seq(c1 + c2 + c3).translate()) if t == "*": self.assertEqual(values, set("*")) elif t == "X": self.assertTrue( len(values) > 1, "translate('%s') = '%s' not '%s'" % (c1 + c2 + c3, t, ",".join(values))) elif t == "Z": self.assertEqual(values, set("EQ")) elif t == "B": self.assertEqual(values, set("DN")) elif t == "J": self.assertEqual(values, set("LI")) else: self.assertEqual(values, set(t)) # TODO - Use the Bio.Data.IUPACData module for the # ambiguous protein mappings? def test_init_typeerror(self): """Check Seq __init__ gives TypeError exceptions.""" # Only expect it to take strings and unicode - not Seq objects! self.assertRaises(TypeError, Seq, (1066)) self.assertRaises(TypeError, Seq, (Seq("ACGT", generic_dna))) def test_MutableSeq_init_typeerror(self): """Check MutableSeq __init__ gives TypeError exceptions.""" self.assertRaises(TypeError, MutableSeq, (Seq("A"))) self.assertRaises(TypeError, MutableSeq, (UnknownSeq(1))) def test_join_Seq_ValueError(self): """Checks that a ValueError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = Seq('NNNNN') self.assertRaises(ValueError, spacer.join, 5) self.assertRaises(ValueError, spacer.join, "ATG") self.assertRaises(ValueError, spacer.join, Seq("ATG")) self.assertRaises(ValueError, spacer.join, MutableSeq("ATG")) self.assertRaises(ValueError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_UnknownSeq_ValueError(self): """Checks that a ValueError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = UnknownSeq(5, character="-") self.assertRaises(ValueError, spacer.join, 5) self.assertRaises(ValueError, spacer.join, "ATG") self.assertRaises(ValueError, spacer.join, Seq("ATG")) self.assertRaises(ValueError, spacer.join, MutableSeq("ATG")) self.assertRaises(ValueError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_MutableSeq_ValueError(self): """Checks that a ValueError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = MutableSeq("MMMMM") self.assertRaises(ValueError, spacer.join, 5) self.assertRaises(ValueError, spacer.join, "ATG") self.assertRaises(ValueError, spacer.join, Seq("ATG")) self.assertRaises(ValueError, spacer.join, MutableSeq("ATG")) self.assertRaises(ValueError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_Seq_TypeError(self): """Checks that a TypeError is thrown for incompatible alphabets.""" spacer = Seq('NNNNN', generic_dna) self.assertRaises( TypeError, spacer.join, [Seq('NNNNN', generic_rna), Seq('NNNNN', generic_rna)]) self.assertRaises( TypeError, spacer.join, [Seq('NNNNN', generic_protein), Seq('NNNNN', generic_protein)]) def test_join_UnknownSeq_TypeError(self): """Checks that a TypeError is thrown for incompatible alphabets.""" spacer = UnknownSeq(5, character="-", alphabet=generic_dna) self.assertRaises(TypeError, spacer.join, [ UnknownSeq(5, character="-", alphabet=generic_rna), UnknownSeq(5, character="-", alphabet=generic_rna) ]) self.assertRaises(TypeError, spacer.join, [ Seq('NNNNN', generic_protein), UnknownSeq(5, character="-", alphabet=generic_protein) ]) def test_join_MutableSeq_TypeError(self): """Checks that a TypeError is thrown for incompatible alphabets.""" spacer = MutableSeq('NNNNN', generic_dna) self.assertRaises(TypeError, spacer.join, [ MutableSeq('NNNNN', generic_rna), MutableSeq('NNNNN', generic_rna) ]) self.assertRaises(TypeError, spacer.join, [ Seq('NNNNN', generic_protein), MutableSeq('NNNNN', generic_protein) ]) def test_join_Seq(self): """Checks if Seq join correctly concatenates sequence with the spacer.""" # Only expect it to take Seq objects and/or strings in an iterable! spacer1 = Seq('', generic_dna) spacers = [ spacer1, Seq('NNNNN', generic_dna), Seq('GGG', generic_nucleotide) ] example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"] # strings with empty spacer str_concatenated = spacer1.join(example_strings) self.assertEqual(str(str_concatenated), "".join(example_strings)) self.assertEqual(str_concatenated.alphabet, spacer1.alphabet) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(str(seq_concatenated), str(spacer).join(example_strings)) self.assertEqual(seq_concatenated.alphabet, spacer.alphabet) def test_join_Seq_with_file(self): """Checks if Seq join correctly concatenates sequence from a file with the spacer.""" filename = 'Fasta/f003' seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')] seqlist_as_strings = [str(_) for _ in seqlist] spacer = Seq('NNNNN') spacer1 = Seq('') # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(str(seq_concatenated), ref_data) self.assertEqual(str(seq_concatenated1), ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, 'fasta')) def test_join_UnknownSeq(self): """Checks if UnknownSeq join correctly concatenates sequence with the spacer.""" # Only expect it to take Seq objects and/or strings in an iterable! spacer1 = UnknownSeq(0, character="-", alphabet=generic_dna) spacers = [ spacer1, UnknownSeq(5, character="-", alphabet=generic_dna), UnknownSeq(5, character="-", alphabet=generic_nucleotide) ] example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"] # strings with empty spacer str_concatenated = spacer1.join(example_strings) self.assertEqual(str(str_concatenated), "".join(example_strings)) self.assertEqual(str_concatenated.alphabet, spacer1.alphabet) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(str(seq_concatenated), str(spacer).join(example_strings)) self.assertEqual(seq_concatenated.alphabet, spacer.alphabet) def test_join_UnknownSeq_with_file(self): """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer.""" filename = 'Fasta/f003' seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')] seqlist_as_strings = [str(_) for _ in seqlist] spacer = UnknownSeq(0, character="-", alphabet=generic_dna) spacer1 = UnknownSeq(5, character="-", alphabet=generic_dna) # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(str(seq_concatenated), ref_data) self.assertEqual(str(seq_concatenated1), ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, 'fasta')) def test_join_MutableSeq(self): """Checks if MutableSeq join correctly concatenates sequence with the spacer.""" # Only expect it to take Seq objects and/or strings in an iterable! spacer1 = MutableSeq('', generic_dna) spacers = [ spacer1, MutableSeq('NNNNN', generic_dna), MutableSeq('GGG', generic_nucleotide) ] example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG", generic_dna), "ATG"] # strings with empty spacer str_concatenated = spacer1.join(example_strings) self.assertEqual(str(str_concatenated), "".join(example_strings)) self.assertEqual(str_concatenated.alphabet, spacer1.alphabet) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(str(seq_concatenated), str(spacer).join(example_strings)) self.assertEqual(seq_concatenated.alphabet, spacer.alphabet) def test_join_MutableSeq_with_file(self): """Checks if MutableSeq join correctly concatenates sequence from a file with the spacer.""" filename = 'Fasta/f003' seqlist = [record.seq for record in SeqIO.parse(filename, 'fasta')] seqlist_as_strings = [str(_) for _ in seqlist] spacer = MutableSeq('NNNNN') spacer1 = MutableSeq('') # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(str(seq_concatenated), ref_data) self.assertEqual(str(seq_concatenated1), ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, 'fasta'))
class StringMethodTests(unittest.TestCase): _examples = [ # These are length 9, a multiple of 3 for translation tests: Seq("ACGTGGGGT"), Seq("ACGUGGGGU"), Seq("GG"), Seq("A"), UnknownSeq(1), UnknownSeq(1, character="n"), UnknownSeq(1, character="N"), UnknownSeq(12, character="N"), UnknownSeq(12, character="X"), UnknownSeq(12), ] for seq in _examples[:]: if not isinstance(seq, MutableSeq): _examples.append(MutableSeq(seq)) _start_end_values = [0, 1, 2, 1000, -1, -2, -999, None] def _test_method(self, method_name, start_end=False): """Check this method matches the plain string's method.""" self.assertIsInstance(method_name, str) for example1 in self._examples: if not hasattr(example1, method_name): # e.g. MutableSeq does not support translate continue str1 = str(example1) for example2 in self._examples: if not hasattr(example2, method_name): # e.g. MutableSeq does not support translate continue str2 = str(example2) try: i = getattr(example1, method_name)(str2) except ValueError: i = ValueError try: j = getattr(str1, method_name)(str2) except ValueError: j = ValueError self.assertEqual(i, j, "%r.%s(%r)" % (example1, method_name, str2)) try: i = getattr(example1, method_name)(example2) except ValueError: i = ValueError try: j = getattr(str1, method_name)(str2) except ValueError: j = ValueError self.assertEqual(i, j, "%r.%s(%r)" % (example1, method_name, example2)) if start_end: for start in self._start_end_values: try: i = getattr(example1, method_name)(str2, start) except ValueError: i = ValueError try: j = getattr(str1, method_name)(str2, start) except ValueError: j = ValueError self.assertEqual( i, j, "%r.%s(%r, %s)" % (example1, method_name, str2, start) ) for end in self._start_end_values: try: i = getattr(example1, method_name)(str2, start, end) except ValueError: i = ValueError try: j = getattr(str1, method_name)(str2, start, end) except ValueError: j = ValueError self.assertEqual( i, j, "%r.%s(%r, %s, %s)" % (example1, method_name, str2, start, end), ) def test_str_count(self): """Check matches the python string count method.""" self._test_method("count", start_end=True) self.assertEqual(Seq("AC777GT").count("7"), 3) self.assertRaises(TypeError, Seq("AC777GT").count, 7) self.assertRaises(TypeError, Seq("AC777GT").count, None) def test_count_overlap(self): """Check count_overlap exception matches python string count method.""" self.assertEqual(Seq("AC777GT").count("77"), 1) self.assertEqual(Seq("AC777GT").count_overlap("77"), 2) self.assertEqual(Seq("AC777GT").count_overlap("7"), 3) self.assertRaises(TypeError, Seq("AC777GT").count_overlap, 7) self.assertRaises(TypeError, Seq("AC777GT").count_overlap, None) def test_str_count_overlap_GG(self): """Check our count_overlap method using GG.""" # Testing with self._examples expected = [ 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, ] expected *= 2 # MutableSeq() Tests assert len(self._examples) == len(expected) for seq, exp in zip(self._examples, expected): # Using search term GG as a string self.assertEqual(seq.count_overlap("GG"), exp) self.assertEqual(seq.count_overlap("G" * 5), 0) # Using search term GG as a Seq self.assertEqual(seq.count_overlap(Seq("GG")), exp) self.assertEqual(seq.count_overlap(Seq("G" * 5)), 0) def test_count_overlap_start_end_GG(self): """Check our count_overlap method using GG with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [ (1, 7, 3), (3, None, 3), (3, 6, 2), (4, 6, 1), (4, -1, 2), (-5, None, 2), (-5, 7, 2), (7, -5, 0), (-100, None, 3), (None, 100, 3), (-100, 1000, 3), ] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual(Seq(testing_seq).count_overlap("GG", start, end), exp) self.assertEqual( MutableSeq(testing_seq).count_overlap("GG", start, end), exp ) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0) # Testing UnknownSeq() with variable start and end arguments char_start_end_exp = [ ("N", 1, 7, 0), ("N", 1, 7, 0), ("N", -4, None, 0), ("N", -4, None, 0), ("X", 1, 7, 0), ] for char, start, end, exp in char_start_end_exp: self.assertEqual( UnknownSeq(12, character=char).count_overlap("GG", start, end), exp ) self.assertEqual(UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [ ("G", 100, 105, 0), ("G", -1, 4, 0), ("G", 4, -1, 0), ("G", -8, -2, 0), ("G", -2, -8, 0), ("G", 8, 2, 0), ("G", 2, 8, 0), ("GG", 8, 2, 0), ("GG", 2, 8, 0), ("GG", -5, -1, 0), ("GG", 1, 5, 0), ("GGG", None, None, 0), ("GGGGGGGGG", None, None, 0), ("GGG", 1, 2, 0), ] for substr, start, end, exp in substr_start_end_exp: self.assertEqual( UnknownSeq(7, character="N").count_overlap(substr, start, end), exp ) self.assertEqual(UnknownSeq(7, character="N").count_overlap("GG", 1), 0) def test_str_count_overlap_NN(self): """Check our count_overlap method using NN.""" # Testing with self._examples expected = [ 0, 0, 0, 0, # Seq() Tests 0, 0, 0, 11, 0, 0, ] # UnknownSeq() Tests expected *= 2 # MutableSeq() Tests assert len(self._examples) == len(expected) for seq, exp in zip(self._examples, expected): # Using search term NN as a string self.assertEqual(seq.count_overlap("NN"), exp) self.assertEqual(seq.count_overlap("N" * 13), 0) # Using search term NN as a Seq self.assertEqual(seq.count_overlap(Seq("NN")), exp) self.assertEqual(seq.count_overlap(Seq("N" * 13)), 0) def test_count_overlap_start_end_NN(self): """Check our count_overlap method using NN with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [ (1, 7, 0), (3, None, 0), (3, 6, 0), (4, 6, 0), (4, -1, 0), (-5, None, 0), (-5, 7, 0), (7, -5, 0), (-100, None, 0), (None, 100, 0), (-100, 1000, 0), ] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual(Seq(testing_seq).count_overlap("NN", start, end), exp) self.assertEqual( MutableSeq(testing_seq).count_overlap("NN", start, end), exp ) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0) # Testing UnknownSeq() with variable start and end arguments char_start_end_exp = [ ("N", 1, 7, 5), ("N", 1, 7, 5), ("N", -4, None, 3), ("N", -4, None, 3), ("X", 1, 7, 0), ] for char, start, end, exp in char_start_end_exp: self.assertEqual( UnknownSeq(12, character=char).count_overlap("NN", start, end), exp ) self.assertEqual(UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [ ("N", 100, 105, 0), ("N", -1, 4, 0), ("N", 4, -1, 2), ("N", -8, -2, 5), ("N", -2, -8, 0), ("N", 8, 2, 0), ("N", 2, 8, 5), ("NN", 8, 2, 0), ("NN", 2, 8, 4), ("NN", -5, -1, 3), ("NN", 1, 5, 3), ("NNN", None, None, 5), ("NNNNNNNNN", None, None, 0), ("NNN", 1, 2, 0), ] for substr, start, end, exp in substr_start_end_exp: self.assertEqual( UnknownSeq(7, character="N").count_overlap(substr, start, end), exp ) self.assertEqual(UnknownSeq(7, character="N").count_overlap("NN", 1), 5) def test_str_find(self): """Check matches the python string find method.""" self._test_method("find", start_end=True) self.assertEqual(Seq("AC7GT").find("7"), 2) self.assertRaises(TypeError, Seq("AC7GT").find, 7) self.assertRaises(TypeError, Seq("ACGT").find, None) def test_str_rfind(self): """Check matches the python string rfind method.""" self._test_method("rfind", start_end=True) self.assertEqual(Seq("AC7GT").rfind("7"), 2) self.assertRaises(TypeError, Seq("AC7GT").rfind, 7) self.assertRaises(TypeError, Seq("ACGT").rfind, None) def test_str_index(self): """Check matches the python string index method.""" self._test_method("index", start_end=True) self.assertEqual(Seq("AC7GT").index("7"), 2) self.assertRaises(TypeError, Seq("AC7GT").index, 7) self.assertRaises(TypeError, Seq("ACGT").index, None) self.assertEqual(MutableSeq("AC7GT").index("7"), 2) self.assertRaises(TypeError, MutableSeq("AC7GT").index, 7) self.assertRaises(TypeError, MutableSeq("ACGT").index, None) def test_str_rindex(self): """Check matches the python string rindex method.""" self._test_method("rindex", start_end=True) self.assertEqual(Seq("AC7GT").rindex("7"), 2) self.assertRaises(TypeError, Seq("AC7GT").rindex, 7) self.assertRaises(TypeError, Seq("ACGT").rindex, None) self.assertEqual(MutableSeq("AC7GT").rindex("7"), 2) self.assertRaises(TypeError, MutableSeq("AC7GT").rindex, 7) self.assertRaises(TypeError, MutableSeq("ACGT").rindex, None) def test_str_startswith(self): """Check matches the python string startswith method.""" self._test_method("startswith", start_end=True) self.assertTrue("ABCDE".startswith(("ABE", "OBE", "ABC"))) self.assertRaises(TypeError, Seq("ACGT").startswith, None) self.assertRaises(TypeError, MutableSeq("ACGT").startswith, None) # Now check with a tuple of sub sequences for example1 in self._examples: subs = tuple( example1[start : start + 2] for start in range(0, len(example1) - 2, 3) ) subs_str = tuple(str(s) for s in subs) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs) ) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs_str) ) # strings! self.assertEqual( str(example1).startswith(subs_str, 3), example1.startswith(subs, 3) ) self.assertEqual( str(example1).startswith(subs_str, 2, 6), example1.startswith(subs, 2, 6), ) def test_str_endswith(self): """Check matches the python string endswith method.""" self._test_method("endswith", start_end=True) self.assertTrue("ABCDE".endswith(("ABE", "OBE", "CDE"))) self.assertRaises(TypeError, Seq("ACGT").endswith, None) # Now check with a tuple of sub sequences for example1 in self._examples: subs = tuple( example1[start : start + 2] for start in range(0, len(example1) - 2, 3) ) subs_str = tuple(str(s) for s in subs) self.assertEqual(str(example1).endswith(subs_str), example1.endswith(subs)) self.assertEqual( str(example1).startswith(subs_str), example1.startswith(subs_str) ) # strings! self.assertEqual( str(example1).endswith(subs_str, 3), example1.endswith(subs, 3) ) self.assertEqual( str(example1).endswith(subs_str, 2, 6), example1.endswith(subs, 2, 6) ) def test_str_strip(self): """Check matches the python string strip method.""" self._test_method("strip") self.assertEqual(Seq(" ACGT ").strip(), "ACGT") self.assertRaises(TypeError, Seq("ACGT").strip, 7) def test_str_rstrip(self): """Check matches the python string rstrip method.""" self._test_method("rstrip") self.assertEqual(Seq(" ACGT ").rstrip(), " ACGT") self.assertRaises(TypeError, Seq("ACGT").rstrip, 7) def test_str_lstrip(self): """Check matches the python string lstrip method.""" self._test_method("rstrip") self.assertEqual(Seq(" ACGT ").lstrip(), "ACGT ") self.assertRaises(TypeError, Seq("ACGT").lstrip, 7) def test_str_split(self): """Check matches the python string rstrip method.""" self._test_method("split") self.assertEqual(Seq("AC7GT").rsplit("7"), "AC7GT".split("7")) self.assertRaises(TypeError, Seq("AC7GT").split, 7) def test_str_rsplit(self): """Check matches the python string rstrip method.""" self._test_method("rsplit") self.assertEqual(Seq("AC7GT").rsplit("7"), "AC7GT".rsplit("7")) self.assertRaises(TypeError, Seq("AC7GT").rsplit, 7) def test_str_length(self): """Check matches the python string __len__ method.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(len(example1), len(str1)) def test_str_upper(self): """Check matches the python string upper method.""" for example1 in self._examples: if isinstance(example1, MutableSeq): continue str1 = str(example1) self.assertEqual(example1.upper(), str1.upper()) def test_str_lower(self): """Check matches the python string lower method.""" for example1 in self._examples: if isinstance(example1, MutableSeq): continue str1 = str(example1) self.assertEqual(example1.lower(), str1.lower()) def test_str_encode(self): """Check matches the python string encode method.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(bytes(example1), str1.encode("ascii")) def test_str_hash(self): for example1 in self._examples: if isinstance(example1, MutableSeq): continue with warnings.catch_warnings(): # Silence change in behaviour warning warnings.simplefilter("ignore", BiopythonWarning) self.assertEqual( hash(str(example1)), hash(example1), "Hash mismatch, %r for %r vs %r for %r" % (hash(str(example1)), id(example1), hash(example1), example1), ) def test_str_comparison(self): for example1 in self._examples: for example2 in self._examples: with warnings.catch_warnings(): self.assertEqual( str(example1) == str(example2), example1 == example2, "Checking %r == %r" % (example1, example2), ) self.assertEqual( str(example1) != str(example2), example1 != example2, "Checking %r != %r" % (example1, example2), ) self.assertEqual( str(example1) < str(example2), example1 < example2, "Checking %r < %r" % (example1, example2), ) self.assertEqual( str(example1) <= str(example2), example1 <= example2, "Checking %r <= %r" % (example1, example2), ) self.assertEqual( str(example1) > str(example2), example1 > example2, "Checking %r > %r" % (example1, example2), ) self.assertEqual( str(example1) >= str(example2), example1 >= example2, "Checking %r >= %r" % (example1, example2), ) def test_str_getitem(self): """Check slicing and indexing works like a string.""" for example1 in self._examples: str1 = str(example1) for i in self._start_end_values: if i is not None and abs(i) < len(example1): self.assertEqual(example1[i], str1[i]) self.assertEqual(example1[:i], str1[:i]) self.assertEqual(example1[i:], str1[i:]) for j in self._start_end_values: self.assertEqual(example1[i:j], str1[i:j]) for step in range(-3, 4): if step == 0: try: print(example1[i:j:step]) self._assert(False) # Should fail! except ValueError: pass else: self.assertEqual(example1[i:j:step], str1[i:j:step]) def test_tomutable(self): """Check creating a MutableSeq object.""" for example1 in self._examples: mut = MutableSeq(example1) self.assertIsInstance(mut, MutableSeq) self.assertEqual(mut, example1) def test_toseq(self): """Check creating a Seq object.""" for example1 in self._examples: seq = Seq(example1) self.assertIsInstance(seq, Seq) self.assertEqual(seq, example1) def test_the_complement(self): """Check obj.complement() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: comp = example1.complement() except ValueError as e: self.assertEqual(str(e), "Proteins do not have complements!") continue str1 = str(example1) if "U" in str1 or "u" in str1: mapping = str.maketrans("ACGUacgu", "UGCAugca") else: # Default to DNA, e.g. complement("A") -> "T" not "U" mapping = str.maketrans("ACGTacgt", "TGCAtgca") self.assertEqual(str1.translate(mapping), comp) def test_the_reverse_complement(self): """Check obj.reverse_complement() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: comp = example1.reverse_complement() except ValueError as e: self.assertEqual(str(e), "Proteins do not have complements!") continue str1 = str(example1) if "U" in str1 or "u" in str1: mapping = str.maketrans("ACGUacgu", "UGCAugca") else: # Defaults to DNA, so reverse_complement("A") --> "T" not "U" mapping = str.maketrans("ACGTacgt", "TGCAtgca") self.assertEqual(str1.translate(mapping)[::-1], comp) def test_the_transcription(self): """Check obj.transcribe() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: tran = example1.transcribe() except ValueError as e: if str(e) == "Proteins cannot be transcribed!": continue if str(e) == "RNA cannot be transcribed!": continue raise str1 = str(example1) if len(str1) % 3 != 0: # TODO - Check for or silence the expected warning? continue self.assertEqual(str1.replace("T", "U").replace("t", "u"), tran) def test_the_back_transcription(self): """Check obj.back_transcribe() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: tran = example1.back_transcribe() except ValueError as e: if str(e) == "Proteins cannot be back transcribed!": continue if str(e) == "DNA cannot be back transcribed!": continue raise str1 = str(example1) self.assertEqual(str1.replace("U", "T").replace("u", "t"), tran) def test_the_translate(self): """Check obj.translate() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue if len(example1) % 3 != 0: # TODO - Check for or silence the expected warning? continue try: tran = example1.translate() except ValueError as e: if str(e) == "Proteins cannot be translated!": continue raise # Try with positional vs named argument: self.assertEqual(example1.translate(11), example1.translate(table=11)) # TODO - check the actual translation, and all the optional args def test_the_translation_of_stops(self): """Check obj.translate() method with stop codons.""" misc_stops = "TAATAGTGAAGAAGG" nuc = Seq(misc_stops) self.assertEqual("***RR", nuc.translate()) self.assertEqual("***RR", nuc.translate(1)) self.assertEqual("***RR", nuc.translate("SGC0")) self.assertEqual("**W**", nuc.translate(table=2)) self.assertEqual("**WRR", nuc.translate(table="Yeast Mitochondrial")) self.assertEqual("**WSS", nuc.translate(table=5)) self.assertEqual("**WSS", nuc.translate(table=9)) self.assertEqual("**CRR", nuc.translate(table="Euplotid Nuclear")) self.assertEqual("***RR", nuc.translate(table=11)) self.assertEqual("***RR", nuc.translate(table="11")) self.assertEqual("***RR", nuc.translate(table="Bacterial")) self.assertEqual("**GRR", nuc.translate(table=25)) self.assertEqual("", nuc.translate(to_stop=True)) self.assertEqual("O*ORR", nuc.translate(table=special_table)) self.assertEqual("*QWRR", nuc.translate(table=Chilodonella_uncinata_table)) # These test the Bio.Seq.translate() function - move these?: self.assertEqual( "*QWRR", translate(str(nuc), table=Chilodonella_uncinata_table) ) self.assertEqual("O*ORR", translate(str(nuc), table=special_table)) self.assertEqual("", translate(str(nuc), to_stop=True)) self.assertEqual("***RR", translate(str(nuc), table="Bacterial")) self.assertEqual("***RR", translate(str(nuc), table="11")) self.assertEqual("***RR", translate(str(nuc), table=11)) self.assertEqual("**W**", translate(str(nuc), table=2)) self.assertEqual(Seq("TAT").translate(), "Y") self.assertEqual(Seq("TAR").translate(), "*") self.assertEqual(Seq("TAN").translate(), "X") self.assertEqual(Seq("NNN").translate(), "X") self.assertEqual(Seq("TAt").translate(), "Y") self.assertEqual(Seq("TaR").translate(), "*") self.assertEqual(Seq("TaN").translate(), "X") self.assertEqual(Seq("nnN").translate(), "X") self.assertEqual(Seq("tat").translate(), "Y") self.assertEqual(Seq("tar").translate(), "*") self.assertEqual(Seq("tan").translate(), "X") self.assertEqual(Seq("nnn").translate(), "X") def test_the_translation_of_invalid_codons(self): """Check obj.translate() method with invalid codons.""" for codon in ["TA?", "N-N", "AC_", "Ac_"]: nuc = Seq(codon) try: nuc.translate() self.fail("Translating %s should fail" % codon) except TranslationError: pass def test_the_translation_of_ambig_codons(self): """Check obj.translate() method with ambiguous codons.""" for ambig_values in [ambiguous_dna_values, ambiguous_rna_values]: ambig = set(ambig_values.keys()) ambig.remove("X") for c1 in ambig: for c2 in ambig: for c3 in ambig: values = { str(Seq(a + b + c).translate()) for a in ambig_values[c1] for b in ambig_values[c2] for c in ambig_values[c3] } t = Seq(c1 + c2 + c3).translate() if t == "*": self.assertEqual(values, set("*")) elif t == "X": self.assertGreater( len(values), 1, "translate('%s') = '%s' not '%s'" % (c1 + c2 + c3, t, ",".join(values)), ) elif t == "Z": self.assertEqual(values, set("EQ")) elif t == "B": self.assertEqual(values, set("DN")) elif t == "J": self.assertEqual(values, set("LI")) else: self.assertEqual(values, set(t)) # TODO - Use the Bio.Data.IUPACData module for the # ambiguous protein mappings? def test_init_typeerror(self): """Check Seq __init__ gives TypeError exceptions.""" self.assertRaises(TypeError, Seq, ("A", "C", "G", "T")) self.assertRaises(TypeError, Seq, ["A", "C", "G", "T"]) self.assertRaises(TypeError, Seq, 1) self.assertRaises(TypeError, Seq, 1.0) def test_MutableSeq_init_typeerror(self): """Check MutableSeq __init__ gives TypeError exceptions.""" self.assertRaises(TypeError, MutableSeq, ("A", "C", "G", "T")) self.assertRaises(TypeError, MutableSeq, ["A", "C", "G", "T"]) self.assertRaises(TypeError, MutableSeq, 1) self.assertRaises(TypeError, MutableSeq, 1.0) def test_join_Seq_TypeError(self): """Checks that a TypeError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = Seq("NNNNN") self.assertRaises(TypeError, spacer.join, 5) self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_UnknownSeq_TypeError_iter(self): """Checks that a TypeError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = UnknownSeq(5, character="-") self.assertRaises(TypeError, spacer.join, 5) self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_MutableSeq_TypeError_iter(self): """Checks that a TypeError is thrown for all non-iterable types.""" # No iterable types which contain non-accepted types either. spacer = MutableSeq("MMMMM") self.assertRaises(TypeError, spacer.join, 5) self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"]) def test_join_Seq(self): """Checks if Seq join correctly concatenates sequence with the spacer.""" spacer = Seq("NNNNN") self.assertEqual( "N" * 15, spacer.join([Seq("NNNNN"), Seq("NNNNN")]), ) spacer1 = Seq("") spacers = [spacer1, Seq("NNNNN"), Seq("GGG")] example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"] # strings with empty spacer str_concatenated = spacer1.join(example_strings) self.assertEqual(str_concatenated, "".join(example_strings)) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(seq_concatenated, str(spacer).join(example_strings)) # Now try single sequence arguments, should join the letters for target in example_strings + example_strings_seqs: self.assertEqual( str(spacer).join(str(target)), str(spacer.join(target)) ) def test_join_UnknownSeq(self): """Checks if UnknownSeq join correctly concatenates sequence with the spacer.""" spacer1 = UnknownSeq(5, character="-") spacer2 = UnknownSeq(0, character="-") spacers = [spacer1, spacer2] self.assertEqual( "-" * 15, spacer1.join([UnknownSeq(5, character="-"), UnknownSeq(5, character="-")]), ) self.assertEqual( "N" * 5 + "-" * 10, spacer1.join([Seq("NNNNN"), UnknownSeq(5, character="-")]), ) example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"] # strings with empty spacer str_concatenated = spacer2.join(example_strings) self.assertEqual(str_concatenated, "".join(example_strings)) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(seq_concatenated, str(spacer).join(example_strings)) # Now try single sequence arguments, should join the letters for target in example_strings + example_strings_seqs: self.assertEqual( str(spacer).join(str(target)), str(spacer.join(target)) ) def test_join_MutableSeq_mixed(self): """Check MutableSeq objects can be joined.""" spacer = MutableSeq("NNNNN") self.assertEqual( "N" * 15, spacer.join([MutableSeq("NNNNN"), MutableSeq("NNNNN")]), ) self.assertRaises( TypeError, spacer.join([Seq("NNNNN"), MutableSeq("NNNNN")]), ) def test_join_Seq_with_file(self): """Checks if Seq join correctly concatenates sequence from a file with the spacer.""" filename = "Fasta/f003" seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")] seqlist_as_strings = [str(_) for _ in seqlist] spacer = Seq("NNNNN") spacer1 = Seq("") # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(seq_concatenated, ref_data) self.assertEqual(seq_concatenated1, ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, "fasta")) def test_join_UnknownSeq_with_file(self): """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer.""" filename = "Fasta/f003" seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")] seqlist_as_strings = [str(_) for _ in seqlist] spacer = UnknownSeq(0, character="-") spacer1 = UnknownSeq(5, character="-") # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(seq_concatenated, ref_data) self.assertEqual(seq_concatenated1, ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, "fasta")) def test_join_MutableSeq(self): """Checks if MutableSeq join correctly concatenates sequence with the spacer.""" # Only expect it to take Seq objects and/or strings in an iterable! spacer1 = MutableSeq("") spacers = [ spacer1, MutableSeq("NNNNN"), MutableSeq("GGG"), ] example_strings = ["ATG", "ATG", "ATG", "ATG"] example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"] # strings with empty spacer str_concatenated = spacer1.join(example_strings) self.assertEqual(str_concatenated, "".join(example_strings)) for spacer in spacers: seq_concatenated = spacer.join(example_strings_seqs) self.assertEqual(seq_concatenated, str(spacer).join(example_strings)) def test_join_MutableSeq_with_file(self): """Checks if MutableSeq join correctly concatenates sequence from a file with the spacer.""" filename = "Fasta/f003" seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")] seqlist_as_strings = [str(_) for _ in seqlist] spacer = MutableSeq("NNNNN") spacer1 = MutableSeq("") # seq objects with spacer seq_concatenated = spacer.join(seqlist) # seq objects with empty spacer seq_concatenated1 = spacer1.join(seqlist) ref_data = ref_data1 = "" ref_data = str(spacer).join(seqlist_as_strings) ref_data1 = str(spacer1).join(seqlist_as_strings) self.assertEqual(seq_concatenated, ref_data) self.assertEqual(seq_concatenated1, ref_data1) with self.assertRaises(TypeError): spacer.join(SeqIO.parse(filename, "fasta")) def test_equality(self): """Test equality when mixing types.""" self.assertEqual(Seq("6"), "6") self.assertNotEqual(Seq("6"), 6) self.assertEqual(Seq(""), "") self.assertNotEqual(Seq(""), None) self.assertEqual(Seq("None"), "None") self.assertNotEqual(Seq("None"), None) self.assertEqual(MutableSeq("6"), "6") self.assertNotEqual(MutableSeq("6"), 6) self.assertEqual(MutableSeq(""), "") self.assertNotEqual(MutableSeq(""), None) self.assertEqual(MutableSeq("None"), "None") self.assertNotEqual(MutableSeq("None"), None) self.assertEqual(UnknownSeq(1, character="6"), "6") self.assertNotEqual(UnknownSeq(1, character="6"), 6) self.assertEqual(UnknownSeq(0), "") self.assertNotEqual(UnknownSeq(0), None)
def test_MutableSeq_init_typeerror(self): """Check MutableSeq __init__ gives TypeError exceptions.""" self.assertRaises(TypeError, MutableSeq, (Seq("A"))) self.assertRaises(TypeError, MutableSeq, (UnknownSeq(1)))
def test_join_UnknownSeq_TypeError(self): """Checks that a TypeError is thrown for incompatible alphabets.""" spacer = UnknownSeq(5, character="-", alphabet=generic_dna) self.assertRaises(TypeError, spacer.join, [UnknownSeq(5, character="-", alphabet=generic_rna), UnknownSeq(5, character="-", alphabet=generic_rna)]) self.assertRaises(TypeError, spacer.join, [Seq("NNNNN", generic_protein), UnknownSeq(5, character="-", alphabet=generic_protein)])
def test_count_overlap_start_end_NN(self): """Check our count_overlap method using NN with variable ends and starts.""" # Testing Seq() and MutableSeq() with variable start and end arguments start_end_exp = [(1, 7, 0), (3, None, 0), (3, 6, 0), (4, 6, 0), (4, -1, 0), (-5, None, 0), (-5, 7, 0), (7, -5, 0), (-100, None, 0), (None, 100, 0), (-100, 1000, 0)] testing_seq = "GTAGGGGAG" for start, end, exp in start_end_exp: self.assertEqual(Seq(testing_seq).count_overlap("NN", start, end), exp) self.assertEqual(MutableSeq(testing_seq).count_overlap("NN", start, end), exp) # Testing Seq() and MutableSeq() with a more heterogeneous sequenece self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0) self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0) # Testing UnknownSeq() with variable start and end arguments alphabet_char_start_end_exp = [(generic_rna, "N", 1, 7, 5), (generic_dna, "N", 1, 7, 5), (generic_rna, "N", -4, None, 3), (generic_dna, "N", -4, None, 3), (generic_protein, "X", 1, 7, 0)] for alpha, char, start, end, exp in alphabet_char_start_end_exp: self.assertEqual(UnknownSeq(12, alpha, char).count_overlap("NN", start, end), exp) self.assertEqual(UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0) # Testing UnknownSeq() with some more cases including unusual edge cases substr_start_end_exp = [("N", 100, 105, 0), ("N", -1, 4, 0), ("N", 4, -1, 2), ("N", -8, -2, 5), ("N", -2, -8, 0), ("N", 8, 2, 0), ("N", 2, 8, 5), ("NN", 8, 2, 0), ("NN", 2, 8, 4), ("NN", -5, -1, 3), ("NN", 1, 5, 3), ("NNN", None, None, 5), ("NNNNNNNNN", None, None, 0), ("NNN", 1, 2, 0)] for substr, start, end, exp in substr_start_end_exp: self.assertEqual(UnknownSeq(7, character="N").count_overlap(substr, start, end), exp) self.assertEqual(UnknownSeq(7, character="N").count_overlap("NN", 1), 5)
def record_end(self, content): """Clean up when we've finished the record. """ from Bio import Alphabet from Bio.Alphabet import IUPAC from Bio.Seq import Seq, UnknownSeq # Try and append the version number to the accession for the full id if not self.data.id: assert "accessions" not in self.data.annotations, self.data.annotations[ "accessions"] self.data.id = self.data.name # Good fall back? elif self.data.id.count(".") == 0: try: self.data.id += ".%i" % self.data.annotations["sequence_version"] except KeyError: pass # add the sequence information # first, determine the alphabet # we default to an generic alphabet if we don't have a # seq type or have strange sequence information. seq_alphabet = Alphabet.generic_alphabet # now set the sequence sequence = "".join(self._seq_data) if (self._expected_size is not None and len(sequence) != 0 and self._expected_size != len(sequence)): import warnings from Bio import BiopythonParserWarning warnings.warn( "Expected sequence length %i, found %i (%s)." % (self._expected_size, len(sequence), self.data.id), BiopythonParserWarning, ) if self._seq_type: # mRNA is really also DNA, since it is actually cDNA if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper(): seq_alphabet = IUPAC.ambiguous_dna # are there ever really RNA sequences in GenBank? elif "RNA" in self._seq_type.upper(): # Even for data which was from RNA, the sequence string # is usually given as DNA (T not U). Bug 2408 if "T" in sequence and "U" not in sequence: seq_alphabet = IUPAC.ambiguous_dna else: seq_alphabet = IUPAC.ambiguous_rna elif ("PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT"): # PRT is used in EMBL-bank for patents seq_alphabet = IUPAC.protein # or extended protein? # work around ugly GenBank records which have circular or # linear but no indication of sequence type elif self._seq_type in ["circular", "linear", "unspecified"]: pass # we have a bug if we get here else: raise ValueError("Could not determine alphabet for seq_type %s" % self._seq_type) # Also save the chomosome layout if "circular" in self._seq_type.lower(): self.data.annotations["topology"] = "circular" elif "linear" in self._seq_type.lower(): self.data.annotations["topology"] = "linear" if not sequence and self.__expected_size: self.data.seq = UnknownSeq(self._expected_size, seq_alphabet) else: self.data.seq = Seq(sequence, seq_alphabet)