def test_repr(self): s0 = Seq("ACGTURYBDHVNACGTURYSWKMBDHVN", nucleic_alphabet) s1 = Seq("ACGTURYSWKMBDHVNACGTURYSWKMBDHVN", nucleic_alphabet) s2 = Seq("ACGTURSWKMBDHVNACGTURKMBDHVN", nucleic_alphabet) seqs = SeqList([s0, s1, s2]) repr(seqs)
def test_create(self): # 1234567890123456789012345678 s0 = Seq("ACGTURYBDHVNACGTURYSWKMBDHVN", nucleic_alphabet) s1 = Seq("ACGTURYSWKMBDHVNACGTURYSWKMBDHVN", nucleic_alphabet) s2 = Seq("ACGTURSWKMBDHVNACGTURKMBDHVN", nucleic_alphabet) seqs = SeqList([s0, s1, s2]) self.assertEqual(len(seqs), 3)
def test_transform(self): trans = Transform(Seq("ACGTURYSWKMBDHVN", nucleic_alphabet), Seq("ACGTTNNNNNNNNNNN", dna_alphabet)) s0 = Seq("AAAAAR", nucleic_alphabet) s1 = trans(s0) # Callable ob self.assertEqual(s1.alphabet, dna_alphabet) self.assertEqual(s1, Seq("AAAAAN", dna_alphabet)) s2 = Seq(protein_alphabet, protein_alphabet) self.assertRaises(ValueError, trans, s2)
def test_create_empty(self): s0 = Seq("ACGTURYBDHVNACGTURYSWKMBDHVN", nucleic_alphabet) s1 = Seq("ACGTURYSWKMBDHVNACGTURYSWKMBDHVN", nucleic_alphabet) s2 = Seq("ACGTURSWKMBDHVNACGTURKMBDHVN", nucleic_alphabet) seqs = SeqList() seqs.append(s0) seqs.extend((s1, s2)) self.assertEqual(len(seqs), 3) self.assertEqual(type(seqs), SeqList)
def test_create_seq(self): self.assertTrue(Seq("alphabet", "alphbet")) self.assertRaises(ValueError, Seq, "not alphabetic", "alphabet") a = "Any printable Ascii character `1234567890-=~!@#$%^&*()_+{}|[]\\:;'<>?,./QWERTYUIOPASD"\ "FGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm " for x in a: self.assertTrue(x in generic_alphabet) self.assertTrue(Seq(a, generic_alphabet)) self.assertRaises(ValueError, Seq, "Not zero. \x00", generic_alphabet)
def test_isaligned(self): a = Alphabet("ABCD") s0 = Seq("ABCDD", a) s1 = Seq("AAAAD", a) s2 = Seq("AAABD", a) s3 = Seq("AAACD", a) seqs = SeqList([s0, s1, s2, s3], a) assert seqs.isaligned() seqs = SeqList([s0, s1, s2, s3], Alphabet("ABCDE")) assert not seqs.isaligned()
def test_create_annotated(self): s0 = Seq("ACGTURYBDHVNACGTURYSWKMBDHVN", nucleic_alphabet) s1 = Seq("ACGTURYSWKMBDHVNACGTURYSWKMBDHVN", nucleic_alphabet) s2 = Seq("ACGTURSWKMBDHVNACGTURKMBDHVN", nucleic_alphabet) seqs = SeqList([s0, s1, s2], alphabet=nucleic_alphabet, name="alsdf", description='a') self.assertEqual(seqs.name, 'alsdf') self.assertEqual(seqs.description, 'a') self.assertEqual(seqs.alphabet, nucleic_alphabet)
def test_profile(self): a = Alphabet("ABCD") s0 = Seq("ABCDD", a) s1 = Seq("AAAAD", a) s2 = Seq("AAABD", a) s3 = Seq("AAACD", a) seqs = SeqList([s0, s1, s2, s3], a) tally = seqs.profile() self.assertEqual(list(tally[0]), [4, 0, 0, 0]) self.assertEqual(list(tally[1]), [3, 1, 0, 0]) self.assertEqual(list(tally[2]), [3, 0, 1, 0]) self.assertEqual(list(tally[3]), [1, 1, 1, 1]) self.assertEqual(list(tally[4]), [0, 0, 0, 4]) self.assertEqual(tally[4, 'D'], 4) seqs = SeqList([Seq("AAACD", a), Seq("AAACDA", a)], a) self.assertRaises(ValueError, seqs.profile) seqs = SeqList([Seq("AAACD", a), Seq("AAACD", a)]) self.assertRaises(ValueError, seqs.profile)
def test_tally(self): # 1234567890123456789012345678 s0 = Seq("ACTTT", nucleic_alphabet) s1 = Seq("ACCCC", nucleic_alphabet) s2 = Seq("GGGG", nucleic_alphabet) seqs = SeqList([s0, s1, s2], nucleic_alphabet) counts = seqs.tally() assert counts == [2, 5, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] seqs = SeqList( [Seq("AAACD", nucleic_alphabet), Seq("AAACD", nucleic_alphabet)]) self.assertRaises(ValueError, seqs.tally)
def test_reduced_protein_alphabets(self): seq = Seq( "ENHGGKVALKTHCGKYLSIGDHKQVYLSHHLHGDHSLFHLEHHGGKVSIKGHHHHYISADHHGHVSTKEHHDHDT" "TFEEIII", reduced_protein_alphabet) for t in reduced_protein_alphabets.values(): t(seq)
def test_seg_invalid(self): seq = Seq("KTHCGKYLSIGDHKQVYLSHH", protein_alphabet) self.assertRaises(ValueError, mask_low_complexity, seq, 12, -1, 0) self.assertRaises(ValueError, mask_low_complexity, seq, -1, 0, 0) self.assertRaises(ValueError, mask_low_complexity, seq, 12, 1, 10) self.assertRaises(ValueError, mask_low_complexity, seq, 6, 12, 13) self.assertRaises(ValueError, mask_low_complexity, seq, 6, 2.0, 1.9)
def main(): # ------ Parse Command line ------ parser = _build_option_parser() (opts, args) = parser.parse_args(sys.argv[1:]) if args: parser.error("Unparsable arguments: %s " % args) seqs = opts.reader.read(opts.fin) if opts.trans_seg: seqs = SeqList([mask_low_complexity(s) for s in seqs]) if opts.subsample is not None: from random import random frac = opts.subsample ss = [] for s in seqs: if random() < frac: ss.append(s) seqs = SeqList(ss) if opts.reverse: seqs = SeqList([s.reverse() for s in seqs]) if opts.complement: seqs = SeqList([Seq(s, alphabet=nucleic_alphabet) for s in seqs]) seqs = SeqList([s.complement() for s in seqs]) opts.writer.write(opts.fout, seqs)
def complement(self): """Complement nucleic acid sequence.""" from weblogo.seq import Seq, Alphabet alphabet = self.alphabet complement_alphabet = Alphabet(Seq(alphabet, alphabet).complement()) self.alphabets = (None, complement_alphabet) m = self.reindex(alphabet) self.alphabets = (None, alphabet) self.array = m.array
def test_add(self): s1 = Seq("AAAA", dna_alphabet) s2 = Seq("TTTT", dna_alphabet) s3 = s1 + s2 self.assertEqual(s3.alphabet, dna_alphabet) self.assertEqual(s3, Seq("AAAATTTT", dna_alphabet)) assert s3 == Seq("AAAATTTT", dna_alphabet) assert s3 != Seq("AAAATTTT", protein_alphabet) assert s3 != "not a seq" s4 = "AA" s5 = s4 + s1 s6 = s1 + s4 self.assertEqual(s5.alphabet, s6.alphabet) self.assertEqual(s5, s6) assert s5 == s6 assert not (s5 != s6)
def test_words(self): s = Seq("AGTCAGCTACGACGcgcx", dna_alphabet) w = list(s.words(2, unambiguous_dna_alphabet)) self.assertEqual(len(w), len(s) - 2) self.assertEqual( w, [ "AG", "GT", "TC", "CA", "AG", "GC", "CT", "TA", "AC", "CG", "GA", "AC", "CG", "GC", "CG", "GC", ], ) self.assertEqual(list(s.words(len(s), unambiguous_dna_alphabet)), []) self.assertEqual( list(s.words(len(s) - 1, unambiguous_dna_alphabet)), [ "AGTCAGCTACGACGCGC", ], ) w = list(s.words(200, unambiguous_dna_alphabet))
def test_segging(self): before = "mgnrafkshhghflsaegeavkthhghhdhhthfhvenhggkvalkthcgkylsigdhkqvylshhlhgdhslfhlehhg"\ "gkvsikghhhhyisadhhghvstkehhdhdttfeeiii".upper() after = "MGNRAFKSHHGHFLSAEGEAVxxxxxxxxxxxxxxxENHGGKVALKTHCGKYLSIGDHKQVYLSHHLHGDHSLFHLEHHGG"\ "KVSIKGHHHHYISADHHGHVSTKEHHDHDTTFEEIII".upper() bseq = Seq(before, protein_alphabet) aseq = Seq(after, protein_alphabet) xseq = Seq('X' * len(bseq), protein_alphabet) sseq = mask_low_complexity(bseq) self.assertEqual(aseq, sseq) # Nothing should be segged sseq = mask_low_complexity(bseq, 12, 0, 0) self.assertEqual(bseq, sseq) # Everthing should be segged sseq = mask_low_complexity(bseq, 12, 4.3, 4.3) self.assertEqual(sseq, xseq) mask_low_complexity(bseq, 100000, 4.3, 4.3)
def test_join(self): s1 = Seq("AAAA", dna_alphabet) s2 = Seq("TTTT", dna_alphabet) s3 = "GGGG" s0 = Seq("", dna_alphabet) j = s0.join([s1, s2, s3]) self.assertEqual(j, Seq("AAAATTTTGGGG", dna_alphabet))
def test_ords(self): s0 = Seq("ACGTURYBDHVNACGTURYSWKMBDHVN", nucleic_alphabet) s1 = Seq("ACGTURYSDHVNACGTURYSWKMBDHVN", nucleic_alphabet) s2 = Seq("ACGTURSWKMBDHVNACGTURKMBDHVN", nucleic_alphabet) seqs = SeqList([s0, s1, s2], nucleic_alphabet) seqs.ords() # self.assertEqual( a.shape, (3, 28) ) # Fails if seqs are of different lengths # FIXME? # s3 = Seq("ACGTUR", nucleic_alphabet ) # seqs2 = SeqList( [ s0,s1,s3,s2], nucleic_alphabet) # self.assertRaises(ValueError, seqs2.ords ) # Use a different alphabet seqs.ords(nucleic_alphabet) # No alphabet seqs3 = SeqList([s0, s1, s2]) seqs3.ords(alphabet=Alphabet("ABC")) # Fail if no alphabet self.assertRaises(ValueError, seqs3.ords)
def test_words(self): s = Seq("AGTCAGCTACGACGcgcx", dna_alphabet) w = list(s.words(2, unambiguous_dna_alphabet)) self.assertEqual(len(w), len(s) - 2) self.assertEqual(w, [ 'AG', 'GT', 'TC', 'CA', 'AG', 'GC', 'CT', 'TA', 'AC', 'CG', 'GA', 'AC', 'CG', 'GC', 'CG', 'GC' ]) self.assertEqual(list(s.words(len(s), unambiguous_dna_alphabet)), []) self.assertEqual(list(s.words(len(s) - 1, unambiguous_dna_alphabet)), [ "AGTCAGCTACGACGCGC", ]) w = list(s.words(200, unambiguous_dna_alphabet))
def test_which_alphabet(self): a = Alphabet.which(Seq("ARNDCQEGHILKMFPSTWYVX")) assert a == unambiguous_protein_alphabet f1 = data_stream('cap.fa') f2 = data_stream('cox2.msf') f3 = data_stream('Rv3829c.fasta') f4 = data_stream('chain_B.fasta') tests = ( (seq_io.read(f1), unambiguous_dna_alphabet), (seq_io.read(f2), unambiguous_protein_alphabet), (seq_io.read(f3), unambiguous_protein_alphabet), (seq_io.read(f4), unambiguous_protein_alphabet), ) for t in tests: self.assertEqual(Alphabet.which(t[0]), t[1]) f1.close() f2.close() f3.close() f4.close()
def test_reverse(self): s = Seq("ACGT", dna_alphabet) self.assertEqual(s, s.reverse().reverse()) self.assertEqual(s.reverse(), Seq("TGCA", dna_alphabet))
def test_tally_nonalphabetic(self): s = Seq("AGTCAGCTACGACGCGC", dna_alphabet) c = s.tally(Alphabet("AC")) self.assertEqual(2, len(c)) self.assertEqual(list(c), [4, 6])
def test_words2(self): s = Seq("AGTCAGCTACGACGCGC", unambiguous_dna_alphabet) wc = s.word_count(2) count = list(zip(*wc))[1] self.assertEqual(count, (2, 2, 1, 3, 1, 1, 3, 1, 1, 1))
def test_getslice(self): s = Seq("AGTCAGCTACGACGCGC", dna_alphabet) slice = s[2:4] self.assertEqual(s.alphabet, slice.alphabet)
def test_alphabet_chrs(self): a = Alphabet("alph") self.assertEqual(Seq("ppla", a), a.chrs((2, 2, 1, 0)))
def test_create_annotated(self): s = "ACGTURYSWKMBDHVNACGTURYSWKMBDHVNAAAAA" a = Seq(s, nucleic_alphabet, name="ID", description="DESCRIPTION") self.assertEqual(a.name, "ID") self.assertEqual(a.description, "DESCRIPTION") self.assertEqual(s, str(a))
def test_ungap(self): s = Seq("T-T", dna_alphabet).ungap() self.assertEqual(str(s), 'TT') s = Seq("T-~---T...~~~--", dna_alphabet).ungap() self.assertEqual(str(s), 'TT')
def test_repr(self): s1 = Seq("AAAA", dna_alphabet) repr(s1)
def test_str(self): s1 = Seq("AGCTA", dna_alphabet) self.assertEqual(str(s1), "AGCTA") # Uncased alpahebt self.assertEqual(str(Seq("AgcTA", dna_alphabet)), "AgcTA")
def test_tostring(self): self.assertEqual(Seq("AgcTAAAA", dna_alphabet).tostring(), "AgcTAAAA")