def test_k_word_frequencies(self): # overlapping = True expected = defaultdict(int) expected['A'] = 3 / 7. expected['C'] = 1 / 7. expected['G'] = 1 / 7. expected['T'] = 2 / 7. self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True), expected) expected = defaultdict(int) expected['GAT'] = 1 / 5. expected['ATT'] = 1 / 5. expected['TTA'] = 1 / 5. expected['TAC'] = 1 / 5. expected['ACA'] = 1 / 5. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True), expected) # overlapping = False expected = defaultdict(int) expected['GAT'] = 1 / 2. expected['TAC'] = 1 / 2. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False), expected) expected = defaultdict(int) expected['GATTACA'] = 1.0 self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False), expected) expected = defaultdict(int) empty = BiologicalSequence('') self.assertEqual(empty.k_word_frequencies(1, overlapping=False), expected)
def test_k_words(self): # overlapping = True self.assertEqual(list(self.b1.k_words(1, overlapping=True)), ['G', 'A', 'T', 'T', 'A', 'C', 'A']) self.assertEqual(list(self.b1.k_words(2, overlapping=True)), ['GA', 'AT', 'TT', 'TA', 'AC', 'CA']) self.assertEqual(list(self.b1.k_words(3, overlapping=True)), ['GAT', 'ATT', 'TTA', 'TAC', 'ACA']) self.assertEqual(list(self.b1.k_words(7, overlapping=True)), ['GATTACA']) self.assertEqual(list(self.b1.k_words(8, overlapping=True)), []) # overlapping = False self.assertEqual(list(self.b1.k_words(1, overlapping=True)), ['G', 'A', 'T', 'T', 'A', 'C', 'A']) self.assertEqual(list(self.b1.k_words(2, overlapping=False)), ['GA', 'TT', 'AC']) self.assertEqual(list(self.b1.k_words(3, overlapping=False)), ['GAT', 'TAC']) self.assertEqual(list(self.b1.k_words(7, overlapping=False)), ['GATTACA']) self.assertEqual(list(self.b1.k_words(8, overlapping=False)), []) # error on invalid k self.assertRaises(ValueError, list, self.b1.k_words(0)) self.assertRaises(ValueError, list, self.b1.k_words(-42)) # tests with different sequences self.assertEqual(list(self.b8.k_words(3, overlapping=False)), ['HE.', '.--', '..L']) b = BiologicalSequence('') self.assertEqual(list(b.k_words(3)), [])
def test_k_word_frequencies(self): # overlapping = True expected = defaultdict(int) expected['A'] = 3/7. expected['C'] = 1/7. expected['G'] = 1/7. expected['T'] = 2/7. self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True), expected) expected = defaultdict(int) expected['GAT'] = 1/5. expected['ATT'] = 1/5. expected['TTA'] = 1/5. expected['TAC'] = 1/5. expected['ACA'] = 1/5. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True), expected) # overlapping = False expected = defaultdict(int) expected['GAT'] = 1/2. expected['TAC'] = 1/2. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False), expected) expected = defaultdict(int) expected['GATTACA'] = 1.0 self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False), expected) expected = defaultdict(int) empty = BiologicalSequence('') self.assertEqual(empty.k_word_frequencies(1, overlapping=False), expected)
def setUp(self): """Setup for Fasta tests.""" self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu'] self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu' self.fasta_with_label =\ '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU' self.fasta_with_label_lw2 =\ '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU' self.alignment_dict = { '1st': 'AAAA', '2nd': 'CCCC', '3rd': 'GGGG', '4th': 'UUUU' } self.sequence_objects_a = [ DNASequence('ACTCGAGATC', 'seq1'), DNASequence('GGCCT', 'seq2') ] self.sequence_objects_b = [ BiologicalSequence('ACTCGAGATC', 'seq1'), BiologicalSequence('GGCCT', 'seq2') ] seqs = [ DNASequence("ACC--G-GGTA..", id="seq1"), DNASequence("TCC--G-GGCA..", id="seqs2") ] self.alignment = Alignment(seqs)
def test_init_varied_input(self): # init as string b = BiologicalSequence('ACCGGXZY') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") # init as string with optional values b = BiologicalSequence('ACCGGXZY', 'test-seq-1', 'The first test sequence') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "test-seq-1") self.assertEqual(b.description, "The first test sequence") # test init as a different string b = BiologicalSequence('WRRTY') self.assertEqual(str(b), 'WRRTY') # init as list b = BiologicalSequence(list('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") # init as tuple b = BiologicalSequence(tuple('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "")
def test_eq(self): self.assertTrue(self.b1 == self.b1) self.assertTrue(self.b2 == self.b2) self.assertTrue(self.b3 == self.b3) self.assertTrue(self.b1 != self.b3) self.assertTrue(self.b1 != self.b2) self.assertTrue(self.b2 != self.b3) # identicial sequences of the same type are equal, even if they have # different ids and/or descriptions self.assertTrue( BiologicalSequence('ACGT') == BiologicalSequence('ACGT')) self.assertTrue( BiologicalSequence('ACGT', id='a') == BiologicalSequence('ACGT', id='b')) self.assertTrue( BiologicalSequence('ACGT', description='c') == BiologicalSequence( 'ACGT', description='d')) self.assertTrue( BiologicalSequence('ACGT', id='a', description='c') == BiologicalSequence('ACGT', id='b', description='d')) # different type causes sequences to not be equal self.assertFalse( BiologicalSequence('ACGT') == NucleotideSequence('ACGT'))
def get_representatives(mapping, seqs): """Returns representative seqs. mapping: The prefix mapping dict seqs_fh: An open Fasta filehandle """ for (label, seq) in seqs: if(label in mapping): seq = BiologicalSequence( seq, id="%s: %d" % (label, len(mapping[label]) + 1)) yield seq.upper()
def get_representatives(mapping, seqs): """Returns representative seqs. mapping: The prefix mapping dict seqs_fh: An open Fasta filehandle """ for (label, seq) in seqs: if (label in mapping): seq = BiologicalSequence(seq, id="%s: %d" % (label, len(mapping[label]) + 1)) yield seq.upper()
def test_init_with_validation(self): self.assertRaises(BiologicalSequenceError, BiologicalSequence, "ACC", validate=True) # no error raised when only allow characters are passed BiologicalSequence("..--..", validate=True)
def test_distance(self): # note that test_hamming_distance covers default behavior more # extensively self.assertEqual(self.b1.distance(self.b1), 0.0) self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')), 1./7) def dumb_distance(x, y): return 42 self.assertEqual( self.b1.distance(self.b1, distance_fn=dumb_distance), 42)
def write_Fasta_from_name_seq_pairs(name_seqs, fh): """writes a list of (name,seqs) to filehandle. name_seqs: (name,seqs) pair such as from parse_fasta fh: an open filehandle """ if fh is None: raise ValueError("Need open file handle to write to.") for (name, seq) in name_seqs: fh.write("%s\n" % BiologicalSequence(seq, identifier=name).to_fasta())
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh, otu_picker_otu_map_fh, out_dir): """Combine denoiser and OTU picker mapping file, replace flowgram IDs. fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py used to replace flowgram id with the unique se_sample_id mapping_fh: The cluster mapping from the denoiser.py denoised_seqs_fh: the Fasta output files from denoiser.py otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh out_dir: output directory """ # read in mapping from split_library file labels = imap(lambda a_b: a_b[0], parse_fasta(fasta_fh)) # mapping from seq_id to sample_id sample_id_mapping = extract_read_to_sample_mapping(labels) denoiser_mapping = read_denoiser_mapping(mapping_fh) # read in cd_hit otu map # and write out combined otu_picker+denoiser map otu_fh = open(out_dir + "/denoised_otu_map.txt", "w") for otu_line in otu_picker_otu_map_fh: otu_split = otu_line.split() otu = otu_split[0] ids = otu_split[1:] get_sample_id = sample_id_mapping.get # concat lists # make sure the biggest one is first for pick_repr all_ids = sort_ids(ids, denoiser_mapping) all_ids.extend(sum([denoiser_mapping[id] for id in ids], [])) try: otu_fh.write("%s\t" % otu + "\t".join(map(get_sample_id, all_ids)) + "\n") except TypeError: # get returns Null if denoiser_mapping id not present in # sample_id_mapping print "Found id in denoiser output, which was not found in split_libraries " +\ "output FASTA file. Wrong file?" exit() fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w") for label, seq in parse_fasta(denoised_seqs_fh): id = label.split()[0] newlabel = "%s %s" % (sample_id_mapping[id], id) fasta_out_fh.write(BiologicalSequence(seq, id=newlabel).to_fasta())
def setUp(self): self.b1 = BiologicalSequence('GATTACA') self.b2 = BiologicalSequence( 'ACCGGTACC', id="test-seq-2", description="A test sequence") self.b3 = BiologicalSequence( 'GREG', id="test-seq-3", description="A protein sequence") self.b4 = BiologicalSequence( 'PRTEIN', id="test-seq-4") self.b5 = BiologicalSequence( 'LLPRTEIN', description="some description") self.b6 = BiologicalSequence('ACGTACGTACGT') self.b7 = BiologicalSequence('..--..') self.b8 = BiologicalSequence('HE..--..LLO')
def setUp(self): """ Initialize values to be used in tests """ self.b1 = BiologicalSequence('GATTACA') self.b2 = BiologicalSequence( 'ACCGGTACC', identifier="test-seq-2", description="A test sequence") self.b3 = BiologicalSequence( 'GREG', identifier="test-seq-3", description="A protein sequence") self.b4 = BiologicalSequence( 'PRTEIN', identifier="test-seq-4") self.b5 = BiologicalSequence( 'LLPRTEIN', description="some description") self.b6 = BiologicalSequence('ACGTACGTACGT') self.b7 = BiologicalSequence('..--..') self.b8 = BiologicalSequence('HE..--..LLO')
def test_gap_maps(self): # in sequence with no gaps, the gap_maps are identical self.assertEqual(self.b1.gap_maps(), ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6])) # in sequence with all gaps, the map of degapped to gapped is the empty # list (bc its length is 0), and the map of gapped to degapped is all # None self.assertEqual(self.b7.gap_maps(), ([], [None, None, None, None, None, None])) self.assertEqual(self.b8.gap_maps(), ([0, 1, 8, 9, 10], [0, 1, None, None, None, None, None, None, 2, 3, 4])) # example from the gap_maps doc string self.assertEqual( BiologicalSequence('-ACCGA-TA-').gap_maps(), ([1, 2, 3, 4, 5, 7, 8], [None, 0, 1, 2, 3, 4, None, 5, 6, None]))
class BiologicalSequenceTests(TestCase): """ Tests of the BiologicalSequence class """ def setUp(self): """ Initialize values to be used in tests """ self.b1 = BiologicalSequence('GATTACA') self.b2 = BiologicalSequence( 'ACCGGTACC', identifier="test-seq-2", description="A test sequence") self.b3 = BiologicalSequence( 'GREG', identifier="test-seq-3", description="A protein sequence") self.b4 = BiologicalSequence( 'PRTEIN', identifier="test-seq-4") self.b5 = BiologicalSequence( 'LLPRTEIN', description="some description") self.b6 = BiologicalSequence('ACGTACGTACGT') self.b7 = BiologicalSequence('..--..') self.b8 = BiologicalSequence('HE..--..LLO') def test_init(self): """ Initialization functions as expected with varied input types """ # init as string b = BiologicalSequence('ACCGGXZY') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.identifier, "") self.assertEqual(b.description, "") # init as string with optional values b = BiologicalSequence( 'ACCGGXZY', 'test-seq-1', 'The first test sequence') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.identifier, "test-seq-1") self.assertEqual(b.description, "The first test sequence") # test init as a different string b = BiologicalSequence('WRRTY') self.assertEqual(str(b), 'WRRTY') # init as list b = BiologicalSequence(list('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.identifier, "") self.assertEqual(b.description, "") # init as tuple b = BiologicalSequence(tuple('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.identifier, "") self.assertEqual(b.description, "") def test_init_validate(self): """ initialization with validation functions as expected """ self.assertRaises(BiologicalSequenceError, BiologicalSequence, "ACC", validate=True) # no error raised when only allow characters are passed BiologicalSequence("..--..", validate=True) def test_contains(self): """ contains functions as expected """ self.assertTrue('G' in self.b1) self.assertFalse('g' in self.b1) def test_eq(self): """ equality functions as expected """ self.assertTrue(self.b1 == self.b1) self.assertTrue(self.b2 == self.b2) self.assertTrue(self.b3 == self.b3) self.assertTrue(self.b1 != self.b3) self.assertTrue(self.b1 != self.b2) self.assertTrue(self.b2 != self.b3) # identicial sequences of the same type are equal, even if they have # different identifiers and/or descriptions self.assertTrue( BiologicalSequence('ACGT') == BiologicalSequence('ACGT')) self.assertTrue( BiologicalSequence('ACGT', identifier='a') == BiologicalSequence('ACGT', identifier='b')) self.assertTrue( BiologicalSequence('ACGT', description='c') == BiologicalSequence('ACGT', description='d')) self.assertTrue( BiologicalSequence('ACGT', identifier='a', description='c') == BiologicalSequence('ACGT', identifier='b', description='d')) # different type causes sequences to not be equal self.assertFalse( BiologicalSequence('ACGT') == NucleotideSequence('ACGT')) def test_getitem(self): """ getitem functions as expected """ self.assertEqual(self.b1[0], BiologicalSequence('G')) self.assertEqual(self.b1[:], BiologicalSequence('GATTACA')) self.assertEqual(self.b1[::-1], BiologicalSequence('ACATTAG')) def test_iter(self): """ iter functions as expected """ b1_iter = iter(self.b1) for actual, expected in zip(b1_iter, "GATTACA"): self.assertEqual(actual, expected) self.assertRaises(StopIteration, b1_iter.next) def test_len(self): """ len functions as expected """ self.assertEqual(len(self.b1), 7) self.assertEqual(len(self.b2), 9) self.assertEqual(len(self.b3), 4) def test_repr(self): """ repr functions as expected """ self.assertEqual(repr(self.b1), "<BiologicalSequence: GATTACA (length: 7)>") self.assertEqual(repr(self.b6), "<BiologicalSequence: ACGTACGTAC... (length: 12)>") def test_reversed(self): """ reversed functions as expected """ b1_reversed = reversed(self.b1) for actual, expected in zip(b1_reversed, "ACATTAG"): self.assertEqual(actual, expected) self.assertRaises(StopIteration, b1_reversed.next) def test_str(self): """ str functions as expected """ self.assertEqual(str(self.b1), "GATTACA") self.assertEqual(str(self.b2), "ACCGGTACC") self.assertEqual(str(self.b3), "GREG") def test_alphabet(self): """ alphabet property functions as expected """ self.assertEqual(self.b1.alphabet(), set()) def test_description(self): """ description property functions as expected """ self.assertEqual(self.b1.description, "") self.assertEqual(self.b2.description, "A test sequence") self.assertEqual(self.b3.description, "A protein sequence") def test_gap_alphabet(self): """ gap_alphabet property functions as expected """ self.assertEqual(self.b1.gap_alphabet(), set('-.')) def test_identifier(self): """ identifier property functions as expected """ self.assertEqual(self.b1.identifier, "") self.assertEqual(self.b2.identifier, "test-seq-2") self.assertEqual(self.b3.identifier, "test-seq-3") def test_count(self): """ count functions as expected """ self.assertEqual(self.b1.count('A'), 3) self.assertEqual(self.b1.count('T'), 2) self.assertEqual(self.b1.count('TT'), 1) def test_degap(self): """ degap functions as expected """ self.assertEqual(self.b1.degap(), self.b1) self.assertEqual(self.b7.degap(), BiologicalSequence('')) self.assertEqual(self.b8.degap(), BiologicalSequence('HELLO')) def test_distance(self): """ distance functions as expected """ # note that test_hamming_distance covers default behavior more # extensively self.assertEqual(self.b1.distance(self.b1), 0.0) self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')), 1./7) def dumb_distance(x, y): return 42 self.assertEqual( self.b1.distance(self.b1, distance_fn=dumb_distance), 42) def test_fraction_diff(self): """ fraction_diff functions as expected """ self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5) self.assertEqual( self.b1.fraction_diff(BiologicalSequence('GATTACC')), 1. / 7., 5) def test_fraction_same(self): """ fraction_same functions as expected """ self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5) self.assertAlmostEqual( self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5) def test_gap_maps(self): """ gap_maps functions as expected """ # in sequence with no gaps, the gap_maps are identical self.assertEqual(self.b1.gap_maps(), ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6])) # in sequence with all gaps, the map of degapped to gapped is the empty # list (bc its length is 0), and the map of gapped to degapped is all # None self.assertEqual(self.b7.gap_maps(), ([], [None, None, None, None, None, None])) self.assertEqual(self.b8.gap_maps(), ([0, 1, 8, 9, 10], [0, 1, None, None, None, None, None, None, 2, 3, 4])) # example from the gap_maps doc string self.assertEqual(BiologicalSequence('-ACCGA-TA-').gap_maps(), ([1, 2, 3, 4, 5, 7, 8], [None, 0, 1, 2, 3, 4, None, 5, 6, None])) def test_gap_vector(self): """ gap_vector functions as expected """ self.assertEqual(self.b1.gap_vector(), [False] * len(self.b1)) self.assertEqual(self.b7.gap_vector(), [True] * len(self.b7)) self.assertEqual(self.b8.gap_vector(), [False, False, True, True, True, True, True, True, False, False, False]) def test_unsupported_characters(self): """ unsupported_characters functions as expected """ self.assertEqual(self.b1.unsupported_characters(), set('GATC')) self.assertEqual(self.b7.unsupported_characters(), set()) def test_has_unsupported_characters(self): """ has_unsupported_characters functions as expected """ self.assertTrue(self.b1.has_unsupported_characters()) self.assertFalse(self.b7.has_unsupported_characters()) def test_index(self): """ index functions as expected """ self.assertEqual(self.b1.index('G'), 0) self.assertEqual(self.b1.index('A'), 1) self.assertEqual(self.b1.index('AC'), 4) self.assertRaises(ValueError, self.b1.index, 'x') def test_is_gap(self): """ is_gap functions as expected """ self.assertTrue(self.b1.is_gap('.')) self.assertTrue(self.b1.is_gap('-')) self.assertFalse(self.b1.is_gap('A')) self.assertFalse(self.b1.is_gap('x')) self.assertFalse(self.b1.is_gap(' ')) self.assertFalse(self.b1.is_gap('')) def test_is_gapped(self): """ is_gapped functions as expected """ self.assertFalse(self.b1.is_gapped()) self.assertFalse(self.b2.is_gapped()) self.assertTrue(self.b7.is_gapped()) self.assertTrue(self.b8.is_gapped()) def test_is_valid(self): """ is_valid functions as expected """ self.assertFalse(self.b1.is_valid()) self.assertTrue(self.b7.is_valid()) def test_to_fasta(self): """ to_fasta functions as expected """ self.assertEqual(self.b1.to_fasta(), ">\nGATTACA\n") self.assertEqual(self.b1.to_fasta(terminal_character=""), ">\nGATTACA") self.assertEqual(self.b2.to_fasta(), ">test-seq-2 A test sequence\nACCGGTACC\n") self.assertEqual(self.b3.to_fasta(), ">test-seq-3 A protein sequence\nGREG\n") self.assertEqual(self.b4.to_fasta(), ">test-seq-4\nPRTEIN\n") self.assertEqual(self.b5.to_fasta(), "> some description\nLLPRTEIN\n") # alt parameters self.assertEqual(self.b2.to_fasta(field_delimiter=":"), ">test-seq-2:A test sequence\nACCGGTACC\n") self.assertEqual(self.b2.to_fasta(terminal_character="!"), ">test-seq-2 A test sequence\nACCGGTACC!") self.assertEqual( self.b2.to_fasta(field_delimiter=":", terminal_character="!"), ">test-seq-2:A test sequence\nACCGGTACC!") def test_upper(self): """ upper functions as expected """ b = NucleotideSequence('GAt.ACa-', identifier='x', description='42') expected = NucleotideSequence('GAT.ACA-', identifier='x', description='42') self.assertEqual(b.upper(), expected) def test_lower(self): """ lower functions as expected """ b = NucleotideSequence('GAt.ACa-', identifier='x', description='42') expected = NucleotideSequence('gat.aca-', identifier='x', description='42') self.assertEqual(b.lower(), expected)
def hamming_distance(s1, s2): s1 = BiologicalSequence(s1) s2 = BiologicalSequence(s2) return s1.distance(s2)
class BiologicalSequenceTests(TestCase): def setUp(self): self.b1 = BiologicalSequence('GATTACA') self.b2 = BiologicalSequence( 'ACCGGTACC', id="test-seq-2", description="A test sequence") self.b3 = BiologicalSequence( 'GREG', id="test-seq-3", description="A protein sequence") self.b4 = BiologicalSequence( 'PRTEIN', id="test-seq-4") self.b5 = BiologicalSequence( 'LLPRTEIN', description="some description") self.b6 = BiologicalSequence('ACGTACGTACGT') self.b7 = BiologicalSequence('..--..') self.b8 = BiologicalSequence('HE..--..LLO') def test_init_varied_input(self): # init as string b = BiologicalSequence('ACCGGXZY') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") # init as string with optional values b = BiologicalSequence( 'ACCGGXZY', 'test-seq-1', 'The first test sequence') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "test-seq-1") self.assertEqual(b.description, "The first test sequence") # test init as a different string b = BiologicalSequence('WRRTY') self.assertEqual(str(b), 'WRRTY') # init as list b = BiologicalSequence(list('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") # init as tuple b = BiologicalSequence(tuple('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") def test_init_with_validation(self): self.assertRaises(BiologicalSequenceError, BiologicalSequence, "ACC", validate=True) try: # no error raised when only allow characters are passed BiologicalSequence("..--..", validate=True) except BiologicalSequenceError: self.assertTrue(False) def test_contains(self): self.assertTrue('G' in self.b1) self.assertFalse('g' in self.b1) def test_eq(self): self.assertTrue(self.b1 == self.b1) self.assertTrue(self.b2 == self.b2) self.assertTrue(self.b3 == self.b3) self.assertTrue(self.b1 != self.b3) self.assertTrue(self.b1 != self.b2) self.assertTrue(self.b2 != self.b3) # identicial sequences of the same type are equal, even if they have # different ids and/or descriptions self.assertTrue( BiologicalSequence('ACGT') == BiologicalSequence('ACGT')) self.assertTrue( BiologicalSequence('ACGT', id='a') == BiologicalSequence('ACGT', id='b')) self.assertTrue( BiologicalSequence('ACGT', description='c') == BiologicalSequence('ACGT', description='d')) self.assertTrue( BiologicalSequence('ACGT', id='a', description='c') == BiologicalSequence('ACGT', id='b', description='d')) # different type causes sequences to not be equal self.assertFalse( BiologicalSequence('ACGT') == NucleotideSequence('ACGT')) def test_getitem(self): self.assertEqual(self.b1[0], BiologicalSequence('G')) self.assertEqual(self.b1[:], BiologicalSequence('GATTACA')) self.assertEqual(self.b1[::-1], BiologicalSequence('ACATTAG')) def test_getitem_out_of_range(self): with self.assertRaises(IndexError): self.b1[42] def test_hash(self): self.assertTrue(isinstance(hash(self.b1), int)) def test_iter(self): b1_iter = iter(self.b1) for actual, expected in zip(b1_iter, "GATTACA"): self.assertEqual(actual, expected) self.assertRaises(StopIteration, lambda: next(b1_iter)) def test_k_words(self): # overlapping = True self.assertEqual(list(self.b1.k_words(1, overlapping=True)), ['G', 'A', 'T', 'T', 'A', 'C', 'A']) self.assertEqual(list(self.b1.k_words(2, overlapping=True)), ['GA', 'AT', 'TT', 'TA', 'AC', 'CA']) self.assertEqual(list(self.b1.k_words(3, overlapping=True)), ['GAT', 'ATT', 'TTA', 'TAC', 'ACA']) self.assertEqual(list(self.b1.k_words(7, overlapping=True)), ['GATTACA']) self.assertEqual(list(self.b1.k_words(8, overlapping=True)), []) # overlapping = False self.assertEqual(list(self.b1.k_words(1, overlapping=True)), ['G', 'A', 'T', 'T', 'A', 'C', 'A']) self.assertEqual(list(self.b1.k_words(2, overlapping=False)), ['GA', 'TT', 'AC']) self.assertEqual(list(self.b1.k_words(3, overlapping=False)), ['GAT', 'TAC']) self.assertEqual(list(self.b1.k_words(7, overlapping=False)), ['GATTACA']) self.assertEqual(list(self.b1.k_words(8, overlapping=False)), []) # error on invalid k self.assertRaises(ValueError, list, self.b1.k_words(0)) self.assertRaises(ValueError, list, self.b1.k_words(-42)) # tests with different sequences self.assertEqual(list(self.b8.k_words(3, overlapping=False)), ['HE.', '.--', '..L']) b = BiologicalSequence('') self.assertEqual(list(b.k_words(3)), []) def test_k_word_counts(self): # overlapping = True expected = Counter('GATTACA') self.assertEqual(self.b1.k_word_counts(1, overlapping=True), expected) expected = Counter(['GAT', 'ATT', 'TTA', 'TAC', 'ACA']) self.assertEqual(self.b1.k_word_counts(3, overlapping=True), expected) # overlapping = False expected = Counter(['GAT', 'TAC']) self.assertEqual(self.b1.k_word_counts(3, overlapping=False), expected) expected = Counter(['GATTACA']) self.assertEqual(self.b1.k_word_counts(7, overlapping=False), expected) def test_k_word_frequencies(self): # overlapping = True expected = defaultdict(int) expected['A'] = 3/7. expected['C'] = 1/7. expected['G'] = 1/7. expected['T'] = 2/7. self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True), expected) expected = defaultdict(int) expected['GAT'] = 1/5. expected['ATT'] = 1/5. expected['TTA'] = 1/5. expected['TAC'] = 1/5. expected['ACA'] = 1/5. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True), expected) # overlapping = False expected = defaultdict(int) expected['GAT'] = 1/2. expected['TAC'] = 1/2. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False), expected) expected = defaultdict(int) expected['GATTACA'] = 1.0 self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False), expected) expected = defaultdict(int) empty = BiologicalSequence('') self.assertEqual(empty.k_word_frequencies(1, overlapping=False), expected) def test_len(self): self.assertEqual(len(self.b1), 7) self.assertEqual(len(self.b2), 9) self.assertEqual(len(self.b3), 4) def test_repr(self): self.assertEqual(repr(self.b1), "<BiologicalSequence: GATTACA (length: 7)>") self.assertEqual(repr(self.b6), "<BiologicalSequence: ACGTACGTAC... (length: 12)>") def test_reversed(self): b1_reversed = reversed(self.b1) for actual, expected in zip(b1_reversed, "ACATTAG"): self.assertEqual(actual, expected) self.assertRaises(StopIteration, lambda: next(b1_reversed)) def test_str(self): self.assertEqual(str(self.b1), "GATTACA") self.assertEqual(str(self.b2), "ACCGGTACC") self.assertEqual(str(self.b3), "GREG") def test_alphabet(self): self.assertEqual(self.b1.alphabet(), set()) def test_description(self): self.assertEqual(self.b1.description, "") self.assertEqual(self.b2.description, "A test sequence") self.assertEqual(self.b3.description, "A protein sequence") def test_gap_alphabet(self): self.assertEqual(self.b1.gap_alphabet(), set('-.')) def test_id(self): self.assertEqual(self.b1.id, "") self.assertEqual(self.b2.id, "test-seq-2") self.assertEqual(self.b3.id, "test-seq-3") def test_count(self): self.assertEqual(self.b1.count('A'), 3) self.assertEqual(self.b1.count('T'), 2) self.assertEqual(self.b1.count('TT'), 1) def test_degap(self): self.assertEqual(self.b1.degap(), self.b1) self.assertEqual(self.b7.degap(), BiologicalSequence('')) self.assertEqual(self.b8.degap(), BiologicalSequence('HELLO')) def test_distance(self): # note that test_hamming_distance covers default behavior more # extensively self.assertEqual(self.b1.distance(self.b1), 0.0) self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')), 1./7) def dumb_distance(x, y): return 42 self.assertEqual( self.b1.distance(self.b1, distance_fn=dumb_distance), 42) def test_distance_unequal_length(self): # Hamming distance (default) requires that sequences are of equal # length with self.assertRaises(BiologicalSequenceError): self.b1.distance(self.b2) # alternate distance functions don't have that requirement (unless # it's implemented within the provided distance function) def dumb_distance(x, y): return 42 self.assertEqual( self.b1.distance(self.b2, distance_fn=dumb_distance), 42) def test_fraction_diff(self): self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5) self.assertEqual( self.b1.fraction_diff(BiologicalSequence('GATTACC')), 1. / 7., 5) def test_fraction_same(self): self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5) self.assertAlmostEqual( self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5) def test_gap_maps(self): # in sequence with no gaps, the gap_maps are identical self.assertEqual(self.b1.gap_maps(), ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6])) # in sequence with all gaps, the map of degapped to gapped is the empty # list (bc its length is 0), and the map of gapped to degapped is all # None self.assertEqual(self.b7.gap_maps(), ([], [None, None, None, None, None, None])) self.assertEqual(self.b8.gap_maps(), ([0, 1, 8, 9, 10], [0, 1, None, None, None, None, None, None, 2, 3, 4])) # example from the gap_maps doc string self.assertEqual(BiologicalSequence('-ACCGA-TA-').gap_maps(), ([1, 2, 3, 4, 5, 7, 8], [None, 0, 1, 2, 3, 4, None, 5, 6, None])) def test_gap_vector(self): self.assertEqual(self.b1.gap_vector(), [False] * len(self.b1)) self.assertEqual(self.b7.gap_vector(), [True] * len(self.b7)) self.assertEqual(self.b8.gap_vector(), [False, False, True, True, True, True, True, True, False, False, False]) def test_unsupported_characters(self): self.assertEqual(self.b1.unsupported_characters(), set('GATC')) self.assertEqual(self.b7.unsupported_characters(), set()) def test_has_unsupported_characters(self): self.assertTrue(self.b1.has_unsupported_characters()) self.assertFalse(self.b7.has_unsupported_characters()) def test_index(self): """ index functions as expected """ self.assertEqual(self.b1.index('G'), 0) self.assertEqual(self.b1.index('A'), 1) self.assertEqual(self.b1.index('AC'), 4) self.assertRaises(ValueError, self.b1.index, 'x') def test_is_gap(self): self.assertTrue(self.b1.is_gap('.')) self.assertTrue(self.b1.is_gap('-')) self.assertFalse(self.b1.is_gap('A')) self.assertFalse(self.b1.is_gap('x')) self.assertFalse(self.b1.is_gap(' ')) self.assertFalse(self.b1.is_gap('')) def test_is_gapped(self): self.assertFalse(self.b1.is_gapped()) self.assertFalse(self.b2.is_gapped()) self.assertTrue(self.b7.is_gapped()) self.assertTrue(self.b8.is_gapped()) def test_is_valid(self): self.assertFalse(self.b1.is_valid()) self.assertTrue(self.b7.is_valid()) def test_to_fasta(self): self.assertEqual(self.b1.to_fasta(), ">\nGATTACA\n") self.assertEqual(self.b1.to_fasta(terminal_character=""), ">\nGATTACA") self.assertEqual(self.b2.to_fasta(), ">test-seq-2 A test sequence\nACCGGTACC\n") self.assertEqual(self.b3.to_fasta(), ">test-seq-3 A protein sequence\nGREG\n") self.assertEqual(self.b4.to_fasta(), ">test-seq-4\nPRTEIN\n") self.assertEqual(self.b5.to_fasta(), "> some description\nLLPRTEIN\n") # alt parameters self.assertEqual(self.b2.to_fasta(field_delimiter=":"), ">test-seq-2:A test sequence\nACCGGTACC\n") self.assertEqual(self.b2.to_fasta(terminal_character="!"), ">test-seq-2 A test sequence\nACCGGTACC!") self.assertEqual( self.b2.to_fasta(field_delimiter=":", terminal_character="!"), ">test-seq-2:A test sequence\nACCGGTACC!") def test_upper(self): b = NucleotideSequence('GAt.ACa-', id='x', description='42') expected = NucleotideSequence('GAT.ACA-', id='x', description='42') self.assertEqual(b.upper(), expected) def test_lower(self): b = NucleotideSequence('GAt.ACa-', id='x', description='42') expected = NucleotideSequence('gat.aca-', id='x', description='42') self.assertEqual(b.lower(), expected)
def test_getitem(self): self.assertEqual(self.b1[0], BiologicalSequence('G')) self.assertEqual(self.b1[:], BiologicalSequence('GATTACA')) self.assertEqual(self.b1[::-1], BiologicalSequence('ACATTAG'))
class BiologicalSequenceTests(TestCase): def setUp(self): self.b1 = BiologicalSequence('GATTACA') self.b2 = BiologicalSequence('ACCGGTACC', id="test-seq-2", description="A test sequence") self.b3 = BiologicalSequence('GREG', id="test-seq-3", description="A protein sequence") self.b4 = BiologicalSequence('PRTEIN', id="test-seq-4") self.b5 = BiologicalSequence('LLPRTEIN', description="some description") self.b6 = BiologicalSequence('ACGTACGTACGT') self.b7 = BiologicalSequence('..--..') self.b8 = BiologicalSequence('HE..--..LLO') def test_init_varied_input(self): # init as string b = BiologicalSequence('ACCGGXZY') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") # init as string with optional values b = BiologicalSequence('ACCGGXZY', 'test-seq-1', 'The first test sequence') self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "test-seq-1") self.assertEqual(b.description, "The first test sequence") # test init as a different string b = BiologicalSequence('WRRTY') self.assertEqual(str(b), 'WRRTY') # init as list b = BiologicalSequence(list('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") # init as tuple b = BiologicalSequence(tuple('ACCGGXZY')) self.assertEqual(str(b), 'ACCGGXZY') self.assertEqual(b.id, "") self.assertEqual(b.description, "") def test_init_with_validation(self): self.assertRaises(BiologicalSequenceError, BiologicalSequence, "ACC", validate=True) # no error raised when only allow characters are passed BiologicalSequence("..--..", validate=True) def test_contains(self): self.assertTrue('G' in self.b1) self.assertFalse('g' in self.b1) def test_eq(self): self.assertTrue(self.b1 == self.b1) self.assertTrue(self.b2 == self.b2) self.assertTrue(self.b3 == self.b3) self.assertTrue(self.b1 != self.b3) self.assertTrue(self.b1 != self.b2) self.assertTrue(self.b2 != self.b3) # identicial sequences of the same type are equal, even if they have # different ids and/or descriptions self.assertTrue( BiologicalSequence('ACGT') == BiologicalSequence('ACGT')) self.assertTrue( BiologicalSequence('ACGT', id='a') == BiologicalSequence('ACGT', id='b')) self.assertTrue( BiologicalSequence('ACGT', description='c') == BiologicalSequence( 'ACGT', description='d')) self.assertTrue( BiologicalSequence('ACGT', id='a', description='c') == BiologicalSequence('ACGT', id='b', description='d')) # different type causes sequences to not be equal self.assertFalse( BiologicalSequence('ACGT') == NucleotideSequence('ACGT')) def test_getitem(self): self.assertEqual(self.b1[0], BiologicalSequence('G')) self.assertEqual(self.b1[:], BiologicalSequence('GATTACA')) self.assertEqual(self.b1[::-1], BiologicalSequence('ACATTAG')) def test_iter(self): b1_iter = iter(self.b1) for actual, expected in zip(b1_iter, "GATTACA"): self.assertEqual(actual, expected) self.assertRaises(StopIteration, lambda: next(b1_iter)) def test_k_words(self): # overlapping = True self.assertEqual(list(self.b1.k_words(1, overlapping=True)), ['G', 'A', 'T', 'T', 'A', 'C', 'A']) self.assertEqual(list(self.b1.k_words(2, overlapping=True)), ['GA', 'AT', 'TT', 'TA', 'AC', 'CA']) self.assertEqual(list(self.b1.k_words(3, overlapping=True)), ['GAT', 'ATT', 'TTA', 'TAC', 'ACA']) self.assertEqual(list(self.b1.k_words(7, overlapping=True)), ['GATTACA']) self.assertEqual(list(self.b1.k_words(8, overlapping=True)), []) # overlapping = False self.assertEqual(list(self.b1.k_words(1, overlapping=True)), ['G', 'A', 'T', 'T', 'A', 'C', 'A']) self.assertEqual(list(self.b1.k_words(2, overlapping=False)), ['GA', 'TT', 'AC']) self.assertEqual(list(self.b1.k_words(3, overlapping=False)), ['GAT', 'TAC']) self.assertEqual(list(self.b1.k_words(7, overlapping=False)), ['GATTACA']) self.assertEqual(list(self.b1.k_words(8, overlapping=False)), []) # error on invalid k self.assertRaises(ValueError, list, self.b1.k_words(0)) self.assertRaises(ValueError, list, self.b1.k_words(-42)) # tests with different sequences self.assertEqual(list(self.b8.k_words(3, overlapping=False)), ['HE.', '.--', '..L']) b = BiologicalSequence('') self.assertEqual(list(b.k_words(3)), []) def test_k_word_counts(self): # overlapping = True expected = Counter('GATTACA') self.assertEqual(self.b1.k_word_counts(1, overlapping=True), expected) expected = Counter(['GAT', 'ATT', 'TTA', 'TAC', 'ACA']) self.assertEqual(self.b1.k_word_counts(3, overlapping=True), expected) # overlapping = False expected = Counter(['GAT', 'TAC']) self.assertEqual(self.b1.k_word_counts(3, overlapping=False), expected) expected = Counter(['GATTACA']) self.assertEqual(self.b1.k_word_counts(7, overlapping=False), expected) def test_k_word_frequencies(self): # overlapping = True expected = defaultdict(int) expected['A'] = 3 / 7. expected['C'] = 1 / 7. expected['G'] = 1 / 7. expected['T'] = 2 / 7. self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True), expected) expected = defaultdict(int) expected['GAT'] = 1 / 5. expected['ATT'] = 1 / 5. expected['TTA'] = 1 / 5. expected['TAC'] = 1 / 5. expected['ACA'] = 1 / 5. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True), expected) # overlapping = False expected = defaultdict(int) expected['GAT'] = 1 / 2. expected['TAC'] = 1 / 2. self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False), expected) expected = defaultdict(int) expected['GATTACA'] = 1.0 self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False), expected) expected = defaultdict(int) empty = BiologicalSequence('') self.assertEqual(empty.k_word_frequencies(1, overlapping=False), expected) def test_len(self): self.assertEqual(len(self.b1), 7) self.assertEqual(len(self.b2), 9) self.assertEqual(len(self.b3), 4) def test_repr(self): self.assertEqual(repr(self.b1), "<BiologicalSequence: GATTACA (length: 7)>") self.assertEqual(repr(self.b6), "<BiologicalSequence: ACGTACGTAC... (length: 12)>") def test_reversed(self): b1_reversed = reversed(self.b1) for actual, expected in zip(b1_reversed, "ACATTAG"): self.assertEqual(actual, expected) self.assertRaises(StopIteration, lambda: next(b1_reversed)) def test_str(self): self.assertEqual(str(self.b1), "GATTACA") self.assertEqual(str(self.b2), "ACCGGTACC") self.assertEqual(str(self.b3), "GREG") def test_alphabet(self): self.assertEqual(self.b1.alphabet(), set()) def test_description(self): self.assertEqual(self.b1.description, "") self.assertEqual(self.b2.description, "A test sequence") self.assertEqual(self.b3.description, "A protein sequence") def test_gap_alphabet(self): self.assertEqual(self.b1.gap_alphabet(), set('-.')) def test_id(self): self.assertEqual(self.b1.id, "") self.assertEqual(self.b2.id, "test-seq-2") self.assertEqual(self.b3.id, "test-seq-3") def test_count(self): self.assertEqual(self.b1.count('A'), 3) self.assertEqual(self.b1.count('T'), 2) self.assertEqual(self.b1.count('TT'), 1) def test_degap(self): self.assertEqual(self.b1.degap(), self.b1) self.assertEqual(self.b7.degap(), BiologicalSequence('')) self.assertEqual(self.b8.degap(), BiologicalSequence('HELLO')) def test_distance(self): # note that test_hamming_distance covers default behavior more # extensively self.assertEqual(self.b1.distance(self.b1), 0.0) self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')), 1. / 7) def dumb_distance(x, y): return 42 self.assertEqual(self.b1.distance(self.b1, distance_fn=dumb_distance), 42) def test_fraction_diff(self): self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5) self.assertEqual(self.b1.fraction_diff(BiologicalSequence('GATTACC')), 1. / 7., 5) def test_fraction_same(self): self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5) self.assertAlmostEqual( self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5) def test_gap_maps(self): # in sequence with no gaps, the gap_maps are identical self.assertEqual(self.b1.gap_maps(), ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6])) # in sequence with all gaps, the map of degapped to gapped is the empty # list (bc its length is 0), and the map of gapped to degapped is all # None self.assertEqual(self.b7.gap_maps(), ([], [None, None, None, None, None, None])) self.assertEqual(self.b8.gap_maps(), ([0, 1, 8, 9, 10], [0, 1, None, None, None, None, None, None, 2, 3, 4])) # example from the gap_maps doc string self.assertEqual( BiologicalSequence('-ACCGA-TA-').gap_maps(), ([1, 2, 3, 4, 5, 7, 8], [None, 0, 1, 2, 3, 4, None, 5, 6, None])) def test_gap_vector(self): self.assertEqual(self.b1.gap_vector(), [False] * len(self.b1)) self.assertEqual(self.b7.gap_vector(), [True] * len(self.b7)) self.assertEqual(self.b8.gap_vector(), [ False, False, True, True, True, True, True, True, False, False, False ]) def test_unsupported_characters(self): self.assertEqual(self.b1.unsupported_characters(), set('GATC')) self.assertEqual(self.b7.unsupported_characters(), set()) def test_has_unsupported_characters(self): self.assertTrue(self.b1.has_unsupported_characters()) self.assertFalse(self.b7.has_unsupported_characters()) def test_index(self): """ index functions as expected """ self.assertEqual(self.b1.index('G'), 0) self.assertEqual(self.b1.index('A'), 1) self.assertEqual(self.b1.index('AC'), 4) self.assertRaises(ValueError, self.b1.index, 'x') def test_is_gap(self): self.assertTrue(self.b1.is_gap('.')) self.assertTrue(self.b1.is_gap('-')) self.assertFalse(self.b1.is_gap('A')) self.assertFalse(self.b1.is_gap('x')) self.assertFalse(self.b1.is_gap(' ')) self.assertFalse(self.b1.is_gap('')) def test_is_gapped(self): self.assertFalse(self.b1.is_gapped()) self.assertFalse(self.b2.is_gapped()) self.assertTrue(self.b7.is_gapped()) self.assertTrue(self.b8.is_gapped()) def test_is_valid(self): self.assertFalse(self.b1.is_valid()) self.assertTrue(self.b7.is_valid()) def test_to_fasta(self): self.assertEqual(self.b1.to_fasta(), ">\nGATTACA\n") self.assertEqual(self.b1.to_fasta(terminal_character=""), ">\nGATTACA") self.assertEqual(self.b2.to_fasta(), ">test-seq-2 A test sequence\nACCGGTACC\n") self.assertEqual(self.b3.to_fasta(), ">test-seq-3 A protein sequence\nGREG\n") self.assertEqual(self.b4.to_fasta(), ">test-seq-4\nPRTEIN\n") self.assertEqual(self.b5.to_fasta(), "> some description\nLLPRTEIN\n") # alt parameters self.assertEqual(self.b2.to_fasta(field_delimiter=":"), ">test-seq-2:A test sequence\nACCGGTACC\n") self.assertEqual(self.b2.to_fasta(terminal_character="!"), ">test-seq-2 A test sequence\nACCGGTACC!") self.assertEqual( self.b2.to_fasta(field_delimiter=":", terminal_character="!"), ">test-seq-2:A test sequence\nACCGGTACC!") def test_upper(self): b = NucleotideSequence('GAt.ACa-', id='x', description='42') expected = NucleotideSequence('GAT.ACA-', id='x', description='42') self.assertEqual(b.upper(), expected) def test_lower(self): b = NucleotideSequence('GAt.ACa-', id='x', description='42') expected = NucleotideSequence('gat.aca-', id='x', description='42') self.assertEqual(b.lower(), expected)
def setUp(self): self.b1 = BiologicalSequence('GATTACA') self.b2 = BiologicalSequence('ACCGGTACC', id="test-seq-2", description="A test sequence") self.b3 = BiologicalSequence('GREG', id="test-seq-3", description="A protein sequence") self.b4 = BiologicalSequence('PRTEIN', id="test-seq-4") self.b5 = BiologicalSequence('LLPRTEIN', description="some description") self.b6 = BiologicalSequence('ACGTACGTACGT') self.b7 = BiologicalSequence('..--..') self.b8 = BiologicalSequence('HE..--..LLO')
def test_degap(self): self.assertEqual(self.b1.degap(), self.b1) self.assertEqual(self.b7.degap(), BiologicalSequence('')) self.assertEqual(self.b8.degap(), BiologicalSequence('HELLO'))
def fasta_from_alignment(aln, make_seqlabel=None, line_wrap=None, sort=True): """Returns a FASTA string given an alignment object Parameters ---------- aln : Alignment, dict alignment or dictionary where the keys are the sequence ids and the values are the sequences themselves. make_seqlabel : function, optional callback function that takes the seq object and returns a label ``str``. If ``None`` is passed, the following attributes will try to be retrieved in this order and the first to exist will be used: ``id``, ``Label`` or ``Name``. In any other case an integer with the position of the sequence object will be used. line_wrap : int, optional line_wrap: a integer for maximum line width, if ``None`` is passed the full sequence will be used. sort : bool, optional Whether or not the sequences should be sorted by their sequence id, default value is ``True``. Returns ------- str FASTA formatted string composed of the objects passed in via `seqs`. See Also -------- skbio.parse.sequences.parse_fasta skbio.core.alignment.Alignment Examples -------- Formatting a sequence alignment object into a FASTA file. >>> from skbio.core.alignment import Alignment >>> from skbio.core.sequence import DNA >>> from skbio.format.sequences import fasta_from_alignment >>> seqs = [DNA("ACC--G-GGTA..", id="seq1"), ... DNA("TCC--G-GGCA..", id="seqs2")] >>> a1 = Alignment(seqs) >>> print fasta_from_alignment(a1) >seq1 ACC--G-GGTA.. >seqs2 TCC--G-GGCA.. """ # check if it's an Alignment object or a dictionary if isinstance(aln, Alignment): order = aln.ids() else: order = aln.keys() if sort: order = sorted(order) ordered_seqs = [] for label in order: seq = aln[label] if isinstance(seq, str): seq = BiologicalSequence(seq, label) ordered_seqs.append(seq) return fasta_from_sequences(ordered_seqs, make_seqlabel=make_seqlabel, line_wrap=line_wrap)
def test_fraction_same(self): self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5) self.assertAlmostEqual( self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5)
def test_fraction_diff(self): self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5) self.assertEqual(self.b1.fraction_diff(BiologicalSequence('GATTACC')), 1. / 7., 5)