Ejemplo n.º 1
0
    def test_k_word_frequencies(self):
        # overlapping = True
        expected = defaultdict(int)
        expected['A'] = 3 / 7.
        expected['C'] = 1 / 7.
        expected['G'] = 1 / 7.
        expected['T'] = 2 / 7.
        self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True),
                         expected)
        expected = defaultdict(int)
        expected['GAT'] = 1 / 5.
        expected['ATT'] = 1 / 5.
        expected['TTA'] = 1 / 5.
        expected['TAC'] = 1 / 5.
        expected['ACA'] = 1 / 5.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True),
                         expected)

        # overlapping = False
        expected = defaultdict(int)
        expected['GAT'] = 1 / 2.
        expected['TAC'] = 1 / 2.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False),
                         expected)
        expected = defaultdict(int)
        expected['GATTACA'] = 1.0
        self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False),
                         expected)
        expected = defaultdict(int)
        empty = BiologicalSequence('')
        self.assertEqual(empty.k_word_frequencies(1, overlapping=False),
                         expected)
Ejemplo n.º 2
0
    def test_k_words(self):
        # overlapping = True
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=True)),
                         ['GA', 'AT', 'TT', 'TA', 'AC', 'CA'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=True)),
                         ['GAT', 'ATT', 'TTA', 'TAC', 'ACA'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=True)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=True)),
                         [])

        # overlapping = False
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=False)),
                         ['GA', 'TT', 'AC'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=False)),
                         ['GAT', 'TAC'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=False)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=False)),
                         [])

        # error on invalid k
        self.assertRaises(ValueError, list, self.b1.k_words(0))
        self.assertRaises(ValueError, list, self.b1.k_words(-42))

        # tests with different sequences
        self.assertEqual(list(self.b8.k_words(3, overlapping=False)),
                         ['HE.', '.--', '..L'])
        b = BiologicalSequence('')
        self.assertEqual(list(b.k_words(3)), [])
Ejemplo n.º 3
0
    def test_k_words(self):
        # overlapping = True
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=True)),
                         ['GA', 'AT', 'TT', 'TA', 'AC', 'CA'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=True)),
                         ['GAT', 'ATT', 'TTA', 'TAC', 'ACA'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=True)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=True)), [])

        # overlapping = False
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=False)),
                         ['GA', 'TT', 'AC'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=False)),
                         ['GAT', 'TAC'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=False)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=False)), [])

        # error on invalid k
        self.assertRaises(ValueError, list, self.b1.k_words(0))
        self.assertRaises(ValueError, list, self.b1.k_words(-42))

        # tests with different sequences
        self.assertEqual(list(self.b8.k_words(3, overlapping=False)),
                         ['HE.', '.--', '..L'])
        b = BiologicalSequence('')
        self.assertEqual(list(b.k_words(3)), [])
Ejemplo n.º 4
0
    def test_k_word_frequencies(self):
        # overlapping = True
        expected = defaultdict(int)
        expected['A'] = 3/7.
        expected['C'] = 1/7.
        expected['G'] = 1/7.
        expected['T'] = 2/7.
        self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True),
                         expected)
        expected = defaultdict(int)
        expected['GAT'] = 1/5.
        expected['ATT'] = 1/5.
        expected['TTA'] = 1/5.
        expected['TAC'] = 1/5.
        expected['ACA'] = 1/5.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True),
                         expected)

        # overlapping = False
        expected = defaultdict(int)
        expected['GAT'] = 1/2.
        expected['TAC'] = 1/2.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False),
                         expected)
        expected = defaultdict(int)
        expected['GATTACA'] = 1.0
        self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False),
                         expected)
        expected = defaultdict(int)
        empty = BiologicalSequence('')
        self.assertEqual(empty.k_word_frequencies(1, overlapping=False),
                         expected)
Ejemplo n.º 5
0
 def setUp(self):
     """Setup for Fasta tests."""
     self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu']
     self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu'
     self.fasta_with_label =\
         '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU'
     self.fasta_with_label_lw2 =\
         '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU'
     self.alignment_dict = {
         '1st': 'AAAA',
         '2nd': 'CCCC',
         '3rd': 'GGGG',
         '4th': 'UUUU'
     }
     self.sequence_objects_a = [
         DNASequence('ACTCGAGATC', 'seq1'),
         DNASequence('GGCCT', 'seq2')
     ]
     self.sequence_objects_b = [
         BiologicalSequence('ACTCGAGATC', 'seq1'),
         BiologicalSequence('GGCCT', 'seq2')
     ]
     seqs = [
         DNASequence("ACC--G-GGTA..", id="seq1"),
         DNASequence("TCC--G-GGCA..", id="seqs2")
     ]
     self.alignment = Alignment(seqs)
Ejemplo n.º 6
0
    def test_init_varied_input(self):
        # init as string
        b = BiologicalSequence('ACCGGXZY')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

        # init as string with optional values
        b = BiologicalSequence('ACCGGXZY', 'test-seq-1',
                               'The first test sequence')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "test-seq-1")
        self.assertEqual(b.description, "The first test sequence")

        # test init as a different string
        b = BiologicalSequence('WRRTY')
        self.assertEqual(str(b), 'WRRTY')

        # init as list
        b = BiologicalSequence(list('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

        # init as tuple
        b = BiologicalSequence(tuple('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")
Ejemplo n.º 7
0
    def test_eq(self):
        self.assertTrue(self.b1 == self.b1)
        self.assertTrue(self.b2 == self.b2)
        self.assertTrue(self.b3 == self.b3)

        self.assertTrue(self.b1 != self.b3)
        self.assertTrue(self.b1 != self.b2)
        self.assertTrue(self.b2 != self.b3)

        # identicial sequences of the same type are equal, even if they have
        # different ids and/or descriptions
        self.assertTrue(
            BiologicalSequence('ACGT') == BiologicalSequence('ACGT'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a') == BiologicalSequence('ACGT',
                                                                     id='b'))
        self.assertTrue(
            BiologicalSequence('ACGT', description='c') == BiologicalSequence(
                'ACGT', description='d'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a', description='c') ==
            BiologicalSequence('ACGT', id='b', description='d'))

        # different type causes sequences to not be equal
        self.assertFalse(
            BiologicalSequence('ACGT') == NucleotideSequence('ACGT'))
Ejemplo n.º 8
0
def get_representatives(mapping, seqs):
    """Returns representative seqs.

    mapping: The prefix mapping dict

    seqs_fh: An open Fasta filehandle
    """
    for (label, seq) in seqs:
        if(label in mapping):
            seq = BiologicalSequence(
                seq, id="%s: %d" % (label, len(mapping[label]) + 1))
            yield seq.upper()
Ejemplo n.º 9
0
def get_representatives(mapping, seqs):
    """Returns representative seqs.

    mapping: The prefix mapping dict

    seqs_fh: An open Fasta filehandle
    """
    for (label, seq) in seqs:
        if (label in mapping):
            seq = BiologicalSequence(seq,
                                     id="%s: %d" %
                                     (label, len(mapping[label]) + 1))
            yield seq.upper()
Ejemplo n.º 10
0
 def test_init_with_validation(self):
     self.assertRaises(BiologicalSequenceError,
                       BiologicalSequence,
                       "ACC",
                       validate=True)
     # no error raised when only allow characters are passed
     BiologicalSequence("..--..", validate=True)
Ejemplo n.º 11
0
    def test_distance(self):
        # note that test_hamming_distance covers default behavior more
        # extensively
        self.assertEqual(self.b1.distance(self.b1), 0.0)
        self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')), 1./7)

        def dumb_distance(x, y):
            return 42

        self.assertEqual(
            self.b1.distance(self.b1, distance_fn=dumb_distance), 42)
Ejemplo n.º 12
0
def write_Fasta_from_name_seq_pairs(name_seqs, fh):
    """writes a list of (name,seqs) to filehandle.

    name_seqs: (name,seqs) pair such as from parse_fasta
    fh: an open filehandle
    """
    if fh is None:
        raise ValueError("Need open file handle to write to.")

    for (name, seq) in name_seqs:
        fh.write("%s\n" % BiologicalSequence(seq, identifier=name).to_fasta())
Ejemplo n.º 13
0
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh,
                     otu_picker_otu_map_fh, out_dir):
    """Combine denoiser and OTU picker mapping file, replace flowgram IDs.

    fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py
             used to replace flowgram id with the unique se_sample_id

    mapping_fh: The cluster mapping from the denoiser.py

    denoised_seqs_fh: the Fasta output files from denoiser.py

    otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh

    out_dir: output directory
    """

    # read in mapping from split_library file
    labels = imap(lambda a_b: a_b[0], parse_fasta(fasta_fh))
    # mapping from seq_id to sample_id
    sample_id_mapping = extract_read_to_sample_mapping(labels)

    denoiser_mapping = read_denoiser_mapping(mapping_fh)
    # read in cd_hit otu map
    # and write out combined otu_picker+denoiser map
    otu_fh = open(out_dir + "/denoised_otu_map.txt", "w")
    for otu_line in otu_picker_otu_map_fh:
        otu_split = otu_line.split()

        otu = otu_split[0]
        ids = otu_split[1:]

        get_sample_id = sample_id_mapping.get
        # concat lists
        # make sure the biggest one is first for pick_repr
        all_ids = sort_ids(ids, denoiser_mapping)
        all_ids.extend(sum([denoiser_mapping[id] for id in ids], []))
        try:
            otu_fh.write("%s\t" % otu +
                         "\t".join(map(get_sample_id, all_ids)) + "\n")
        except TypeError:
            # get returns Null if denoiser_mapping id not present in
            # sample_id_mapping
            print "Found id in denoiser output, which was not found in split_libraries " +\
                "output FASTA file. Wrong file?"
            exit()

    fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w")
    for label, seq in parse_fasta(denoised_seqs_fh):
        id = label.split()[0]
        newlabel = "%s %s" % (sample_id_mapping[id], id)
        fasta_out_fh.write(BiologicalSequence(seq, id=newlabel).to_fasta())
Ejemplo n.º 14
0
 def setUp(self):
     self.b1 = BiologicalSequence('GATTACA')
     self.b2 = BiologicalSequence(
         'ACCGGTACC', id="test-seq-2",
         description="A test sequence")
     self.b3 = BiologicalSequence(
         'GREG', id="test-seq-3", description="A protein sequence")
     self.b4 = BiologicalSequence(
         'PRTEIN', id="test-seq-4")
     self.b5 = BiologicalSequence(
         'LLPRTEIN', description="some description")
     self.b6 = BiologicalSequence('ACGTACGTACGT')
     self.b7 = BiologicalSequence('..--..')
     self.b8 = BiologicalSequence('HE..--..LLO')
Ejemplo n.º 15
0
 def setUp(self):
     """ Initialize values to be used in tests
     """
     self.b1 = BiologicalSequence('GATTACA')
     self.b2 = BiologicalSequence(
         'ACCGGTACC', identifier="test-seq-2",
         description="A test sequence")
     self.b3 = BiologicalSequence(
         'GREG', identifier="test-seq-3", description="A protein sequence")
     self.b4 = BiologicalSequence(
         'PRTEIN', identifier="test-seq-4")
     self.b5 = BiologicalSequence(
         'LLPRTEIN', description="some description")
     self.b6 = BiologicalSequence('ACGTACGTACGT')
     self.b7 = BiologicalSequence('..--..')
     self.b8 = BiologicalSequence('HE..--..LLO')
Ejemplo n.º 16
0
    def test_gap_maps(self):
        # in sequence with no gaps, the gap_maps are identical
        self.assertEqual(self.b1.gap_maps(),
                         ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6]))
        # in sequence with all gaps, the map of degapped to gapped is the empty
        # list (bc its length is 0), and the map of gapped to degapped is all
        # None
        self.assertEqual(self.b7.gap_maps(),
                         ([], [None, None, None, None, None, None]))

        self.assertEqual(self.b8.gap_maps(),
                         ([0, 1, 8, 9, 10],
                          [0, 1, None, None, None, None, None, None, 2, 3, 4]))

        # example from the gap_maps doc string
        self.assertEqual(
            BiologicalSequence('-ACCGA-TA-').gap_maps(),
            ([1, 2, 3, 4, 5, 7, 8], [None, 0, 1, 2, 3, 4, None, 5, 6, None]))
Ejemplo n.º 17
0
class BiologicalSequenceTests(TestCase):
    """ Tests of the BiologicalSequence class """

    def setUp(self):
        """ Initialize values to be used in tests
        """
        self.b1 = BiologicalSequence('GATTACA')
        self.b2 = BiologicalSequence(
            'ACCGGTACC', identifier="test-seq-2",
            description="A test sequence")
        self.b3 = BiologicalSequence(
            'GREG', identifier="test-seq-3", description="A protein sequence")
        self.b4 = BiologicalSequence(
            'PRTEIN', identifier="test-seq-4")
        self.b5 = BiologicalSequence(
            'LLPRTEIN', description="some description")
        self.b6 = BiologicalSequence('ACGTACGTACGT')
        self.b7 = BiologicalSequence('..--..')
        self.b8 = BiologicalSequence('HE..--..LLO')

    def test_init(self):
        """ Initialization functions as expected with varied input types
        """
        # init as string
        b = BiologicalSequence('ACCGGXZY')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.identifier, "")
        self.assertEqual(b.description, "")

        # init as string with optional values
        b = BiologicalSequence(
            'ACCGGXZY', 'test-seq-1', 'The first test sequence')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.identifier, "test-seq-1")
        self.assertEqual(b.description, "The first test sequence")

        # test init as a different string
        b = BiologicalSequence('WRRTY')
        self.assertEqual(str(b), 'WRRTY')

        # init as list
        b = BiologicalSequence(list('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.identifier, "")
        self.assertEqual(b.description, "")

        # init as tuple
        b = BiologicalSequence(tuple('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.identifier, "")
        self.assertEqual(b.description, "")

    def test_init_validate(self):
        """ initialization with validation functions as expected
        """
        self.assertRaises(BiologicalSequenceError, BiologicalSequence, "ACC",
                          validate=True)
        # no error raised when only allow characters are passed
        BiologicalSequence("..--..", validate=True)

    def test_contains(self):
        """ contains functions as expected
        """
        self.assertTrue('G' in self.b1)
        self.assertFalse('g' in self.b1)

    def test_eq(self):
        """ equality functions as expected
        """
        self.assertTrue(self.b1 == self.b1)
        self.assertTrue(self.b2 == self.b2)
        self.assertTrue(self.b3 == self.b3)

        self.assertTrue(self.b1 != self.b3)
        self.assertTrue(self.b1 != self.b2)
        self.assertTrue(self.b2 != self.b3)

        # identicial sequences of the same type are equal, even if they have
        # different identifiers and/or descriptions
        self.assertTrue(
            BiologicalSequence('ACGT') == BiologicalSequence('ACGT'))
        self.assertTrue(
            BiologicalSequence('ACGT', identifier='a') ==
            BiologicalSequence('ACGT', identifier='b'))
        self.assertTrue(
            BiologicalSequence('ACGT', description='c') ==
            BiologicalSequence('ACGT', description='d'))
        self.assertTrue(
            BiologicalSequence('ACGT', identifier='a', description='c') ==
            BiologicalSequence('ACGT', identifier='b', description='d'))

        # different type causes sequences to not be equal
        self.assertFalse(
            BiologicalSequence('ACGT') == NucleotideSequence('ACGT'))

    def test_getitem(self):
        """ getitem functions as expected
        """
        self.assertEqual(self.b1[0], BiologicalSequence('G'))
        self.assertEqual(self.b1[:], BiologicalSequence('GATTACA'))
        self.assertEqual(self.b1[::-1], BiologicalSequence('ACATTAG'))

    def test_iter(self):
        """ iter functions as expected
        """
        b1_iter = iter(self.b1)
        for actual, expected in zip(b1_iter, "GATTACA"):
            self.assertEqual(actual, expected)

        self.assertRaises(StopIteration, b1_iter.next)

    def test_len(self):
        """ len functions as expected
        """
        self.assertEqual(len(self.b1), 7)
        self.assertEqual(len(self.b2), 9)
        self.assertEqual(len(self.b3), 4)

    def test_repr(self):
        """ repr functions as expected
        """
        self.assertEqual(repr(self.b1),
                         "<BiologicalSequence: GATTACA (length: 7)>")
        self.assertEqual(repr(self.b6),
                         "<BiologicalSequence: ACGTACGTAC... (length: 12)>")

    def test_reversed(self):
        """ reversed functions as expected
        """
        b1_reversed = reversed(self.b1)
        for actual, expected in zip(b1_reversed, "ACATTAG"):
            self.assertEqual(actual, expected)

        self.assertRaises(StopIteration, b1_reversed.next)

    def test_str(self):
        """ str functions as expected
        """
        self.assertEqual(str(self.b1), "GATTACA")
        self.assertEqual(str(self.b2), "ACCGGTACC")
        self.assertEqual(str(self.b3), "GREG")

    def test_alphabet(self):
        """ alphabet property functions as expected
        """
        self.assertEqual(self.b1.alphabet(), set())

    def test_description(self):
        """ description property functions as expected
        """
        self.assertEqual(self.b1.description, "")
        self.assertEqual(self.b2.description, "A test sequence")
        self.assertEqual(self.b3.description, "A protein sequence")

    def test_gap_alphabet(self):
        """ gap_alphabet property functions as expected
        """
        self.assertEqual(self.b1.gap_alphabet(), set('-.'))

    def test_identifier(self):
        """ identifier property functions as expected
        """
        self.assertEqual(self.b1.identifier, "")
        self.assertEqual(self.b2.identifier, "test-seq-2")
        self.assertEqual(self.b3.identifier, "test-seq-3")

    def test_count(self):
        """ count functions as expected
        """
        self.assertEqual(self.b1.count('A'), 3)
        self.assertEqual(self.b1.count('T'), 2)
        self.assertEqual(self.b1.count('TT'), 1)

    def test_degap(self):
        """ degap functions as expected
        """
        self.assertEqual(self.b1.degap(), self.b1)
        self.assertEqual(self.b7.degap(), BiologicalSequence(''))
        self.assertEqual(self.b8.degap(), BiologicalSequence('HELLO'))

    def test_distance(self):
        """ distance functions as expected
        """
        # note that test_hamming_distance covers default behavior more
        # extensively
        self.assertEqual(self.b1.distance(self.b1), 0.0)
        self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')), 1./7)

        def dumb_distance(x, y):
            return 42

        self.assertEqual(
            self.b1.distance(self.b1, distance_fn=dumb_distance), 42)

    def test_fraction_diff(self):
        """ fraction_diff functions as expected
        """
        self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5)
        self.assertEqual(
            self.b1.fraction_diff(BiologicalSequence('GATTACC')), 1. / 7., 5)

    def test_fraction_same(self):
        """ fraction_same functions as expected
        """
        self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5)
        self.assertAlmostEqual(
            self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5)

    def test_gap_maps(self):
        """ gap_maps functions as expected
        """
        # in sequence with no gaps, the gap_maps are identical
        self.assertEqual(self.b1.gap_maps(),
                         ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6]))
        # in sequence with all gaps, the map of degapped to gapped is the empty
        # list (bc its length is 0), and the map of gapped to degapped is all
        # None
        self.assertEqual(self.b7.gap_maps(),
                         ([], [None, None, None, None, None, None]))

        self.assertEqual(self.b8.gap_maps(),
                         ([0, 1, 8, 9, 10],
                          [0, 1, None, None, None, None, None, None, 2, 3, 4]))

        # example from the gap_maps doc string
        self.assertEqual(BiologicalSequence('-ACCGA-TA-').gap_maps(),
                         ([1, 2, 3, 4, 5, 7, 8],
                          [None, 0, 1, 2, 3, 4, None, 5, 6, None]))

    def test_gap_vector(self):
        """ gap_vector functions as expected
        """
        self.assertEqual(self.b1.gap_vector(),
                         [False] * len(self.b1))
        self.assertEqual(self.b7.gap_vector(),
                         [True] * len(self.b7))
        self.assertEqual(self.b8.gap_vector(),
                         [False, False, True, True, True, True,
                          True, True, False, False, False])

    def test_unsupported_characters(self):
        """ unsupported_characters functions as expected
        """
        self.assertEqual(self.b1.unsupported_characters(), set('GATC'))
        self.assertEqual(self.b7.unsupported_characters(), set())

    def test_has_unsupported_characters(self):
        """ has_unsupported_characters functions as expected
        """
        self.assertTrue(self.b1.has_unsupported_characters())
        self.assertFalse(self.b7.has_unsupported_characters())

    def test_index(self):
        """ index functions as expected """
        self.assertEqual(self.b1.index('G'), 0)
        self.assertEqual(self.b1.index('A'), 1)
        self.assertEqual(self.b1.index('AC'), 4)
        self.assertRaises(ValueError, self.b1.index, 'x')

    def test_is_gap(self):
        """ is_gap functions as expected """
        self.assertTrue(self.b1.is_gap('.'))
        self.assertTrue(self.b1.is_gap('-'))
        self.assertFalse(self.b1.is_gap('A'))
        self.assertFalse(self.b1.is_gap('x'))
        self.assertFalse(self.b1.is_gap(' '))
        self.assertFalse(self.b1.is_gap(''))

    def test_is_gapped(self):
        """ is_gapped functions as expected """
        self.assertFalse(self.b1.is_gapped())
        self.assertFalse(self.b2.is_gapped())
        self.assertTrue(self.b7.is_gapped())
        self.assertTrue(self.b8.is_gapped())

    def test_is_valid(self):
        """ is_valid functions as expected
        """
        self.assertFalse(self.b1.is_valid())
        self.assertTrue(self.b7.is_valid())

    def test_to_fasta(self):
        """ to_fasta functions as expected
        """
        self.assertEqual(self.b1.to_fasta(), ">\nGATTACA\n")
        self.assertEqual(self.b1.to_fasta(terminal_character=""), ">\nGATTACA")
        self.assertEqual(self.b2.to_fasta(),
                         ">test-seq-2 A test sequence\nACCGGTACC\n")
        self.assertEqual(self.b3.to_fasta(),
                         ">test-seq-3 A protein sequence\nGREG\n")
        self.assertEqual(self.b4.to_fasta(),
                         ">test-seq-4\nPRTEIN\n")
        self.assertEqual(self.b5.to_fasta(),
                         "> some description\nLLPRTEIN\n")

        # alt parameters
        self.assertEqual(self.b2.to_fasta(field_delimiter=":"),
                         ">test-seq-2:A test sequence\nACCGGTACC\n")
        self.assertEqual(self.b2.to_fasta(terminal_character="!"),
                         ">test-seq-2 A test sequence\nACCGGTACC!")
        self.assertEqual(
            self.b2.to_fasta(field_delimiter=":", terminal_character="!"),
            ">test-seq-2:A test sequence\nACCGGTACC!")

    def test_upper(self):
        """ upper functions as expected
        """
        b = NucleotideSequence('GAt.ACa-', identifier='x', description='42')
        expected = NucleotideSequence('GAT.ACA-', identifier='x',
                                      description='42')
        self.assertEqual(b.upper(), expected)

    def test_lower(self):
        """ lower functions as expected
        """
        b = NucleotideSequence('GAt.ACa-', identifier='x', description='42')
        expected = NucleotideSequence('gat.aca-', identifier='x',
                                      description='42')
        self.assertEqual(b.lower(), expected)
def hamming_distance(s1, s2):
    s1 = BiologicalSequence(s1)
    s2 = BiologicalSequence(s2)
    return s1.distance(s2)
Ejemplo n.º 19
0
class BiologicalSequenceTests(TestCase):

    def setUp(self):
        self.b1 = BiologicalSequence('GATTACA')
        self.b2 = BiologicalSequence(
            'ACCGGTACC', id="test-seq-2",
            description="A test sequence")
        self.b3 = BiologicalSequence(
            'GREG', id="test-seq-3", description="A protein sequence")
        self.b4 = BiologicalSequence(
            'PRTEIN', id="test-seq-4")
        self.b5 = BiologicalSequence(
            'LLPRTEIN', description="some description")
        self.b6 = BiologicalSequence('ACGTACGTACGT')
        self.b7 = BiologicalSequence('..--..')
        self.b8 = BiologicalSequence('HE..--..LLO')

    def test_init_varied_input(self):
        # init as string
        b = BiologicalSequence('ACCGGXZY')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

        # init as string with optional values
        b = BiologicalSequence(
            'ACCGGXZY', 'test-seq-1', 'The first test sequence')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "test-seq-1")
        self.assertEqual(b.description, "The first test sequence")

        # test init as a different string
        b = BiologicalSequence('WRRTY')
        self.assertEqual(str(b), 'WRRTY')

        # init as list
        b = BiologicalSequence(list('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

        # init as tuple
        b = BiologicalSequence(tuple('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

    def test_init_with_validation(self):
        self.assertRaises(BiologicalSequenceError, BiologicalSequence, "ACC",
                          validate=True)
        try:
            # no error raised when only allow characters are passed
            BiologicalSequence("..--..", validate=True)
        except BiologicalSequenceError:
            self.assertTrue(False)

    def test_contains(self):
        self.assertTrue('G' in self.b1)
        self.assertFalse('g' in self.b1)

    def test_eq(self):
        self.assertTrue(self.b1 == self.b1)
        self.assertTrue(self.b2 == self.b2)
        self.assertTrue(self.b3 == self.b3)

        self.assertTrue(self.b1 != self.b3)
        self.assertTrue(self.b1 != self.b2)
        self.assertTrue(self.b2 != self.b3)

        # identicial sequences of the same type are equal, even if they have
        # different ids and/or descriptions
        self.assertTrue(
            BiologicalSequence('ACGT') == BiologicalSequence('ACGT'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a') ==
            BiologicalSequence('ACGT', id='b'))
        self.assertTrue(
            BiologicalSequence('ACGT', description='c') ==
            BiologicalSequence('ACGT', description='d'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a', description='c') ==
            BiologicalSequence('ACGT', id='b', description='d'))

        # different type causes sequences to not be equal
        self.assertFalse(
            BiologicalSequence('ACGT') == NucleotideSequence('ACGT'))

    def test_getitem(self):
        self.assertEqual(self.b1[0], BiologicalSequence('G'))
        self.assertEqual(self.b1[:], BiologicalSequence('GATTACA'))
        self.assertEqual(self.b1[::-1], BiologicalSequence('ACATTAG'))

    def test_getitem_out_of_range(self):
        with self.assertRaises(IndexError):
            self.b1[42]

    def test_hash(self):
        self.assertTrue(isinstance(hash(self.b1), int))

    def test_iter(self):
        b1_iter = iter(self.b1)
        for actual, expected in zip(b1_iter, "GATTACA"):
            self.assertEqual(actual, expected)

        self.assertRaises(StopIteration, lambda: next(b1_iter))

    def test_k_words(self):
        # overlapping = True
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=True)),
                         ['GA', 'AT', 'TT', 'TA', 'AC', 'CA'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=True)),
                         ['GAT', 'ATT', 'TTA', 'TAC', 'ACA'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=True)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=True)),
                         [])

        # overlapping = False
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=False)),
                         ['GA', 'TT', 'AC'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=False)),
                         ['GAT', 'TAC'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=False)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=False)),
                         [])

        # error on invalid k
        self.assertRaises(ValueError, list, self.b1.k_words(0))
        self.assertRaises(ValueError, list, self.b1.k_words(-42))

        # tests with different sequences
        self.assertEqual(list(self.b8.k_words(3, overlapping=False)),
                         ['HE.', '.--', '..L'])
        b = BiologicalSequence('')
        self.assertEqual(list(b.k_words(3)), [])

    def test_k_word_counts(self):
        # overlapping = True
        expected = Counter('GATTACA')
        self.assertEqual(self.b1.k_word_counts(1, overlapping=True),
                         expected)
        expected = Counter(['GAT', 'ATT', 'TTA', 'TAC', 'ACA'])
        self.assertEqual(self.b1.k_word_counts(3, overlapping=True),
                         expected)

        # overlapping = False
        expected = Counter(['GAT', 'TAC'])
        self.assertEqual(self.b1.k_word_counts(3, overlapping=False),
                         expected)
        expected = Counter(['GATTACA'])
        self.assertEqual(self.b1.k_word_counts(7, overlapping=False),
                         expected)

    def test_k_word_frequencies(self):
        # overlapping = True
        expected = defaultdict(int)
        expected['A'] = 3/7.
        expected['C'] = 1/7.
        expected['G'] = 1/7.
        expected['T'] = 2/7.
        self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True),
                         expected)
        expected = defaultdict(int)
        expected['GAT'] = 1/5.
        expected['ATT'] = 1/5.
        expected['TTA'] = 1/5.
        expected['TAC'] = 1/5.
        expected['ACA'] = 1/5.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True),
                         expected)

        # overlapping = False
        expected = defaultdict(int)
        expected['GAT'] = 1/2.
        expected['TAC'] = 1/2.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False),
                         expected)
        expected = defaultdict(int)
        expected['GATTACA'] = 1.0
        self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False),
                         expected)
        expected = defaultdict(int)
        empty = BiologicalSequence('')
        self.assertEqual(empty.k_word_frequencies(1, overlapping=False),
                         expected)

    def test_len(self):
        self.assertEqual(len(self.b1), 7)
        self.assertEqual(len(self.b2), 9)
        self.assertEqual(len(self.b3), 4)

    def test_repr(self):
        self.assertEqual(repr(self.b1),
                         "<BiologicalSequence: GATTACA (length: 7)>")
        self.assertEqual(repr(self.b6),
                         "<BiologicalSequence: ACGTACGTAC... (length: 12)>")

    def test_reversed(self):
        b1_reversed = reversed(self.b1)
        for actual, expected in zip(b1_reversed, "ACATTAG"):
            self.assertEqual(actual, expected)

        self.assertRaises(StopIteration, lambda: next(b1_reversed))

    def test_str(self):
        self.assertEqual(str(self.b1), "GATTACA")
        self.assertEqual(str(self.b2), "ACCGGTACC")
        self.assertEqual(str(self.b3), "GREG")

    def test_alphabet(self):
        self.assertEqual(self.b1.alphabet(), set())

    def test_description(self):
        self.assertEqual(self.b1.description, "")
        self.assertEqual(self.b2.description, "A test sequence")
        self.assertEqual(self.b3.description, "A protein sequence")

    def test_gap_alphabet(self):
        self.assertEqual(self.b1.gap_alphabet(), set('-.'))

    def test_id(self):
        self.assertEqual(self.b1.id, "")
        self.assertEqual(self.b2.id, "test-seq-2")
        self.assertEqual(self.b3.id, "test-seq-3")

    def test_count(self):
        self.assertEqual(self.b1.count('A'), 3)
        self.assertEqual(self.b1.count('T'), 2)
        self.assertEqual(self.b1.count('TT'), 1)

    def test_degap(self):
        self.assertEqual(self.b1.degap(), self.b1)
        self.assertEqual(self.b7.degap(), BiologicalSequence(''))
        self.assertEqual(self.b8.degap(), BiologicalSequence('HELLO'))

    def test_distance(self):
        # note that test_hamming_distance covers default behavior more
        # extensively
        self.assertEqual(self.b1.distance(self.b1), 0.0)
        self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')), 1./7)

        def dumb_distance(x, y):
            return 42

        self.assertEqual(
            self.b1.distance(self.b1, distance_fn=dumb_distance), 42)

    def test_distance_unequal_length(self):
        # Hamming distance (default) requires that sequences are of equal
        # length
        with self.assertRaises(BiologicalSequenceError):
            self.b1.distance(self.b2)

        # alternate distance functions don't have that requirement (unless
        # it's implemented within the provided distance function)
        def dumb_distance(x, y):
            return 42
        self.assertEqual(
            self.b1.distance(self.b2, distance_fn=dumb_distance), 42)

    def test_fraction_diff(self):
        self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5)
        self.assertEqual(
            self.b1.fraction_diff(BiologicalSequence('GATTACC')), 1. / 7., 5)

    def test_fraction_same(self):
        self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5)
        self.assertAlmostEqual(
            self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5)

    def test_gap_maps(self):
        # in sequence with no gaps, the gap_maps are identical
        self.assertEqual(self.b1.gap_maps(),
                         ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6]))
        # in sequence with all gaps, the map of degapped to gapped is the empty
        # list (bc its length is 0), and the map of gapped to degapped is all
        # None
        self.assertEqual(self.b7.gap_maps(),
                         ([], [None, None, None, None, None, None]))

        self.assertEqual(self.b8.gap_maps(),
                         ([0, 1, 8, 9, 10],
                          [0, 1, None, None, None, None, None, None, 2, 3, 4]))

        # example from the gap_maps doc string
        self.assertEqual(BiologicalSequence('-ACCGA-TA-').gap_maps(),
                         ([1, 2, 3, 4, 5, 7, 8],
                          [None, 0, 1, 2, 3, 4, None, 5, 6, None]))

    def test_gap_vector(self):
        self.assertEqual(self.b1.gap_vector(),
                         [False] * len(self.b1))
        self.assertEqual(self.b7.gap_vector(),
                         [True] * len(self.b7))
        self.assertEqual(self.b8.gap_vector(),
                         [False, False, True, True, True, True,
                          True, True, False, False, False])

    def test_unsupported_characters(self):
        self.assertEqual(self.b1.unsupported_characters(), set('GATC'))
        self.assertEqual(self.b7.unsupported_characters(), set())

    def test_has_unsupported_characters(self):
        self.assertTrue(self.b1.has_unsupported_characters())
        self.assertFalse(self.b7.has_unsupported_characters())

    def test_index(self):
        """ index functions as expected """
        self.assertEqual(self.b1.index('G'), 0)
        self.assertEqual(self.b1.index('A'), 1)
        self.assertEqual(self.b1.index('AC'), 4)
        self.assertRaises(ValueError, self.b1.index, 'x')

    def test_is_gap(self):
        self.assertTrue(self.b1.is_gap('.'))
        self.assertTrue(self.b1.is_gap('-'))
        self.assertFalse(self.b1.is_gap('A'))
        self.assertFalse(self.b1.is_gap('x'))
        self.assertFalse(self.b1.is_gap(' '))
        self.assertFalse(self.b1.is_gap(''))

    def test_is_gapped(self):
        self.assertFalse(self.b1.is_gapped())
        self.assertFalse(self.b2.is_gapped())
        self.assertTrue(self.b7.is_gapped())
        self.assertTrue(self.b8.is_gapped())

    def test_is_valid(self):
        self.assertFalse(self.b1.is_valid())
        self.assertTrue(self.b7.is_valid())

    def test_to_fasta(self):
        self.assertEqual(self.b1.to_fasta(), ">\nGATTACA\n")
        self.assertEqual(self.b1.to_fasta(terminal_character=""), ">\nGATTACA")
        self.assertEqual(self.b2.to_fasta(),
                         ">test-seq-2 A test sequence\nACCGGTACC\n")
        self.assertEqual(self.b3.to_fasta(),
                         ">test-seq-3 A protein sequence\nGREG\n")
        self.assertEqual(self.b4.to_fasta(),
                         ">test-seq-4\nPRTEIN\n")
        self.assertEqual(self.b5.to_fasta(),
                         "> some description\nLLPRTEIN\n")

        # alt parameters
        self.assertEqual(self.b2.to_fasta(field_delimiter=":"),
                         ">test-seq-2:A test sequence\nACCGGTACC\n")
        self.assertEqual(self.b2.to_fasta(terminal_character="!"),
                         ">test-seq-2 A test sequence\nACCGGTACC!")
        self.assertEqual(
            self.b2.to_fasta(field_delimiter=":", terminal_character="!"),
            ">test-seq-2:A test sequence\nACCGGTACC!")

    def test_upper(self):
        b = NucleotideSequence('GAt.ACa-', id='x', description='42')
        expected = NucleotideSequence('GAT.ACA-', id='x',
                                      description='42')
        self.assertEqual(b.upper(), expected)

    def test_lower(self):
        b = NucleotideSequence('GAt.ACa-', id='x', description='42')
        expected = NucleotideSequence('gat.aca-', id='x',
                                      description='42')
        self.assertEqual(b.lower(), expected)
Ejemplo n.º 20
0
def hamming_distance(s1, s2):
    s1 = BiologicalSequence(s1)
    s2 = BiologicalSequence(s2)
    return s1.distance(s2)
Ejemplo n.º 21
0
 def test_getitem(self):
     self.assertEqual(self.b1[0], BiologicalSequence('G'))
     self.assertEqual(self.b1[:], BiologicalSequence('GATTACA'))
     self.assertEqual(self.b1[::-1], BiologicalSequence('ACATTAG'))
Ejemplo n.º 22
0
class BiologicalSequenceTests(TestCase):
    def setUp(self):
        self.b1 = BiologicalSequence('GATTACA')
        self.b2 = BiologicalSequence('ACCGGTACC',
                                     id="test-seq-2",
                                     description="A test sequence")
        self.b3 = BiologicalSequence('GREG',
                                     id="test-seq-3",
                                     description="A protein sequence")
        self.b4 = BiologicalSequence('PRTEIN', id="test-seq-4")
        self.b5 = BiologicalSequence('LLPRTEIN',
                                     description="some description")
        self.b6 = BiologicalSequence('ACGTACGTACGT')
        self.b7 = BiologicalSequence('..--..')
        self.b8 = BiologicalSequence('HE..--..LLO')

    def test_init_varied_input(self):
        # init as string
        b = BiologicalSequence('ACCGGXZY')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

        # init as string with optional values
        b = BiologicalSequence('ACCGGXZY', 'test-seq-1',
                               'The first test sequence')
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "test-seq-1")
        self.assertEqual(b.description, "The first test sequence")

        # test init as a different string
        b = BiologicalSequence('WRRTY')
        self.assertEqual(str(b), 'WRRTY')

        # init as list
        b = BiologicalSequence(list('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

        # init as tuple
        b = BiologicalSequence(tuple('ACCGGXZY'))
        self.assertEqual(str(b), 'ACCGGXZY')
        self.assertEqual(b.id, "")
        self.assertEqual(b.description, "")

    def test_init_with_validation(self):
        self.assertRaises(BiologicalSequenceError,
                          BiologicalSequence,
                          "ACC",
                          validate=True)
        # no error raised when only allow characters are passed
        BiologicalSequence("..--..", validate=True)

    def test_contains(self):
        self.assertTrue('G' in self.b1)
        self.assertFalse('g' in self.b1)

    def test_eq(self):
        self.assertTrue(self.b1 == self.b1)
        self.assertTrue(self.b2 == self.b2)
        self.assertTrue(self.b3 == self.b3)

        self.assertTrue(self.b1 != self.b3)
        self.assertTrue(self.b1 != self.b2)
        self.assertTrue(self.b2 != self.b3)

        # identicial sequences of the same type are equal, even if they have
        # different ids and/or descriptions
        self.assertTrue(
            BiologicalSequence('ACGT') == BiologicalSequence('ACGT'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a') == BiologicalSequence('ACGT',
                                                                     id='b'))
        self.assertTrue(
            BiologicalSequence('ACGT', description='c') == BiologicalSequence(
                'ACGT', description='d'))
        self.assertTrue(
            BiologicalSequence('ACGT', id='a', description='c') ==
            BiologicalSequence('ACGT', id='b', description='d'))

        # different type causes sequences to not be equal
        self.assertFalse(
            BiologicalSequence('ACGT') == NucleotideSequence('ACGT'))

    def test_getitem(self):
        self.assertEqual(self.b1[0], BiologicalSequence('G'))
        self.assertEqual(self.b1[:], BiologicalSequence('GATTACA'))
        self.assertEqual(self.b1[::-1], BiologicalSequence('ACATTAG'))

    def test_iter(self):
        b1_iter = iter(self.b1)
        for actual, expected in zip(b1_iter, "GATTACA"):
            self.assertEqual(actual, expected)

        self.assertRaises(StopIteration, lambda: next(b1_iter))

    def test_k_words(self):
        # overlapping = True
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=True)),
                         ['GA', 'AT', 'TT', 'TA', 'AC', 'CA'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=True)),
                         ['GAT', 'ATT', 'TTA', 'TAC', 'ACA'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=True)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=True)), [])

        # overlapping = False
        self.assertEqual(list(self.b1.k_words(1, overlapping=True)),
                         ['G', 'A', 'T', 'T', 'A', 'C', 'A'])
        self.assertEqual(list(self.b1.k_words(2, overlapping=False)),
                         ['GA', 'TT', 'AC'])
        self.assertEqual(list(self.b1.k_words(3, overlapping=False)),
                         ['GAT', 'TAC'])
        self.assertEqual(list(self.b1.k_words(7, overlapping=False)),
                         ['GATTACA'])
        self.assertEqual(list(self.b1.k_words(8, overlapping=False)), [])

        # error on invalid k
        self.assertRaises(ValueError, list, self.b1.k_words(0))
        self.assertRaises(ValueError, list, self.b1.k_words(-42))

        # tests with different sequences
        self.assertEqual(list(self.b8.k_words(3, overlapping=False)),
                         ['HE.', '.--', '..L'])
        b = BiologicalSequence('')
        self.assertEqual(list(b.k_words(3)), [])

    def test_k_word_counts(self):
        # overlapping = True
        expected = Counter('GATTACA')
        self.assertEqual(self.b1.k_word_counts(1, overlapping=True), expected)
        expected = Counter(['GAT', 'ATT', 'TTA', 'TAC', 'ACA'])
        self.assertEqual(self.b1.k_word_counts(3, overlapping=True), expected)

        # overlapping = False
        expected = Counter(['GAT', 'TAC'])
        self.assertEqual(self.b1.k_word_counts(3, overlapping=False), expected)
        expected = Counter(['GATTACA'])
        self.assertEqual(self.b1.k_word_counts(7, overlapping=False), expected)

    def test_k_word_frequencies(self):
        # overlapping = True
        expected = defaultdict(int)
        expected['A'] = 3 / 7.
        expected['C'] = 1 / 7.
        expected['G'] = 1 / 7.
        expected['T'] = 2 / 7.
        self.assertEqual(self.b1.k_word_frequencies(1, overlapping=True),
                         expected)
        expected = defaultdict(int)
        expected['GAT'] = 1 / 5.
        expected['ATT'] = 1 / 5.
        expected['TTA'] = 1 / 5.
        expected['TAC'] = 1 / 5.
        expected['ACA'] = 1 / 5.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=True),
                         expected)

        # overlapping = False
        expected = defaultdict(int)
        expected['GAT'] = 1 / 2.
        expected['TAC'] = 1 / 2.
        self.assertEqual(self.b1.k_word_frequencies(3, overlapping=False),
                         expected)
        expected = defaultdict(int)
        expected['GATTACA'] = 1.0
        self.assertEqual(self.b1.k_word_frequencies(7, overlapping=False),
                         expected)
        expected = defaultdict(int)
        empty = BiologicalSequence('')
        self.assertEqual(empty.k_word_frequencies(1, overlapping=False),
                         expected)

    def test_len(self):
        self.assertEqual(len(self.b1), 7)
        self.assertEqual(len(self.b2), 9)
        self.assertEqual(len(self.b3), 4)

    def test_repr(self):
        self.assertEqual(repr(self.b1),
                         "<BiologicalSequence: GATTACA (length: 7)>")
        self.assertEqual(repr(self.b6),
                         "<BiologicalSequence: ACGTACGTAC... (length: 12)>")

    def test_reversed(self):
        b1_reversed = reversed(self.b1)
        for actual, expected in zip(b1_reversed, "ACATTAG"):
            self.assertEqual(actual, expected)

        self.assertRaises(StopIteration, lambda: next(b1_reversed))

    def test_str(self):
        self.assertEqual(str(self.b1), "GATTACA")
        self.assertEqual(str(self.b2), "ACCGGTACC")
        self.assertEqual(str(self.b3), "GREG")

    def test_alphabet(self):
        self.assertEqual(self.b1.alphabet(), set())

    def test_description(self):
        self.assertEqual(self.b1.description, "")
        self.assertEqual(self.b2.description, "A test sequence")
        self.assertEqual(self.b3.description, "A protein sequence")

    def test_gap_alphabet(self):
        self.assertEqual(self.b1.gap_alphabet(), set('-.'))

    def test_id(self):
        self.assertEqual(self.b1.id, "")
        self.assertEqual(self.b2.id, "test-seq-2")
        self.assertEqual(self.b3.id, "test-seq-3")

    def test_count(self):
        self.assertEqual(self.b1.count('A'), 3)
        self.assertEqual(self.b1.count('T'), 2)
        self.assertEqual(self.b1.count('TT'), 1)

    def test_degap(self):
        self.assertEqual(self.b1.degap(), self.b1)
        self.assertEqual(self.b7.degap(), BiologicalSequence(''))
        self.assertEqual(self.b8.degap(), BiologicalSequence('HELLO'))

    def test_distance(self):
        # note that test_hamming_distance covers default behavior more
        # extensively
        self.assertEqual(self.b1.distance(self.b1), 0.0)
        self.assertEqual(self.b1.distance(BiologicalSequence('GATTACC')),
                         1. / 7)

        def dumb_distance(x, y):
            return 42

        self.assertEqual(self.b1.distance(self.b1, distance_fn=dumb_distance),
                         42)

    def test_fraction_diff(self):
        self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5)
        self.assertEqual(self.b1.fraction_diff(BiologicalSequence('GATTACC')),
                         1. / 7., 5)

    def test_fraction_same(self):
        self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5)
        self.assertAlmostEqual(
            self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5)

    def test_gap_maps(self):
        # in sequence with no gaps, the gap_maps are identical
        self.assertEqual(self.b1.gap_maps(),
                         ([0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6]))
        # in sequence with all gaps, the map of degapped to gapped is the empty
        # list (bc its length is 0), and the map of gapped to degapped is all
        # None
        self.assertEqual(self.b7.gap_maps(),
                         ([], [None, None, None, None, None, None]))

        self.assertEqual(self.b8.gap_maps(),
                         ([0, 1, 8, 9, 10],
                          [0, 1, None, None, None, None, None, None, 2, 3, 4]))

        # example from the gap_maps doc string
        self.assertEqual(
            BiologicalSequence('-ACCGA-TA-').gap_maps(),
            ([1, 2, 3, 4, 5, 7, 8], [None, 0, 1, 2, 3, 4, None, 5, 6, None]))

    def test_gap_vector(self):
        self.assertEqual(self.b1.gap_vector(), [False] * len(self.b1))
        self.assertEqual(self.b7.gap_vector(), [True] * len(self.b7))
        self.assertEqual(self.b8.gap_vector(), [
            False, False, True, True, True, True, True, True, False, False,
            False
        ])

    def test_unsupported_characters(self):
        self.assertEqual(self.b1.unsupported_characters(), set('GATC'))
        self.assertEqual(self.b7.unsupported_characters(), set())

    def test_has_unsupported_characters(self):
        self.assertTrue(self.b1.has_unsupported_characters())
        self.assertFalse(self.b7.has_unsupported_characters())

    def test_index(self):
        """ index functions as expected """
        self.assertEqual(self.b1.index('G'), 0)
        self.assertEqual(self.b1.index('A'), 1)
        self.assertEqual(self.b1.index('AC'), 4)
        self.assertRaises(ValueError, self.b1.index, 'x')

    def test_is_gap(self):
        self.assertTrue(self.b1.is_gap('.'))
        self.assertTrue(self.b1.is_gap('-'))
        self.assertFalse(self.b1.is_gap('A'))
        self.assertFalse(self.b1.is_gap('x'))
        self.assertFalse(self.b1.is_gap(' '))
        self.assertFalse(self.b1.is_gap(''))

    def test_is_gapped(self):
        self.assertFalse(self.b1.is_gapped())
        self.assertFalse(self.b2.is_gapped())
        self.assertTrue(self.b7.is_gapped())
        self.assertTrue(self.b8.is_gapped())

    def test_is_valid(self):
        self.assertFalse(self.b1.is_valid())
        self.assertTrue(self.b7.is_valid())

    def test_to_fasta(self):
        self.assertEqual(self.b1.to_fasta(), ">\nGATTACA\n")
        self.assertEqual(self.b1.to_fasta(terminal_character=""), ">\nGATTACA")
        self.assertEqual(self.b2.to_fasta(),
                         ">test-seq-2 A test sequence\nACCGGTACC\n")
        self.assertEqual(self.b3.to_fasta(),
                         ">test-seq-3 A protein sequence\nGREG\n")
        self.assertEqual(self.b4.to_fasta(), ">test-seq-4\nPRTEIN\n")
        self.assertEqual(self.b5.to_fasta(), "> some description\nLLPRTEIN\n")

        # alt parameters
        self.assertEqual(self.b2.to_fasta(field_delimiter=":"),
                         ">test-seq-2:A test sequence\nACCGGTACC\n")
        self.assertEqual(self.b2.to_fasta(terminal_character="!"),
                         ">test-seq-2 A test sequence\nACCGGTACC!")
        self.assertEqual(
            self.b2.to_fasta(field_delimiter=":", terminal_character="!"),
            ">test-seq-2:A test sequence\nACCGGTACC!")

    def test_upper(self):
        b = NucleotideSequence('GAt.ACa-', id='x', description='42')
        expected = NucleotideSequence('GAT.ACA-', id='x', description='42')
        self.assertEqual(b.upper(), expected)

    def test_lower(self):
        b = NucleotideSequence('GAt.ACa-', id='x', description='42')
        expected = NucleotideSequence('gat.aca-', id='x', description='42')
        self.assertEqual(b.lower(), expected)
Ejemplo n.º 23
0
 def setUp(self):
     self.b1 = BiologicalSequence('GATTACA')
     self.b2 = BiologicalSequence('ACCGGTACC',
                                  id="test-seq-2",
                                  description="A test sequence")
     self.b3 = BiologicalSequence('GREG',
                                  id="test-seq-3",
                                  description="A protein sequence")
     self.b4 = BiologicalSequence('PRTEIN', id="test-seq-4")
     self.b5 = BiologicalSequence('LLPRTEIN',
                                  description="some description")
     self.b6 = BiologicalSequence('ACGTACGTACGT')
     self.b7 = BiologicalSequence('..--..')
     self.b8 = BiologicalSequence('HE..--..LLO')
Ejemplo n.º 24
0
 def test_degap(self):
     self.assertEqual(self.b1.degap(), self.b1)
     self.assertEqual(self.b7.degap(), BiologicalSequence(''))
     self.assertEqual(self.b8.degap(), BiologicalSequence('HELLO'))
Ejemplo n.º 25
0
def fasta_from_alignment(aln, make_seqlabel=None, line_wrap=None, sort=True):
    """Returns a FASTA string given an alignment object

    Parameters
    ----------
    aln : Alignment, dict
        alignment or dictionary where the keys are the sequence ids and
        the values are the sequences themselves.
    make_seqlabel : function, optional
        callback function that takes the seq object and returns a label
        ``str``. If ``None`` is passed, the following attributes will try to be
        retrieved in this order and the first to exist will be used:
        ``id``, ``Label`` or ``Name``. In any other case an integer
        with the position of the sequence object will be used.
    line_wrap : int, optional
        line_wrap: a integer for maximum line width, if ``None`` is passed the
        full sequence will be used.
    sort : bool, optional
        Whether or not the sequences should be sorted by their sequence
        id, default value is ``True``.

    Returns
    -------
    str
        FASTA formatted string composed of the objects passed in via `seqs`.

    See Also
    --------
    skbio.parse.sequences.parse_fasta
    skbio.core.alignment.Alignment

    Examples
    --------
    Formatting a sequence alignment object into a FASTA file.

    >>> from skbio.core.alignment import Alignment
    >>> from skbio.core.sequence import DNA
    >>> from skbio.format.sequences import fasta_from_alignment
    >>> seqs = [DNA("ACC--G-GGTA..", id="seq1"),
    ...         DNA("TCC--G-GGCA..", id="seqs2")]
    >>> a1 = Alignment(seqs)
    >>> print fasta_from_alignment(a1)
    >seq1
    ACC--G-GGTA..
    >seqs2
    TCC--G-GGCA..

    """
    # check if it's an Alignment object or a dictionary
    if isinstance(aln, Alignment):
        order = aln.ids()
    else:
        order = aln.keys()

    if sort:
        order = sorted(order)

    ordered_seqs = []
    for label in order:
        seq = aln[label]
        if isinstance(seq, str):
            seq = BiologicalSequence(seq, label)
        ordered_seqs.append(seq)
    return fasta_from_sequences(ordered_seqs,
                                make_seqlabel=make_seqlabel,
                                line_wrap=line_wrap)
Ejemplo n.º 26
0
 def test_fraction_same(self):
     self.assertAlmostEqual(self.b1.fraction_same(self.b1), 1., 5)
     self.assertAlmostEqual(
         self.b1.fraction_same(BiologicalSequence('GATTACC')), 6. / 7., 5)
Ejemplo n.º 27
0
 def test_fraction_diff(self):
     self.assertEqual(self.b1.fraction_diff(self.b1), 0., 5)
     self.assertEqual(self.b1.fraction_diff(BiologicalSequence('GATTACC')),
                      1. / 7., 5)