Example #1
0
 def test_keys_setter_non_empty(self):
     msa = TabularMSA([DNA('AC'), DNA('AG'), DNA('AT')])
     self.assertFalse(msa.has_keys())
     msa.keys = range(3)
     npt.assert_array_equal(msa.keys, np.array([0, 1, 2]))
     msa.keys = range(3, 6)
     npt.assert_array_equal(msa.keys, np.array([3, 4, 5]))
Example #2
0
    def test_global_pairwise_align_nucleotide_penalize_terminal_gaps(self):
        # in these tests one sequence is about 3x the length of the other.
        # we toggle penalize_terminal_gaps to confirm that it results in
        # different alignments and alignment scores.
        seq1 = DNA("ACCGTGGACCGTTAGGATTGGACCCAAGGTTG")
        seq2 = DNA("T"*25 + "ACCGTGGACCGTAGGATTGGACCAAGGTTA" + "A"*25)

        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5,
            match_score=5, mismatch_score=-4, penalize_terminal_gaps=False)

        self.assertEqual(
            obs_msa,
            TabularMSA([DNA("-------------------------ACCGTGGACCGTTAGGA"
                            "TTGGACCCAAGGTTG-------------------------"),
                        DNA("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA"
                            "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")]))
        self.assertEqual(obs_score, 131.0)

        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5,
            match_score=5, mismatch_score=-4, penalize_terminal_gaps=True)

        self.assertEqual(
            obs_msa,
            TabularMSA([DNA("-------------------------ACCGTGGACCGTTAGGA"
                            "TTGGACCCAAGGTT-------------------------G"),
                        DNA("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA"
                            "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")]))
        self.assertEqual(obs_score, 97.0)
Example #3
0
 def test_keys_setter_non_empty(self):
     msa = TabularMSA([DNA('AC'), DNA('AG'), DNA('AT')])
     self.assertFalse(msa.has_keys())
     msa.keys = range(3)
     npt.assert_array_equal(msa.keys, np.array([0, 1, 2]))
     msa.keys = range(3, 6)
     npt.assert_array_equal(msa.keys, np.array([3, 4, 5]))
Example #4
0
    def test_constructor_not_monomorphic(self):
        with six.assertRaisesRegex(self, TypeError, 'mixed types.*RNA.*DNA'):
            TabularMSA([DNA(''), RNA('')])

        with six.assertRaisesRegex(self, TypeError,
                                   'mixed types.*float.*Protein'):
            TabularMSA([Protein(''), Protein(''), 42.0, Protein('')])
Example #5
0
 def test_to_dict_non_empty(self):
     seqs = [
         Protein('PAW', metadata={'id': 42}),
         Protein('WAP', metadata={'id': -999})
     ]
     msa = TabularMSA(seqs, key='id')
     self.assertEqual(msa.to_dict(), {42: seqs[0], -999: seqs[1]})
Example #6
0
 def test_sort_on_unorderable_msa_keys(self):
     unorderable = Unorderable()
     msa = TabularMSA([DNA('AAA'), DNA('ACG')], keys=[42, unorderable])
     with self.assertRaises(TypeError):
         msa.sort()
     self.assertEqual(
         msa, TabularMSA([DNA('AAA'), DNA('ACG')], keys=[42, unorderable]))
Example #7
0
    def test_constructor_invalid_dtype(self):
        with six.assertRaisesRegex(self, TypeError,
                                   'sequence.*alphabet.*Sequence'):
            TabularMSA([Sequence('')])

        with six.assertRaisesRegex(self, TypeError, 'sequence.*alphabet.*int'):
            TabularMSA([42, DNA('')])
Example #8
0
 def test_sort_on_invalid_key(self):
     msa = TabularMSA([DNA('AAA'), DNA('ACG')], keys=[42, 43])
     with self.assertRaises(KeyError):
         msa.sort(key='id')
     self.assertEqual(
         msa,
         TabularMSA([DNA('AAA'), DNA('ACG')], keys=[42, 43]))
Example #9
0
    def test_sort_multiple_sequences_no_msa_keys_on_metadata_key(self):
        msa = TabularMSA([
            RNA('UCA', metadata={'id': 41}),
            RNA('AAA', metadata={'id': 44}),
            RNA('GAC', metadata={'id': -1}),
            RNA('GAC', metadata={'id': 42})
        ])
        msa.sort(key='id')
        self.assertEqual(
            msa,
            TabularMSA([
                RNA('GAC', metadata={'id': -1}),
                RNA('UCA', metadata={'id': 41}),
                RNA('GAC', metadata={'id': 42}),
                RNA('AAA', metadata={'id': 44})
            ]))

        msa = TabularMSA([
            RNA('UCA', metadata={'id': 41}),
            RNA('AAA', metadata={'id': 44}),
            RNA('GAC', metadata={'id': -1}),
            RNA('GAC', metadata={'id': 42})
        ])
        msa.sort(key='id', reverse=True)
        self.assertEqual(
            msa,
            TabularMSA([
                RNA('AAA', metadata={'id': 44}),
                RNA('GAC', metadata={'id': 42}),
                RNA('UCA', metadata={'id': 41}),
                RNA('GAC', metadata={'id': -1})
            ]))
Example #10
0
    def test_eq_handles_missing_metadata_efficiently(self):
        msa1 = TabularMSA([DNA('ACGT')])
        msa2 = TabularMSA([DNA('ACGT')])
        self.assertReallyEqual(msa1, msa2)

        self.assertIsNone(msa1._metadata)
        self.assertIsNone(msa2._metadata)
Example #11
0
    def test_sort_multiple_sequences_with_msa_keys_on_metadata_key(self):
        msa = TabularMSA([
            DNA('TCA', metadata={'#': 41.2}),
            DNA('AAA', metadata={'#': 44.5}),
            DNA('GAC', metadata={'#': 42.999})
        ],
                         keys=[None, ('hello', 'world'), True])
        msa.sort(key='#')
        self.assertEqual(
            msa,
            TabularMSA([
                DNA('TCA', metadata={'#': 41.2}),
                DNA('GAC', metadata={'#': 42.999}),
                DNA('AAA', metadata={'#': 44.5})
            ],
                       keys=[None, True, ('hello', 'world')]))

        msa = TabularMSA([
            DNA('TCA', metadata={'#': 41.2}),
            DNA('AAA', metadata={'#': 44.5}),
            DNA('GAC', metadata={'#': 42.999})
        ],
                         keys=[None, ('hello', 'world'), True])
        msa.sort(key='#', reverse=True)
        self.assertEqual(
            msa,
            TabularMSA([
                DNA('AAA', metadata={'#': 44.5}),
                DNA('GAC', metadata={'#': 42.999}),
                DNA('TCA', metadata={'#': 41.2})
            ],
                       keys=[('hello', 'world'), True, None]))
Example #12
0
    def test_local_pairwise_align_protein(self):
        obs_msa, obs_score, obs_start_end = local_pairwise_align_protein(
            Protein("HEAGAWGHEE"),
            Protein("PAWHEAE"),
            gap_open_penalty=10.,
            gap_extend_penalty=5.)

        self.assertEqual(obs_msa,
                         TabularMSA([Protein("AWGHE"),
                                     Protein("AW-HE")]))
        self.assertEqual(obs_score, 26.0)
        self.assertEqual(obs_start_end, [(4, 8), (1, 4)])

        obs_msa, obs_score, obs_start_end = local_pairwise_align_protein(
            Protein("HEAGAWGHEE"),
            Protein("PAWHEAE"),
            gap_open_penalty=5.,
            gap_extend_penalty=0.5)

        self.assertEqual(obs_msa,
                         TabularMSA([Protein("AWGHE-E"),
                                     Protein("AW-HEAE")]))
        self.assertEqual(obs_score, 32.0)
        self.assertEqual(obs_start_end, [(4, 9), (1, 6)])

        # Protein sequences with metadata
        obs_msa, obs_score, obs_start_end = local_pairwise_align_protein(
            Protein("HEAGAWGHEE", metadata={'id': "s1"}),
            Protein("PAWHEAE", metadata={'id': "s2"}),
            gap_open_penalty=10.,
            gap_extend_penalty=5.)

        self.assertEqual(obs_msa,
                         TabularMSA([Protein("AWGHE"),
                                     Protein("AW-HE")]))
        self.assertEqual(obs_score, 26.0)
        self.assertEqual(obs_start_end, [(4, 8), (1, 4)])

        # Fails when either input is passed as a TabularMSA
        self.assertRaises(TypeError,
                          local_pairwise_align_protein,
                          TabularMSA(
                              [Protein("HEAGAWGHEE", metadata={'id': "s1"})]),
                          Protein("PAWHEAE", metadata={'id': "s2"}),
                          gap_open_penalty=10.,
                          gap_extend_penalty=5.)
        self.assertRaises(TypeError,
                          local_pairwise_align_protein,
                          Protein("HEAGAWGHEE", metadata={'id': "s1"}),
                          TabularMSA(
                              [Protein("PAWHEAE", metadata={'id': "s2"})]),
                          gap_open_penalty=10.,
                          gap_extend_penalty=5.)

        # TypeError on invalid input
        self.assertRaises(TypeError, local_pairwise_align_protein, 42,
                          Protein("HEAGAWGHEE"))
        self.assertRaises(TypeError, local_pairwise_align_protein,
                          Protein("HEAGAWGHEE"), 42)
Example #13
0
    def test_sort_single_sequence_with_msa_keys_on_callable_key(self):
        msa = TabularMSA([RNA('UCA')], keys=['foo'])
        msa.sort(key=str)
        self.assertEqual(msa, TabularMSA([RNA('UCA')], keys=['foo']))

        msa = TabularMSA([RNA('UCA')], keys=['foo'])
        msa.sort(key=str, reverse=True)
        self.assertEqual(msa, TabularMSA([RNA('UCA')], keys=['foo']))
Example #14
0
    def test_sort_empty_on_msa_keys(self):
        msa = TabularMSA([], keys=[])
        msa.sort()
        self.assertEqual(msa, TabularMSA([], keys=[]))

        msa = TabularMSA([], keys=[])
        msa.sort(reverse=True)
        self.assertEqual(msa, TabularMSA([], keys=[]))
Example #15
0
 def test_sort_on_unorderable_msa_keys(self):
     unorderable = Unorderable()
     msa = TabularMSA([DNA('AAA'), DNA('ACG')], keys=[42, unorderable])
     with self.assertRaises(TypeError):
         msa.sort()
     self.assertEqual(
         msa,
         TabularMSA([DNA('AAA'), DNA('ACG')], keys=[42, unorderable]))
Example #16
0
    def test_sort_single_sequence_on_msa_keys(self):
        msa = TabularMSA([DNA('ACGT')], keys=[42])
        msa.sort()
        self.assertEqual(msa, TabularMSA([DNA('ACGT')], keys=[42]))

        msa = TabularMSA([DNA('ACGT')], keys=[42])
        msa.sort(reverse=True)
        self.assertEqual(msa, TabularMSA([DNA('ACGT')], keys=[42]))
Example #17
0
    def test_sort_empty_with_msa_keys_on_metadata_key(self):
        msa = TabularMSA([], keys=[])
        msa.sort(key='id')
        self.assertEqual(msa, TabularMSA([], keys=[]))

        msa = TabularMSA([], keys=[])
        msa.sort(key='id', reverse=True)
        self.assertEqual(msa, TabularMSA([], keys=[]))
Example #18
0
    def test_global_pairwise_align_dtype_mismatch(self):
        with self.assertRaisesRegex(TypeError, r"same dtype: 'DNA' != 'RNA'"):
            global_pairwise_align(DNA('ACGT'), TabularMSA([RNA('ACGU')]), 1.0,
                                  1.0, {})

        with self.assertRaisesRegex(TypeError, r"same dtype: 'DNA' != 'RNA'"):
            global_pairwise_align(TabularMSA([DNA('ACGT')]),
                                  TabularMSA([RNA('ACGU')]), 1.0, 1.0, {})
Example #19
0
    def test_sort_single_sequence_no_msa_keys_on_metadata_key(self):
        msa = TabularMSA([RNA('UCA', metadata={'id': 42})])
        msa.sort(key='id')
        self.assertEqual(msa, TabularMSA([RNA('UCA', metadata={'id': 42})]))

        msa = TabularMSA([RNA('UCA', metadata={'id': 42})])
        msa.sort(key='id', reverse=True)
        self.assertEqual(msa, TabularMSA([RNA('UCA', metadata={'id': 42})]))
Example #20
0
    def test_sort_multiple_sequences_no_msa_keys_on_callable_key(self):
        msa = TabularMSA([RNA('UCC'), RNA('UCG'), RNA('UCA')])
        msa.sort(key=str)
        self.assertEqual(msa, TabularMSA([RNA('UCA'), RNA('UCC'), RNA('UCG')]))

        msa = TabularMSA([RNA('UCC'), RNA('UCG'), RNA('UCA')])
        msa.sort(key=str, reverse=True)
        self.assertEqual(msa, TabularMSA([RNA('UCG'), RNA('UCC'), RNA('UCA')]))
Example #21
0
    def test_sort_empty_with_msa_keys_on_callable_key(self):
        msa = TabularMSA([], keys=[])
        msa.sort(key=str)
        self.assertEqual(msa, TabularMSA([], keys=[]))

        msa = TabularMSA([], keys=[])
        msa.sort(key=str, reverse=True)
        self.assertEqual(msa, TabularMSA([], keys=[]))
Example #22
0
 def test_compute_score_and_traceback_matrices_invalid(self):
     # if the sequence contains a character that is not in the
     # substitution matrix, an informative error should be raised
     m = make_identity_substitution_matrix(2, -1)
     self.assertRaises(ValueError, _compute_score_and_traceback_matrices,
                       TabularMSA([DNA('AWG', metadata={'id': 'id'})]),
                       TabularMSA([DNA('ACGT', metadata={'id': 'id'})]),
                       5, 2, m)
Example #23
0
    def test_global_pairwise_align_nucleotide(self):
        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"),
            gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5,
            mismatch_score=-4)

        self.assertEqual(obs_msa, TabularMSA([DNA("G-ACCTTGACCAGGTACC"),
                                              DNA("GAACTTTGAC---GTAAC")]))
        self.assertEqual(obs_score, 41.0)
        self.assertEqual(obs_start_end, [(0, 16), (0, 14)])

        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"),
            gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5,
            mismatch_score=-4)

        self.assertEqual(obs_msa, TabularMSA([DNA("-GACCTTGACCAGGTACC"),
                                              DNA("GAACTTTGAC---GTAAC")]))
        self.assertEqual(obs_score, 32.0)
        self.assertEqual(obs_start_end, [(0, 16), (0, 14)])

        # DNA sequences with metadata
        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}),
            DNA("GAACTTTGACGTAAC", metadata={'id': "s2"}),
            gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5,
            mismatch_score=-4)

        self.assertEqual(
            obs_msa,
            TabularMSA([DNA("-GACCTTGACCAGGTACC", metadata={'id': "s1"}),
                        DNA("GAACTTTGAC---GTAAC", metadata={'id': "s2"})]))

        self.assertEqual(obs_score, 32.0)
        self.assertEqual(obs_start_end, [(0, 16), (0, 14)])

        # Align one DNA sequence and one TabularMSA, score computed manually
        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            TabularMSA([DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}),
                        DNA("GACCATGACCAGGTACC", metadata={'id': "s2"})]),
            DNA("GAACTTTGACGTAAC", metadata={'id': "s3"}),
            gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5,
            mismatch_score=-4)

        self.assertEqual(
            obs_msa,
            TabularMSA([DNA("-GACCTTGACCAGGTACC", metadata={'id': "s1"}),
                        DNA("-GACCATGACCAGGTACC", metadata={'id': "s2"}),
                        DNA("GAACTTTGAC---GTAAC", metadata={'id': "s3"})]))

        self.assertEqual(obs_score, 27.5)
        self.assertEqual(obs_start_end, [(0, 16), (0, 14)])

        # TypeError on invalid input
        self.assertRaises(TypeError, global_pairwise_align_nucleotide,
                          42, DNA("ACGT"))
        self.assertRaises(TypeError, global_pairwise_align_nucleotide,
                          DNA("ACGT"), 42)
Example #24
0
    def test_metadata_setter_invalid_type(self):
        msa = TabularMSA([Protein('PAW')], metadata={123: 456})

        for md in (None, 0, 'a', ('f', 'o', 'o'), np.array([]),
                   pd.DataFrame()):
            with six.assertRaisesRegex(self, TypeError,
                                       'metadata must be a dict'):
                msa.metadata = md
            self.assertEqual(msa.metadata, {123: 456})
Example #25
0
    def test_dtype(self):
        self.assertIsNone(TabularMSA([]).dtype)
        self.assertIs(TabularMSA([Protein('')]).dtype, Protein)

        with self.assertRaises(AttributeError):
            TabularMSA([]).dtype = DNA

        with self.assertRaises(AttributeError):
            del TabularMSA([]).dtype
Example #26
0
    def test_iter(self):
        with self.assertRaises(StopIteration):
            next(iter(TabularMSA([])))

        seqs = [DNA(''), DNA('')]
        self.assertEqual(list(iter(TabularMSA(seqs))), seqs)

        seqs = [DNA('AAA'), DNA('GCT')]
        self.assertEqual(list(iter(TabularMSA(seqs))), seqs)
Example #27
0
    def test_reversed(self):
        with self.assertRaises(StopIteration):
            next(reversed(TabularMSA([])))

        seqs = [DNA(''), DNA('', metadata={'id': 42})]
        self.assertEqual(list(reversed(TabularMSA(seqs))), seqs[::-1])

        seqs = [DNA('AAA'), DNA('GCT')]
        self.assertEqual(list(reversed(TabularMSA(seqs))), seqs[::-1])
Example #28
0
 def test_constructor_no_metadata(self):
     self.assertFalse(TabularMSA([]).has_metadata())
     self.assertFalse(
         TabularMSA([DNA('', metadata={'id': 42})]).has_metadata())
     self.assertFalse(
         TabularMSA([
             DNA('AGC', metadata={'id': 42}),
             DNA('---', metadata={'id': 43})
         ]).has_metadata())
Example #29
0
    def test_metadata_setter_invalid_type(self):
        msa = TabularMSA([Protein('PAW')], metadata={123: 456})

        for md in (None, 0, 'a', ('f', 'o', 'o'), np.array([]),
                   pd.DataFrame()):
            with six.assertRaisesRegex(self, TypeError,
                                       'metadata must be a dict'):
                msa.metadata = md
            self.assertEqual(msa.metadata, {123: 456})
Example #30
0
 def test_init_matrices_sw(self):
     expected_score_m = np.zeros((5, 4))
     expected_tback_m = [[0, 0, 0, 0], [0, -1, -1, -1], [0, -1, -1, -1],
                         [0, -1, -1, -1], [0, -1, -1, -1]]
     actual_score_m, actual_tback_m = _init_matrices_sw(
         TabularMSA([DNA('AAA', metadata={'id': 'id'})]),
         TabularMSA([DNA('AAAA', metadata={'id': 'id'})]), 5, 2)
     np.testing.assert_array_equal(actual_score_m, expected_score_m)
     np.testing.assert_array_equal(actual_tback_m, expected_tback_m)
Example #31
0
    def test_reindex_key_and_keys_both_provided(self):
        msa = TabularMSA([DNA('ACGT'), DNA('TGCA')], key=str)
        keys = np.array(['ACGT', 'TGCA'])
        npt.assert_array_equal(msa.keys, keys)

        with six.assertRaisesRegex(self, ValueError, 'both.*key.*keys'):
            msa.reindex(key=str, keys=['a', 'b'])

        # original state is maintained
        npt.assert_array_equal(msa.keys, keys)
Example #32
0
    def test_reindex_key_and_keys_both_provided(self):
        msa = TabularMSA([DNA('ACGT'), DNA('TGCA')], key=str)
        keys = np.array(['ACGT', 'TGCA'])
        npt.assert_array_equal(msa.keys, keys)

        with six.assertRaisesRegex(self, ValueError, 'both.*key.*keys'):
            msa.reindex(key=str, keys=['a', 'b'])

        # original state is maintained
        npt.assert_array_equal(msa.keys, keys)
Example #33
0
    def test_reindex_makes_copy_of_keys(self):
        msa = TabularMSA([DNA('AC'), DNA('AG'), DNA('AT')])
        keys = np.asarray([1, 2, 3])
        msa.reindex(keys=keys)
        npt.assert_array_equal(msa.keys, np.array([1, 2, 3]))

        self.assertFalse(msa.keys.flags.writeable)
        self.assertTrue(keys.flags.writeable)
        keys[1] = 42
        npt.assert_array_equal(msa.keys, np.array([1, 2, 3]))
Example #34
0
    def test_reindex_makes_copy_of_keys(self):
        msa = TabularMSA([DNA('AC'), DNA('AG'), DNA('AT')])
        keys = np.asarray([1, 2, 3])
        msa.reindex(keys=keys)
        npt.assert_array_equal(msa.keys, np.array([1, 2, 3]))

        self.assertFalse(msa.keys.flags.writeable)
        self.assertTrue(keys.flags.writeable)
        keys[1] = 42
        npt.assert_array_equal(msa.keys, np.array([1, 2, 3]))
Example #35
0
    def test_has_metadata(self):
        msa = TabularMSA([])
        self.assertFalse(msa.has_metadata())
        # Handles metadata efficiently.
        self.assertIsNone(msa._metadata)

        self.assertFalse(TabularMSA([], metadata={}).has_metadata())

        self.assertTrue(TabularMSA([], metadata={'': ''}).has_metadata())
        self.assertTrue(TabularMSA([], metadata={'foo': 42}).has_metadata())
Example #36
0
    def test_keys_setter_non_hashable_keys(self):
        msa = TabularMSA([DNA('ACGT'), DNA('TGCA')], key=str)
        keys = np.array(['ACGT', 'TGCA'])
        npt.assert_array_equal(msa.keys, keys)

        with self.assertRaises(TypeError):
            msa.keys = [[42], [42]]

        # original state is maintained
        npt.assert_array_equal(msa.keys, keys)
Example #37
0
 def test_init_matrices_nw(self):
     expected_score_m = [[0, -5, -7, -9], [-5, 0, 0, 0], [-7, 0, 0, 0],
                         [-9, 0, 0, 0], [-11, 0, 0, 0]]
     expected_tback_m = [[0, 3, 3, 3], [2, -1, -1, -1], [2, -1, -1, -1],
                         [2, -1, -1, -1], [2, -1, -1, -1]]
     actual_score_m, actual_tback_m = _init_matrices_nw(
         TabularMSA([DNA('AAA', metadata={'id': 'id'})]),
         TabularMSA([DNA('AAAA', metadata={'id': 'id'})]), 5, 2)
     np.testing.assert_array_equal(actual_score_m, expected_score_m)
     np.testing.assert_array_equal(actual_tback_m, expected_tback_m)
Example #38
0
    def test_keys_setter_non_hashable_keys(self):
        msa = TabularMSA([DNA('ACGT'), DNA('TGCA')], key=str)
        keys = np.array(['ACGT', 'TGCA'])
        npt.assert_array_equal(msa.keys, keys)

        with self.assertRaises(TypeError):
            msa.keys = [[42], [42]]

        # original state is maintained
        npt.assert_array_equal(msa.keys, keys)
Example #39
0
    def test_keys_setter_non_unique_keys(self):
        msa = TabularMSA([DNA('ACGT'), DNA('TGCA')], key=str)
        keys = np.array(['ACGT', 'TGCA'])
        npt.assert_array_equal(msa.keys, keys)

        with six.assertRaisesRegex(self, UniqueError, 'Duplicate keys:.*42'):
            msa.keys = [42, 42]

        # original state is maintained
        npt.assert_array_equal(msa.keys, keys)
Example #40
0
    def test_from_dict_to_dict_roundtrip(self):
        d = {}
        self.assertEqual(TabularMSA.from_dict(d).to_dict(), d)

        # can roundtrip even with mixed key types
        d1 = {'a': DNA('CAT'), 42: DNA('TAG')}
        d2 = TabularMSA.from_dict(d1).to_dict()
        self.assertEqual(d2, d1)
        self.assertIs(d1['a'], d2['a'])
        self.assertIs(d1[42], d2[42])
Example #41
0
    def test_reindex_keys_length_mismatch(self):
        msa = TabularMSA([DNA('ACGT'), DNA('TGCA')], key=str)
        keys = np.array(['ACGT', 'TGCA'])
        npt.assert_array_equal(msa.keys, keys)

        with six.assertRaisesRegex(self, ValueError,
                                   'Number.*keys.*number.*sequences: 0 != 2'):
            msa.reindex(keys=iter([]))

        # original state is maintained
        npt.assert_array_equal(msa.keys, keys)
Example #42
0
 def test_sort_on_unorderable_key(self):
     unorderable = Unorderable()
     msa = TabularMSA([
         DNA('AAA', metadata={'id': 42}),
         DNA('ACG', metadata={'id': unorderable})], keys=[42, 43])
     with self.assertRaises(TypeError):
         msa.sort(key='id')
     self.assertEqual(
         msa,
         TabularMSA([
             DNA('AAA', metadata={'id': 42}),
             DNA('ACG', metadata={'id': unorderable})], keys=[42, 43]))
Example #43
0
 def test_sort_on_key_with_all_repeats(self):
     msa = TabularMSA([
         DNA('TTT', metadata={'id': 'a'}),
         DNA('TTT', metadata={'id': 'b'}),
         DNA('TTT', metadata={'id': 'c'})], keys=range(3))
     msa.sort(key=str)
     self.assertEqual(
         msa,
         TabularMSA([
             DNA('TTT', metadata={'id': 'a'}),
             DNA('TTT', metadata={'id': 'b'}),
             DNA('TTT', metadata={'id': 'c'})], keys=range(3)))
Example #44
0
    def test_metadata_getter(self):
        msa = TabularMSA([])
        self.assertIsNone(msa._metadata)
        self.assertEqual(msa.metadata, {})
        self.assertIsNotNone(msa._metadata)
        self.assertIsInstance(msa.metadata, dict)

        msa = TabularMSA([], metadata={42: 'foo', ('hello', 'world'): 43})
        self.assertEqual(msa.metadata, {42: 'foo', ('hello', 'world'): 43})
        self.assertIsInstance(msa.metadata, dict)

        msa.metadata[42] = 'bar'
        self.assertEqual(msa.metadata, {42: 'bar', ('hello', 'world'): 43})
Example #45
0
def filter_positions(alignment_fh, maximum_gap_frequency,
                     maximum_position_entropy):
    """Filter gaps and high entropy positions from an alignment."""

    with alignment_fh:
        try:
            aln = TabularMSA.read(alignment_fh, constructor=DNA)
        except ValueError:
            alignment_fh.seek(0)
            aln = TabularMSA.read(alignment_fh, constructor=RNA)
    aln = _filter_gap_positions(aln, maximum_gap_frequency)
    aln = _filter_high_entropy_positions(aln, maximum_position_entropy)
    return aln
Example #46
0
    def test_metadata_setter_makes_shallow_copy(self):
        msa = TabularMSA([RNA('-.-'), RNA('.-.')])
        md = {'foo': 'bar', 42: []}
        msa.metadata = md

        self.assertEqual(msa.metadata, md)
        self.assertIsNot(msa.metadata, md)

        md['foo'] = 'baz'
        self.assertEqual(msa.metadata, {'foo': 'bar', 42: []})

        md[42].append(True)
        self.assertEqual(msa.metadata, {'foo': 'bar', 42: [True]})
Example #47
0
 def test_reformat_treepuzzle(self):
     """ Test functionality of reformat_treepuzzle()
     """
     species_tree = TreeNode.read(self.species_tree_fp, format='newick')
     gene_tree_3 = TreeNode.read(self.gene_tree_3_fp, format='newick')
     output_tree_fp = join(self.working_dir, "joined_trees.nwk")
     output_msa_phy_fp = join(self.working_dir, "gene_tree_3.phy")
     reformat_treepuzzle(gene_tree_3,
                         species_tree,
                         self.msa_fa_3_fp,
                         output_tree_fp,
                         output_msa_phy_fp)
     reformat_tree_exp = [
         "(((((((SE001:2.1494877,SE010:1.08661):3.7761166,SE008:"
         "0.86305436):0.21024487,(SE006:0.56704221,SE009:0.5014676):"
         "0.90294223):0.20542323,SE005:3.0992506):0.37145632,SE004:"
         "1.8129133):0.72933621,SE003:1.737411):0.24447835,(SE002:"
         "1.6606127,SE007:0.70000178):1.6331374);\n",
         "(((((((SE001:2.1494876,SE010:2.1494876):"
         "3.7761166,SE008:5.9256042):0.2102448,(SE006:"
         "5.2329068,SE009:5.2329068):0.9029422):0.2054233,"
         "SE005:6.3412723):0.3714563,SE004:6.7127286):"
         "0.7293362,SE003:7.4420648):0.2444784,SE002:"
         "7.6865432);\n"]
     with open(output_tree_fp, 'r') as output_tree_f:
         reformat_tree_act = output_tree_f.readlines()
     self.assertListEqual(reformat_tree_exp, reformat_tree_act)
     msa_fa = TabularMSA.read(output_msa_phy_fp, constructor=Protein)
     labels_exp = [u'SE001', u'SE002', u'SE003', u'SE004', u'SE005',
                   u'SE006', u'SE008', u'SE009', u'SE010']
     labels_act = list(msa_fa.index)
     self.assertListEqual(labels_exp, labels_act)
Example #48
0
 def test_sort_on_key_with_some_repeats(self):
     msa = TabularMSA([
         DNA('TCCG', metadata={'id': 10}),
         DNA('TAGG', metadata={'id': 10}),
         DNA('GGGG', metadata={'id': 8}),
         DNA('ACGT', metadata={'id': 0}),
         DNA('TAGG', metadata={'id': 10})], keys=range(5))
     msa.sort(key='id')
     self.assertEqual(
         msa,
         TabularMSA([
             DNA('ACGT', metadata={'id': 0}),
             DNA('GGGG', metadata={'id': 8}),
             DNA('TCCG', metadata={'id': 10}),
             DNA('TAGG', metadata={'id': 10}),
             DNA('TAGG', metadata={'id': 10})], keys=[3, 2, 0, 1, 4]))
Example #49
0
def aln_distmat(alignment, reps=3):
    '''Calculate pairwise distances from a MSA of genomes'''
    aln = TabularMSA.read(alignment, constructor=DNA)
    aln.reassign_index(minter="id")
    dist = DistanceMatrix.from_iterable([seq.values for seq in aln],
                                        metric=hamming, keys=aln.index)
    return dist
Example #50
0
def filter_positions(alignment_fh, maximum_gap_frequency,
                     maximum_position_entropy):
    """Filter gaps and high entropy positions from an alignment."""
    aln = TabularMSA.read(alignment_fh, constructor=DNA)
    aln = _filter_gap_positions(aln, maximum_gap_frequency)
    aln = _filter_high_entropy_positions(aln, maximum_position_entropy)
    return aln
Example #51
0
    def test_keys_update_subset_of_keys(self):
        # keys can be copied, modified, then re-set
        msa = TabularMSA([DNA('AC'), DNA('AG'), DNA('AT')], key=str)
        npt.assert_array_equal(msa.keys, np.array(['AC', 'AG', 'AT']))

        new_keys = msa.keys.copy()
        new_keys[1] = 42
        msa.keys = new_keys
        npt.assert_array_equal(msa.keys,
                               np.array(['AC', 42, 'AT'], dtype=object))

        self.assertFalse(msa.keys.flags.writeable)
        self.assertTrue(new_keys.flags.writeable)
        new_keys[1] = 'GG'
        npt.assert_array_equal(msa.keys,
                               np.array(['AC', 42, 'AT'], dtype=object))
Example #52
0
 def test_from_dict_multiple_sequences(self):
     msa = TabularMSA.from_dict(
         {1: DNA('ACG'), 2: DNA('GGG'), 3: DNA('TAG')})
     # Sort because order is arbitrary.
     msa.sort()
     self.assertEqual(
         msa,
         TabularMSA([DNA('ACG'), DNA('GGG'), DNA('TAG')], keys=[1, 2, 3]))
Example #53
0
    def test_sort_single_sequence_with_msa_keys_on_callable_key(self):
        msa = TabularMSA([RNA('UCA')], keys=['foo'])
        msa.sort(key=str)
        self.assertEqual(msa, TabularMSA([RNA('UCA')], keys=['foo']))

        msa = TabularMSA([RNA('UCA')], keys=['foo'])
        msa.sort(key=str, reverse=True)
        self.assertEqual(msa, TabularMSA([RNA('UCA')], keys=['foo']))
Example #54
0
    def test_sort_single_sequence_on_msa_keys(self):
        msa = TabularMSA([DNA('ACGT')], keys=[42])
        msa.sort()
        self.assertEqual(msa, TabularMSA([DNA('ACGT')], keys=[42]))

        msa = TabularMSA([DNA('ACGT')], keys=[42])
        msa.sort(reverse=True)
        self.assertEqual(msa, TabularMSA([DNA('ACGT')], keys=[42]))
Example #55
0
    def test_sort_single_sequence_no_msa_keys_on_metadata_key(self):
        msa = TabularMSA([RNA('UCA', metadata={'id': 42})])
        msa.sort(key='id')
        self.assertEqual(msa, TabularMSA([RNA('UCA', metadata={'id': 42})]))

        msa = TabularMSA([RNA('UCA', metadata={'id': 42})])
        msa.sort(key='id', reverse=True)
        self.assertEqual(msa, TabularMSA([RNA('UCA', metadata={'id': 42})]))
Example #56
0
    def test_sort_empty_with_msa_keys_on_metadata_key(self):
        msa = TabularMSA([], keys=[])
        msa.sort(key='id')
        self.assertEqual(msa, TabularMSA([], keys=[]))

        msa = TabularMSA([], keys=[])
        msa.sort(key='id', reverse=True)
        self.assertEqual(msa, TabularMSA([], keys=[]))
Example #57
0
    def test_sort_empty_on_msa_keys(self):
        msa = TabularMSA([], keys=[])
        msa.sort()
        self.assertEqual(msa, TabularMSA([], keys=[]))

        msa = TabularMSA([], keys=[])
        msa.sort(reverse=True)
        self.assertEqual(msa, TabularMSA([], keys=[]))
Example #58
0
    def test_sort_empty_with_msa_keys_on_callable_key(self):
        msa = TabularMSA([], keys=[])
        msa.sort(key=str)
        self.assertEqual(msa, TabularMSA([], keys=[]))

        msa = TabularMSA([], keys=[])
        msa.sort(key=str, reverse=True)
        self.assertEqual(msa, TabularMSA([], keys=[]))
Example #59
0
    def test_metadata_setter(self):
        msa = TabularMSA([DNA('A-A'), DNA('A-G')])
        self.assertFalse(msa.has_metadata())

        msa.metadata = {'hello': 'world'}
        self.assertTrue(msa.has_metadata())
        self.assertEqual(msa.metadata, {'hello': 'world'})

        msa.metadata = {}
        self.assertFalse(msa.has_metadata())
Example #60
0
    def test_has_keys(self):
        self.assertFalse(TabularMSA([]).has_keys())
        self.assertTrue(TabularMSA([], key=str).has_keys())

        self.assertFalse(TabularMSA([DNA('')]).has_keys())
        self.assertTrue(TabularMSA([DNA('')], key=str).has_keys())

        self.assertFalse(TabularMSA([DNA('ACG'), DNA('GCA')]).has_keys())
        self.assertTrue(
            TabularMSA([DNA('ACG', metadata={'id': 1}),
                        DNA('GCA', metadata={'id': 2})], key='id').has_keys())

        msa = TabularMSA([])
        self.assertFalse(msa.has_keys())
        msa.reindex(key=str)
        self.assertTrue(msa.has_keys())
        msa.reindex()
        self.assertFalse(msa.has_keys())