Example #1
0
    def test_ProfileDistance_dynamic_smooth(self):
        # If we use function=min and threshold=0, we should get the following
        # transformation:
        #
        #           | before           | after
        # ----------+------------------+-----------------
        #           | 0111111111111011 | 3000111111113000
        # profile A | ACGTACGTACGTACGT | ACGTACGTACGTACGT
        #           | AAAACCCCGGGGTTTT | AAAACCCCGGGGTTTT
        # ----------+------------------+-----------------
        #           | 0101111111111111 | 2000111111114000
        # profile B | ACGTACGTACGTACGT | ACGTACGTACGTACGT
        #           | AAAACCCCGGGGTTTT | AAAACCCCGGGGTTTT
        counts_a = Counter(['AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TG', 'TT'])
        counts_b = Counter(['AC', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT'])

        profile_a = klib.Profile(utils.as_array(counts_a, 2))
        profile_b = klib.Profile(utils.as_array(counts_b, 2))

        k_dist = kdistlib.ProfileDistance()
        k_dist.dynamic_smooth(profile_a, profile_b)

        counts_a = Counter(['AA', 'AA', 'AA', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TA', 'TA'])
        counts_b = Counter(['AA', 'AA', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TA', 'TA', 'TA'])

        np.testing.assert_array_equal(profile_a.counts, utils.as_array(counts_a, 2))
        np.testing.assert_array_equal(profile_b.counts, utils.as_array(counts_b, 2))
Example #2
0
    def test_ProfileDistance_distance(self):
        counts_a = Counter(['AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TG', 'TT'])
        counts_b = Counter(['AC', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT'])

        profile_a = klib.Profile(utils.as_array(counts_a, 2))
        profile_b = klib.Profile(utils.as_array(counts_b, 2))

        k_dist = kdistlib.ProfileDistance()
        assert k_dist.distance(profile_a, profile_b) == 0.0625
Example #3
0
    def test_profile_merge(self):
        counts_left = utils.counts(utils.SEQUENCES_LEFT, 8)
        counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8)

        profile_left = klib.Profile(utils.as_array(counts_left, 8))
        profile_right = klib.Profile(utils.as_array(counts_right, 8))

        profile_left.merge(profile_right)
        utils.test_profile(profile_left, counts_left + counts_right, 8)
Example #4
0
    def test_profile_merge(self):
        counts_left = utils.counts(utils.SEQUENCES_LEFT, 8)
        counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8)

        profile_left = klib.Profile(utils.as_array(counts_left, 8))
        profile_right = klib.Profile(utils.as_array(counts_right, 8))

        profile_left.merge(profile_right)
        utils.test_profile(profile_left, counts_left + counts_right, 8)
Example #5
0
    def test_ProfileDistance_distance_k8(self):
        counts_a = utils.counts(utils.SEQUENCES_LEFT, 8)
        counts_b = utils.counts(utils.SEQUENCES_RIGHT, 8)

        profile_a = klib.Profile(utils.as_array(counts_a, 8))
        profile_b = klib.Profile(utils.as_array(counts_b, 8))

        k_dist = kdistlib.ProfileDistance()
        np.testing.assert_almost_equal(k_dist.distance(profile_a, profile_b), 0.4626209322)
Example #6
0
    def test_ProfileDistance_distance_k8(self):
        counts_a = utils.counts(utils.SEQUENCES_LEFT, 8)
        counts_b = utils.counts(utils.SEQUENCES_RIGHT, 8)

        profile_a = klib.Profile(utils.as_array(counts_a, 8))
        profile_b = klib.Profile(utils.as_array(counts_b, 8))

        k_dist = kdistlib.ProfileDistance()
        np.testing.assert_almost_equal(k_dist.distance(profile_a, profile_b),
                                       0.4626209322)
Example #7
0
    def test_get_stats(self):
        counts = utils.counts(utils.SEQUENCES, 8)
        out = StringIO()

        with utils.open_profile(self.profile(counts, 8)) as input_handle:
            kmer.get_stats(input_handle, out)

        name, mean, std = out.getvalue().strip().split()
        assert name == '1'
        assert mean == '%.10f' % np.mean(utils.as_array(counts, 8))
        assert std == '%.10f' % np.std(utils.as_array(counts, 8))
Example #8
0
    def test_get_stats(self):
        counts = utils.counts(utils.SEQUENCES, 8)
        out = StringIO()

        with utils.open_profile(self.profile(counts, 8)) as input_handle:
            kmer.get_stats(input_handle, out)

        name, mean, std = out.getvalue().strip().split()
        assert name == '1'
        assert mean == '%.10f' % np.mean(utils.as_array(counts, 8))
        assert std == '%.10f' % np.std(utils.as_array(counts, 8))
Example #9
0
    def test_ProfileDistance_distance_unmodified(self):
        counts_a = utils.counts(utils.SEQUENCES_LEFT, 8)
        counts_b = utils.counts(utils.SEQUENCES_RIGHT, 8)

        profile_a = klib.Profile(utils.as_array(counts_a, 8))
        profile_b = klib.Profile(utils.as_array(counts_b, 8))

        k_dist = kdistlib.ProfileDistance(do_balance=True)
        k_dist.distance(profile_a, profile_b)

        utils.test_profile(profile_a, counts_a, 8)
        utils.test_profile(profile_b, counts_b, 8)
Example #10
0
    def test_distance_matrix_two(self):
        counts_left = utils.counts(utils.SEQUENCES_LEFT, 8)
        counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8)

        profiles = [klib.Profile(utils.as_array(counts_left, 8), 'a'),
                    klib.Profile(utils.as_array(counts_right, 8), 'b')]

        k_dist = kdistlib.ProfileDistance()
        out = StringIO()
        kdistlib.distance_matrix(profiles, out, 2, k_dist)

        assert out.getvalue().strip().split('\n') == ['2', 'a', 'b', '0.46']
Example #11
0
    def test_ProfileDistance_distance_unmodified(self):
        counts_a = utils.counts(utils.SEQUENCES_LEFT, 8)
        counts_b = utils.counts(utils.SEQUENCES_RIGHT, 8)

        profile_a = klib.Profile(utils.as_array(counts_a, 8))
        profile_b = klib.Profile(utils.as_array(counts_b, 8))

        k_dist = kdistlib.ProfileDistance(do_balance=True)
        k_dist.distance(profile_a, profile_b)

        utils.test_profile(profile_a, counts_a, 8)
        utils.test_profile(profile_b, counts_b, 8)
Example #12
0
    def test_distance_matrix_two(self):
        counts_left = utils.counts(utils.SEQUENCES_LEFT, 8)
        counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8)

        profiles = [
            klib.Profile(utils.as_array(counts_left, 8), 'a'),
            klib.Profile(utils.as_array(counts_right, 8), 'b')
        ]

        k_dist = kdistlib.ProfileDistance()
        out = StringIO()
        kdistlib.distance_matrix(profiles, out, 2, k_dist)

        assert out.getvalue().strip().split('\n') == ['2', 'a', 'b', '0.46']
Example #13
0
    def test_ProfileDistance_distance(self):
        counts_a = Counter([
            'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT',
            'TA', 'TG', 'TT'
        ])
        counts_b = Counter([
            'AC', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA',
            'TC', 'TG', 'TT'
        ])

        profile_a = klib.Profile(utils.as_array(counts_a, 2))
        profile_b = klib.Profile(utils.as_array(counts_b, 2))

        k_dist = kdistlib.ProfileDistance()
        assert k_dist.distance(profile_a, profile_b) == 0.0625
Example #14
0
    def test_profile_reverse_complement_palindrome(self):
        counts = utils.counts(['ACCTAGGT'], 8)
        profile = klib.Profile(utils.as_array(counts, 8))

        for i in range(profile.length):
            assert (profile.binary_to_dna(profile.reverse_complement(i)) ==
                    utils.reverse_complement(profile.binary_to_dna(i)))
Example #15
0
    def _test_profile_split(self, sequences, length):
        counts = utils.counts(sequences, length)
        profile = klib.Profile(utils.as_array(counts, length))
        left, right = profile.split()

        assert len(left) == len(right)
        assert sum(left) + sum(right) == sum(counts.values()) * 2

        indices_left = {}
        indices_right = {}
        indices_palindrome = {}

        for s, c in counts.items():
            r = utils.reverse_complement(s)
            if s < r:
                indices_left[utils.count_index(s)] = c * 2
            elif s > r:
                indices_right[utils.count_index(r)] = counts[s] * 2
            else:
                indices_palindrome[utils.count_index(s)] = c

        assert ([c for c in left if c > 0] == [
            c for i, c in sorted(
                list(indices_left.items()) + list(indices_palindrome.items()))
        ])
        assert ([c for c in right if c > 0] == [
            c for i, c in sorted(
                list(indices_right.items()) + list(indices_palindrome.items()))
        ])
Example #16
0
    def test_profile_reverse_complement(self):
        counts = utils.counts(utils.SEQUENCES, 8)
        profile = klib.Profile(utils.as_array(counts, 8))

        for i in range(profile.length):
            assert (profile.binary_to_dna(profile.reverse_complement(i)) ==
                    utils.reverse_complement(profile.binary_to_dna(i)))
Example #17
0
    def _test_profile_split(self, sequences, length):
        counts = utils.counts(sequences, length)
        profile = klib.Profile(utils.as_array(counts, length))
        left, right = profile.split()

        assert len(left) == len(right)
        assert sum(left) + sum(right) == sum(counts.values()) * 2

        indices_left = {}
        indices_right = {}
        indices_palindrome = {}

        for s, c in counts.items():
            r = utils.reverse_complement(s)
            if s < r:
                indices_left[utils.count_index(s)] = c * 2
            elif s > r:
                indices_right[utils.count_index(r)] = counts[s] * 2
            else:
                indices_palindrome[utils.count_index(s)] = c

        assert ([c for c in left if c > 0] ==
                [c for i, c in sorted(list(indices_left.items()) +
                                      list(indices_palindrome.items()))])
        assert ([c for c in right if c > 0] ==
                [c for i, c in sorted(list(indices_right.items()) +
                                      list(indices_palindrome.items()))])
Example #18
0
    def test_profile_balance(self):
        counts = utils.counts(utils.SEQUENCES, 8)
        profile = klib.Profile(utils.as_array(counts, 8))
        profile.balance()

        counts.update(dict((utils.reverse_complement(s), c)
                           for s, c in counts.items()))
        utils.test_profile(profile, counts, 8)
Example #19
0
    def test_profile_reverse_complement_palindrome(self):
        counts = utils.counts(['ACCTAGGT'], 8)
        profile = klib.Profile(utils.as_array(counts, 8))

        for i in range(profile.length):
            assert (profile.binary_to_dna(
                profile.reverse_complement(i)) == utils.reverse_complement(
                    profile.binary_to_dna(i)))
Example #20
0
    def test_profile_reverse_complement(self):
        counts = utils.counts(utils.SEQUENCES, 8)
        profile = klib.Profile(utils.as_array(counts, 8))

        for i in range(profile.length):
            assert (profile.binary_to_dna(
                profile.reverse_complement(i)) == utils.reverse_complement(
                    profile.binary_to_dna(i)))
Example #21
0
    def test_profile_balance_palindrome(self):
        counts = utils.counts(['AATT'], 4)
        profile = klib.Profile(utils.as_array(counts, 4))
        profile.balance()

        counts.update(
            dict((utils.reverse_complement(s), c) for s, c in counts.items()))
        utils.test_profile(profile, counts, 4)
Example #22
0
    def test_profile_balance(self):
        counts = utils.counts(utils.SEQUENCES, 8)
        profile = klib.Profile(utils.as_array(counts, 8))
        profile.balance()

        counts.update(
            dict((utils.reverse_complement(s), c) for s, c in counts.items()))
        utils.test_profile(profile, counts, 8)
Example #23
0
    def test_profile_balance_palindrome(self):
        counts = utils.counts(['AATT'], 4)
        profile = klib.Profile(utils.as_array(counts, 4))
        profile.balance()

        counts.update(dict((utils.reverse_complement(s), c)
                           for s, c in counts.items()))
        utils.test_profile(profile, counts, 4)
Example #24
0
    def test_profile_print_counts(self, capsys):
        counts = utils.counts(utils.SEQUENCES, 4)
        profile = klib.Profile(utils.as_array(counts, 4))
        profile.print_counts()

        out, err = capsys.readouterr()
        assert out == ''.join('%s %d\n' % (''.join(s), counts[''.join(s)])
                              for s in itertools.product('ACGT', repeat=4))
Example #25
0
    def test_profile_print_counts(self, capsys):
        counts = utils.counts(utils.SEQUENCES, 4)
        profile = klib.Profile(utils.as_array(counts, 4))
        profile.print_counts()

        out, err = capsys.readouterr()
        assert out == ''.join('%s %d\n' % (''.join(s), counts[''.join(s)])
                              for s in itertools.product('ACGT', repeat=4))
Example #26
0
    def test_profile_shrink_max(self):
        counts = utils.counts(utils.SEQUENCES, 4)
        profile = klib.Profile(utils.as_array(counts, 4))
        profile.shrink(3)

        counts = Counter(
            dict((t, sum(counts[u] for u in counts if u.startswith(t)))
                 for t in set(s[:-3] for s in counts)))
        utils.test_profile(profile, counts, 1)
Example #27
0
    def test_profile_shrink_max(self):
        counts = utils.counts(utils.SEQUENCES, 4)
        profile = klib.Profile(utils.as_array(counts, 4))
        profile.shrink(3)

        counts = Counter(dict((t, sum(counts[u] for u in counts
                                            if u.startswith(t)))
                                    for t in set(s[:-3] for s in counts)))
        utils.test_profile(profile, counts, 1)
Example #28
0
    def test_profile_save(self):
        counts = utils.counts(utils.SEQUENCES, 4)
        profile = klib.Profile(utils.as_array(counts, 4))

        filename = self.empty()
        with utils.open_profile(filename, 'w') as profile_handle:
            profile.save(profile_handle)

        utils.test_profile_file(filename, counts, 4)
Example #29
0
    def test_profile_save(self):
        counts = utils.counts(utils.SEQUENCES, 4)
        profile = klib.Profile(utils.as_array(counts, 4))

        filename = self.empty()
        with utils.open_profile(filename, 'w') as profile_handle:
            profile.save(profile_handle)

        utils.test_profile_file(filename, counts, 4)
Example #30
0
    def test_profile_shuffle(self):
        counts = utils.counts(utils.SEQUENCES, 2)
        profile = klib.Profile(utils.as_array(counts, 2))

        np.random.seed(100)
        profile.shuffle()

        counts = dict(zip([''.join(s) for s in itertools.product('ACGT', repeat=2)],
                          [13,  7,  6, 18, 12,  1, 13, 17, 16, 12, 23, 27, 24, 17, 18, 12]))
        utils.test_profile(profile, counts, 2)
Example #31
0
    def test_distance_matrix_one(self):
        counts = utils.counts(utils.SEQUENCES, 8)

        profiles = [klib.Profile(utils.as_array(counts, 8), 'a')]

        k_dist = kdistlib.ProfileDistance()
        out = StringIO()
        kdistlib.distance_matrix(profiles, out, 2, k_dist)

        assert out.getvalue().strip().split('\n') == ['1', 'a']
Example #32
0
    def test_distribution(self):
        counts = utils.counts(utils.SEQUENCES, 8)
        out = StringIO()

        with utils.open_profile(self.profile(counts, 8)) as input_handle:
            kmer.distribution(input_handle, out)

        counter = Counter(utils.as_array(counts, 8))
        assert out.getvalue() == '\n'.join('1 %i %i' % x
                                           for x in sorted(counter.items())) + '\n'
Example #33
0
    def test_distribution(self):
        counts = utils.counts(utils.SEQUENCES, 8)
        out = StringIO()

        with utils.open_profile(self.profile(counts, 8)) as input_handle:
            kmer.distribution(input_handle, out)

        counter = Counter(utils.as_array(counts, 8))
        assert out.getvalue() == '\n'.join('1 %i %i' % x
                                           for x in sorted(counter.items())) + '\n'
Example #34
0
    def test_distance_matrix_one(self):
        counts = utils.counts(utils.SEQUENCES, 8)

        profiles = [klib.Profile(utils.as_array(counts, 8), 'a')]

        k_dist = kdistlib.ProfileDistance()
        out = StringIO()
        kdistlib.distance_matrix(profiles, out, 2, k_dist)

        assert out.getvalue().strip().split('\n') == ['1', 'a']
Example #35
0
    def test_profile_shuffle(self):
        counts = utils.counts(utils.SEQUENCES, 2)
        profile = klib.Profile(utils.as_array(counts, 2))

        np.random.seed(100)
        profile.shuffle()

        counts = dict(
            zip([''.join(s) for s in itertools.product('ACGT', repeat=2)],
                [13, 7, 6, 18, 12, 1, 13, 17, 16, 12, 23, 27, 24, 17, 18, 12]))
        utils.test_profile(profile, counts, 2)
Example #36
0
    def test_ProfileDistance_dynamic_smooth(self):
        # If we use function=min and threshold=0, we should get the following
        # transformation:
        #
        #           | before           | after
        # ----------+------------------+-----------------
        #           | 0111111111111011 | 3000111111113000
        # profile A | ACGTACGTACGTACGT | ACGTACGTACGTACGT
        #           | AAAACCCCGGGGTTTT | AAAACCCCGGGGTTTT
        # ----------+------------------+-----------------
        #           | 0101111111111111 | 2000111111114000
        # profile B | ACGTACGTACGTACGT | ACGTACGTACGTACGT
        #           | AAAACCCCGGGGTTTT | AAAACCCCGGGGTTTT
        counts_a = Counter([
            'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT',
            'TA', 'TG', 'TT'
        ])
        counts_b = Counter([
            'AC', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA',
            'TC', 'TG', 'TT'
        ])

        profile_a = klib.Profile(utils.as_array(counts_a, 2))
        profile_b = klib.Profile(utils.as_array(counts_b, 2))

        k_dist = kdistlib.ProfileDistance()
        k_dist.dynamic_smooth(profile_a, profile_b)

        counts_a = Counter([
            'AA', 'AA', 'AA', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT',
            'TA', 'TA', 'TA'
        ])
        counts_b = Counter([
            'AA', 'AA', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA',
            'TA', 'TA', 'TA'
        ])

        np.testing.assert_array_equal(profile_a.counts,
                                      utils.as_array(counts_a, 2))
        np.testing.assert_array_equal(profile_b.counts,
                                      utils.as_array(counts_b, 2))
Example #37
0
    def test_profile_ratios_matrix(self):
        counts = utils.counts(utils.SEQUENCES, 4)
        profile = klib.Profile(utils.as_array(counts, 4))

        ratios = profile._ratios_matrix()
        total = sum(counts.values())

        for left in itertools.product('ACGT', repeat=4):
            for right in itertools.product('ACGT', repeat=4):
                left = ''.join(left)
                right = ''.join(right)
                ratio = ratios[utils.count_index(left)][utils.count_index(right)]
                try:
                    assert ratio == counts[left] / counts[right] / total
                except ZeroDivisionError:
                    assert ratio == -1.0
Example #38
0
    def test_profile_freq_diff_matrix(self):
        counts = utils.counts(utils.SEQUENCES, 4)
        profile = klib.Profile(utils.as_array(counts, 4))
        freq_diffs = profile._freq_diff_matrix()

        total = sum(counts.values())

        for left in itertools.product('ACGT', repeat=4):
            for right in itertools.product('ACGT', repeat=4):
                left = ''.join(left)
                right = ''.join(right)
                freq_diff = freq_diffs[utils.count_index(left)][utils.count_index(right)]
                if counts[right] > 0:
                    assert freq_diff == abs(counts[left] - counts[right]) / total
                else:
                    assert freq_diff == 0
Example #39
0
    def test_profile_ratios_matrix(self):
        counts = utils.counts(utils.SEQUENCES, 4)
        profile = klib.Profile(utils.as_array(counts, 4))

        ratios = profile._ratios_matrix()
        total = sum(counts.values())

        for left in itertools.product('ACGT', repeat=4):
            for right in itertools.product('ACGT', repeat=4):
                left = ''.join(left)
                right = ''.join(right)
                ratio = ratios[utils.count_index(left)][utils.count_index(
                    right)]
                try:
                    assert ratio == counts[left] / counts[right] / total
                except ZeroDivisionError:
                    assert ratio == -1.0
Example #40
0
    def test_profile_freq_diff_matrix(self):
        counts = utils.counts(utils.SEQUENCES, 4)
        profile = klib.Profile(utils.as_array(counts, 4))
        freq_diffs = profile._freq_diff_matrix()

        total = sum(counts.values())

        for left in itertools.product('ACGT', repeat=4):
            for right in itertools.product('ACGT', repeat=4):
                left = ''.join(left)
                right = ''.join(right)
                freq_diff = freq_diffs[utils.count_index(left)][
                    utils.count_index(right)]
                if counts[right] > 0:
                    assert freq_diff == abs(counts[left] -
                                            counts[right]) / total
                else:
                    assert freq_diff == 0
Example #41
0
 def test_profile(self):
     counts = utils.counts(utils.SEQUENCES, 8)
     profile = klib.Profile(utils.as_array(counts, 8))
     utils.test_profile(profile, counts, 8)
Example #42
0
 def test_profile(self):
     counts = utils.counts(utils.SEQUENCES, 8)
     profile = klib.Profile(utils.as_array(counts, 8))
     utils.test_profile(profile, counts, 8)
Example #43
0
    def test_profile_binary_to_dna(self):
        counts = utils.counts(utils.SEQUENCES, 4)
        profile = klib.Profile(utils.as_array(counts, 4))

        for i, s in enumerate(itertools.product('ACGT', repeat=4)):
            assert ''.join(s) == profile.binary_to_dna(i)
Example #44
0
 def test_profile_shrink_invalid(self):
     counts = utils.counts(utils.SEQUENCES, 4)
     profile = klib.Profile(utils.as_array(counts, 4))
     with pytest.raises(ValueError):
         profile.shrink(4)
Example #45
0
 def test_profile_shrink_invalid(self):
     counts = utils.counts(utils.SEQUENCES, 4)
     profile = klib.Profile(utils.as_array(counts, 4))
     with pytest.raises(ValueError):
         profile.shrink(4)
Example #46
0
    def test_profile_binary_to_dna(self):
        counts = utils.counts(utils.SEQUENCES, 4)
        profile = klib.Profile(utils.as_array(counts, 4))

        for i, s in enumerate(itertools.product('ACGT', repeat=4)):
            assert ''.join(s) == profile.binary_to_dna(i)
Example #47
0
 def test_profile_name_with_slash(self):
     counts = utils.counts(utils.SEQUENCES, 4)
     with pytest.raises(ValueError):
         klib.Profile(utils.as_array(counts, 4), name='abc/def')
Example #48
0
 def test_profile_name_with_slash(self):
     counts = utils.counts(utils.SEQUENCES, 4)
     with pytest.raises(ValueError):
         klib.Profile(utils.as_array(counts, 4), name='abc/def')