def test_ProfileDistance_dynamic_smooth(self): # If we use function=min and threshold=0, we should get the following # transformation: # # | before | after # ----------+------------------+----------------- # | 0111111111111011 | 3000111111113000 # profile A | ACGTACGTACGTACGT | ACGTACGTACGTACGT # | AAAACCCCGGGGTTTT | AAAACCCCGGGGTTTT # ----------+------------------+----------------- # | 0101111111111111 | 2000111111114000 # profile B | ACGTACGTACGTACGT | ACGTACGTACGTACGT # | AAAACCCCGGGGTTTT | AAAACCCCGGGGTTTT counts_a = Counter(['AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TG', 'TT']) counts_b = Counter(['AC', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT']) profile_a = klib.Profile(utils.as_array(counts_a, 2)) profile_b = klib.Profile(utils.as_array(counts_b, 2)) k_dist = kdistlib.ProfileDistance() k_dist.dynamic_smooth(profile_a, profile_b) counts_a = Counter(['AA', 'AA', 'AA', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TA', 'TA']) counts_b = Counter(['AA', 'AA', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TA', 'TA', 'TA']) np.testing.assert_array_equal(profile_a.counts, utils.as_array(counts_a, 2)) np.testing.assert_array_equal(profile_b.counts, utils.as_array(counts_b, 2))
def test_ProfileDistance_distance(self): counts_a = Counter(['AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TG', 'TT']) counts_b = Counter(['AC', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT']) profile_a = klib.Profile(utils.as_array(counts_a, 2)) profile_b = klib.Profile(utils.as_array(counts_b, 2)) k_dist = kdistlib.ProfileDistance() assert k_dist.distance(profile_a, profile_b) == 0.0625
def test_profile_merge(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) profile_left = klib.Profile(utils.as_array(counts_left, 8)) profile_right = klib.Profile(utils.as_array(counts_right, 8)) profile_left.merge(profile_right) utils.test_profile(profile_left, counts_left + counts_right, 8)
def test_ProfileDistance_distance_k8(self): counts_a = utils.counts(utils.SEQUENCES_LEFT, 8) counts_b = utils.counts(utils.SEQUENCES_RIGHT, 8) profile_a = klib.Profile(utils.as_array(counts_a, 8)) profile_b = klib.Profile(utils.as_array(counts_b, 8)) k_dist = kdistlib.ProfileDistance() np.testing.assert_almost_equal(k_dist.distance(profile_a, profile_b), 0.4626209322)
def test_get_stats(self): counts = utils.counts(utils.SEQUENCES, 8) out = StringIO() with utils.open_profile(self.profile(counts, 8)) as input_handle: kmer.get_stats(input_handle, out) name, mean, std = out.getvalue().strip().split() assert name == '1' assert mean == '%.10f' % np.mean(utils.as_array(counts, 8)) assert std == '%.10f' % np.std(utils.as_array(counts, 8))
def test_ProfileDistance_distance_unmodified(self): counts_a = utils.counts(utils.SEQUENCES_LEFT, 8) counts_b = utils.counts(utils.SEQUENCES_RIGHT, 8) profile_a = klib.Profile(utils.as_array(counts_a, 8)) profile_b = klib.Profile(utils.as_array(counts_b, 8)) k_dist = kdistlib.ProfileDistance(do_balance=True) k_dist.distance(profile_a, profile_b) utils.test_profile(profile_a, counts_a, 8) utils.test_profile(profile_b, counts_b, 8)
def test_distance_matrix_two(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) profiles = [klib.Profile(utils.as_array(counts_left, 8), 'a'), klib.Profile(utils.as_array(counts_right, 8), 'b')] k_dist = kdistlib.ProfileDistance() out = StringIO() kdistlib.distance_matrix(profiles, out, 2, k_dist) assert out.getvalue().strip().split('\n') == ['2', 'a', 'b', '0.46']
def test_distance_matrix_two(self): counts_left = utils.counts(utils.SEQUENCES_LEFT, 8) counts_right = utils.counts(utils.SEQUENCES_RIGHT, 8) profiles = [ klib.Profile(utils.as_array(counts_left, 8), 'a'), klib.Profile(utils.as_array(counts_right, 8), 'b') ] k_dist = kdistlib.ProfileDistance() out = StringIO() kdistlib.distance_matrix(profiles, out, 2, k_dist) assert out.getvalue().strip().split('\n') == ['2', 'a', 'b', '0.46']
def test_ProfileDistance_distance(self): counts_a = Counter([ 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TG', 'TT' ]) counts_b = Counter([ 'AC', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT' ]) profile_a = klib.Profile(utils.as_array(counts_a, 2)) profile_b = klib.Profile(utils.as_array(counts_b, 2)) k_dist = kdistlib.ProfileDistance() assert k_dist.distance(profile_a, profile_b) == 0.0625
def test_profile_reverse_complement_palindrome(self): counts = utils.counts(['ACCTAGGT'], 8) profile = klib.Profile(utils.as_array(counts, 8)) for i in range(profile.length): assert (profile.binary_to_dna(profile.reverse_complement(i)) == utils.reverse_complement(profile.binary_to_dna(i)))
def _test_profile_split(self, sequences, length): counts = utils.counts(sequences, length) profile = klib.Profile(utils.as_array(counts, length)) left, right = profile.split() assert len(left) == len(right) assert sum(left) + sum(right) == sum(counts.values()) * 2 indices_left = {} indices_right = {} indices_palindrome = {} for s, c in counts.items(): r = utils.reverse_complement(s) if s < r: indices_left[utils.count_index(s)] = c * 2 elif s > r: indices_right[utils.count_index(r)] = counts[s] * 2 else: indices_palindrome[utils.count_index(s)] = c assert ([c for c in left if c > 0] == [ c for i, c in sorted( list(indices_left.items()) + list(indices_palindrome.items())) ]) assert ([c for c in right if c > 0] == [ c for i, c in sorted( list(indices_right.items()) + list(indices_palindrome.items())) ])
def test_profile_reverse_complement(self): counts = utils.counts(utils.SEQUENCES, 8) profile = klib.Profile(utils.as_array(counts, 8)) for i in range(profile.length): assert (profile.binary_to_dna(profile.reverse_complement(i)) == utils.reverse_complement(profile.binary_to_dna(i)))
def _test_profile_split(self, sequences, length): counts = utils.counts(sequences, length) profile = klib.Profile(utils.as_array(counts, length)) left, right = profile.split() assert len(left) == len(right) assert sum(left) + sum(right) == sum(counts.values()) * 2 indices_left = {} indices_right = {} indices_palindrome = {} for s, c in counts.items(): r = utils.reverse_complement(s) if s < r: indices_left[utils.count_index(s)] = c * 2 elif s > r: indices_right[utils.count_index(r)] = counts[s] * 2 else: indices_palindrome[utils.count_index(s)] = c assert ([c for c in left if c > 0] == [c for i, c in sorted(list(indices_left.items()) + list(indices_palindrome.items()))]) assert ([c for c in right if c > 0] == [c for i, c in sorted(list(indices_right.items()) + list(indices_palindrome.items()))])
def test_profile_balance(self): counts = utils.counts(utils.SEQUENCES, 8) profile = klib.Profile(utils.as_array(counts, 8)) profile.balance() counts.update(dict((utils.reverse_complement(s), c) for s, c in counts.items())) utils.test_profile(profile, counts, 8)
def test_profile_reverse_complement_palindrome(self): counts = utils.counts(['ACCTAGGT'], 8) profile = klib.Profile(utils.as_array(counts, 8)) for i in range(profile.length): assert (profile.binary_to_dna( profile.reverse_complement(i)) == utils.reverse_complement( profile.binary_to_dna(i)))
def test_profile_reverse_complement(self): counts = utils.counts(utils.SEQUENCES, 8) profile = klib.Profile(utils.as_array(counts, 8)) for i in range(profile.length): assert (profile.binary_to_dna( profile.reverse_complement(i)) == utils.reverse_complement( profile.binary_to_dna(i)))
def test_profile_balance_palindrome(self): counts = utils.counts(['AATT'], 4) profile = klib.Profile(utils.as_array(counts, 4)) profile.balance() counts.update( dict((utils.reverse_complement(s), c) for s, c in counts.items())) utils.test_profile(profile, counts, 4)
def test_profile_balance(self): counts = utils.counts(utils.SEQUENCES, 8) profile = klib.Profile(utils.as_array(counts, 8)) profile.balance() counts.update( dict((utils.reverse_complement(s), c) for s, c in counts.items())) utils.test_profile(profile, counts, 8)
def test_profile_balance_palindrome(self): counts = utils.counts(['AATT'], 4) profile = klib.Profile(utils.as_array(counts, 4)) profile.balance() counts.update(dict((utils.reverse_complement(s), c) for s, c in counts.items())) utils.test_profile(profile, counts, 4)
def test_profile_print_counts(self, capsys): counts = utils.counts(utils.SEQUENCES, 4) profile = klib.Profile(utils.as_array(counts, 4)) profile.print_counts() out, err = capsys.readouterr() assert out == ''.join('%s %d\n' % (''.join(s), counts[''.join(s)]) for s in itertools.product('ACGT', repeat=4))
def test_profile_shrink_max(self): counts = utils.counts(utils.SEQUENCES, 4) profile = klib.Profile(utils.as_array(counts, 4)) profile.shrink(3) counts = Counter( dict((t, sum(counts[u] for u in counts if u.startswith(t))) for t in set(s[:-3] for s in counts))) utils.test_profile(profile, counts, 1)
def test_profile_shrink_max(self): counts = utils.counts(utils.SEQUENCES, 4) profile = klib.Profile(utils.as_array(counts, 4)) profile.shrink(3) counts = Counter(dict((t, sum(counts[u] for u in counts if u.startswith(t))) for t in set(s[:-3] for s in counts))) utils.test_profile(profile, counts, 1)
def test_profile_save(self): counts = utils.counts(utils.SEQUENCES, 4) profile = klib.Profile(utils.as_array(counts, 4)) filename = self.empty() with utils.open_profile(filename, 'w') as profile_handle: profile.save(profile_handle) utils.test_profile_file(filename, counts, 4)
def test_profile_shuffle(self): counts = utils.counts(utils.SEQUENCES, 2) profile = klib.Profile(utils.as_array(counts, 2)) np.random.seed(100) profile.shuffle() counts = dict(zip([''.join(s) for s in itertools.product('ACGT', repeat=2)], [13, 7, 6, 18, 12, 1, 13, 17, 16, 12, 23, 27, 24, 17, 18, 12])) utils.test_profile(profile, counts, 2)
def test_distance_matrix_one(self): counts = utils.counts(utils.SEQUENCES, 8) profiles = [klib.Profile(utils.as_array(counts, 8), 'a')] k_dist = kdistlib.ProfileDistance() out = StringIO() kdistlib.distance_matrix(profiles, out, 2, k_dist) assert out.getvalue().strip().split('\n') == ['1', 'a']
def test_distribution(self): counts = utils.counts(utils.SEQUENCES, 8) out = StringIO() with utils.open_profile(self.profile(counts, 8)) as input_handle: kmer.distribution(input_handle, out) counter = Counter(utils.as_array(counts, 8)) assert out.getvalue() == '\n'.join('1 %i %i' % x for x in sorted(counter.items())) + '\n'
def test_profile_shuffle(self): counts = utils.counts(utils.SEQUENCES, 2) profile = klib.Profile(utils.as_array(counts, 2)) np.random.seed(100) profile.shuffle() counts = dict( zip([''.join(s) for s in itertools.product('ACGT', repeat=2)], [13, 7, 6, 18, 12, 1, 13, 17, 16, 12, 23, 27, 24, 17, 18, 12])) utils.test_profile(profile, counts, 2)
def test_ProfileDistance_dynamic_smooth(self): # If we use function=min and threshold=0, we should get the following # transformation: # # | before | after # ----------+------------------+----------------- # | 0111111111111011 | 3000111111113000 # profile A | ACGTACGTACGTACGT | ACGTACGTACGTACGT # | AAAACCCCGGGGTTTT | AAAACCCCGGGGTTTT # ----------+------------------+----------------- # | 0101111111111111 | 2000111111114000 # profile B | ACGTACGTACGTACGT | ACGTACGTACGTACGT # | AAAACCCCGGGGTTTT | AAAACCCCGGGGTTTT counts_a = Counter([ 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TG', 'TT' ]) counts_b = Counter([ 'AC', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT' ]) profile_a = klib.Profile(utils.as_array(counts_a, 2)) profile_b = klib.Profile(utils.as_array(counts_b, 2)) k_dist = kdistlib.ProfileDistance() k_dist.dynamic_smooth(profile_a, profile_b) counts_a = Counter([ 'AA', 'AA', 'AA', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TA', 'TA' ]) counts_b = Counter([ 'AA', 'AA', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TA', 'TA', 'TA' ]) np.testing.assert_array_equal(profile_a.counts, utils.as_array(counts_a, 2)) np.testing.assert_array_equal(profile_b.counts, utils.as_array(counts_b, 2))
def test_profile_ratios_matrix(self): counts = utils.counts(utils.SEQUENCES, 4) profile = klib.Profile(utils.as_array(counts, 4)) ratios = profile._ratios_matrix() total = sum(counts.values()) for left in itertools.product('ACGT', repeat=4): for right in itertools.product('ACGT', repeat=4): left = ''.join(left) right = ''.join(right) ratio = ratios[utils.count_index(left)][utils.count_index(right)] try: assert ratio == counts[left] / counts[right] / total except ZeroDivisionError: assert ratio == -1.0
def test_profile_freq_diff_matrix(self): counts = utils.counts(utils.SEQUENCES, 4) profile = klib.Profile(utils.as_array(counts, 4)) freq_diffs = profile._freq_diff_matrix() total = sum(counts.values()) for left in itertools.product('ACGT', repeat=4): for right in itertools.product('ACGT', repeat=4): left = ''.join(left) right = ''.join(right) freq_diff = freq_diffs[utils.count_index(left)][utils.count_index(right)] if counts[right] > 0: assert freq_diff == abs(counts[left] - counts[right]) / total else: assert freq_diff == 0
def test_profile_ratios_matrix(self): counts = utils.counts(utils.SEQUENCES, 4) profile = klib.Profile(utils.as_array(counts, 4)) ratios = profile._ratios_matrix() total = sum(counts.values()) for left in itertools.product('ACGT', repeat=4): for right in itertools.product('ACGT', repeat=4): left = ''.join(left) right = ''.join(right) ratio = ratios[utils.count_index(left)][utils.count_index( right)] try: assert ratio == counts[left] / counts[right] / total except ZeroDivisionError: assert ratio == -1.0
def test_profile_freq_diff_matrix(self): counts = utils.counts(utils.SEQUENCES, 4) profile = klib.Profile(utils.as_array(counts, 4)) freq_diffs = profile._freq_diff_matrix() total = sum(counts.values()) for left in itertools.product('ACGT', repeat=4): for right in itertools.product('ACGT', repeat=4): left = ''.join(left) right = ''.join(right) freq_diff = freq_diffs[utils.count_index(left)][ utils.count_index(right)] if counts[right] > 0: assert freq_diff == abs(counts[left] - counts[right]) / total else: assert freq_diff == 0
def test_profile(self): counts = utils.counts(utils.SEQUENCES, 8) profile = klib.Profile(utils.as_array(counts, 8)) utils.test_profile(profile, counts, 8)
def test_profile_binary_to_dna(self): counts = utils.counts(utils.SEQUENCES, 4) profile = klib.Profile(utils.as_array(counts, 4)) for i, s in enumerate(itertools.product('ACGT', repeat=4)): assert ''.join(s) == profile.binary_to_dna(i)
def test_profile_shrink_invalid(self): counts = utils.counts(utils.SEQUENCES, 4) profile = klib.Profile(utils.as_array(counts, 4)) with pytest.raises(ValueError): profile.shrink(4)
def test_profile_name_with_slash(self): counts = utils.counts(utils.SEQUENCES, 4) with pytest.raises(ValueError): klib.Profile(utils.as_array(counts, 4), name='abc/def')