def test_normal_sequence(self): seq = random_seq(1000) stats = DNAStats(seq, 14, 20, 20) print(seq) print(stats()) print(stats.cost()) assert stats.cost() < 15 print( DNAStats( "CGAGACCACTCGGGACTTCCGGCCATAGCGTACCGTTTTGTGACAAA" "ACCCCCACTCGAACGTGAGAAAACCCTTCTCTTCATGTAATTCCCCCACA" "GTCCGCGGGTCGGTCAAACCTGGATAAGGTAAAGACTAATATCTAAACCT" "GCTGGAGAGTCGAACCGCGGTCTTAGGCCCACGCAGAGTGTATGTTA" "TTCGTCTGCCGCTATATCGGTCAACACTAGTTGACGGATAGGAATGTTGG" "ATTAACGCGTCTCCAACGCTGGGATACCCTCGCAAAATTTTCCCGAT" "ACTATCCGGAATCTCTAACGCCGTTGGTTTGGGCTCCCAACCACCCGTG" "AACTTCTAACACGAGAATCACCGCTGGAGCGCGCGCCTTCTCTCAATT" "TACCTGAGCTTTCGCTTCCTACTTAGCAGAATCGTGAACCTAAATTTTA" "GCAGCTTCAAGTCAGTTACGCTCGACACTTCCGATTCCAGGTAAAATA" "ACCACTTCTAAGGTTCGTGACTGGTTCTCTATTCAACGCACGCGGTGCCC" "TCGCGGGTCCTCTGCTGCCGGGAAGCACATGATTGCCAGCTTGTTAA" "ACAACACAAGGTGGCCAATCTCAAACTCGCATAAGCCCTGTTTTTTCTTG" "CAAGCTGCAACCGAGCATTCCTTCAGTCAGTGGTGGTTTTTCAAAAC" "TATTCCTATGGGTGCTGACACGTGTGTAATTGTTTTCTACTATCTCTCG" "GTTTATAGCGTAGTTGCCGAGGCTATTGAGTCTCCTTTGCTAATAGCT" "AAGGTGGAAATTTTTTTTTTTTTGAACCGGGTGAATATACTTGATACAT" "CAATAGCCCCTAGCGTATTGTACCCGTCACGGGCTCAAATACTCTGCC" "CAGGGCGATACCATGGAAGTTCTCGTAACATACAATGGATCTGGGCCGT" "CATCGCTTGATGCTCTAGAAGAAAAAGCAGAGACCGGCCATTACCGCG" "TCAACTAACACGCCTCAGGCCGGGGTTAACACTAGGTGTGT", 14, 20, 20, )())
def test_count_misprimings_from_slice_case3(self): """repeats are very near the edge.""" repeat = random_seq(30) seq = "N" + repeat + "N" * 100 + revcomp(repeat) + "N" stats = DNAStats(seq, 1, 1, hairpin_window=30) n = stats.count_repeats_from_slice(1, 31) assert n == 1
def test_count_misprimings_from_slice_edge_cases(self, dist_from_left, dist_from_right, left_or_right, rc): """Here one of the repeats is on the 3' (right) end of the sequence.""" repeat = random_seq(30) r1 = repeat r2 = repeat if left_or_right == "left" and rc: r1 = revcomp(r1) elif left_or_right == "right" and rc: r2 = revcomp(r2) seq = "N" * dist_from_left + r1 + "N" * 100 + r2 + "N" * dist_from_right stats = DNAStats(seq, 1, 1, hairpin_window=30) print(seq) print(seq[dist_from_left:dist_from_left + 30]) if left_or_right == "left": n = stats.count_repeats_from_slice(dist_from_left, dist_from_left + 30) assert n == 1 else: n = stats.count_repeats_from_slice(dist_from_left + 30 + 100, dist_from_left + 30 + 100 + 30) assert n == 1
def test_copy(): seq = random_seq(1000) stats = DNAStats(seq, 14, 20, 20) stats2 = stats.copy(slice(None, None)) assert stats is not stats2 assert stats.cost(1, 1000) == stats.cost(1, 1000) print(stats) print(stats2)
def test_hash2(key): s1 = random_seq(1000) kwargs = {"repeat_window": 20, "stats_window": 20, "hairpin_window": 20} kwargs2 = dict(kwargs) kwargs2[key] += 1 stats1 = DNAStats(s1, **kwargs) stats2 = DNAStats(s1, **kwargs) stats3 = DNAStats(s1, **kwargs2) assert hash(stats1) == hash(stats2) assert not hash(stats1) == hash(stats3)
def test_slice(index): seq = random_seq(1000) stats = DNAStats(seq, 14, 20, 20) stats2 = stats[:index] if index is None: assert len(stats2) == len(stats) else: assert len(stats2) == index print(stats.cost()) print(stats2.cost())
def test_count_misprimings_from_slice_case5(self): """Here we have a sequence of Ns with a predicted hairpin at indices. [100:130] and [200:230]. """ repeat = random_seq(30) seq = "N" * 100 + repeat + "N" * 100 + revcomp(repeat) + "N" * 107 stats = DNAStats(seq, 1, 1, hairpin_window=30) n = stats.count_repeats_from_slice(100, 130) assert n == 1
def test_count_misprimings_from_slice_case4(self, ij): """Here we have a sequence of Ns with a predicted hairpin at indices. [100:130] and [160:190] and [260:290]. Evaluating at any of these indices should return exactly 2 sequences. """ repeat = random_seq(30) i, j = ij seq = ("N" * 100 + repeat + "N" * 30 + repeat + "N" * 100 + revcomp(repeat) + "N" * 107) stats = DNAStats(seq, 1, 1, hairpin_window=30) n = stats.count_repeats_from_slice(i, j) assert n == 2
def test_hairpins(self, length__kmer): hairpin_length, kmer = length__kmer hairpin = random_seq(hairpin_length) # make a sequence with a hairpin seq = (random_seq(1000) + hairpin + random_seq(500) + revcomp(hairpin) + random_seq(1000)) # look for hairpins of size kmer stats = DNAStats(seq, 20, 20, kmer) assert stats()["n_repeats"] == 0 assert stats()["n_hairpins"] > 0 print(stats.cost(1000, 1500)) print(stats.cost(None, None))
def find_by_partitions_for_sequence( stats: DNAStats, cyclic: bool, threshold: int, step_size: int = 100, delta: Optional[int] = None, ): """Approximates the best partitions for a sequence. If cyclic=True, then will approximate partitions by also rotating the origin. :param stats: DNAStats instance :param cyclic: whether the sequence is cyclic :param threshold: threshold cost to find a partition :param step_size: step size to find partition. :return: """ f = functools.partial(find_best_partitions, threshold=threshold, step_size=step_size, delta=delta) partitions = f(stats) if cyclic: origin = int(len(stats.seq) / 2.0) seq = stats.seq stats2 = stats.copy_with_new_seq(seq[origin:] + seq[:origin]) partitions2 = f(stats2) partitions += _shift_indices(partitions2, origin, len(seq)) partitions = sorted(set(partitions)) return partitions
def test_hash3(key): s1 = random_seq(1000) kwargs = { "repeat_window": 20, "stats_window": 20, "hairpin_window": 20, "gc_content_threshold": 0.8, "at_content_threshold": 0.8, "base_percentage_threshold": 0.8, } kwargs2 = dict(kwargs) kwargs2[key] += 0.1 stats1 = DNAStats(s1, **kwargs) stats2 = DNAStats(s1, **kwargs) stats3 = DNAStats(s1, **kwargs2) assert hash(stats1) == hash(stats2) assert not hash(stats1) == hash(stats3)
def test_repeats(self, kmer): repeat = random_seq(kmer[0]) seq = random_seq(1000) + repeat + random_seq( 500) + repeat + random_seq(1000) stats = DNAStats(seq, kmer[1], 20, 20) print(stats()) assert stats()["n_repeats"] > 0 assert stats()["n_hairpins"] == 0
def find_best_partitions( stats: DNAStats, threshold: int, i=None, j=None, step_size: int = 100, delta: Optional[int] = None, partitions=None, ): if partitions is None: partitions = [] c = stats.cost(i, j) if c < threshold: return partitions p, pmin = find_fast_opt_partition(stats, i=i, j=j, step_size=step_size, delta=delta) if p is None or p in partitions: return partitions else: partitions.append(p) c1 = stats.cost(i, p) c2 = stats.cost(p, j) if c1 > threshold: find_best_partitions(stats, threshold=threshold, i=i, j=p, partitions=partitions) if c2 > threshold: find_best_partitions(stats, threshold=threshold, i=p, j=j, partitions=partitions) return partitions
def test_case_insensitive(self): seq = random_seq(1000) stats1 = DNAStats(seq.lower(), 20, 20, 20) stats2 = DNAStats(seq.upper(), 20, 20, 20) print(stats1.cost()) assert stats1.cost() == stats2.cost()
def cached_stats_cost(stats: DNAStats, i: int, j: int): return stats.cost(i, j)
def test_count_misprimings_from_slice_case2(self): repeat = random_seq(30) seq = "N" * 100 + repeat + "N" * 100 + repeat + "N" * 107 stats = DNAStats(seq, 1, 1, hairpin_window=30) n = stats.count_repeats_from_slice(100, 130) assert n == 1
def test_rev_signatures(self): seq = "N" * 100 + random_seq(100) + "N" * 101 stats = DNAStats(seq, 1, 1, hairpin_window=30) assert set(stats.rev_signatures[:70]) == {0.0} assert stats.rev_signatures[71] != 0.0 assert stats.rev_signatures[72] != 0.0