Esempio n. 1
0
    def test_normal_sequence(self):
        seq = random_seq(1000)
        stats = DNAStats(seq, 14, 20, 20)
        print(seq)
        print(stats())
        print(stats.cost())
        assert stats.cost() < 15

        print(
            DNAStats(
                "CGAGACCACTCGGGACTTCCGGCCATAGCGTACCGTTTTGTGACAAA"
                "ACCCCCACTCGAACGTGAGAAAACCCTTCTCTTCATGTAATTCCCCCACA"
                "GTCCGCGGGTCGGTCAAACCTGGATAAGGTAAAGACTAATATCTAAACCT"
                "GCTGGAGAGTCGAACCGCGGTCTTAGGCCCACGCAGAGTGTATGTTA"
                "TTCGTCTGCCGCTATATCGGTCAACACTAGTTGACGGATAGGAATGTTGG"
                "ATTAACGCGTCTCCAACGCTGGGATACCCTCGCAAAATTTTCCCGAT"
                "ACTATCCGGAATCTCTAACGCCGTTGGTTTGGGCTCCCAACCACCCGTG"
                "AACTTCTAACACGAGAATCACCGCTGGAGCGCGCGCCTTCTCTCAATT"
                "TACCTGAGCTTTCGCTTCCTACTTAGCAGAATCGTGAACCTAAATTTTA"
                "GCAGCTTCAAGTCAGTTACGCTCGACACTTCCGATTCCAGGTAAAATA"
                "ACCACTTCTAAGGTTCGTGACTGGTTCTCTATTCAACGCACGCGGTGCCC"
                "TCGCGGGTCCTCTGCTGCCGGGAAGCACATGATTGCCAGCTTGTTAA"
                "ACAACACAAGGTGGCCAATCTCAAACTCGCATAAGCCCTGTTTTTTCTTG"
                "CAAGCTGCAACCGAGCATTCCTTCAGTCAGTGGTGGTTTTTCAAAAC"
                "TATTCCTATGGGTGCTGACACGTGTGTAATTGTTTTCTACTATCTCTCG"
                "GTTTATAGCGTAGTTGCCGAGGCTATTGAGTCTCCTTTGCTAATAGCT"
                "AAGGTGGAAATTTTTTTTTTTTTGAACCGGGTGAATATACTTGATACAT"
                "CAATAGCCCCTAGCGTATTGTACCCGTCACGGGCTCAAATACTCTGCC"
                "CAGGGCGATACCATGGAAGTTCTCGTAACATACAATGGATCTGGGCCGT"
                "CATCGCTTGATGCTCTAGAAGAAAAAGCAGAGACCGGCCATTACCGCG"
                "TCAACTAACACGCCTCAGGCCGGGGTTAACACTAGGTGTGT",
                14,
                20,
                20,
            )())
Esempio n. 2
0
 def test_count_misprimings_from_slice_case3(self):
     """repeats are very near the edge."""
     repeat = random_seq(30)
     seq = "N" + repeat + "N" * 100 + revcomp(repeat) + "N"
     stats = DNAStats(seq, 1, 1, hairpin_window=30)
     n = stats.count_repeats_from_slice(1, 31)
     assert n == 1
Esempio n. 3
0
    def test_count_misprimings_from_slice_edge_cases(self, dist_from_left,
                                                     dist_from_right,
                                                     left_or_right, rc):
        """Here one of the repeats is on the 3' (right) end of the sequence."""
        repeat = random_seq(30)
        r1 = repeat
        r2 = repeat

        if left_or_right == "left" and rc:
            r1 = revcomp(r1)
        elif left_or_right == "right" and rc:
            r2 = revcomp(r2)

        seq = "N" * dist_from_left + r1 + "N" * 100 + r2 + "N" * dist_from_right
        stats = DNAStats(seq, 1, 1, hairpin_window=30)
        print(seq)
        print(seq[dist_from_left:dist_from_left + 30])
        if left_or_right == "left":
            n = stats.count_repeats_from_slice(dist_from_left,
                                               dist_from_left + 30)
            assert n == 1
        else:
            n = stats.count_repeats_from_slice(dist_from_left + 30 + 100,
                                               dist_from_left + 30 + 100 + 30)
            assert n == 1
Esempio n. 4
0
def test_copy():
    seq = random_seq(1000)
    stats = DNAStats(seq, 14, 20, 20)
    stats2 = stats.copy(slice(None, None))
    assert stats is not stats2
    assert stats.cost(1, 1000) == stats.cost(1, 1000)
    print(stats)
    print(stats2)
Esempio n. 5
0
def test_hash2(key):
    s1 = random_seq(1000)
    kwargs = {"repeat_window": 20, "stats_window": 20, "hairpin_window": 20}
    kwargs2 = dict(kwargs)
    kwargs2[key] += 1
    stats1 = DNAStats(s1, **kwargs)
    stats2 = DNAStats(s1, **kwargs)
    stats3 = DNAStats(s1, **kwargs2)
    assert hash(stats1) == hash(stats2)
    assert not hash(stats1) == hash(stats3)
Esempio n. 6
0
def test_slice(index):
    seq = random_seq(1000)
    stats = DNAStats(seq, 14, 20, 20)
    stats2 = stats[:index]
    if index is None:
        assert len(stats2) == len(stats)
    else:
        assert len(stats2) == index
    print(stats.cost())
    print(stats2.cost())
Esempio n. 7
0
    def test_count_misprimings_from_slice_case5(self):
        """Here we have a sequence of Ns with a predicted hairpin at indices.

        [100:130] and [200:230].
        """
        repeat = random_seq(30)
        seq = "N" * 100 + repeat + "N" * 100 + revcomp(repeat) + "N" * 107
        stats = DNAStats(seq, 1, 1, hairpin_window=30)
        n = stats.count_repeats_from_slice(100, 130)
        assert n == 1
Esempio n. 8
0
    def test_count_misprimings_from_slice_case4(self, ij):
        """Here we have a sequence of Ns with a predicted hairpin at indices.

        [100:130] and [160:190] and [260:290].

        Evaluating at any of these indices should return exactly 2
        sequences.
        """
        repeat = random_seq(30)
        i, j = ij
        seq = ("N" * 100 + repeat + "N" * 30 + repeat + "N" * 100 +
               revcomp(repeat) + "N" * 107)
        stats = DNAStats(seq, 1, 1, hairpin_window=30)
        n = stats.count_repeats_from_slice(i, j)
        assert n == 2
Esempio n. 9
0
    def test_hairpins(self, length__kmer):
        hairpin_length, kmer = length__kmer
        hairpin = random_seq(hairpin_length)

        # make a sequence with a hairpin
        seq = (random_seq(1000) + hairpin + random_seq(500) +
               revcomp(hairpin) + random_seq(1000))

        # look for hairpins of size kmer
        stats = DNAStats(seq, 20, 20, kmer)

        assert stats()["n_repeats"] == 0
        assert stats()["n_hairpins"] > 0

        print(stats.cost(1000, 1500))
        print(stats.cost(None, None))
Esempio n. 10
0
def find_by_partitions_for_sequence(
    stats: DNAStats,
    cyclic: bool,
    threshold: int,
    step_size: int = 100,
    delta: Optional[int] = None,
):
    """Approximates the best partitions for a sequence. If cyclic=True, then
    will approximate partitions by also rotating the origin.

    :param stats: DNAStats instance
    :param cyclic: whether the sequence is cyclic
    :param threshold: threshold cost to find a partition
    :param step_size: step size to find partition.
    :return:
    """
    f = functools.partial(find_best_partitions,
                          threshold=threshold,
                          step_size=step_size,
                          delta=delta)
    partitions = f(stats)
    if cyclic:
        origin = int(len(stats.seq) / 2.0)
        seq = stats.seq
        stats2 = stats.copy_with_new_seq(seq[origin:] + seq[:origin])
        partitions2 = f(stats2)
        partitions += _shift_indices(partitions2, origin, len(seq))
    partitions = sorted(set(partitions))
    return partitions
Esempio n. 11
0
def test_hash3(key):
    s1 = random_seq(1000)
    kwargs = {
        "repeat_window": 20,
        "stats_window": 20,
        "hairpin_window": 20,
        "gc_content_threshold": 0.8,
        "at_content_threshold": 0.8,
        "base_percentage_threshold": 0.8,
    }
    kwargs2 = dict(kwargs)
    kwargs2[key] += 0.1
    stats1 = DNAStats(s1, **kwargs)
    stats2 = DNAStats(s1, **kwargs)
    stats3 = DNAStats(s1, **kwargs2)
    assert hash(stats1) == hash(stats2)
    assert not hash(stats1) == hash(stats3)
Esempio n. 12
0
    def test_repeats(self, kmer):
        repeat = random_seq(kmer[0])
        seq = random_seq(1000) + repeat + random_seq(
            500) + repeat + random_seq(1000)

        stats = DNAStats(seq, kmer[1], 20, 20)
        print(stats())
        assert stats()["n_repeats"] > 0
        assert stats()["n_hairpins"] == 0
Esempio n. 13
0
def find_best_partitions(
    stats: DNAStats,
    threshold: int,
    i=None,
    j=None,
    step_size: int = 100,
    delta: Optional[int] = None,
    partitions=None,
):
    if partitions is None:
        partitions = []
    c = stats.cost(i, j)
    if c < threshold:
        return partitions
    p, pmin = find_fast_opt_partition(stats,
                                      i=i,
                                      j=j,
                                      step_size=step_size,
                                      delta=delta)
    if p is None or p in partitions:
        return partitions
    else:
        partitions.append(p)
    c1 = stats.cost(i, p)
    c2 = stats.cost(p, j)
    if c1 > threshold:
        find_best_partitions(stats,
                             threshold=threshold,
                             i=i,
                             j=p,
                             partitions=partitions)
    if c2 > threshold:
        find_best_partitions(stats,
                             threshold=threshold,
                             i=p,
                             j=j,
                             partitions=partitions)

    return partitions
Esempio n. 14
0
 def test_case_insensitive(self):
     seq = random_seq(1000)
     stats1 = DNAStats(seq.lower(), 20, 20, 20)
     stats2 = DNAStats(seq.upper(), 20, 20, 20)
     print(stats1.cost())
     assert stats1.cost() == stats2.cost()
Esempio n. 15
0
def cached_stats_cost(stats: DNAStats, i: int, j: int):
    return stats.cost(i, j)
Esempio n. 16
0
 def test_count_misprimings_from_slice_case2(self):
     repeat = random_seq(30)
     seq = "N" * 100 + repeat + "N" * 100 + repeat + "N" * 107
     stats = DNAStats(seq, 1, 1, hairpin_window=30)
     n = stats.count_repeats_from_slice(100, 130)
     assert n == 1
Esempio n. 17
0
 def test_rev_signatures(self):
     seq = "N" * 100 + random_seq(100) + "N" * 101
     stats = DNAStats(seq, 1, 1, hairpin_window=30)
     assert set(stats.rev_signatures[:70]) == {0.0}
     assert stats.rev_signatures[71] != 0.0
     assert stats.rev_signatures[72] != 0.0