def test_time_to_read(self):

        t = time.perf_counter_ns()
        GenomeIndex.read(data_root / "genome.chr22.fa.gz.sa32_index")
        t = (time.perf_counter_ns() - t) * 1e-9

        # About 30s
        self.assertTrue(0 <= t < 300)
    def test_wavelet(self):

        ref = "TAGANNGAGATCNGNNNATTNTTTNTCTNNNTGANCTGNACTGACTCAAAAAGN"

        for query in ["ACT", "T", "TTT", "TTTT", "GNACT", "AAAA"]:
            fm_index = GenomeIndex(ref)
            wavelet = GenomeIndex(ref, wavelet=True)
            fm_hits = fm_index.query(query)
            wavelet_hits = wavelet.query(query)
            self.assertEqual(fm_hits, wavelet_hits)
    def test_perfect_match_mini(self):
        ref = "TAGANNGAGATCNGNNNATTNTTTNTCTNNNTGANCTGNACTGACTCAAAAAGN"

        for query in ["ACT", "T", "TTT", "TTTT", "GNACT", "AAAA"]:
            fm_index = GenomeIndex(ref)
            hits = fm_index.query(query)

            # Test for precision
            for i in hits:
                self.assertEqual(ref[i:(i + len(query))], query)

            # Test for recall
            from humdum.utils import find_all
            self.assertCountEqual(hits, find_all(template=ref, pattern=query))
    def test_match_with_compression(self):
        ref = "TAGAATCGTTTTTTTTTTATCGACTACNACTACAAAAAAAAATGATCNTACNGTAANNNNNTTNTTTNTCTNNNTGANCTGNACTGACTCAAAAAGN"

        for comp in range(1, 100):

            fm_index = GenomeIndex(ref, compression_occ=comp, compression_sa=comp + 1)
            for query in ["ACT", "T", "TTT", "TTTT", "GNACT", "AAAA"]:
                hits = fm_index.query(query)

                # Test for precision
                for i in hits:
                    self.assertEqual(ref[i:(i + len(query))], query)

                # Test for recall
                from humdum.utils import find_all
                self.assertCountEqual(hits, find_all(template=ref, pattern=query))
    def test_read_write(self):

        ref = "NTAGAGNANGACGTACNGATCGANCTGACTNAGCTNNNAGCACACACACTGACTCNNGATCGACNNN"

        fm_index = GenomeIndex(ref)

        Path(data_root / "index_data").mkdir(parents=True, exist_ok=True)

        fm_index.write(data_root / "index_data/index_small.data")
        fm_index2 = GenomeIndex.read(data_root / "index_data/index_small.data")

        self.assertIsInstance(fm_index2, GenomeIndex)

        self.assertEqual(fm_index.bwt.sa, fm_index2.bwt.sa)
        self.assertEqual(str(fm_index), str(fm_index2))
        self.assertEqual(fm_index.bwt.f, fm_index2.bwt.f)
        self.assertEqual(fm_index.bwt.next_chars._data, fm_index2.bwt.next_chars._data)
    def test_init_write(self):

        genome = ""

        with open_maybe_gz(genome_file) as fd:

            # skip first line
            line = fd.readline()
            line = fd.readline().rstrip()
            while True:

                genome += line
                line = fd.readline().rstrip()
                if not line:
                    break
        print("length", len(genome))

        print("init")
        index = GenomeIndex(genome)

        print("write")
        index.write(data_root / "genome.chr22.fa.gz.sa32_index")
Ejemplo n.º 7
0
    def from_files(cls, *, fa, fq1, fq2):
        """
        Reference genome file `fa`.
        FASTQ files `fq1` and `fq2`.

        Creates an instance of AllTheKingsHorses and
        yields from its map_paired(...) member function.
        """

        ref_genome = unlist1(from_fasta(fa))

        index = GenomeIndex.read_or_make(path_to_genome=fa)

        aligner = SequenceAligner()

        atkh = AllTheKingsHorses(genome_index=index,
                                 sequence_aligner=aligner,
                                 ref_genome=ref_genome)

        class _:
            headers = atkh.headers()
            alignments = atkh.map_paired(fq1, fq2)

        return _
    def test_read_query(self):

        print("read")
        index = GenomeIndex.read(data_root / "genome.chr22.fa.gz.sa32_index")

        # The following strings are copied from the original genome

        ns = 10**(-9)

        print(len("AAAAGAATGCA"))
        start = time.perf_counter_ns()
        self.assertGreater(len(index.query("AAAAGAATGCA")), 0)
        end = time.perf_counter_ns()
        print("time: ", ns * (end - start))

        print(len("CGACACCACCAAGGCCACCCACCTGCCT"))
        start = time.perf_counter_ns()
        self.assertGreater(len(index.query("CGACACCACCAAGGCCACCCACCTGCCT")), 0)
        end = time.perf_counter_ns()
        print("time: ", ns * (end - start))

        print(
            len("GGCATTTACAACTAAAACATTGAATTCAGATTCATTTTCAGGTAATGATATAATCATGTG")
        )
        start = time.perf_counter_ns()
        self.assertGreater(
            len(
                index.query(
                    "GGCATTTACAACTAAAACATTGAATTCAGATTCATTTTCAGGTAATGATATAATCATGTG"
                )), 0)
        end = time.perf_counter_ns()
        print("time: ", ns * (end - start))

        print(
            len("AAAAGAATGCATTTCTGTATTTTTTGAAACCTTTTCTTTTGAAAACATAGTAATACATTT"
                "CTACTCTAAAATAGAACTTAGCCTAAATACTTTCAAAACCTTTAGAATTTGGAAAAGAAA")
        )
        start = time.perf_counter_ns()
        self.assertGreater(
            len(
                index.query(
                    "AAAAGAATGCATTTCTGTATTTTTTGAAACCTTTTCTTTTGAAAACATAGTAATACATTT"
                    "CTACTCTAAAATAGAACTTAGCCTAAATACTTTCAAAACCTTTAGAATTTGGAAAAGAAA"
                )), 0)
        end = time.perf_counter_ns()
        print("time: ", ns * (end - start))

        print(len("AAAAGAATGCA"))
        start = time.perf_counter_ns()
        self.assertGreater(len(index.query("AAAAGAATGCA")), 0)
        end = time.perf_counter_ns()
        print("time: ", ns * (end - start))

        print(len("CGACACCACCAAGGCCACCCACCTGCCT"))
        start = time.perf_counter_ns()
        self.assertGreater(len(index.query("CGACACCACCAAGGCCACCCACCTGCCT")), 0)
        end = time.perf_counter_ns()
        print("time: ", ns * (end - start))

        print(
            len("GGCATTTACAACTAAAACATTGAATTCAGATTCATTTTCAGGTAATGATATAATCATGTG")
        )
        start = time.perf_counter_ns()
        self.assertGreater(
            len(
                index.query(
                    "GGCATTTACAACTAAAACATTGAATTCAGATTCATTTTCAGGTAATGATATAATCATGTG"
                )), 0)
        end = time.perf_counter_ns()
        print("time: ", ns * (end - start))

        print(
            len("AAAAGAATGCATTTCTGTATTTTTTGAAACCTTTTCTTTTGAAAACATAGTAATACATTT"
                "CTACTCTAAAATAGAACTTAGCCTAAATACTTTCAAAACCTTTAGAATTTGGAAAAGAAA")
        )
        start = time.perf_counter_ns()
        self.assertGreater(
            len(
                index.query(
                    "AAAAGAATGCATTTCTGTATTTTTTGAAACCTTTTCTTTTGAAAACATAGTAATACATTT"
                    "CTACTCTAAAATAGAACTTAGCCTAAATACTTTCAAAACCTTTAGAATTTGGAAAAGAAA"
                )), 0)
        end = time.perf_counter_ns()
        print("time: ", ns * (end - start))