def test_time_to_read(self): t = time.perf_counter_ns() GenomeIndex.read(data_root / "genome.chr22.fa.gz.sa32_index") t = (time.perf_counter_ns() - t) * 1e-9 # About 30s self.assertTrue(0 <= t < 300)
def test_wavelet(self): ref = "TAGANNGAGATCNGNNNATTNTTTNTCTNNNTGANCTGNACTGACTCAAAAAGN" for query in ["ACT", "T", "TTT", "TTTT", "GNACT", "AAAA"]: fm_index = GenomeIndex(ref) wavelet = GenomeIndex(ref, wavelet=True) fm_hits = fm_index.query(query) wavelet_hits = wavelet.query(query) self.assertEqual(fm_hits, wavelet_hits)
def test_perfect_match_mini(self): ref = "TAGANNGAGATCNGNNNATTNTTTNTCTNNNTGANCTGNACTGACTCAAAAAGN" for query in ["ACT", "T", "TTT", "TTTT", "GNACT", "AAAA"]: fm_index = GenomeIndex(ref) hits = fm_index.query(query) # Test for precision for i in hits: self.assertEqual(ref[i:(i + len(query))], query) # Test for recall from humdum.utils import find_all self.assertCountEqual(hits, find_all(template=ref, pattern=query))
def test_match_with_compression(self): ref = "TAGAATCGTTTTTTTTTTATCGACTACNACTACAAAAAAAAATGATCNTACNGTAANNNNNTTNTTTNTCTNNNTGANCTGNACTGACTCAAAAAGN" for comp in range(1, 100): fm_index = GenomeIndex(ref, compression_occ=comp, compression_sa=comp + 1) for query in ["ACT", "T", "TTT", "TTTT", "GNACT", "AAAA"]: hits = fm_index.query(query) # Test for precision for i in hits: self.assertEqual(ref[i:(i + len(query))], query) # Test for recall from humdum.utils import find_all self.assertCountEqual(hits, find_all(template=ref, pattern=query))
def test_read_write(self): ref = "NTAGAGNANGACGTACNGATCGANCTGACTNAGCTNNNAGCACACACACTGACTCNNGATCGACNNN" fm_index = GenomeIndex(ref) Path(data_root / "index_data").mkdir(parents=True, exist_ok=True) fm_index.write(data_root / "index_data/index_small.data") fm_index2 = GenomeIndex.read(data_root / "index_data/index_small.data") self.assertIsInstance(fm_index2, GenomeIndex) self.assertEqual(fm_index.bwt.sa, fm_index2.bwt.sa) self.assertEqual(str(fm_index), str(fm_index2)) self.assertEqual(fm_index.bwt.f, fm_index2.bwt.f) self.assertEqual(fm_index.bwt.next_chars._data, fm_index2.bwt.next_chars._data)
def test_init_write(self): genome = "" with open_maybe_gz(genome_file) as fd: # skip first line line = fd.readline() line = fd.readline().rstrip() while True: genome += line line = fd.readline().rstrip() if not line: break print("length", len(genome)) print("init") index = GenomeIndex(genome) print("write") index.write(data_root / "genome.chr22.fa.gz.sa32_index")
def from_files(cls, *, fa, fq1, fq2): """ Reference genome file `fa`. FASTQ files `fq1` and `fq2`. Creates an instance of AllTheKingsHorses and yields from its map_paired(...) member function. """ ref_genome = unlist1(from_fasta(fa)) index = GenomeIndex.read_or_make(path_to_genome=fa) aligner = SequenceAligner() atkh = AllTheKingsHorses(genome_index=index, sequence_aligner=aligner, ref_genome=ref_genome) class _: headers = atkh.headers() alignments = atkh.map_paired(fq1, fq2) return _
def test_read_query(self): print("read") index = GenomeIndex.read(data_root / "genome.chr22.fa.gz.sa32_index") # The following strings are copied from the original genome ns = 10**(-9) print(len("AAAAGAATGCA")) start = time.perf_counter_ns() self.assertGreater(len(index.query("AAAAGAATGCA")), 0) end = time.perf_counter_ns() print("time: ", ns * (end - start)) print(len("CGACACCACCAAGGCCACCCACCTGCCT")) start = time.perf_counter_ns() self.assertGreater(len(index.query("CGACACCACCAAGGCCACCCACCTGCCT")), 0) end = time.perf_counter_ns() print("time: ", ns * (end - start)) print( len("GGCATTTACAACTAAAACATTGAATTCAGATTCATTTTCAGGTAATGATATAATCATGTG") ) start = time.perf_counter_ns() self.assertGreater( len( index.query( "GGCATTTACAACTAAAACATTGAATTCAGATTCATTTTCAGGTAATGATATAATCATGTG" )), 0) end = time.perf_counter_ns() print("time: ", ns * (end - start)) print( len("AAAAGAATGCATTTCTGTATTTTTTGAAACCTTTTCTTTTGAAAACATAGTAATACATTT" "CTACTCTAAAATAGAACTTAGCCTAAATACTTTCAAAACCTTTAGAATTTGGAAAAGAAA") ) start = time.perf_counter_ns() self.assertGreater( len( index.query( "AAAAGAATGCATTTCTGTATTTTTTGAAACCTTTTCTTTTGAAAACATAGTAATACATTT" "CTACTCTAAAATAGAACTTAGCCTAAATACTTTCAAAACCTTTAGAATTTGGAAAAGAAA" )), 0) end = time.perf_counter_ns() print("time: ", ns * (end - start)) print(len("AAAAGAATGCA")) start = time.perf_counter_ns() self.assertGreater(len(index.query("AAAAGAATGCA")), 0) end = time.perf_counter_ns() print("time: ", ns * (end - start)) print(len("CGACACCACCAAGGCCACCCACCTGCCT")) start = time.perf_counter_ns() self.assertGreater(len(index.query("CGACACCACCAAGGCCACCCACCTGCCT")), 0) end = time.perf_counter_ns() print("time: ", ns * (end - start)) print( len("GGCATTTACAACTAAAACATTGAATTCAGATTCATTTTCAGGTAATGATATAATCATGTG") ) start = time.perf_counter_ns() self.assertGreater( len( index.query( "GGCATTTACAACTAAAACATTGAATTCAGATTCATTTTCAGGTAATGATATAATCATGTG" )), 0) end = time.perf_counter_ns() print("time: ", ns * (end - start)) print( len("AAAAGAATGCATTTCTGTATTTTTTGAAACCTTTTCTTTTGAAAACATAGTAATACATTT" "CTACTCTAAAATAGAACTTAGCCTAAATACTTTCAAAACCTTTAGAATTTGGAAAAGAAA") ) start = time.perf_counter_ns() self.assertGreater( len( index.query( "AAAAGAATGCATTTCTGTATTTTTTGAAACCTTTTCTTTTGAAAACATAGTAATACATTT" "CTACTCTAAAATAGAACTTAGCCTAAATACTTTCAAAACCTTTAGAATTTGGAAAAGAAA" )), 0) end = time.perf_counter_ns() print("time: ", ns * (end - start))