def test_build_contig_db_from_fasta(self): conn = sqlite3.connect(':memory:') ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=0.5) contig_db.fast_add_kmers_from_fasta(KMER_FASTA) contig_db.commit() stored = contig_db.get_all_contigs() self.assertGreaterEqual(len(stored), 3)
def test_build_contig_db(self): conn = sqlite3.connect(':memory:') ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=0.5) contig = random_kmer(2 * 10 * 1000) contig_db.py_add_contig('test_genome___test_contig', contig, gap=100) contig_db.commit() stored = contig_db.get_all_contigs() self.assertGreaterEqual(len(stored), 2)
def test_add_kmer_to_pre(self): ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db = PreDB(sqlite3.connect(':memory:'), ramifier=ramifier) db.py_add_kmer(KMER_31) db.commit() members = list(db.conn.execute('SELECT * FROM kmers')) self.assertEqual(len(members), 1) self.assertIn(KMER_31, [reverse_convert_kmer(member[1]) for member in members])
def test_save(self): DB_SAVE_TEMP_FILE = join(dirname(__file__), 'temp.db_save_temp.sqlite') ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db = GridCoverDB(sqlite3.connect(DB_SAVE_TEMP_FILE), ramifier=ramifier, box_side_len=0.5) db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_31) db.close() remove(DB_SAVE_TEMP_FILE)
def test_get_centroids(self): ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db = GridCoverDB(sqlite3.connect(':memory:'), ramifier=ramifier, box_side_len=0.5) db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'A') db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'T') db.py_add_point_to_cluster(np.array([1., 0., 0., 0.]), KMER_30 + 'C') db.commit() centroids = db.centroids() self.assertEqual(centroids.shape, (2, 4))
def test_add_kmer(self): ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db = GridCoverDB(sqlite3.connect(':memory:'), ramifier=ramifier, box_side_len=0.5) db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_31) db.commit() members = db.py_get_cluster_members(0) self.assertEqual(len(members), 1) self.assertIn(KMER_31, [reverse_convert_kmer(member) for member in members])
def test_search_contig_db(self): conn = sqlite3.connect(':memory:') ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=0.5) contig = random_kmer(2 * 10 * 1000) contig_db.py_add_contig('test_genome___test_contig', contig, gap=10) contig_db.commit() stored = contig_db.get_all_contigs() searcher = ContigSearcher(contig_db) hits = searcher.py_search(contig[500:600], 0.1, 0.5) self.assertGreaterEqual(len(hits), 1)
def add_rotation_dists(dimensions, kmer_cols, outfile, rotation, dist_table): """Add rotation distances to an existing distance table.""" header = dist_table.readline().strip() + f',rotation_dist_{dimensions}\n' outfile.write(header) ramifier = RotatingRamifier.from_file(dimensions, rotation) for line in dist_table: line = line.strip() tkns = line.split(',') k1, k2 = tkns[kmer_cols[0]], tkns[kmer_cols[1]] rft1, rft2 = ramifier.ramify(k1), ramifier.ramify(k2) d = np.linalg.norm(rft1 - rft2) outfile.write(line + f',{d}\n')
def test_build_grid_cover_from_fasta(self): ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db = GridCoverDB(sqlite3.connect(':memory:'), ramifier=ramifier, box_side_len=0.5) grid = GridCoverBuilder(db) grid.fast_add_kmers_from_fasta(KMER_FASTA) grid.commit() n_centers = grid.db.centroids().shape[0] n_points = len(grid.db.get_kmers()) self.assertGreater(n_centers, 0) self.assertLess(n_centers, 98) self.assertEqual(n_points, 98)
def build_grid_cover_fasta(dimension, threads, outfile, rotation, fasta_list): environ[ 'OPENBLAS_NUM_THREADS'] = f'{threads}' # numpy uses one of these two libraries environ['MKL_NUM_THREADS'] = f'{threads}' fasta_list = [line.strip() for line in fasta_list] ramifier = RotatingRamifier.from_file(dimension, rotation) predb = PreDB.load_from_filepath(outfile, ramifier=ramifier) start = time() with click.progressbar(fasta_list) as fastas: for fasta_filename in fastas: n_added = predb.fast_add_kmers_from_fasta(fasta_filename) predb.close() add_time = time() - start click.echo(f'Added {n_added:,} kmers to {outfile} in {add_time:.5}s.', err=True)
def test_save_and_reload(self): DB_SAVE_TEMP_FILE = join(dirname(__file__), 'temp.db_save_temp.sqlite') ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db = GridCoverDB(sqlite3.connect(DB_SAVE_TEMP_FILE), ramifier=ramifier, box_side_len=0.5) db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_31) db.close() del db db = GridCoverDB.load_from_filepath(DB_SAVE_TEMP_FILE) members = db.py_get_cluster_members(0) self.assertEqual(len(members), 1) self.assertIn(KMER_31, [reverse_convert_kmer(member) for member in members]) remove(DB_SAVE_TEMP_FILE)
def build_grid_cover(radius, dimension, threads, num_kmers, start_offset, outfile, preload, rotation, kmer_table): environ[ 'OPENBLAS_NUM_THREADS'] = f'{threads}' # numpy uses one of these two libraries environ['MKL_NUM_THREADS'] = f'{threads}' ramifier = RotatingRamifier.from_file(dimension, rotation) grid = GridCoverBuilder.from_filepath(outfile, ramifier, radius) start = time() n_added = grid.fast_add_kmers_from_file(kmer_table, num_to_add=num_kmers) grid.commit() n_centers = grid.db.centroids().shape[0] grid.close() add_time = time() - start click.echo( f'Added {n_added:,} kmers to {outfile} in {add_time:.5}s. {n_centers:,} clusters.', err=True)
def test_pre_build_blooms(self): ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db = GridCoverDB(sqlite3.connect(':memory:'), ramifier=ramifier, box_side_len=0.5) db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'A') db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'T') db.py_add_point_to_cluster(np.array([1., 0., 0., 0.]), KMER_30 + 'C') db.commit() for centroid_id in [0, 1]: db.build_and_store_bloom_grid(centroid_id) bg_0 = db.retrieve_bloom_grid(0) bg_1 = db.retrieve_bloom_grid(1) self.assertEqual(max(bg_0.py_count_grid_contains(KMER_30 + 'A')), 32 - bg_0.col_k) self.assertEqual(max(bg_1.py_count_grid_contains(KMER_30 + 'C')), 32 - bg_1.col_k) self.assertRaises(IndexError, lambda: db.retrieve_bloom_grid(2))
def build_grid_cover_fasta(radius, dimension, threads, outfile, rotation, fasta_list): environ[ 'OPENBLAS_NUM_THREADS'] = f'{threads}' # numpy uses one of these two libraries environ['MKL_NUM_THREADS'] = f'{threads}' fasta_list = [line.strip() for line in fasta_list] ramifier = RotatingRamifier.from_file(dimension, rotation) grid = GridCoverBuilder.from_filepath(outfile, ramifier, radius) start = time() with click.progressbar(fasta_list) as fastas: for fasta_filename in fastas: n_added = grid.fast_add_kmers_from_fasta(fasta_filename) n_centers = grid.db.centroids().shape[0] grid.close() add_time = time() - start click.echo((f'Added {n_added:,} kmers to {outfile} in {add_time:.5}s. ' f'{n_centers:,} clusters.'), err=True)
def test_search_bigger_contig_db_exact(self): contig_db = ContigDB(sqlite3.connect(':memory:'), ramifier=RotatingRamifier.from_file( 4, KMER_ROTATION), box_side_len=0.0001) n_contigs, contig_len = 3, 2 * 10 * 1000 contigs = [random_kmer(contig_len) for _ in range(n_contigs)] for i, contig in enumerate(contigs): contig_db.py_add_contig(f'test_genome_{i}___test_contig_{i}', contig, gap=1) contig_db.commit() self.assertEqual(contig_db.centroids().shape[0], n_contigs * (contig_len - 31 + 1)) searcher = ContigSearcher(contig_db) hits = searcher.py_search(contigs[0][500:600], 0, 1) self.assertEqual(len(hits), 1)
def build_contig_cover_fasta(radius, dimension, threads, outfile, rotation, fasta_list): environ[ 'OPENBLAS_NUM_THREADS'] = f'{threads}' # numpy uses one of these two libraries environ['MKL_NUM_THREADS'] = f'{threads}' fasta_list = [line.strip() for line in fasta_list] ramifier = RotatingRamifier.from_file(dimension, rotation) grid = ContigDB(sqlite3.connect(outfile), ramifier=ramifier, box_side_len=radius) click.echo(f'Adding {len(fasta_list)} fastas.', err=True) start = time() with click.progressbar(fasta_list) as fastas: for fasta_filename in fastas: n_added = grid.fast_add_kmers_from_fasta(fasta_filename) grid.close() add_time = time() - start click.echo(f'Added {n_added:,} kmers to {outfile} in {add_time:.5}s. ', err=True)
def test_fileio_contig_db(self): fname = 'temp.test_contig_db.sqlite' try: remove(fname) except FileNotFoundError: pass conn = sqlite3.connect(fname) ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=1) contig = random_kmer(2 * 10 * 1000) contig_db.py_add_contig('test_genome___test_contig', contig, gap=100) contig_db.commit() from_store = ContigDB.load_from_filepath(fname) self.assertEqual(contig_db.current_seq_coord, from_store.current_seq_coord) self.assertEqual(len(contig_db.centroid_cache), len(from_store.centroid_cache)) for key, val in contig_db.centroid_cache.items(): self.assertIn(key, from_store.centroid_cache) self.assertEqual(val, from_store.centroid_cache[key]) remove(fname)
def test_merge_dbs(self): ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db1 = GridCoverDB(sqlite3.connect(':memory:'), ramifier=ramifier, box_side_len=0.5) db1.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'A') db1.py_add_point_to_cluster(np.array([1., 0., 0., 0.]), KMER_30 + 'T') db1.commit() db2 = GridCoverDB(sqlite3.connect(':memory:'), ramifier=ramifier, box_side_len=0.5) db2.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'C') db2.py_add_point_to_cluster(np.array([1., 1., 0., 0.]), KMER_30 + 'G') db2.commit() db1.load_other(db2) centroids = db1.centroids() self.assertEqual(centroids.shape, (3, 4)) kmers = [el[1] for el in db1.get_kmers()] self.assertEqual(len(kmers), 4) for char in 'ATCG': self.assertIn(KMER_30 + char, kmers)
def calibrate_db(dropout, gap, burst, kmer_len, outfile, rotation, fasta): seqs = [str(el.seq) for el in SeqIO.parse(fasta, 'fasta')] kmers = set() for seq in seqs: for i in range(0, len(seq) - kmer_len, gap): for j in range(burst): j = 0 if random.random() < dropout: kmer = seq[i + j:i + j + kmer_len] # kmer = 'A' + kmer + 'C' kmers.add(kmer) # frac = 30 # mut_kmer = kmer[:(kmer_len // frac)] # mut_kmer += mutate_seq(kmer[(kmer_len // frac):((frac - 1) * kmer_len // frac)]) # mut_kmer += kmer[((frac - 1) * kmer_len // frac):] # kmers.add(mut_kmer) click.echo(f'{len(kmers)} kmers', err=True) dist_tbl = pd.DataFrame(py_needle(list(kmers)), columns=['k1', 'k2', 'f_lev']) if rotation is None: ramifier = Ramifier(kmer_len) else: ramifier = RotatingRamifier.from_file(rotation) def rc_lev(row): s1, s2 = row['k1'], reverseComplement(row['k2']) return py_needle([s1, s2])[0][2] dist_tbl['rc_lev'] = dist_tbl.apply(rc_lev, axis=1) dist_tbl['lev'] = dist_tbl.apply( lambda row: min(row['f_lev'], row['rc_lev']), axis=1) def ram_dist(row): r1, r2 = ramifier.ramify(row['k1']), ramifier.ramify(row['k2']) return np.abs(r1 - r2).sum() dist_tbl['ram'] = dist_tbl.apply(ram_dist, axis=1) dist_tbl.to_csv(outfile)
def test_build_merge_contig_db(self): conn_1 = sqlite3.connect(':memory:') ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) contig_db_1 = ContigDB(conn_1, ramifier=ramifier, box_side_len=0.5) contig = random_kmer(2 * 10 * 1000) contig_db_1.py_add_contig('test_genome_1___test_contig_1', contig, gap=100) contig_db_1.commit() n_stored = len(contig_db_1.get_all_contigs()) conn_2 = sqlite3.connect(':memory:') contig_db_2 = ContigDB(conn_2, ramifier=ramifier, box_side_len=0.5) contig = random_kmer(2 * 10 * 1000) contig_db_2.py_add_contig('test_genome_2___test_contig_2', contig, gap=100) contig_db_2.commit() n_stored += len(contig_db_2.get_all_contigs()) contig_db_1.load_other(contig_db_2) self.assertEqual(len(contig_db_1.get_all_contigs()), n_stored)