def merge_contig_dbs(contig_dbs): main_db = ContigDB.load_from_filepath(contig_dbs[0]) start = time() with click.progressbar(contig_dbs[1:]) as dbs: for filename in dbs: main_db.load_other(ContigDB.load_from_filepath(filename), rebuild_indices=False) main_db._build_indices() main_db.close() add_time = time() - start click.echo( f'Merged {len(contig_dbs)} dbs to {contig_dbs[0]} in {add_time:.5}s. ', err=True)
def probe_calibrate_db(num_seqs, contig_multiplier, probe_multiplier, outfile, probes, database): db = ContigDB.load_from_filepath(database) click.echo(f'K: {db.ramifier.k}', err=True) probes = [str(el.seq) for el in SeqIO.parse(probes, 'fasta')] probes = [ select_one_kmer(seq, db.ramifier.k) for seq in probes for _ in range(probe_multiplier) ] contigs = random.sample(db.get_all_contigs(), num_seqs) contigs = [ db.py_get_seq(contig_name, start_coord, end_coord) for contig_name, _, start_coord, end_coord in contigs ] contigs = [ select_one_kmer(seq, db.ramifier.k) for seq in contigs for _ in range(contig_multiplier) ] click.echo(f'Comparisons: {len(contigs) * len(probes):,}', err=True) dist_tbl = py_needle_2(contigs, probes) dist_tbl = pd.DataFrame(dist_tbl, columns=['contig', 'probe', 'levenshtein']) def ram_dist(row): r1, r2 = db.ramifier.ramify(row['contig']), db.ramifier.ramify( row['probe']) return np.abs(r1 - r2).sum() dist_tbl['ram'] = dist_tbl.apply(ram_dist, axis=1) dist_tbl.to_csv(outfile)
def calibrate_db(num_seqs, num_mutants, outfile, database): db = ContigDB.load_from_filepath(database) click.echo(f'K: {db.ramifier.k}', err=True) prek = int(db.ramifier.k * 1.1) contigs = random.sample(db.get_all_contigs(), num_seqs) contigs = [ db.py_get_seq(contig_name, start_coord, start_coord + prek + 100) for contig_name, _, start_coord, end_coord in contigs ] contigs = [ select_one_kmer(seq, prek) for seq in contigs if len(seq) > prek ] click.echo(f'Total contigs: {len(contigs)}', err=True) mutated = [ mutate_seq(seq, db.ramifier.k) for seq in contigs for _ in range(num_mutants) ] contigs = [select_one_kmer(kmer, db.ramifier.k) for kmer in contigs] + mutated click.echo(f'Comparisons: {(len(contigs) ** 2) / 2 - len(contigs)}', err=True) dist_tbl = pd.DataFrame(py_needle(contigs), columns=['k1', 'k2', 'levenshtein']) def ram_dist(row): r1, r2 = db.ramifier.ramify(row['k1']), db.ramifier.ramify(row['k2']) return np.abs(r1 - r2).sum() dist_tbl['ram'] = dist_tbl.apply(ram_dist, axis=1) dist_tbl.to_csv(outfile)
def cli_dump_contigs(seq, outfile, contig_db): grid = ContigDB.load_from_filepath(contig_db) for cid, kmer, genome_name, contig_name, contig_coord in grid.get_all_contigs( ): if not seq: kmer = '' print(f'{cid} {genome_name} {contig_name} {contig_coord} {kmer}', file=outfile)
def test_build_contig_db_from_fasta(self): conn = sqlite3.connect(':memory:') ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=0.5) contig_db.fast_add_kmers_from_fasta(KMER_FASTA) contig_db.commit() stored = contig_db.get_all_contigs() self.assertGreaterEqual(len(stored), 3)
def build_contig_cover_fasta(radius, dimension, threads, outfile, rotation, fasta_list): environ[ 'OPENBLAS_NUM_THREADS'] = f'{threads}' # numpy uses one of these two libraries environ['MKL_NUM_THREADS'] = f'{threads}' fasta_list = [line.strip() for line in fasta_list] ramifier = RotatingRamifier.from_file(dimension, rotation) grid = ContigDB(sqlite3.connect(outfile), ramifier=ramifier, box_side_len=radius) click.echo(f'Adding {len(fasta_list)} fastas.', err=True) start = time() with click.progressbar(fasta_list) as fastas: for fasta_filename in fastas: n_added = grid.fast_add_kmers_from_fasta(fasta_filename) grid.close() add_time = time() - start click.echo(f'Added {n_added:,} kmers to {outfile} in {add_time:.5}s. ', err=True)
def test_build_contig_db(self): conn = sqlite3.connect(':memory:') ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=0.5) contig = random_kmer(2 * 10 * 1000) contig_db.py_add_contig('test_genome___test_contig', contig, gap=100) contig_db.commit() stored = contig_db.get_all_contigs() self.assertGreaterEqual(len(stored), 2)
def test_search_contig_db(self): conn = sqlite3.connect(':memory:') ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=0.5) contig = random_kmer(2 * 10 * 1000) contig_db.py_add_contig('test_genome___test_contig', contig, gap=10) contig_db.commit() stored = contig_db.get_all_contigs() searcher = ContigSearcher(contig_db) hits = searcher.py_search(contig[500:600], 0.1, 0.5) self.assertGreaterEqual(len(hits), 1)
def test_search_bigger_contig_db_exact(self): contig_db = ContigDB(sqlite3.connect(':memory:'), ramifier=RotatingRamifier.from_file( 4, KMER_ROTATION), box_side_len=0.0001) n_contigs, contig_len = 3, 2 * 10 * 1000 contigs = [random_kmer(contig_len) for _ in range(n_contigs)] for i, contig in enumerate(contigs): contig_db.py_add_contig(f'test_genome_{i}___test_contig_{i}', contig, gap=1) contig_db.commit() self.assertEqual(contig_db.centroids().shape[0], n_contigs * (contig_len - 31 + 1)) searcher = ContigSearcher(contig_db) hits = searcher.py_search(contigs[0][500:600], 0, 1) self.assertEqual(len(hits), 1)
def build_contig_from_pre(radius, threads, outfile, pre_list): environ[ 'OPENBLAS_NUM_THREADS'] = f'{threads}' # numpy uses one of these two libraries environ['MKL_NUM_THREADS'] = f'{threads}' pre_list = [line.strip() for line in pre_list] click.echo(f'Adding {len(pre_list)} predbs.', err=True) start = time() predb = PreContigDB.load_from_filepath(pre_list[0]) grid = ContigDB.from_predb(outfile, predb, radius) grid._drop_indices() with click.progressbar(pre_list) as pres: for i, predb_filename in enumerate(pres): if i > 0: grid.add_from_predb( PreContigDB.load_from_filepath(predb_filename)) grid.commit() grid._build_indices() grid.close() add_time = time() - start click.echo(f'Added predbs to {outfile} in {add_time:.5}s. ', err=True)
def test_fileio_contig_db(self): fname = 'temp.test_contig_db.sqlite' try: remove(fname) except FileNotFoundError: pass conn = sqlite3.connect(fname) ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=1) contig = random_kmer(2 * 10 * 1000) contig_db.py_add_contig('test_genome___test_contig', contig, gap=100) contig_db.commit() from_store = ContigDB.load_from_filepath(fname) self.assertEqual(contig_db.current_seq_coord, from_store.current_seq_coord) self.assertEqual(len(contig_db.centroid_cache), len(from_store.centroid_cache)) for key, val in contig_db.centroid_cache.items(): self.assertIn(key, from_store.centroid_cache) self.assertEqual(val, from_store.centroid_cache[key]) remove(fname)
def test_build_merge_contig_db(self): conn_1 = sqlite3.connect(':memory:') ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) contig_db_1 = ContigDB(conn_1, ramifier=ramifier, box_side_len=0.5) contig = random_kmer(2 * 10 * 1000) contig_db_1.py_add_contig('test_genome_1___test_contig_1', contig, gap=100) contig_db_1.commit() n_stored = len(contig_db_1.get_all_contigs()) conn_2 = sqlite3.connect(':memory:') contig_db_2 = ContigDB(conn_2, ramifier=ramifier, box_side_len=0.5) contig = random_kmer(2 * 10 * 1000) contig_db_2.py_add_contig('test_genome_2___test_contig_2', contig, gap=100) contig_db_2.commit() n_stored += len(contig_db_2.get_all_contigs()) contig_db_1.load_other(contig_db_2) self.assertEqual(len(contig_db_1.get_all_contigs()), n_stored)