def merge_grid_cover(final_db, other_dbs): if not isfile(final_db): copyfile(other_dbs[0], final_db) other_dbs = other_dbs[1:] final_db = GridCoverDB.load_from_filepath(final_db) for other_db_filename in other_dbs: other_db = GridCoverDB.load_from_filepath(other_db_filename) final_db.load_other(other_db)
def test_get_centroids(self): ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db = GridCoverDB(sqlite3.connect(':memory:'), ramifier=ramifier, box_side_len=0.5) db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'A') db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'T') db.py_add_point_to_cluster(np.array([1., 0., 0., 0.]), KMER_30 + 'C') db.commit() centroids = db.centroids() self.assertEqual(centroids.shape, (2, 4))
def cli_dump_kmers(outfile, cluster_ids, grid_cover): grid = GridCoverDB.load_from_filepath(grid_cover) for centroid_index, kmer in grid.get_kmers(): if cluster_ids: print(f'{centroid_index},{kmer}', file=outfile) else: print(kmer, file=outfile)
def cli_dump_kmers(outfile, grid_cover): grid = GridCoverDB.load_from_filepath(grid_cover) counts = {} for centroid_index, _ in grid.get_kmers(): counts[centroid_index] = 1 + counts.get(centroid_index, 0) for centroid_index, count in counts.items(): print(f'{centroid_index},{count}', file=outfile)
def build_grid_cover(grid_db): db = GridCoverDB.load_from_filepath(grid_db) start = time() n_centers = db.centroids().shape[0] with click.progressbar(list(range(n_centers))) as centroid_ids: for centroid_id in centroid_ids: db.build_and_store_bloom_grid(centroid_id) db.close() add_time = time() - start click.echo(f'Built {n_centers} bloom filters in {add_time:.5}s.', err=True)
def cli_dump_kmers(grid_cover): click.echo(grid_cover) grid = GridCoverDB.load_from_filepath(grid_cover) n_centers = grid.centroids().shape[0] click.echo(f'centers\t{n_centers}') n_kmers = len(grid.get_kmers()) click.echo(f'kmers\t{n_kmers}') box_side = grid.box_side_len click.echo(f'box_side\t{box_side}') dims = grid.ramifier.d click.echo(f'dims\t{dims}')
def test_build_grid_cover_from_fasta(self): ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db = GridCoverDB(sqlite3.connect(':memory:'), ramifier=ramifier, box_side_len=0.5) grid = GridCoverBuilder(db) grid.fast_add_kmers_from_fasta(KMER_FASTA) grid.commit() n_centers = grid.db.centroids().shape[0] n_points = len(grid.db.get_kmers()) self.assertGreater(n_centers, 0) self.assertLess(n_centers, 98) self.assertEqual(n_points, 98)
def test_add_kmer(self): ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db = GridCoverDB(sqlite3.connect(':memory:'), ramifier=ramifier, box_side_len=0.5) db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_31) db.commit() members = db.py_get_cluster_members(0) self.assertEqual(len(members), 1) self.assertIn(KMER_31, [reverse_convert_kmer(member) for member in members])
def test_save_and_reload(self): DB_SAVE_TEMP_FILE = join(dirname(__file__), 'temp.db_save_temp.sqlite') ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db = GridCoverDB(sqlite3.connect(DB_SAVE_TEMP_FILE), ramifier=ramifier, box_side_len=0.5) db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_31) db.close() del db db = GridCoverDB.load_from_filepath(DB_SAVE_TEMP_FILE) members = db.py_get_cluster_members(0) self.assertEqual(len(members), 1) self.assertIn(KMER_31, [reverse_convert_kmer(member) for member in members]) remove(DB_SAVE_TEMP_FILE)
def test_save(self): DB_SAVE_TEMP_FILE = join(dirname(__file__), 'temp.db_save_temp.sqlite') ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db = GridCoverDB(sqlite3.connect(DB_SAVE_TEMP_FILE), ramifier=ramifier, box_side_len=0.5) db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_31) db.close() remove(DB_SAVE_TEMP_FILE)
def cli_dump_kmers(outfile, grid_cover): grid = GridCoverDB.load_from_filepath(grid_cover) pd.DataFrame(grid.centroids()).to_csv(outfile, header=None, index=None)
def test_pre_build_blooms(self): ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db = GridCoverDB(sqlite3.connect(':memory:'), ramifier=ramifier, box_side_len=0.5) db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'A') db.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'T') db.py_add_point_to_cluster(np.array([1., 0., 0., 0.]), KMER_30 + 'C') db.commit() for centroid_id in [0, 1]: db.build_and_store_bloom_grid(centroid_id) bg_0 = db.retrieve_bloom_grid(0) bg_1 = db.retrieve_bloom_grid(1) self.assertEqual(max(bg_0.py_count_grid_contains(KMER_30 + 'A')), 32 - bg_0.col_k) self.assertEqual(max(bg_1.py_count_grid_contains(KMER_30 + 'C')), 32 - bg_1.col_k) self.assertRaises(IndexError, lambda: db.retrieve_bloom_grid(2))
def test_merge_dbs(self): ramifier = RotatingRamifier.from_file(4, KMER_ROTATION) db1 = GridCoverDB(sqlite3.connect(':memory:'), ramifier=ramifier, box_side_len=0.5) db1.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'A') db1.py_add_point_to_cluster(np.array([1., 0., 0., 0.]), KMER_30 + 'T') db1.commit() db2 = GridCoverDB(sqlite3.connect(':memory:'), ramifier=ramifier, box_side_len=0.5) db2.py_add_point_to_cluster(np.array([0., 0., 0., 0.]), KMER_30 + 'C') db2.py_add_point_to_cluster(np.array([1., 1., 0., 0.]), KMER_30 + 'G') db2.commit() db1.load_other(db2) centroids = db1.centroids() self.assertEqual(centroids.shape, (3, 4)) kmers = [el[1] for el in db1.get_kmers()] self.assertEqual(len(kmers), 4) for char in 'ATCG': self.assertIn(KMER_30 + char, kmers)