Example #1
0
def merge_contig_dbs(contig_dbs):
    main_db = ContigDB.load_from_filepath(contig_dbs[0])
    start = time()
    with click.progressbar(contig_dbs[1:]) as dbs:
        for filename in dbs:
            main_db.load_other(ContigDB.load_from_filepath(filename),
                               rebuild_indices=False)
    main_db._build_indices()
    main_db.close()
    add_time = time() - start
    click.echo(
        f'Merged {len(contig_dbs)} dbs to {contig_dbs[0]} in {add_time:.5}s. ',
        err=True)
Example #2
0
def probe_calibrate_db(num_seqs, contig_multiplier, probe_multiplier, outfile,
                       probes, database):
    db = ContigDB.load_from_filepath(database)
    click.echo(f'K: {db.ramifier.k}', err=True)
    probes = [str(el.seq) for el in SeqIO.parse(probes, 'fasta')]
    probes = [
        select_one_kmer(seq, db.ramifier.k) for seq in probes
        for _ in range(probe_multiplier)
    ]
    contigs = random.sample(db.get_all_contigs(), num_seqs)
    contigs = [
        db.py_get_seq(contig_name, start_coord, end_coord)
        for contig_name, _, start_coord, end_coord in contigs
    ]
    contigs = [
        select_one_kmer(seq, db.ramifier.k) for seq in contigs
        for _ in range(contig_multiplier)
    ]
    click.echo(f'Comparisons: {len(contigs) * len(probes):,}', err=True)
    dist_tbl = py_needle_2(contigs, probes)
    dist_tbl = pd.DataFrame(dist_tbl,
                            columns=['contig', 'probe', 'levenshtein'])

    def ram_dist(row):
        r1, r2 = db.ramifier.ramify(row['contig']), db.ramifier.ramify(
            row['probe'])
        return np.abs(r1 - r2).sum()

    dist_tbl['ram'] = dist_tbl.apply(ram_dist, axis=1)
    dist_tbl.to_csv(outfile)
Example #3
0
def calibrate_db(num_seqs, num_mutants, outfile, database):
    db = ContigDB.load_from_filepath(database)
    click.echo(f'K: {db.ramifier.k}', err=True)
    prek = int(db.ramifier.k * 1.1)
    contigs = random.sample(db.get_all_contigs(), num_seqs)
    contigs = [
        db.py_get_seq(contig_name, start_coord, start_coord + prek + 100)
        for contig_name, _, start_coord, end_coord in contigs
    ]
    contigs = [
        select_one_kmer(seq, prek) for seq in contigs if len(seq) > prek
    ]
    click.echo(f'Total contigs: {len(contigs)}', err=True)
    mutated = [
        mutate_seq(seq, db.ramifier.k) for seq in contigs
        for _ in range(num_mutants)
    ]
    contigs = [select_one_kmer(kmer, db.ramifier.k)
               for kmer in contigs] + mutated
    click.echo(f'Comparisons: {(len(contigs) ** 2) / 2 - len(contigs)}',
               err=True)
    dist_tbl = pd.DataFrame(py_needle(contigs),
                            columns=['k1', 'k2', 'levenshtein'])

    def ram_dist(row):
        r1, r2 = db.ramifier.ramify(row['k1']), db.ramifier.ramify(row['k2'])
        return np.abs(r1 - r2).sum()

    dist_tbl['ram'] = dist_tbl.apply(ram_dist, axis=1)
    dist_tbl.to_csv(outfile)
Example #4
0
def cli_dump_contigs(seq, outfile, contig_db):
    grid = ContigDB.load_from_filepath(contig_db)
    for cid, kmer, genome_name, contig_name, contig_coord in grid.get_all_contigs(
    ):
        if not seq:
            kmer = ''
        print(f'{cid} {genome_name} {contig_name} {contig_coord} {kmer}',
              file=outfile)
Example #5
0
 def test_build_contig_db_from_fasta(self):
     conn = sqlite3.connect(':memory:')
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=0.5)
     contig_db.fast_add_kmers_from_fasta(KMER_FASTA)
     contig_db.commit()
     stored = contig_db.get_all_contigs()
     self.assertGreaterEqual(len(stored), 3)
Example #6
0
def build_contig_cover_fasta(radius, dimension, threads, outfile, rotation,
                             fasta_list):
    environ[
        'OPENBLAS_NUM_THREADS'] = f'{threads}'  # numpy uses one of these two libraries
    environ['MKL_NUM_THREADS'] = f'{threads}'
    fasta_list = [line.strip() for line in fasta_list]
    ramifier = RotatingRamifier.from_file(dimension, rotation)
    grid = ContigDB(sqlite3.connect(outfile),
                    ramifier=ramifier,
                    box_side_len=radius)
    click.echo(f'Adding {len(fasta_list)} fastas.', err=True)
    start = time()
    with click.progressbar(fasta_list) as fastas:
        for fasta_filename in fastas:
            n_added = grid.fast_add_kmers_from_fasta(fasta_filename)
    grid.close()
    add_time = time() - start
    click.echo(f'Added {n_added:,} kmers to {outfile} in {add_time:.5}s. ',
               err=True)
Example #7
0
 def test_build_contig_db(self):
     conn = sqlite3.connect(':memory:')
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=0.5)
     contig = random_kmer(2 * 10 * 1000)
     contig_db.py_add_contig('test_genome___test_contig', contig, gap=100)
     contig_db.commit()
     stored = contig_db.get_all_contigs()
     self.assertGreaterEqual(len(stored), 2)
Example #8
0
 def test_search_contig_db(self):
     conn = sqlite3.connect(':memory:')
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=0.5)
     contig = random_kmer(2 * 10 * 1000)
     contig_db.py_add_contig('test_genome___test_contig', contig, gap=10)
     contig_db.commit()
     stored = contig_db.get_all_contigs()
     searcher = ContigSearcher(contig_db)
     hits = searcher.py_search(contig[500:600], 0.1, 0.5)
     self.assertGreaterEqual(len(hits), 1)
Example #9
0
    def test_search_bigger_contig_db_exact(self):
        contig_db = ContigDB(sqlite3.connect(':memory:'),
                             ramifier=RotatingRamifier.from_file(
                                 4, KMER_ROTATION),
                             box_side_len=0.0001)
        n_contigs, contig_len = 3, 2 * 10 * 1000
        contigs = [random_kmer(contig_len) for _ in range(n_contigs)]
        for i, contig in enumerate(contigs):
            contig_db.py_add_contig(f'test_genome_{i}___test_contig_{i}',
                                    contig,
                                    gap=1)
        contig_db.commit()
        self.assertEqual(contig_db.centroids().shape[0],
                         n_contigs * (contig_len - 31 + 1))

        searcher = ContigSearcher(contig_db)
        hits = searcher.py_search(contigs[0][500:600], 0, 1)
        self.assertEqual(len(hits), 1)
Example #10
0
def build_contig_from_pre(radius, threads, outfile, pre_list):
    environ[
        'OPENBLAS_NUM_THREADS'] = f'{threads}'  # numpy uses one of these two libraries
    environ['MKL_NUM_THREADS'] = f'{threads}'
    pre_list = [line.strip() for line in pre_list]
    click.echo(f'Adding {len(pre_list)} predbs.', err=True)
    start = time()
    predb = PreContigDB.load_from_filepath(pre_list[0])
    grid = ContigDB.from_predb(outfile, predb, radius)
    grid._drop_indices()
    with click.progressbar(pre_list) as pres:
        for i, predb_filename in enumerate(pres):
            if i > 0:
                grid.add_from_predb(
                    PreContigDB.load_from_filepath(predb_filename))
    grid.commit()
    grid._build_indices()
    grid.close()
    add_time = time() - start
    click.echo(f'Added predbs to {outfile} in {add_time:.5}s. ', err=True)
Example #11
0
 def test_fileio_contig_db(self):
     fname = 'temp.test_contig_db.sqlite'
     try:
         remove(fname)
     except FileNotFoundError:
         pass
     conn = sqlite3.connect(fname)
     ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
     contig_db = ContigDB(conn, ramifier=ramifier, box_side_len=1)
     contig = random_kmer(2 * 10 * 1000)
     contig_db.py_add_contig('test_genome___test_contig', contig, gap=100)
     contig_db.commit()
     from_store = ContigDB.load_from_filepath(fname)
     self.assertEqual(contig_db.current_seq_coord,
                      from_store.current_seq_coord)
     self.assertEqual(len(contig_db.centroid_cache),
                      len(from_store.centroid_cache))
     for key, val in contig_db.centroid_cache.items():
         self.assertIn(key, from_store.centroid_cache)
         self.assertEqual(val, from_store.centroid_cache[key])
     remove(fname)
Example #12
0
    def test_build_merge_contig_db(self):
        conn_1 = sqlite3.connect(':memory:')
        ramifier = RotatingRamifier.from_file(4, KMER_ROTATION)
        contig_db_1 = ContigDB(conn_1, ramifier=ramifier, box_side_len=0.5)
        contig = random_kmer(2 * 10 * 1000)
        contig_db_1.py_add_contig('test_genome_1___test_contig_1',
                                  contig,
                                  gap=100)
        contig_db_1.commit()
        n_stored = len(contig_db_1.get_all_contigs())

        conn_2 = sqlite3.connect(':memory:')
        contig_db_2 = ContigDB(conn_2, ramifier=ramifier, box_side_len=0.5)
        contig = random_kmer(2 * 10 * 1000)
        contig_db_2.py_add_contig('test_genome_2___test_contig_2',
                                  contig,
                                  gap=100)
        contig_db_2.commit()
        n_stored += len(contig_db_2.get_all_contigs())

        contig_db_1.load_other(contig_db_2)

        self.assertEqual(len(contig_db_1.get_all_contigs()), n_stored)