Esempio n. 1
0
def calibrate_db(num_seqs, num_mutants, outfile, database):
    db = ContigDB.load_from_filepath(database)
    click.echo(f'K: {db.ramifier.k}', err=True)
    prek = int(db.ramifier.k * 1.1)
    contigs = random.sample(db.get_all_contigs(), num_seqs)
    contigs = [
        db.py_get_seq(contig_name, start_coord, start_coord + prek + 100)
        for contig_name, _, start_coord, end_coord in contigs
    ]
    contigs = [
        select_one_kmer(seq, prek) for seq in contigs if len(seq) > prek
    ]
    click.echo(f'Total contigs: {len(contigs)}', err=True)
    mutated = [
        mutate_seq(seq, db.ramifier.k) for seq in contigs
        for _ in range(num_mutants)
    ]
    contigs = [select_one_kmer(kmer, db.ramifier.k)
               for kmer in contigs] + mutated
    click.echo(f'Comparisons: {(len(contigs) ** 2) / 2 - len(contigs)}',
               err=True)
    dist_tbl = pd.DataFrame(py_needle(contigs),
                            columns=['k1', 'k2', 'levenshtein'])

    def ram_dist(row):
        r1, r2 = db.ramifier.ramify(row['k1']), db.ramifier.ramify(row['k2'])
        return np.abs(r1 - r2).sum()

    dist_tbl['ram'] = dist_tbl.apply(ram_dist, axis=1)
    dist_tbl.to_csv(outfile)
Esempio n. 2
0
 def test_needle(self):
     kmers = [KMER_31, MIS, GAP]
     needle = py_needle(kmers, normalize=False)
     for k1, k2, dist in needle:
         ex_dist = 1
         if k1 == GAP or k2 == GAP:
             ex_dist = 2
         self.assertEqual(dist, ex_dist)
Esempio n. 3
0
def cli_lev_dist_matrix(gap, kmer_len, outfile, fasta):
    kmers = parse_seqs(fasta, kmer_len, gap)
    click.echo(f'{len(kmers)} unique kmers.', err=True)
    start = time()
    dist_tbl = pd.DataFrame(py_needle(kmers), columns=['k1', 'k2', 'lev'])
    elapsed = time() - start
    click.echo(f'{elapsed:.5}s to build distance matrix.', err=True)
    dist_tbl.to_csv(outfile)
Esempio n. 4
0
def calibrate_db(dropout, gap, burst, kmer_len, outfile, rotation, fasta):
    seqs = [str(el.seq) for el in SeqIO.parse(fasta, 'fasta')]
    kmers = set()
    for seq in seqs:
        for i in range(0, len(seq) - kmer_len, gap):
            for j in range(burst):
                j = 0
                if random.random() < dropout:
                    kmer = seq[i + j:i + j + kmer_len]
                    # kmer = 'A' + kmer + 'C'
                    kmers.add(kmer)
                    # frac = 30
                    # mut_kmer = kmer[:(kmer_len // frac)]
                    # mut_kmer += mutate_seq(kmer[(kmer_len // frac):((frac - 1) * kmer_len // frac)])
                    # mut_kmer += kmer[((frac - 1) * kmer_len // frac):]
                    # kmers.add(mut_kmer)

    click.echo(f'{len(kmers)} kmers', err=True)
    dist_tbl = pd.DataFrame(py_needle(list(kmers)),
                            columns=['k1', 'k2', 'f_lev'])

    if rotation is None:
        ramifier = Ramifier(kmer_len)
    else:
        ramifier = RotatingRamifier.from_file(rotation)

    def rc_lev(row):
        s1, s2 = row['k1'], reverseComplement(row['k2'])
        return py_needle([s1, s2])[0][2]

    dist_tbl['rc_lev'] = dist_tbl.apply(rc_lev, axis=1)
    dist_tbl['lev'] = dist_tbl.apply(
        lambda row: min(row['f_lev'], row['rc_lev']), axis=1)

    def ram_dist(row):
        r1, r2 = ramifier.ramify(row['k1']), ramifier.ramify(row['k2'])
        return np.abs(r1 - r2).sum()

    dist_tbl['ram'] = dist_tbl.apply(ram_dist, axis=1)
    dist_tbl.to_csv(outfile)
Esempio n. 5
0
 def rc_lev(row):
     s1, s2 = row['k1'], reverseComplement(row['k2'])
     return py_needle([s1, s2])[0][2]
Esempio n. 6
0
 def test_needle_equal(self):
     kmers = [KMER_31, KMER_31]
     needle = py_needle(kmers, normalize=False)
     for k1, k2, dist in needle:
         self.assertEqual(dist, 0)