def calibrate_db(num_seqs, num_mutants, outfile, database): db = ContigDB.load_from_filepath(database) click.echo(f'K: {db.ramifier.k}', err=True) prek = int(db.ramifier.k * 1.1) contigs = random.sample(db.get_all_contigs(), num_seqs) contigs = [ db.py_get_seq(contig_name, start_coord, start_coord + prek + 100) for contig_name, _, start_coord, end_coord in contigs ] contigs = [ select_one_kmer(seq, prek) for seq in contigs if len(seq) > prek ] click.echo(f'Total contigs: {len(contigs)}', err=True) mutated = [ mutate_seq(seq, db.ramifier.k) for seq in contigs for _ in range(num_mutants) ] contigs = [select_one_kmer(kmer, db.ramifier.k) for kmer in contigs] + mutated click.echo(f'Comparisons: {(len(contigs) ** 2) / 2 - len(contigs)}', err=True) dist_tbl = pd.DataFrame(py_needle(contigs), columns=['k1', 'k2', 'levenshtein']) def ram_dist(row): r1, r2 = db.ramifier.ramify(row['k1']), db.ramifier.ramify(row['k2']) return np.abs(r1 - r2).sum() dist_tbl['ram'] = dist_tbl.apply(ram_dist, axis=1) dist_tbl.to_csv(outfile)
def test_needle(self): kmers = [KMER_31, MIS, GAP] needle = py_needle(kmers, normalize=False) for k1, k2, dist in needle: ex_dist = 1 if k1 == GAP or k2 == GAP: ex_dist = 2 self.assertEqual(dist, ex_dist)
def cli_lev_dist_matrix(gap, kmer_len, outfile, fasta): kmers = parse_seqs(fasta, kmer_len, gap) click.echo(f'{len(kmers)} unique kmers.', err=True) start = time() dist_tbl = pd.DataFrame(py_needle(kmers), columns=['k1', 'k2', 'lev']) elapsed = time() - start click.echo(f'{elapsed:.5}s to build distance matrix.', err=True) dist_tbl.to_csv(outfile)
def calibrate_db(dropout, gap, burst, kmer_len, outfile, rotation, fasta): seqs = [str(el.seq) for el in SeqIO.parse(fasta, 'fasta')] kmers = set() for seq in seqs: for i in range(0, len(seq) - kmer_len, gap): for j in range(burst): j = 0 if random.random() < dropout: kmer = seq[i + j:i + j + kmer_len] # kmer = 'A' + kmer + 'C' kmers.add(kmer) # frac = 30 # mut_kmer = kmer[:(kmer_len // frac)] # mut_kmer += mutate_seq(kmer[(kmer_len // frac):((frac - 1) * kmer_len // frac)]) # mut_kmer += kmer[((frac - 1) * kmer_len // frac):] # kmers.add(mut_kmer) click.echo(f'{len(kmers)} kmers', err=True) dist_tbl = pd.DataFrame(py_needle(list(kmers)), columns=['k1', 'k2', 'f_lev']) if rotation is None: ramifier = Ramifier(kmer_len) else: ramifier = RotatingRamifier.from_file(rotation) def rc_lev(row): s1, s2 = row['k1'], reverseComplement(row['k2']) return py_needle([s1, s2])[0][2] dist_tbl['rc_lev'] = dist_tbl.apply(rc_lev, axis=1) dist_tbl['lev'] = dist_tbl.apply( lambda row: min(row['f_lev'], row['rc_lev']), axis=1) def ram_dist(row): r1, r2 = ramifier.ramify(row['k1']), ramifier.ramify(row['k2']) return np.abs(r1 - r2).sum() dist_tbl['ram'] = dist_tbl.apply(ram_dist, axis=1) dist_tbl.to_csv(outfile)
def rc_lev(row): s1, s2 = row['k1'], reverseComplement(row['k2']) return py_needle([s1, s2])[0][2]
def test_needle_equal(self): kmers = [KMER_31, KMER_31] needle = py_needle(kmers, normalize=False) for k1, k2, dist in needle: self.assertEqual(dist, 0)