def test_make_canon_kmers(self): """Test that make kmers works.""" seq = 'ATCGGTA' expected_kmers = set(['ATCG', 'CCGA', 'ACCG', 'GGTA']) actual_kmers = make_kmers(seq, 4, canon=True) for kmer in actual_kmers: self.assertIn(kmer, expected_kmers)
def test_kmer_stats(self): kmers = make_kmers(ECOLI.longest_contig()[:1000], 31, canon=True) radial_cover = GreedyRadialCover(hamming_distance, 2) for kmer in kmers: radial_cover.add(kmer) radial_cover.stats() for kmer in kmers[:10]: radial_cover.search(kmer, 1)
def kmer_entropy(seqs, k=31): tbl = {} for seq in seqs: for kmer in make_kmers(seq, k, canon=True): tbl[kmer] = 1 + tbl.get(kmer, 0) H, total = 0, sum(tbl.values()) for count in tbl.values(): p = count / total H += p * log2(p) return -H
def kmer_search(cls, seq, eps=0.5): uuids = {} kmers = make_kmers(seq, MAX_K) for kmer in kmers: for uuid in cls.query.filter_by(kmer=kmer).all(): uuids[uuid] = uuids.get(uuid, 0) + 1 seqs = [] for uuid, count in uuids.items(): if count < (eps * len(kmers)): continue seqs.append(Contig.from_uuid(uuid)) return seqs
def sub_k_dist(row): q_ks = make_kmers(row['query'], sub_k) t_ks = make_kmers(row['target'], sub_k) return len(q_ks & t_ks) / len(q_ks)
def num_unique_subwords(seq, n): out = set() for kmer in make_kmers(seq, n, canon=False): out.add(kmer) return len(out)
def add_contig(cls, contig): kmers = set(make_kmers(contig.seq, MAX_K)) for kmer in kmers: db.session.add(cls(contig.uuid, kmer)) db.session.commit()
def test_add_to_cover(self): kmers = make_kmers(ECOLI.longest_contig()[:1000], 31, canon=True) radial_cover = GreedyRadialCover(hamming_distance, 2) for kmer in kmers: radial_cover.add(kmer)
def test_make_kmers(self): """Idiot check myself.""" make_kmers(ECOLI.longest_contig()[:1000], 31, canon=True)