def test_jaccard_index2(mc, kmers1, kmers2): kmers1 = list(seq_to_kmers(kmers1)) kmers2 = list(seq_to_kmers(kmers2)) mc.delete_all() mc.insert(kmers1, '1234') mc.insert(kmers2, '1235') skmers1 = set(kmers1) skmers2 = set(kmers2) true_sim = float(len(skmers1 & skmers2)) / float(len(skmers1 | skmers2)) ji = mc.jaccard_index('1234', '1235') # print(ji, true_sim, float(abs(ji-true_sim))) assert float(abs(ji-true_sim)) <= 0.2
def test_get_bloomfilter(sample, seq): kmers = seq_to_kmers(seq, 31) cbg = CBG.create(m=100, force=True) cbg.insert(cbg.bloom(kmers), sample) bf = cbg.get_bloom_filter(sample) assert bf.length() == cbg.graph.bloomfilter.size cbg.delete_all()
def _search(gene_name, seq, results, threshold, graph, output_format="json", pipe=False, score=False): if pipe: if output_format == "tsv": start = time.time() result = graph.search(seq, threshold=threshold, score=score) diff = time.time() - start if result: for sample_id, percent in result.items(): print("\t".join( [gene_name, sample_id, str(percent), str(diff)])) else: print("\t".join([gene_name, "NA", str(0), str(diff)])) elif output_format == "fasta": samples = graph.sample_to_colour_lookup.keys() print(" ".join(['>', gene_name])) print(seq) result = graph.search(seq, threshold=threshold, score=score) result = sorted(result.items(), key=operator.itemgetter(1), reverse=True) for sample, percent in result: percent = round(percent * 100, 2) colour = int(graph.sample_to_colour_lookup.get(sample)) print(" ".join([ '>', gene_name, sample, "kmer-%i coverage %f" % (graph.kmer_size, percent) ])) presence = [] for kmer in seq_to_kmers(seq, graph.kmer_size): kmer_presence = graph.graph.lookup( convert_query_kmer(kmer))[colour] sys.stdout.write(str(int(kmer_presence))) sys.stdout.write('\n') else: result = {} start = time.time() result['results'] = graph.search(seq, threshold=threshold, score=score) diff = time.time() - start result['time'] = diff print(json.dumps({gene_name: result})) else: results[gene_name] = {} start = time.time() results[gene_name]['results'] = graph.search(seq, threshold=threshold, score=score) diff = time.time() - start results[gene_name]['time'] = diff return results
def kmer_reader(f): count = 0 reader = Reader(f) for i, line in enumerate(reader): if i % 100000 == 0: sys.stderr.write(str(i) + '\n') sys.stderr.flush() read = line.decode('utf-8') for k in seq_to_kmers(read): count += 1 yield k sys.stderr.write(str(count))
def test_jaccard_index3(kmers1, kmers2): kmers1=list(seq_to_kmers(kmers1)) kmers2=list(seq_to_kmers(kmers2)) mc=HyperLogLogJaccardIndex(host = REDIS_HOST, port = REDIS_PORT) mc.delete_all() mc.insert(kmers1, '1234') mc.insert(kmers2, '1235') skmers1=set(kmers1) skmers2=set(kmers2) true_sim=float(len(skmers1 & skmers2)) / float(len(skmers1 | skmers2)) true_sdiff=float(len(skmers1 ^ skmers2)) true_diff=float(len(skmers1 - skmers2)) ji=mc.jaccard_index('1234', '1235') sd=mc.symmetric_difference('1234', '1235') dd=mc.difference('1234', '1235') # print(ji, true_sim, float(abs(ji-true_sim))) # print(sd, true_sdiff, float(abs(sd-true_sdiff))) # print(dd, true_diff, float(abs(dd - true_diff))) assert float(abs(ji-true_sim)) <= 0.2 assert float(abs(sd-true_sdiff)) <= 5 assert float(abs(dd - true_diff)) <= 5
def test_insert_lookup_kmers(Graph, sample, seq, k, m, h): logger.info("Testing graph with params (k=%i,m=%i,h=%i)" % (k, m, h)) kmers = list(seq_to_kmers(seq, k)) cbg = Graph.create(m=m, k=k, h=h, force=True) bloom = cbg.bloom(kmers) cbg.insert(bloom, sample) for kmer in kmers: # assert sample not in cbg.lookup(kmer+"T")[kmer+"T"] ba = bitarray() ba.frombytes(cbg.lookup_raw(kmer)) assert ba[0] == True assert sample in cbg.lookup(kmer)[kmer] assert [sample] in cbg.lookup(kmers).values() cbg.delete_all()
def test_insert_and_unique_sample_names(Graph, sample, seq, k, m, h): logger.info("Testing graph with params (k=%i,m=%i,h=%i)" % (k, m, h)) kmers = seq_to_kmers(seq, k) m = 100 cbg = Graph.create(m=m, k=k, h=h, force=True) assert cbg.kmer_size == k bloom = cbg.bloom(kmers) assert len(bloom) == m cbg.insert(bloom, sample) with pytest.raises(ValueError): cbg.insert(bloom, sample) assert sample in cbg.search(seq) assert cbg.search(seq).get(sample).get('percent_kmers_found') == 100 cbg.delete_all()
def test_update_contains(colour, elements, bloom_filter_size, num_hashes): storage = ProbabilisticBerkeleyDBStorage( filename="db", bloom_filter_size=bloom_filter_size, num_hashes=num_hashes) elements = list(seq_to_kmers(elements, 31)) storage.bloom_filter_size = bloom_filter_size storage.num_hashes = num_hashes storage.bloomfilter.update(elements, colour) for k in elements: assert storage.bloomfilter.contains(k, colour) storage.delete_all()
def extract_kmers_from_ctx(ctx, k): gr = GraphReader(ctx) for i in gr: for kmer in seq_to_kmers(i.kmer.canonical_value, k): yield kmer
def test_jaccard_index1(mc, kmers): kmers = list(seq_to_kmers(kmers)) mc.delete_all() mc.insert(kmers, '1234') mc.insert(kmers, '1235') assert mc.jaccard_index('1234', '1235') == 1