def test_jaccard_index2(mc, kmers1, kmers2):
    kmers1 = list(seq_to_kmers(kmers1))
    kmers2 = list(seq_to_kmers(kmers2))
    mc.delete_all()
    mc.insert(kmers1, '1234')
    mc.insert(kmers2, '1235')
    skmers1 = set(kmers1)
    skmers2 = set(kmers2)
    true_sim = float(len(skmers1 & skmers2)) / float(len(skmers1 | skmers2))

    ji = mc.jaccard_index('1234', '1235')
    # print(ji, true_sim, float(abs(ji-true_sim)))
    assert float(abs(ji-true_sim)) <= 0.2
def test_get_bloomfilter(sample, seq):
    kmers = seq_to_kmers(seq, 31)
    cbg = CBG.create(m=100, force=True)
    cbg.insert(cbg.bloom(kmers), sample)
    bf = cbg.get_bloom_filter(sample)
    assert bf.length() == cbg.graph.bloomfilter.size
    cbg.delete_all()
Exemple #3
0
def _search(gene_name,
            seq,
            results,
            threshold,
            graph,
            output_format="json",
            pipe=False,
            score=False):
    if pipe:
        if output_format == "tsv":
            start = time.time()
            result = graph.search(seq, threshold=threshold, score=score)
            diff = time.time() - start
            if result:
                for sample_id, percent in result.items():
                    print("\t".join(
                        [gene_name, sample_id,
                         str(percent),
                         str(diff)]))
            else:
                print("\t".join([gene_name, "NA", str(0), str(diff)]))
        elif output_format == "fasta":
            samples = graph.sample_to_colour_lookup.keys()
            print(" ".join(['>', gene_name]))
            print(seq)
            result = graph.search(seq, threshold=threshold, score=score)
            result = sorted(result.items(),
                            key=operator.itemgetter(1),
                            reverse=True)
            for sample, percent in result:
                percent = round(percent * 100, 2)
                colour = int(graph.sample_to_colour_lookup.get(sample))
                print(" ".join([
                    '>', gene_name, sample,
                    "kmer-%i coverage %f" % (graph.kmer_size, percent)
                ]))
                presence = []
                for kmer in seq_to_kmers(seq, graph.kmer_size):
                    kmer_presence = graph.graph.lookup(
                        convert_query_kmer(kmer))[colour]
                    sys.stdout.write(str(int(kmer_presence)))
                sys.stdout.write('\n')
        else:
            result = {}
            start = time.time()
            result['results'] = graph.search(seq,
                                             threshold=threshold,
                                             score=score)
            diff = time.time() - start
            result['time'] = diff
            print(json.dumps({gene_name: result}))
    else:
        results[gene_name] = {}
        start = time.time()
        results[gene_name]['results'] = graph.search(seq,
                                                     threshold=threshold,
                                                     score=score)
        diff = time.time() - start
        results[gene_name]['time'] = diff
    return results
Exemple #4
0
def kmer_reader(f):
    count = 0
    reader = Reader(f)
    for i, line in enumerate(reader):
        if i % 100000 == 0:
            sys.stderr.write(str(i) + '\n')
            sys.stderr.flush()
        read = line.decode('utf-8')
        for k in seq_to_kmers(read):
            count += 1
            yield k
    sys.stderr.write(str(count))
def test_jaccard_index3(kmers1, kmers2):
    kmers1=list(seq_to_kmers(kmers1))
    kmers2=list(seq_to_kmers(kmers2))
    mc=HyperLogLogJaccardIndex(host = REDIS_HOST, port = REDIS_PORT)
    mc.delete_all()
    mc.insert(kmers1, '1234')
    mc.insert(kmers2, '1235')
    skmers1=set(kmers1)
    skmers2=set(kmers2)
    true_sim=float(len(skmers1 & skmers2)) / float(len(skmers1 | skmers2))
    true_sdiff=float(len(skmers1 ^ skmers2))
    true_diff=float(len(skmers1 - skmers2))

    ji=mc.jaccard_index('1234', '1235')
    sd=mc.symmetric_difference('1234', '1235')
    dd=mc.difference('1234', '1235')
    # print(ji, true_sim, float(abs(ji-true_sim)))
    # print(sd, true_sdiff, float(abs(sd-true_sdiff)))
    # print(dd, true_diff, float(abs(dd - true_diff)))
    assert float(abs(ji-true_sim)) <= 0.2
    assert float(abs(sd-true_sdiff)) <= 5
    assert float(abs(dd - true_diff)) <= 5
Exemple #6
0
def test_insert_lookup_kmers(Graph, sample, seq, k, m, h):
    logger.info("Testing graph with params (k=%i,m=%i,h=%i)" % (k, m, h))
    kmers = list(seq_to_kmers(seq, k))
    cbg = Graph.create(m=m, k=k, h=h, force=True)
    bloom = cbg.bloom(kmers)
    cbg.insert(bloom, sample)
    for kmer in kmers:
        # assert sample not in cbg.lookup(kmer+"T")[kmer+"T"]
        ba = bitarray()
        ba.frombytes(cbg.lookup_raw(kmer))
        assert ba[0] == True
        assert sample in cbg.lookup(kmer)[kmer]
    assert [sample] in cbg.lookup(kmers).values()
    cbg.delete_all()
Exemple #7
0
def test_insert_and_unique_sample_names(Graph, sample, seq, k, m, h):
    logger.info("Testing graph with params (k=%i,m=%i,h=%i)" % (k, m, h))
    kmers = seq_to_kmers(seq, k)
    m = 100
    cbg = Graph.create(m=m, k=k, h=h, force=True)
    assert cbg.kmer_size == k
    bloom = cbg.bloom(kmers)
    assert len(bloom) == m
    cbg.insert(bloom, sample)
    with pytest.raises(ValueError):
        cbg.insert(bloom, sample)
    assert sample in cbg.search(seq)
    assert cbg.search(seq).get(sample).get('percent_kmers_found') == 100
    cbg.delete_all()
Exemple #8
0
def test_update_contains(colour, elements, bloom_filter_size, num_hashes):
    storage = ProbabilisticBerkeleyDBStorage(
        filename="db",
        bloom_filter_size=bloom_filter_size,
        num_hashes=num_hashes)

    elements = list(seq_to_kmers(elements, 31))
    storage.bloom_filter_size = bloom_filter_size
    storage.num_hashes = num_hashes

    storage.bloomfilter.update(elements, colour)
    for k in elements:
        assert storage.bloomfilter.contains(k, colour)
    storage.delete_all()
Exemple #9
0
def extract_kmers_from_ctx(ctx, k):
    gr = GraphReader(ctx)
    for i in gr:
        for kmer in seq_to_kmers(i.kmer.canonical_value, k):
            yield kmer
def test_jaccard_index1(mc, kmers):
    kmers = list(seq_to_kmers(kmers))
    mc.delete_all()
    mc.insert(kmers, '1234')
    mc.insert(kmers, '1235')
    assert mc.jaccard_index('1234', '1235') == 1