Ejemplo n.º 1
0
def test_inexact_search():
    for config in CONFIGS:
        get_storage(config).delete_all()
    config = CONFIGS[0]
    kmers_1 = seq_to_kmers("ATACACAAT", config["k"])
    kmers_2 = seq_to_kmers("ATACACAAC", config["k"])
    bloom1 = BIGSI.bloom(config, kmers_1)
    bloom2 = BIGSI.bloom(config, kmers_2)

    for config in CONFIGS:
        get_storage(config).delete_all()
        with pytest.raises(BaseException):
            BIGSI(config)
        bigsi = BIGSI.build(config, [bloom1, bloom2], ["a", "b"])
        assert bigsi.search("ACAGTTAAC", 0.5) == []
        assert bigsi.lookup("AAT") == {"AAT": bitarray("10")}

        results = bigsi.search("ATACACAAT", 0.5)
        assert results[0] == {
            "percent_kmers_found": 100.0,
            "num_kmers": 6,
            "num_kmers_found": 6,
            "sample_name": "a",
        }
        assert (
            json.dumps(results[0])
            == '{"percent_kmers_found": 100.0, "num_kmers": 6, "num_kmers_found": 6, "sample_name": "a"}'
        )
        assert results[1] == {
            "percent_kmers_found": 83.33,
            "num_kmers": 6,
            "num_kmers_found": 5,
            "sample_name": "b",
        }
        bigsi.delete()
Ejemplo n.º 2
0
def test_jaccard_index2(mc, kmers1, kmers2):
    kmers1 = list(seq_to_kmers(kmers1))
    kmers2 = list(seq_to_kmers(kmers2))
    mc.delete_all()
    mc.insert(kmers1, '1234')
    mc.insert(kmers2, '1235')
    skmers1 = set(kmers1)
    skmers2 = set(kmers2)
    true_sim = float(len(skmers1 & skmers2)) / float(len(skmers1 | skmers2))

    ji = mc.jaccard_index('1234', '1235')
    assert float(abs(ji-true_sim)) <= 0.2
Ejemplo n.º 3
0
def test_get_bloomfilter(sample, seq):
    kmers = seq_to_kmers(seq, 31)
    bigsi = BIGSI.create(m=100, force=True)
    bigsi.insert(bigsi.bloom(kmers), sample)
    bf = bigsi.get_bloom_filter(sample)
    assert bf.length() == bigsi.graph.bloomfilter.size
    bigsi.delete_all()
Ejemplo n.º 4
0
def test_get_bloomfilter(seq):
    sample = "1234"
    kmers = seq_to_kmers(seq, 31)
    bigsi = BIGSI.create(m=10, force=True)
    bigsi.build([bigsi.bloom(kmers)], [sample])
    bf = bigsi.get_bloom_filter(sample)
    assert bf.length() == bigsi.graph.bloomfilter.size
    bigsi.delete_all()
Ejemplo n.º 5
0
def kmer_reader(f):
    reader = Reader(f)
    for i, line in enumerate(reader):
        # if i % 100000 == 0:
        #     sys.stderr.write(str(i)+'\n')
        #     sys.stderr.flush()
        read = line.decode('utf-8')
        for k in seq_to_kmers(read):
            yield k
Ejemplo n.º 6
0
def test_merge():
    for config in CONFIGS:
        get_storage(config).delete_all()
    config = CONFIGS[0]
    kmers_1 = seq_to_kmers("ATACACAAT", config["k"])
    kmers_2 = seq_to_kmers("ATACACAAC", config["k"])
    bloom1 = BIGSI.bloom(config, kmers_1)
    bloom2 = BIGSI.bloom(config, kmers_2)

    bigsi1 = BIGSI.build(CONFIGS[0], [bloom1], ["a"])
    bigsi2 = BIGSI.build(CONFIGS[1], [bloom2], ["b"])
    bigsic = BIGSI.build(CONFIGS[2], [bloom1, bloom2], ["a", "b"])

    bigsi1.merge(bigsi2)

    assert bigsi1.search("ATACACAAT", 0.5) == bigsic.search("ATACACAAT", 0.5)
    bigsi1.delete()
    bigsi2.delete()
    bigsic.delete()
Ejemplo n.º 7
0
def test_jaccard_index3(kmers1, kmers2):
    kmers1=list(seq_to_kmers(kmers1))
    kmers2=list(seq_to_kmers(kmers2))
    mc=HyperLogLogJaccardIndex(host = REDIS_HOST, port = REDIS_PORT)
    mc.delete_all()
    mc.insert(kmers1, '1234')
    mc.insert(kmers2, '1235')
    skmers1=set(kmers1)
    skmers2=set(kmers2)
    true_sim=float(len(skmers1 & skmers2)) / float(len(skmers1 | skmers2))
    true_sdiff=float(len(skmers1 ^ skmers2))
    true_diff=float(len(skmers1 - skmers2))

    ji=mc.jaccard_index('1234', '1235')
    sd=mc.symmetric_difference('1234', '1235')
    dd=mc.difference('1234', '1235')

    assert float(abs(ji-true_sim)) <= 0.2
    assert float(abs(sd-true_sdiff)) <= 5
    assert float(abs(dd - true_diff)) <= 5
Ejemplo n.º 8
0
def kmer_reader(f):
    count = 0
    reader = Reader(f)
    for i, line in enumerate(reader):
        if i % 100000 == 0:
            sys.stderr.write(str(i) + '\n')
            sys.stderr.flush()
        read = line.decode('utf-8')
        for k in seq_to_kmers(read):
            count += 1
            yield k
    sys.stderr.write(str(count))
Ejemplo n.º 9
0
def test_update_contains(colour, elements, bloom_filter_size, num_hashes):
    storage = ProbabilisticBerkeleyDBStorage(
        filename="db",
        bloom_filter_size=bloom_filter_size,
        num_hashes=num_hashes)

    elements = list(seq_to_kmers(elements, 31))
    storage.bloom_filter_size = bloom_filter_size
    storage.num_hashes = num_hashes

    storage.bloomfilter.update(elements, colour)
    for k in elements:
        assert storage.bloomfilter.contains(k, colour)
    storage.delete_all()
Ejemplo n.º 10
0
def test_exact_search():
    config = CONFIGS[0]
    kmers_1 = seq_to_kmers("ATACACAAT", config["k"])
    kmers_2 = seq_to_kmers("ACAGAGAAC", config["k"])
    bloom1 = BIGSI.bloom(config, kmers_1)
    bloom2 = BIGSI.bloom(config, kmers_2)
    for config in CONFIGS:
        get_storage(config).delete_all()
        bigsi = BIGSI.build(config, [bloom1, bloom2], ["a", "b"])
        assert bigsi.search("ATACACAAT")[0] == {
            "percent_kmers_found": 100,
            "num_kmers": 6,
            "num_kmers_found": 6,
            "sample_name": "a",
        }
        assert bigsi.search("ACAGAGAAC")[0] == {
            "percent_kmers_found": 100,
            "num_kmers": 6,
            "num_kmers_found": 6,
            "sample_name": "b",
        }
        assert bigsi.search("ACAGTTAAC") == []
        bigsi.delete()
Ejemplo n.º 11
0
def test_insert_lookup_kmers(Graph, sample, seq, k, m, h):
    logger.info("Testing graph with params (k=%i,m=%i,h=%i)" % (k, m, h))
    kmers = list(seq_to_kmers(seq, k))
    bigsi = Graph.create(m=m, k=k, h=h, force=True)
    bloom = bigsi.bloom(kmers)
    bigsi.insert(bloom, sample)
    for kmer in kmers:
        # assert sample not in bigsi.lookup(kmer+"T")[kmer+"T"]
        ba = bitarray()
        ba.frombytes(bigsi.lookup_raw(kmer))
        assert ba[0] == True
        assert sample in bigsi.lookup(kmer)[kmer]
    assert [sample] in bigsi.lookup(kmers).values()
    bigsi.delete_all()
Ejemplo n.º 12
0
def test_insert_and_unique_sample_names(Graph, sample, seq, k, m, h):
    logger.info("Testing graph with params (k=%i,m=%i,h=%i)" % (k, m, h))
    kmers = seq_to_kmers(seq, k)
    m = 100
    bigsi = Graph.create(m=m, k=k, h=h, force=True)
    assert bigsi.kmer_size == k
    bloom = bigsi.bloom(kmers)
    assert len(bloom) == m
    bigsi.insert(bloom, sample)
    with pytest.raises(ValueError):
        bigsi.insert(bloom, sample)
    assert sample in bigsi.search(seq)
    assert bigsi.search(seq).get(sample).get('percent_kmers_found') == 100
    bigsi.delete_all()
Ejemplo n.º 13
0
def test_insert_lookup_kmers():
    Graph, sample, seq = BIGSI, '0', 'AAAAAAAAAAAATCAAAAAAAAAAAAAAAAA'
    m, h, k = 10, 2, 31

    logger.debug("Testing graph with params (k=%i,m=%i,h=%i)" % (k, m, h))
    kmers = list(seq_to_kmers(seq, k))
    bigsi = Graph.create(m=m, k=k, h=h, force=True)
    bloom = bigsi.bloom(kmers)
    bigsi.build([bloom], [sample])
    for kmer in kmers:
        # assert sample not in bigsi.lookup(kmer+"T")[kmer+"T"]
        ba = bitarray()
        ba.frombytes(bigsi.lookup_raw(kmer))
        assert ba[0] == True
        assert sample in bigsi.lookup(kmer)[kmer]
    assert [sample] in bigsi.lookup(kmers).values()
    bigsi.delete_all()
Ejemplo n.º 14
0
def test_insert_and_unique_sample_names():
    Graph, sample = BIGSI, '0'
    seq, k, h = 'AATTTTTATTTTTTTTTTTTTAATTAATATT', 11, 1
    m = 10
    logger.debug("Testing graph with params (k=%i,m=%i,h=%i)" % (k, m, h))
    kmers = seq_to_kmers(seq, k)
    bigsi = Graph.create(m=m, k=k, h=h, force=True)
    assert bigsi.kmer_size == k
    bloom = bigsi.bloom(kmers)
    assert len(bloom) == m
    with pytest.raises(ValueError):
        bigsi.insert(bloom, sample)
    bigsi.build([bloom], [sample])
    with pytest.raises(ValueError):
        bigsi.insert(bloom, sample)
    assert sample in bigsi.search(seq)
    assert bigsi.search(seq).get(sample).get('percent_kmers_found') == 100
    bigsi.delete_all()
Ejemplo n.º 15
0
def test_cant_write_to_read_only_index():
    Graph, sample = BIGSI, "sfewe"
    seq, k, h = 'AATTTTTATTTTTTTTTTTTTAATTAATATT', 11, 1
    m = 10
    logger.debug("Testing graph with params (k=%i,m=%i,h=%i)" % (k, m, h))
    kmers = seq_to_kmers(seq, k)
    bigsi = Graph.create(m=m, k=k, h=h, force=True)
    assert bigsi.kmer_size == k
    bloom = bigsi.bloom(kmers)
    bigsi.build([bloom], [sample])
    os.chmod(bigsi.graph_filename, S_IREAD | S_IRGRP | S_IROTH)
    # Can write to a read only DB
    bigsi = Graph(mode="r")
    with pytest.raises(bsddb3.db.DBAccessError):
        bigsi.insert(bloom, "1234")
    assert sample in bigsi.search(seq)
    assert bigsi.search(seq).get(sample).get('percent_kmers_found') == 100
    os.chmod(bigsi.graph_filename, S_IWUSR | S_IREAD)
    bigsi.delete_all()
Ejemplo n.º 16
0
def extract_kmers_from_ctx(ctx, k):
    gr = GraphReader(ctx)
    for i in gr:
        for kmer in seq_to_kmers(i.kmer.canonical_value, k):
            yield kmer
Ejemplo n.º 17
0
def test_jaccard_index1(mc, kmers):
    kmers = list(seq_to_kmers(kmers))
    mc.delete_all()
    mc.insert(kmers, '1234')
    mc.insert(kmers, '1235')
    assert mc.jaccard_index('1234', '1235') == 1
Ejemplo n.º 18
0
 def seq_to_kmers(self, seq):
     return seq_to_kmers(seq, self.kmer_size)
Ejemplo n.º 19
0
def _search(gene_name,
            seq,
            results,
            threshold,
            graph,
            output_format="json",
            pipe=False,
            score=False):
    if pipe:
        if output_format == "tsv":
            start = time.time()
            result = graph.search(seq, threshold=threshold, score=score)
            diff = time.time() - start
            if result:
                for sample_id, percent in result.items():
                    print("\t".join([
                        gene_name, sample_id,
                        str(percent["percent_kmers_found"]),
                        str(diff)
                    ]))
            else:
                print("\t".join([gene_name, "NA", str(0), str(diff)]))
        elif output_format == "fasta":
            samples = graph.sample_to_colour_lookup.keys()
            print(" ".join(['>', gene_name]))
            print(seq)
            result = graph.search(seq, threshold=threshold, score=score)
            result = sorted(result.items(),
                            key=operator.itemgetter(1),
                            reverse=True)
            for sample, percent in result:
                percent = round(percent * 100, 2)
                colour = int(graph.sample_to_colour_lookup.get(sample))
                print(" ".join([
                    '>', gene_name, sample,
                    "kmer-%i coverage %f" % (graph.kmer_size, percent)
                ]))
                presence = []
                for kmer in seq_to_kmers(seq, graph.kmer_size):
                    kmer_presence = graph.graph.lookup(
                        convert_query_kmer(kmer))[colour]
                    sys.stdout.write(str(int(kmer_presence)))
                sys.stdout.write('\n')
        else:
            result = {}
            start = time.time()
            result['results'] = graph.search(seq,
                                             threshold=threshold,
                                             score=score)
            diff = time.time() - start
            result['time'] = diff
            print(json.dumps({gene_name: result}))
    else:
        results[gene_name] = {}
        start = time.time()
        results[gene_name]['results'] = graph.search(seq,
                                                     threshold=threshold,
                                                     score=score)
        diff = time.time() - start
        results[gene_name]['time'] = diff
    return results