def test_build_chunks(): N = 3 bloomfilter_filepaths = ['bigsi/tests/data/test_kmers.bloom'] * N sample_names = generate_sample_names(len(bloomfilter_filepaths)) bigsi1 = BIGSI.create(db="./db-bigsi-no-max-mem/", m=10, k=9, h=1, force=True) build(bloomfilter_filepaths, sample_names, bigsi1) bigsi2 = BIGSI.create(db="./db-bigsi-max-mem/", m=10, k=9, h=1, force=True) build(bloomfilter_filepaths, sample_names, bigsi2, max_memory=20) # 20bytes # Reload and test equal bigsi1 = BIGSI("./db-bigsi-no-max-mem/") bigsi2 = BIGSI("./db-bigsi-max-mem") for i in range(10): assert bigsi1.graph[i] == bigsi2.graph[i] for k, v in bigsi2.metadata.items(): assert bigsi1.metadata[k] == v bigsi1.delete_all() bigsi2.delete_all()
def test_inexact_search(): for config in CONFIGS: get_storage(config).delete_all() config = CONFIGS[0] kmers_1 = seq_to_kmers("ATACACAAT", config["k"]) kmers_2 = seq_to_kmers("ATACACAAC", config["k"]) bloom1 = BIGSI.bloom(config, kmers_1) bloom2 = BIGSI.bloom(config, kmers_2) for config in CONFIGS: get_storage(config).delete_all() with pytest.raises(BaseException): BIGSI(config) bigsi = BIGSI.build(config, [bloom1, bloom2], ["a", "b"]) assert bigsi.search("ACAGTTAAC", 0.5) == [] assert bigsi.lookup("AAT") == {"AAT": bitarray("10")} results = bigsi.search("ATACACAAT", 0.5) assert results[0] == { "percent_kmers_found": 100.0, "num_kmers": 6, "num_kmers_found": 6, "sample_name": "a", } assert ( json.dumps(results[0]) == '{"percent_kmers_found": 100.0, "num_kmers": 6, "num_kmers_found": 6, "sample_name": "a"}' ) assert results[1] == { "percent_kmers_found": 83.33, "num_kmers": 6, "num_kmers_found": 5, "sample_name": "b", } bigsi.delete()
def test_create(): for config in CONFIGS: get_storage(config).delete_all() bloomfilters = [BIGSI.bloom(config, ["ATC", "ATA"])] samples = ["1"] bigsi = BIGSI.build(config, bloomfilters, samples) assert bigsi.kmer_size == 3 assert bigsi.bloomfilter_size == 1000 assert bigsi.num_hashes == 3 assert bigsi.num_samples == 1 assert bigsi.lookup("ATC") == {"ATC": bitarray("1")} assert bigsi.colour_to_sample(0) == "1" assert bigsi.sample_to_colour("1") == 0 bigsi.delete()
def test_unique_sample_names(): for config in CONFIGS: get_storage(config).delete_all() bloom = BIGSI.bloom(config, ["ATC", "ATA"]) bigsi = BIGSI.build(config, [bloom], ["1"]) with pytest.raises(ValueError): bigsi.insert(bloom, "1") assert bigsi.num_samples == 1 assert bigsi.lookup(["ATC", "ATA", "ATT"]) == { "ATC": bitarray("1"), "ATA": bitarray("1"), "ATT": bitarray("0"), } bigsi.delete()
def test_insert_search_cmd(): Graph = BIGSI.create(m=100, force=True) f = Graph.db response = hug.test.delete(bigsi.__main__, '', {'db': f}) response = hug.test.post(bigsi.__main__, 'init', {'db': f, 'm': 1000}) N = 3 bloomfilter_filepaths = ['bigsi/tests/data/test_kmers.bloom'] * N samples = [] for i in range(N): samples.append(''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(6))) response = hug.test.post(bigsi.__main__, 'build', { 'db': f, 'bloomfilters': bloomfilter_filepaths, 'samples': samples }) # Returns a Response object response = hug.test.post( bigsi.__main__, 'insert', { 'db': f, 'bloomfilter': 'bigsi/tests/data/test_kmers.bloom', 'sample': "s3" }) assert response.data.get('result') == 'success' seq = 'GATCGTTTGCGGCCACAGTTGCCAGAGATGA' response = hug.test.get(bigsi.__main__, 'search', {'db': f, 'seq': seq}) assert "s3" in response.data.get(seq).get('results') response = hug.test.delete(bigsi.__main__, '', { 'db': f, })
def test_get_bloomfilter(sample, seq): kmers = seq_to_kmers(seq, 31) bigsi = BIGSI.create(m=100, force=True) bigsi.insert(bigsi.bloom(kmers), sample) bf = bigsi.get_bloom_filter(sample) assert bf.length() == bigsi.graph.bloomfilter.size bigsi.delete_all()
def test_get_bloomfilter(seq): sample = "1234" kmers = seq_to_kmers(seq, 31) bigsi = BIGSI.create(m=10, force=True) bigsi.build([bigsi.bloom(kmers)], [sample]) bf = bigsi.get_bloom_filter(sample) assert bf.length() == bigsi.graph.bloomfilter.size bigsi.delete_all()
def test_cant_build_chunks_if_max_memory_less_than_bf(): N = 3 bloomfilter_filepaths = ['bigsi/tests/data/test_kmers.bloom'] * N sample_names = generate_sample_names(len(bloomfilter_filepaths)) bigsi2 = BIGSI.create(db="./db-bigsi-max-mem/", m=10, k=9, h=1, force=True) with pytest.raises(ValueError): build(bloomfilter_filepaths, sample_names, bigsi2, max_memory=1) # 1byte (should fail)
def test_merge(): for config in CONFIGS: get_storage(config).delete_all() config = CONFIGS[0] kmers_1 = seq_to_kmers("ATACACAAT", config["k"]) kmers_2 = seq_to_kmers("ATACACAAC", config["k"]) bloom1 = BIGSI.bloom(config, kmers_1) bloom2 = BIGSI.bloom(config, kmers_2) bigsi1 = BIGSI.build(CONFIGS[0], [bloom1], ["a"]) bigsi2 = BIGSI.build(CONFIGS[1], [bloom2], ["b"]) bigsic = BIGSI.build(CONFIGS[2], [bloom1, bloom2], ["a", "b"]) bigsi1.merge(bigsi2) assert bigsi1.search("ATACACAAT", 0.5) == bigsic.search("ATACACAAT", 0.5) bigsi1.delete() bigsi2.delete() bigsic.delete()
def test_search_doesnt_required_write_access(): Graph = BIGSI.create(m=100, force=True) f = Graph.db response = hug.test.delete(bigsi.__main__, '', {'db': f}) response = hug.test.post(bigsi.__main__, 'init', {'db': f, 'm': 1000}) N = 3 bloomfilter_filepaths = ['bigsi/tests/data/test_kmers.bloom'] * N samples = [] for i in range(N): samples.append(''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(6))) response = hug.test.post(bigsi.__main__, 'build', { 'db': f, 'bloomfilters': bloomfilter_filepaths, 'samples': samples }) # Make the DB read only os.chmod(Graph.graph_filename, S_IREAD | S_IRGRP | S_IROTH) os.chmod(Graph.metadata_filename, S_IREAD | S_IRGRP | S_IROTH) with pytest.raises(OSError): response = hug.test.post( bigsi.__main__, 'insert', { 'db': f, 'bloomfilter': 'bigsi/tests/data/test_kmers.bloom', 'sample': "s3" }) # Search doesn't raise errors seq = 'GATCGTTTGCGGCCACAGTTGCCAGAGATGA' response = hug.test.get(bigsi.__main__, 'search', { 'db': f, 'seq': seq, "score": True }) # assert response.data.get(seq).get('results') != {} # assert "score" in list(response.data.get(seq).get('results').values())[0] seq = 'GATCGTTTGCGGCCACAGTTGCCAGAGATGAAAG' response = hug.test.get(bigsi.__main__, 'search', { 'db': f, 'seq': seq, 'threshold': 0.1, "score": True }) assert response.data.get(seq).get('results') assert "score" in list(response.data.get(seq).get('results').values())[0] # Delete requires read access os.chmod(Graph.graph_filename, S_IWUSR | S_IREAD) os.chmod(Graph.metadata_filename, S_IWUSR | S_IREAD) response = hug.test.delete(bigsi.__main__, '', { 'db': f, })
def test_insert(): for config in CONFIGS: get_storage(config).delete_all() bloomfilters = [BIGSI.bloom(config, ["ATC", "ATA"])] samples = ["1"] bigsi = BIGSI.build(config, bloomfilters, samples) bloomfilter_2 = BIGSI.bloom(config, ["ATC", "ATT"]) bigsi.insert(bloomfilter_2, "2") assert bigsi.kmer_size == 3 assert bigsi.bloomfilter_size == 1000 assert bigsi.num_hashes == 3 assert bigsi.num_samples == 2 assert bigsi.lookup(["ATC", "ATA", "ATT"]) == { "ATC": bitarray("11"), "ATA": bitarray("10"), "ATT": bitarray("01"), } assert bigsi.colour_to_sample(0) == "1" assert bigsi.sample_to_colour("1") == 0 assert bigsi.colour_to_sample(1) == "2" assert bigsi.sample_to_colour("2") == 1 bigsi.delete()
def test_exact_search(): config = CONFIGS[0] kmers_1 = seq_to_kmers("ATACACAAT", config["k"]) kmers_2 = seq_to_kmers("ACAGAGAAC", config["k"]) bloom1 = BIGSI.bloom(config, kmers_1) bloom2 = BIGSI.bloom(config, kmers_2) for config in CONFIGS: get_storage(config).delete_all() bigsi = BIGSI.build(config, [bloom1, bloom2], ["a", "b"]) assert bigsi.search("ATACACAAT")[0] == { "percent_kmers_found": 100, "num_kmers": 6, "num_kmers_found": 6, "sample_name": "a", } assert bigsi.search("ACAGAGAAC")[0] == { "percent_kmers_found": 100, "num_kmers": 6, "num_kmers_found": 6, "sample_name": "b", } assert bigsi.search("ACAGTTAAC") == [] bigsi.delete()
def test_bloom_cmd(): Graph = BIGSI.create(m=100, force=True) f = '/tmp/test_kmers.bloom' response = hug.test.post(bigsi.__main__, 'bloom', { 'db': Graph.db, 'ctx': 'bigsi/tests/data/test_kmers.ctx', 'outfile': f }) a = bitarray() with open(f, 'rb') as inf: a.fromfile(inf) assert sum(a) > 0 os.remove(f)
def test_search_for_amino_acid_mutation(): kmer_size = 21 bigsi = BIGSI.create(m=1000, k=kmer_size, force=True) variant_search = BIGSIAminoAcidMutationSearch( bigsi, "bigsi/tests/data/ref.fasta", "bigsi/tests/data/ref.gb") var_name1 = variant_search.aa2dna.get_variant_names("rpoB", "S450X", True)[0] var_name2 = variant_search.aa2dna.get_variant_names("rpoB", "S450X", True)[4] print(var_name1) print(var_name2) # # Add a the reference seq, the alternate and both as samples variant_probe_set1 = variant_search.create_variant_probe_set(var_name1) variant_probe_set2 = variant_search.create_variant_probe_set(var_name2) ref1 = variant_probe_set1.refs[0] alt1 = variant_probe_set1.alts[0] ref2 = variant_probe_set2.refs[0] alt2 = variant_probe_set2.alts[0] bloom1 = bigsi.bloom(bigsi.seq_to_kmers(ref1)) bloom2 = bigsi.bloom(bigsi.seq_to_kmers(alt1)) bloom3 = bigsi.bloom(bigsi.seq_to_kmers(ref2)) bloom4 = bigsi.bloom(bigsi.seq_to_kmers(alt2)) bigsi.insert(bloom1, 'ref1') bigsi.insert(bloom2, 'alt1') bigsi.insert(bloom3, 'ref2') bigsi.insert(bloom4, 'alt2') results = variant_search.search_for_amino_acid_variant( "rpoB", "S", 450, "X") assert results.get("rpoB_S450X").get("ref1").get("genotype") == "0/0" assert results.get("rpoB_S450X").get("ref1").get("aa_mut")[:-1] == "S450" assert results.get("rpoB_S450X").get("ref1").get( "variant")[:-3] == var_name1[:-3] assert results.get("rpoB_S450X").get("ref2").get("genotype") == "0/0" assert results.get("rpoB_S450X").get("ref2").get("aa_mut")[:-1] == "S450" assert results.get("rpoB_S450X").get("ref2").get( "variant")[:-3] == var_name2[:-3] assert results.get("rpoB_S450X").get("alt1").get("genotype") == "1/1" assert results.get("rpoB_S450X").get("alt1").get("aa_mut") == "S450K" assert results.get("rpoB_S450X").get("alt1").get("variant") == var_name1 assert results.get("rpoB_S450X").get("alt2").get("genotype") == "1/1" assert results.get("rpoB_S450X").get("alt2").get("aa_mut") == "S450I" assert results.get("rpoB_S450X").get("alt2").get("variant") == var_name2
def test_build_cmd(): Graph = BIGSI.create(m=100, force=True) f = Graph.db response = hug.test.delete(bigsi.__main__, '', {'db': f}) response = hug.test.post(bigsi.__main__, 'init', {'db': f, 'm': 1000}) N = 3 bloomfilter_filepaths = ['bigsi/tests/data/test_kmers.bloom'] * N samples = [] for i in range(N): samples.append(''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(6))) response = hug.test.post(bigsi.__main__, 'build', { 'db': f, 'bloomfilters': bloomfilter_filepaths, 'samples': samples }) # TODO fix below seq = 'GATCGTTTGCGGCCACAGTTGCCAGAGATGA' response = hug.test.get(bigsi.__main__, 'search', { 'db': f, 'seq': seq, "score": True }) # assert response.data.get(seq).get('results') != {} # assert "score" in list(response.data.get(seq).get('results').values())[0] seq = 'GATCGTTTGCGGCCACAGTTGCCAGAGATGAAAG' response = hug.test.get(bigsi.__main__, 'search', { 'db': f, 'seq': seq, 'threshold': 0.1, "score": True }) assert response.data.get(seq).get('results') assert "score" in list(response.data.get(seq).get('results').values())[0] response = hug.test.delete(bigsi.__main__, '', { 'db': f, })
import bigsi.__main__ import json from bigsi.tests.base import ST_SEQ from bigsi.tests.base import ST_KMER from bigsi.tests.base import ST_SAMPLE_NAME from bigsi.tests.base import ST_GRAPH from bigsi import BIGSI import hypothesis.strategies as st from hypothesis import given import random import tempfile from bigsi.utils import seq_to_kmers from bitarray import bitarray import numpy as np Graph = BIGSI.create(m=100, force=True) def test_bloom_cmd(): f = '/tmp/test_kmers.bloom' response = hug.test.post( bigsi.__main__, 'bloom', {'db': Graph.db, 'ctx': 'bigsi/tests/data/test_kmers.ctx', 'outfile': f}) a = bitarray() with open(f, 'rb') as inf: a.fromfile(inf) assert sum(a) > 0 os.remove(f)
def test_merge(): kmers1 = ['AAAAAAAAA'] * 3 kmers2 = ['AAAAAAAAT'] * 9 bigsi1 = BIGSI.create(db="./db-bigsi1/", m=10, k=9, h=1, force=True) blooms1 = [] for s in kmers1: blooms1.append(bigsi1.bloom([s])) samples1 = [str(i) for i in range(len(kmers1))] bigsi1.build(blooms1, samples1) bigsi2 = BIGSI.create(db="./db-bigsi2/", m=10, k=9, h=1, force=True) blooms2 = [] for s in kmers2: blooms2.append(bigsi2.bloom([s])) samples2 = [str(i) for i in range(len(kmers2))] bigsi2.build(blooms2, samples2) combined_samples = combine_samples(samples1, samples2) bigsicombined = BIGSI.create(db="./db-bigsi-c/", m=10, k=9, h=1, force=True) bigsicombined = BIGSI(db="./db-bigsi-c/", mode="c") bigsicombined.build(blooms1 + blooms2, combined_samples) bigsi1.merge(bigsi2) bigsi1 = BIGSI(db="./db-bigsi1/") for i in range(10): assert bigsi1.graph[i] == bigsicombined.graph[i] for k, v in bigsicombined.metadata.items(): assert bigsi1.metadata[k] == v bigsi1.delete_all() bigsi2.delete_all() bigsicombined.delete_all()