def test_create(): for config in CONFIGS: get_storage(config).delete_all() bloomfilters = [BIGSI.bloom(config, ["ATC", "ATA"])] samples = ["1"] bigsi = BIGSI.build(config, bloomfilters, samples) assert bigsi.kmer_size == 3 assert bigsi.bloomfilter_size == 1000 assert bigsi.num_hashes == 3 assert bigsi.num_samples == 1 assert bigsi.lookup("ATC") == {"ATC": bitarray("1")} assert bigsi.colour_to_sample(0) == "1" assert bigsi.sample_to_colour("1") == 0 bigsi.delete()
def test_unique_sample_names(): for config in CONFIGS: get_storage(config).delete_all() bloom = BIGSI.bloom(config, ["ATC", "ATA"]) bigsi = BIGSI.build(config, [bloom], ["1"]) with pytest.raises(ValueError): bigsi.insert(bloom, "1") assert bigsi.num_samples == 1 assert bigsi.lookup(["ATC", "ATA", "ATT"]) == { "ATC": bitarray("1"), "ATA": bitarray("1"), "ATT": bitarray("0"), } bigsi.delete()
def test_large_build_cmd_success(num_rows: int, byte_values1: List[int], byte_values2: List[int]): num_cols1 = math.floor(len(byte_values1) * 8 / num_rows) num_cols2 = math.floor(len(byte_values2) * 8 / num_rows) input_bit_array1 = bitarray() input_bit_array1.frombytes(bytes(byte_values1)) input_bit_array2 = bitarray() input_bit_array2.frombytes(bytes(byte_values2)) with NamedTemporaryFile() as tmp_for_input_1, NamedTemporaryFile( ) as tmp_for_input_2, NamedTemporaryFile() as tmp_db: input_bit_array1.tofile(tmp_for_input_1) tmp_for_input_1.flush() input_bit_array2.tofile(tmp_for_input_2) tmp_for_input_2.flush() input_paths = [tmp_for_input_1.name, tmp_for_input_2.name] cols = [num_cols1, num_cols2] samples = ["s1", "s2"] config = _get_bigsi_index_config(num_rows, tmp_db.name) large_build(config, input_paths, cols, samples) storage = get_storage(config) with NamedTemporaryFile() as tmp_for_merged_blooms_write: merge_blooms(zip(input_paths, cols), num_rows, tmp_for_merged_blooms_write.name) with open(tmp_for_merged_blooms_write.name, "rb") as tmp_for_merged_blooms_read: for index, row in enumerate( BitMatrixReader(tmp_for_merged_blooms_read, num_rows, num_cols1 + num_cols2)): assert storage.get_bitarray( index).tobytes() == row.tobytes()
def large_build(config: str, input_path_list: List[str], num_cols_list: List[int], sample_list: List[str]): storage = get_storage(config) num_rows = int(config["m"]) with BitMatrixGroupReader(zip(input_path_list, num_cols_list), num_rows) as bmgr: processed = 0 bit_arrays = [] keys = [] for row_index in range(num_rows): keys.append(row_index) bit_arrays.append(next(bmgr)) processed = processed + 1 if processed == DB_INSERT_BATCH_SIZE: storage.set_bitarrays(keys, bit_arrays) storage.sync() keys = [] bit_arrays = [] processed = 0 if processed != 0: storage.set_bitarrays(keys, bit_arrays) storage.sync() SampleMetadata(storage).add_samples(sample_list) storage.set_integer(BLOOM_FILTERS_SIZE_KEY, num_rows) storage.set_integer(NUM_HASH_FUNCTIONS_KEY, int(config["h"])) storage.set_integer(NUM_ROWS_KEY, num_rows) storage.set_integer(NUM_COLS_KEY, sum(num_cols_list)) storage.sync() storage.close()
def convert_metadata(infile, config): in_metadata = db.DB() in_metadata.set_cachesize(4, 0) in_metadata.open(infile + "/metadata", flags=db.DB_RDONLY) bloom_filter_size = int.from_bytes(in_metadata[b'bloom_filter_size'], 'big') kmer_size = int.from_bytes(in_metadata[b'kmer_size'], 'big') num_hashes = int.from_bytes(in_metadata[b'num_hashes'], 'big') colours = pickle.loads(in_metadata[b'colours']) num_samples = len(colours) ## Create the sample metadata colour_sample = {} for colour in range(num_samples): key = "colour%i" % colour key = key.encode("utf-8") sample_name = in_metadata[key].decode('utf-8') colour_sample[colour] = sample_name ## Add the sample metadata storage = get_storage(config) sm = SampleMetadata(storage) for colour, sample_name in colour_sample.items(): if "DELETE" in sample_name: sample_name = "D3L3T3D" print(colour, sample_name) sm._set_colour_sample(colour, sample_name) sm._set_sample_colour(sample_name, -1) else: sm._set_sample_colour(sample_name, colour) sm._set_colour_sample(colour, sample_name) sm._set_integer(sm.colour_count_key, num_samples) in_metadata.close() return num_samples
def get_test_storages(): test_storages = [] for config in CONFIGS: try: test_storages.append(get_storage(config)) except: logger.warning("Skipping %s" % config["storage-engine"]) return test_storages
def test_merge(): for config in CONFIGS: get_storage(config).delete_all() config = CONFIGS[0] kmers_1 = seq_to_kmers("ATACACAAT", config["k"]) kmers_2 = seq_to_kmers("ATACACAAC", config["k"]) bloom1 = BIGSI.bloom(config, kmers_1) bloom2 = BIGSI.bloom(config, kmers_2) bigsi1 = BIGSI.build(CONFIGS[0], [bloom1], ["a"]) bigsi2 = BIGSI.build(CONFIGS[1], [bloom2], ["b"]) bigsic = BIGSI.build(CONFIGS[2], [bloom1, bloom2], ["a", "b"]) bigsi1.merge(bigsi2) assert bigsi1.search("ATACACAAT", 0.5) == bigsic.search("ATACACAAT", 0.5) bigsi1.delete() bigsi2.delete() bigsic.delete()
def __init__(self, config=None): if config is None: config = DEFAULT_CONFIG self.config = config self.storage = get_storage(config) SampleMetadata.__init__(self, self.storage) KmerSignatureIndex.__init__(self, self.storage) self.min_unique_kmers_in_query = ( MIN_UNIQUE_KMERS_IN_QUERY ) ## TODO this can be inferred and set at build time self.scorer=Scorer(self.num_samples)
def test_insert(): for config in CONFIGS: get_storage(config).delete_all() bloomfilters = [BIGSI.bloom(config, ["ATC", "ATA"])] samples = ["1"] bigsi = BIGSI.build(config, bloomfilters, samples) bloomfilter_2 = BIGSI.bloom(config, ["ATC", "ATT"]) bigsi.insert(bloomfilter_2, "2") assert bigsi.kmer_size == 3 assert bigsi.bloomfilter_size == 1000 assert bigsi.num_hashes == 3 assert bigsi.num_samples == 2 assert bigsi.lookup(["ATC", "ATA", "ATT"]) == { "ATC": bitarray("11"), "ATA": bitarray("10"), "ATT": bitarray("01"), } assert bigsi.colour_to_sample(0) == "1" assert bigsi.sample_to_colour("1") == 0 assert bigsi.colour_to_sample(1) == "2" assert bigsi.sample_to_colour("2") == 1 bigsi.delete()
def test_exact_search(): config = CONFIGS[0] kmers_1 = seq_to_kmers("ATACACAAT", config["k"]) kmers_2 = seq_to_kmers("ACAGAGAAC", config["k"]) bloom1 = BIGSI.bloom(config, kmers_1) bloom2 = BIGSI.bloom(config, kmers_2) for config in CONFIGS: get_storage(config).delete_all() bigsi = BIGSI.build(config, [bloom1, bloom2], ["a", "b"]) assert bigsi.search("ATACACAAT")[0] == { "percent_kmers_found": 100, "num_kmers": 6, "num_kmers_found": 6, "sample_name": "a", } assert bigsi.search("ACAGAGAAC")[0] == { "percent_kmers_found": 100, "num_kmers": 6, "num_kmers_found": 6, "sample_name": "b", } assert bigsi.search("ACAGTTAAC") == [] bigsi.delete()
def convert_index(infile, config, num_samples): in_graph = db.DB() in_graph.set_cachesize(4, 0) in_graph.open(infile + "/graph", flags=db.DB_RDONLY) # Create the kmer signature index storage = get_storage(config) storage.set_integer(BLOOMFILTER_SIZE_KEY, config["m"]) storage.set_integer(NUM_HASH_FUNCTS_KEY, config["h"]) BitMatrix.create(storage=storage, rows=get_rows(in_graph, config["m"]), num_rows=config["m"], num_cols=num_samples) in_graph.close()
def build(cls, config, bloomfilters, samples): storage = get_storage(config) validate_build_params(bloomfilters, samples) logger.debug("Insert sample metadata") sm = SampleMetadata(storage).add_samples(samples) logger.debug("Create signature index") ksi = KmerSignatureIndex.create( storage, bloomfilters, config["m"], config["h"], config.get("low_mem_build", False), ) storage.close() ## Need to delete LOCK files before re init return cls(config)
def migrate(mapping_filepath, storage_engine, storage_filename=None): config = determine_config(storage_engine, storage_filename) storage = get_storage(config) current_metadata = SampleMetadata(storage) with open(mapping_filepath, 'rb') as infile: mapping = pickle.load(infile) for old_id in mapping: new_id = mapping.get(old_id) if new_id and new_id != old_id: colour = current_metadata.sample_to_colour(old_id) if colour: current_metadata._validate_sample_name(new_id) current_metadata._set_sample_colour(new_id, colour) current_metadata._set_colour_sample(colour, new_id) current_metadata._set_sample_colour(old_id, -1) storage.sync() storage.close()
def delete(self, config: hug.types.text = None): config = get_config_from_file(config) get_storage(config).delete_all()