def test_unique_sample_names(): for storage in get_storages(): storage.delete_all() sm = SampleMetadata(storage=storage) sample_name = "sample_name" colour = sm.add_sample(sample_name) with pytest.raises(ValueError): sm.add_sample(sample_name)
def __init__(self, config=None): if config is None: config = DEFAULT_CONFIG self.config = config self.storage = get_storage(config) SampleMetadata.__init__(self, self.storage) KmerSignatureIndex.__init__(self, self.storage) self.min_unique_kmers_in_query = ( MIN_UNIQUE_KMERS_IN_QUERY ) ## TODO this can be inferred and set at build time self.scorer=Scorer(self.num_samples)
def large_build(config: str, input_path_list: List[str], num_cols_list: List[int], sample_list: List[str]): storage = get_storage(config) num_rows = int(config["m"]) with BitMatrixGroupReader(zip(input_path_list, num_cols_list), num_rows) as bmgr: processed = 0 bit_arrays = [] keys = [] for row_index in range(num_rows): keys.append(row_index) bit_arrays.append(next(bmgr)) processed = processed + 1 if processed == DB_INSERT_BATCH_SIZE: storage.set_bitarrays(keys, bit_arrays) storage.sync() keys = [] bit_arrays = [] processed = 0 if processed != 0: storage.set_bitarrays(keys, bit_arrays) storage.sync() SampleMetadata(storage).add_samples(sample_list) storage.set_integer(BLOOM_FILTERS_SIZE_KEY, num_rows) storage.set_integer(NUM_HASH_FUNCTIONS_KEY, int(config["h"])) storage.set_integer(NUM_ROWS_KEY, num_rows) storage.set_integer(NUM_COLS_KEY, sum(num_cols_list)) storage.sync() storage.close()
def test_add_sample_metadata(): for storage in get_storages(): storage.delete_all() sm = SampleMetadata(storage=storage) sample_name = "sample_name" colour = sm.add_sample(sample_name) assert sm.samples_to_colours([sample_name]) == {sample_name: 0} assert sm.colours_to_samples([0]) == {0: sample_name} assert sm.num_samples == 1 assert sm.sample_name_exists(sample_name) assert not sm.sample_name_exists("sample_name2") sample_name = "sample_name2" colour = sm.add_sample(sample_name) assert sm.sample_to_colour(sample_name) == 1 assert sm.colour_to_sample(1) == sample_name assert sm.num_samples == 2
def test_delete_sample(): ## Add 2 samples for storage in get_storages(): storage.delete_all() sm = SampleMetadata(storage=storage) sample_name1 = "sample_name" colour = sm.add_sample(sample_name1) sample_name2 = "sample_name2" colour = sm.add_sample(sample_name2) ## Ensure both samples are these assert sm.samples_to_colours([sample_name1, sample_name2]) == { sample_name1: 0, sample_name2: 1, } assert sm.colours_to_samples([0, 1]) == {0: sample_name1, 1: sample_name2} ## Delete one sample sm.delete_sample(sample_name1) ## Ensure only one sample is return assert sm.samples_to_colours([sample_name1, sample_name2]) == {sample_name2: 1} assert sm.colours_to_samples([0, 1]) == {0: "D3L3T3D", 1: sample_name2}
def build(cls, config, bloomfilters, samples): storage = get_storage(config) validate_build_params(bloomfilters, samples) logger.debug("Insert sample metadata") sm = SampleMetadata(storage).add_samples(samples) logger.debug("Create signature index") ksi = KmerSignatureIndex.create( storage, bloomfilters, config["m"], config["h"], config.get("low_mem_build", False), ) storage.close() ## Need to delete LOCK files before re init return cls(config)
def migrate(mapping_filepath, storage_engine, storage_filename=None): config = determine_config(storage_engine, storage_filename) storage = get_storage(config) current_metadata = SampleMetadata(storage) with open(mapping_filepath, 'rb') as infile: mapping = pickle.load(infile) for old_id in mapping: new_id = mapping.get(old_id) if new_id and new_id != old_id: colour = current_metadata.sample_to_colour(old_id) if colour: current_metadata._validate_sample_name(new_id) current_metadata._set_sample_colour(new_id, colour) current_metadata._set_colour_sample(colour, new_id) current_metadata._set_sample_colour(old_id, -1) storage.sync() storage.close()
def convert_metadata(infile, config): in_metadata = db.DB() in_metadata.set_cachesize(4, 0) in_metadata.open(infile + "/metadata", flags=db.DB_RDONLY) bloom_filter_size = int.from_bytes(in_metadata[b'bloom_filter_size'], 'big') kmer_size = int.from_bytes(in_metadata[b'kmer_size'], 'big') num_hashes = int.from_bytes(in_metadata[b'num_hashes'], 'big') colours = pickle.loads(in_metadata[b'colours']) num_samples = len(colours) ## Create the sample metadata colour_sample = {} for colour in range(num_samples): key = "colour%i" % colour key = key.encode("utf-8") sample_name = in_metadata[key].decode('utf-8') colour_sample[colour] = sample_name ## Add the sample metadata storage = get_storage(config) sm = SampleMetadata(storage) for colour, sample_name in colour_sample.items(): if "DELETE" in sample_name: sample_name = "D3L3T3D" print(colour, sample_name) sm._set_colour_sample(colour, sample_name) sm._set_sample_colour(sample_name, -1) else: sm._set_sample_colour(sample_name, colour) sm._set_colour_sample(colour, sample_name) sm._set_integer(sm.colour_count_key, num_samples) in_metadata.close() return num_samples