Beispiel #1
0
def test_unique_sample_names():
    for storage in get_storages():
        storage.delete_all()
        sm = SampleMetadata(storage=storage)
        sample_name = "sample_name"
        colour = sm.add_sample(sample_name)
        with pytest.raises(ValueError):
            sm.add_sample(sample_name)
Beispiel #2
0
 def __init__(self, config=None):
     if config is None:
         config = DEFAULT_CONFIG
     self.config = config
     self.storage = get_storage(config)
     SampleMetadata.__init__(self, self.storage)
     KmerSignatureIndex.__init__(self, self.storage)
     self.min_unique_kmers_in_query = (
         MIN_UNIQUE_KMERS_IN_QUERY
     )  ## TODO this can be inferred and set at build time
     self.scorer=Scorer(self.num_samples)
Beispiel #3
0
def large_build(config: str, input_path_list: List[str],
                num_cols_list: List[int], sample_list: List[str]):
    storage = get_storage(config)
    num_rows = int(config["m"])

    with BitMatrixGroupReader(zip(input_path_list, num_cols_list),
                              num_rows) as bmgr:
        processed = 0
        bit_arrays = []
        keys = []
        for row_index in range(num_rows):
            keys.append(row_index)
            bit_arrays.append(next(bmgr))
            processed = processed + 1
            if processed == DB_INSERT_BATCH_SIZE:
                storage.set_bitarrays(keys, bit_arrays)
                storage.sync()
                keys = []
                bit_arrays = []
                processed = 0
        if processed != 0:
            storage.set_bitarrays(keys, bit_arrays)
            storage.sync()

    SampleMetadata(storage).add_samples(sample_list)
    storage.set_integer(BLOOM_FILTERS_SIZE_KEY, num_rows)
    storage.set_integer(NUM_HASH_FUNCTIONS_KEY, int(config["h"]))
    storage.set_integer(NUM_ROWS_KEY, num_rows)
    storage.set_integer(NUM_COLS_KEY, sum(num_cols_list))
    storage.sync()
    storage.close()
Beispiel #4
0
def test_add_sample_metadata():
    for storage in get_storages():
        storage.delete_all()
        sm = SampleMetadata(storage=storage)
        sample_name = "sample_name"
        colour = sm.add_sample(sample_name)
        assert sm.samples_to_colours([sample_name]) == {sample_name: 0}
        assert sm.colours_to_samples([0]) == {0: sample_name}
        assert sm.num_samples == 1
        assert sm.sample_name_exists(sample_name)
        assert not sm.sample_name_exists("sample_name2")

        sample_name = "sample_name2"
        colour = sm.add_sample(sample_name)
        assert sm.sample_to_colour(sample_name) == 1
        assert sm.colour_to_sample(1) == sample_name
        assert sm.num_samples == 2
Beispiel #5
0
def test_delete_sample():
    ## Add 2 samples
    for storage in get_storages():
        storage.delete_all()
        sm = SampleMetadata(storage=storage)
        sample_name1 = "sample_name"
        colour = sm.add_sample(sample_name1)
        sample_name2 = "sample_name2"
        colour = sm.add_sample(sample_name2)

    ## Ensure both samples are these
    assert sm.samples_to_colours([sample_name1, sample_name2]) == {
        sample_name1: 0,
        sample_name2: 1,
    }
    assert sm.colours_to_samples([0, 1]) == {0: sample_name1, 1: sample_name2}

    ## Delete one sample
    sm.delete_sample(sample_name1)

    ## Ensure only one sample is return
    assert sm.samples_to_colours([sample_name1, sample_name2]) == {sample_name2: 1}
    assert sm.colours_to_samples([0, 1]) == {0: "D3L3T3D", 1: sample_name2}
Beispiel #6
0
 def build(cls, config, bloomfilters, samples):
     storage = get_storage(config)
     validate_build_params(bloomfilters, samples)
     logger.debug("Insert sample metadata")
     sm = SampleMetadata(storage).add_samples(samples)
     logger.debug("Create signature index")
     ksi = KmerSignatureIndex.create(
         storage,
         bloomfilters,
         config["m"],
         config["h"],
         config.get("low_mem_build", False),
     )
     storage.close()  ## Need to delete LOCK files before re init
     return cls(config)
Beispiel #7
0
def migrate(mapping_filepath, storage_engine, storage_filename=None):
    config = determine_config(storage_engine, storage_filename)
    storage = get_storage(config)
    current_metadata = SampleMetadata(storage)
    with open(mapping_filepath, 'rb') as infile:
        mapping = pickle.load(infile)

    for old_id in mapping:
        new_id = mapping.get(old_id)
        if new_id and new_id != old_id:
            colour = current_metadata.sample_to_colour(old_id)
            if colour:
                current_metadata._validate_sample_name(new_id)
                current_metadata._set_sample_colour(new_id, colour)
                current_metadata._set_colour_sample(colour, new_id)
                current_metadata._set_sample_colour(old_id, -1)

    storage.sync()
    storage.close()
def convert_metadata(infile, config):
    in_metadata = db.DB()
    in_metadata.set_cachesize(4, 0)
    in_metadata.open(infile + "/metadata", flags=db.DB_RDONLY)
    bloom_filter_size = int.from_bytes(in_metadata[b'bloom_filter_size'],
                                       'big')
    kmer_size = int.from_bytes(in_metadata[b'kmer_size'], 'big')
    num_hashes = int.from_bytes(in_metadata[b'num_hashes'], 'big')
    colours = pickle.loads(in_metadata[b'colours'])
    num_samples = len(colours)

    ## Create the sample metadata
    colour_sample = {}
    for colour in range(num_samples):
        key = "colour%i" % colour
        key = key.encode("utf-8")
        sample_name = in_metadata[key].decode('utf-8')
        colour_sample[colour] = sample_name
    ## Add the sample metadata

    storage = get_storage(config)
    sm = SampleMetadata(storage)

    for colour, sample_name in colour_sample.items():
        if "DELETE" in sample_name:
            sample_name = "D3L3T3D"
            print(colour, sample_name)
            sm._set_colour_sample(colour, sample_name)
            sm._set_sample_colour(sample_name, -1)
        else:
            sm._set_sample_colour(sample_name, colour)
            sm._set_colour_sample(colour, sample_name)
    sm._set_integer(sm.colour_count_key, num_samples)
    in_metadata.close()
    return num_samples