Ejemplo n.º 1
0
def test_create():
    for config in CONFIGS:
        get_storage(config).delete_all()
        bloomfilters = [BIGSI.bloom(config, ["ATC", "ATA"])]
        samples = ["1"]
        bigsi = BIGSI.build(config, bloomfilters, samples)
        assert bigsi.kmer_size == 3
        assert bigsi.bloomfilter_size == 1000
        assert bigsi.num_hashes == 3
        assert bigsi.num_samples == 1
        assert bigsi.lookup("ATC") == {"ATC": bitarray("1")}
        assert bigsi.colour_to_sample(0) == "1"
        assert bigsi.sample_to_colour("1") == 0
        bigsi.delete()
Ejemplo n.º 2
0
def test_unique_sample_names():

    for config in CONFIGS:
        get_storage(config).delete_all()
        bloom = BIGSI.bloom(config, ["ATC", "ATA"])
        bigsi = BIGSI.build(config, [bloom], ["1"])
        with pytest.raises(ValueError):
            bigsi.insert(bloom, "1")
        assert bigsi.num_samples == 1
        assert bigsi.lookup(["ATC", "ATA", "ATT"]) == {
            "ATC": bitarray("1"),
            "ATA": bitarray("1"),
            "ATT": bitarray("0"),
        }
        bigsi.delete()
Ejemplo n.º 3
0
def test_large_build_cmd_success(num_rows: int, byte_values1: List[int],
                                 byte_values2: List[int]):
    num_cols1 = math.floor(len(byte_values1) * 8 / num_rows)
    num_cols2 = math.floor(len(byte_values2) * 8 / num_rows)

    input_bit_array1 = bitarray()
    input_bit_array1.frombytes(bytes(byte_values1))
    input_bit_array2 = bitarray()
    input_bit_array2.frombytes(bytes(byte_values2))

    with NamedTemporaryFile() as tmp_for_input_1, NamedTemporaryFile(
    ) as tmp_for_input_2, NamedTemporaryFile() as tmp_db:
        input_bit_array1.tofile(tmp_for_input_1)
        tmp_for_input_1.flush()
        input_bit_array2.tofile(tmp_for_input_2)
        tmp_for_input_2.flush()

        input_paths = [tmp_for_input_1.name, tmp_for_input_2.name]
        cols = [num_cols1, num_cols2]
        samples = ["s1", "s2"]
        config = _get_bigsi_index_config(num_rows, tmp_db.name)
        large_build(config, input_paths, cols, samples)

        storage = get_storage(config)
        with NamedTemporaryFile() as tmp_for_merged_blooms_write:
            merge_blooms(zip(input_paths, cols), num_rows,
                         tmp_for_merged_blooms_write.name)
            with open(tmp_for_merged_blooms_write.name,
                      "rb") as tmp_for_merged_blooms_read:
                for index, row in enumerate(
                        BitMatrixReader(tmp_for_merged_blooms_read, num_rows,
                                        num_cols1 + num_cols2)):
                    assert storage.get_bitarray(
                        index).tobytes() == row.tobytes()
Ejemplo n.º 4
0
def large_build(config: str, input_path_list: List[str],
                num_cols_list: List[int], sample_list: List[str]):
    storage = get_storage(config)
    num_rows = int(config["m"])

    with BitMatrixGroupReader(zip(input_path_list, num_cols_list),
                              num_rows) as bmgr:
        processed = 0
        bit_arrays = []
        keys = []
        for row_index in range(num_rows):
            keys.append(row_index)
            bit_arrays.append(next(bmgr))
            processed = processed + 1
            if processed == DB_INSERT_BATCH_SIZE:
                storage.set_bitarrays(keys, bit_arrays)
                storage.sync()
                keys = []
                bit_arrays = []
                processed = 0
        if processed != 0:
            storage.set_bitarrays(keys, bit_arrays)
            storage.sync()

    SampleMetadata(storage).add_samples(sample_list)
    storage.set_integer(BLOOM_FILTERS_SIZE_KEY, num_rows)
    storage.set_integer(NUM_HASH_FUNCTIONS_KEY, int(config["h"]))
    storage.set_integer(NUM_ROWS_KEY, num_rows)
    storage.set_integer(NUM_COLS_KEY, sum(num_cols_list))
    storage.sync()
    storage.close()
Ejemplo n.º 5
0
def convert_metadata(infile, config):
    in_metadata = db.DB()
    in_metadata.set_cachesize(4, 0)
    in_metadata.open(infile + "/metadata", flags=db.DB_RDONLY)
    bloom_filter_size = int.from_bytes(in_metadata[b'bloom_filter_size'],
                                       'big')
    kmer_size = int.from_bytes(in_metadata[b'kmer_size'], 'big')
    num_hashes = int.from_bytes(in_metadata[b'num_hashes'], 'big')
    colours = pickle.loads(in_metadata[b'colours'])
    num_samples = len(colours)

    ## Create the sample metadata
    colour_sample = {}
    for colour in range(num_samples):
        key = "colour%i" % colour
        key = key.encode("utf-8")
        sample_name = in_metadata[key].decode('utf-8')
        colour_sample[colour] = sample_name
    ## Add the sample metadata

    storage = get_storage(config)
    sm = SampleMetadata(storage)

    for colour, sample_name in colour_sample.items():
        if "DELETE" in sample_name:
            sample_name = "D3L3T3D"
            print(colour, sample_name)
            sm._set_colour_sample(colour, sample_name)
            sm._set_sample_colour(sample_name, -1)
        else:
            sm._set_sample_colour(sample_name, colour)
            sm._set_colour_sample(colour, sample_name)
    sm._set_integer(sm.colour_count_key, num_samples)
    in_metadata.close()
    return num_samples
Ejemplo n.º 6
0
def get_test_storages():
    test_storages = []
    for config in CONFIGS:
        try:
            test_storages.append(get_storage(config))
        except:
            logger.warning("Skipping %s" % config["storage-engine"])
    return test_storages
Ejemplo n.º 7
0
def test_merge():
    for config in CONFIGS:
        get_storage(config).delete_all()
    config = CONFIGS[0]
    kmers_1 = seq_to_kmers("ATACACAAT", config["k"])
    kmers_2 = seq_to_kmers("ATACACAAC", config["k"])
    bloom1 = BIGSI.bloom(config, kmers_1)
    bloom2 = BIGSI.bloom(config, kmers_2)

    bigsi1 = BIGSI.build(CONFIGS[0], [bloom1], ["a"])
    bigsi2 = BIGSI.build(CONFIGS[1], [bloom2], ["b"])
    bigsic = BIGSI.build(CONFIGS[2], [bloom1, bloom2], ["a", "b"])

    bigsi1.merge(bigsi2)

    assert bigsi1.search("ATACACAAT", 0.5) == bigsic.search("ATACACAAT", 0.5)
    bigsi1.delete()
    bigsi2.delete()
    bigsic.delete()
Ejemplo n.º 8
0
 def __init__(self, config=None):
     if config is None:
         config = DEFAULT_CONFIG
     self.config = config
     self.storage = get_storage(config)
     SampleMetadata.__init__(self, self.storage)
     KmerSignatureIndex.__init__(self, self.storage)
     self.min_unique_kmers_in_query = (
         MIN_UNIQUE_KMERS_IN_QUERY
     )  ## TODO this can be inferred and set at build time
     self.scorer=Scorer(self.num_samples)
Ejemplo n.º 9
0
def test_insert():
    for config in CONFIGS:
        get_storage(config).delete_all()
        bloomfilters = [BIGSI.bloom(config, ["ATC", "ATA"])]
        samples = ["1"]
        bigsi = BIGSI.build(config, bloomfilters, samples)
        bloomfilter_2 = BIGSI.bloom(config, ["ATC", "ATT"])
        bigsi.insert(bloomfilter_2, "2")
        assert bigsi.kmer_size == 3
        assert bigsi.bloomfilter_size == 1000
        assert bigsi.num_hashes == 3
        assert bigsi.num_samples == 2
        assert bigsi.lookup(["ATC", "ATA", "ATT"]) == {
            "ATC": bitarray("11"),
            "ATA": bitarray("10"),
            "ATT": bitarray("01"),
        }
        assert bigsi.colour_to_sample(0) == "1"
        assert bigsi.sample_to_colour("1") == 0
        assert bigsi.colour_to_sample(1) == "2"
        assert bigsi.sample_to_colour("2") == 1
        bigsi.delete()
Ejemplo n.º 10
0
def test_exact_search():
    config = CONFIGS[0]
    kmers_1 = seq_to_kmers("ATACACAAT", config["k"])
    kmers_2 = seq_to_kmers("ACAGAGAAC", config["k"])
    bloom1 = BIGSI.bloom(config, kmers_1)
    bloom2 = BIGSI.bloom(config, kmers_2)
    for config in CONFIGS:
        get_storage(config).delete_all()
        bigsi = BIGSI.build(config, [bloom1, bloom2], ["a", "b"])
        assert bigsi.search("ATACACAAT")[0] == {
            "percent_kmers_found": 100,
            "num_kmers": 6,
            "num_kmers_found": 6,
            "sample_name": "a",
        }
        assert bigsi.search("ACAGAGAAC")[0] == {
            "percent_kmers_found": 100,
            "num_kmers": 6,
            "num_kmers_found": 6,
            "sample_name": "b",
        }
        assert bigsi.search("ACAGTTAAC") == []
        bigsi.delete()
Ejemplo n.º 11
0
def convert_index(infile, config, num_samples):
    in_graph = db.DB()
    in_graph.set_cachesize(4, 0)
    in_graph.open(infile + "/graph", flags=db.DB_RDONLY)

    # Create the kmer signature index
    storage = get_storage(config)
    storage.set_integer(BLOOMFILTER_SIZE_KEY, config["m"])
    storage.set_integer(NUM_HASH_FUNCTS_KEY, config["h"])
    BitMatrix.create(storage=storage,
                     rows=get_rows(in_graph, config["m"]),
                     num_rows=config["m"],
                     num_cols=num_samples)
    in_graph.close()
Ejemplo n.º 12
0
 def build(cls, config, bloomfilters, samples):
     storage = get_storage(config)
     validate_build_params(bloomfilters, samples)
     logger.debug("Insert sample metadata")
     sm = SampleMetadata(storage).add_samples(samples)
     logger.debug("Create signature index")
     ksi = KmerSignatureIndex.create(
         storage,
         bloomfilters,
         config["m"],
         config["h"],
         config.get("low_mem_build", False),
     )
     storage.close()  ## Need to delete LOCK files before re init
     return cls(config)
Ejemplo n.º 13
0
def migrate(mapping_filepath, storage_engine, storage_filename=None):
    config = determine_config(storage_engine, storage_filename)
    storage = get_storage(config)
    current_metadata = SampleMetadata(storage)
    with open(mapping_filepath, 'rb') as infile:
        mapping = pickle.load(infile)

    for old_id in mapping:
        new_id = mapping.get(old_id)
        if new_id and new_id != old_id:
            colour = current_metadata.sample_to_colour(old_id)
            if colour:
                current_metadata._validate_sample_name(new_id)
                current_metadata._set_sample_colour(new_id, colour)
                current_metadata._set_colour_sample(colour, new_id)
                current_metadata._set_sample_colour(old_id, -1)

    storage.sync()
    storage.close()
Ejemplo n.º 14
0
 def delete(self, config: hug.types.text = None):
     config = get_config_from_file(config)
     get_storage(config).delete_all()