def test_linear_index_gather(): sig2 = utils.get_test_data('2.fa.sig') sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) lidx = LinearIndex() lidx.insert(ss2) lidx.insert(ss47) lidx.insert(ss63) matches = lidx.gather(ss2) assert len(matches) == 1 assert matches[0][0] == 1.0 assert matches[0][1] == ss2 matches = lidx.gather(ss47) assert len(matches) == 2 assert matches[0][0] == 1.0 assert matches[0][1] == ss47 assert round(matches[1][0], 2) == 0.49 assert matches[1][1] == ss63
def test_linear_gather_threshold_1(): # test gather() method, in some detail sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) linear = LinearIndex() linear.insert(sig47) linear.insert(sig63) linear.insert(sig2) # now construct query signatures with specific numbers of hashes -- # note, these signatures all have scaled=1000. mins = list(sorted(sig2.minhash.get_mins())) new_mh = sig2.minhash.copy_and_clear() # query with empty hashes assert not new_mh assert not linear.gather(SourmashSignature(new_mh)) # add one hash new_mh.add_hash(mins.pop()) assert len(new_mh) == 1 results = linear.gather(SourmashSignature(new_mh)) assert len(results) == 1 containment, match_sig, name = results[0] assert containment == 1.0 assert match_sig == sig2 assert name is None # check with a threshold -> should be no results. results = linear.gather(SourmashSignature(new_mh), threshold_bp=5000) assert not results # add three more hashes => length of 4 new_mh.add_hash(mins.pop()) new_mh.add_hash(mins.pop()) new_mh.add_hash(mins.pop()) assert len(new_mh) == 4 results = linear.gather(SourmashSignature(new_mh)) assert len(results) == 1 containment, match_sig, name = results[0] assert containment == 1.0 assert match_sig == sig2 assert name is None # check with a too-high threshold -> should be no results. results = linear.gather(SourmashSignature(new_mh), threshold_bp=5000) assert not results
def test_linear_gather_threshold_5(): # test gather() method above threshold sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) linear = LinearIndex(filename='foo') linear.insert(sig47) linear.insert(sig63) linear.insert(sig2) # now construct query signatures with specific numbers of hashes -- # note, these signatures all have scaled=1000. mins = list(sorted(sig2.minhash.get_mins())) new_mh = sig2.minhash.copy_and_clear() # add five hashes for i in range(5): new_mh.add_hash(mins.pop()) new_mh.add_hash(mins.pop()) new_mh.add_hash(mins.pop()) new_mh.add_hash(mins.pop()) new_mh.add_hash(mins.pop()) # should get a result with no threshold (any match at all is returned) results = linear.gather(SourmashSignature(new_mh)) assert len(results) == 1 containment, match_sig, name = results[0] assert containment == 1.0 assert match_sig == sig2 assert name == 'foo' # now, check with a threshold_bp that should be meet-able. results = linear.gather(SourmashSignature(new_mh), threshold_bp=5000) assert len(results) == 1 containment, match_sig, name = results[0] assert containment == 1.0 assert match_sig == sig2 assert name == 'foo'