def test_linear_index_gather(): sig2 = utils.get_test_data('2.fa.sig') sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) lidx = LinearIndex() lidx.insert(ss2) lidx.insert(ss47) lidx.insert(ss63) matches = lidx.gather(ss2) assert len(matches) == 1 assert matches[0][0] == 1.0 assert matches[0][1] == ss2 matches = lidx.gather(ss47) assert len(matches) == 2 assert matches[0][0] == 1.0 assert matches[0][1] == ss47 assert round(matches[1][0], 2) == 0.49 assert matches[1][1] == ss63
def test_sbt_dayhoff_command_index(c): # test command-line creation of SBT database with dayhoff sigs sigfile1 = utils.get_test_data( 'prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') sigfile2 = utils.get_test_data( 'prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') db_out = c.output('dayhoff.sbt.zip') c.run_sourmash('index', db_out, sigfile1, sigfile2, '--scaled', '100', '-k', '57', '--dayhoff') db2 = sourmash.load_sbt_index(db_out) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) # check reconstruction -- mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list # and search, gather results = db2.search(sig1, threshold=0.0, ignore_abundance=True, do_containment=False, best_only=False) assert len(results) == 2 results = db2.gather(sig2) assert results[0][0] == 1.0
def test_sig_intersect_4(c): # use --abundances-from to preserve abundances from sig #47 sig47 = utils.get_test_data('track_abund/47.fa.sig') sig63 = utils.get_test_data('track_abund/63.fa.sig') c.run_sourmash('sig', 'intersect', '--abundances-from', sig47, sig63) # stdout should be new signature out = c.last_result.out actual_intersect_sig = sourmash.load_one_signature(out) # actually do an intersection ourselves for the test mh47 = sourmash.load_one_signature(sig47).minhash mh63 = sourmash.load_one_signature(sig63).minhash mh47_abunds = mh47.get_mins(with_abundance=True) mh63_mins = set(mh63.get_mins()) # get the set of mins that are in common mh63_mins.intersection_update(mh47_abunds) # take abundances from mh47 & create new sig mh47_abunds = {k: mh47_abunds[k] for k in mh63_mins} test_mh = mh47.copy_and_clear() test_mh.set_abundances(mh47_abunds) print(actual_intersect_sig.minhash) print(out) assert actual_intersect_sig.minhash == test_mh
def test_linear_index_save(): sig2 = utils.get_test_data('2.fa.sig') sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) linear = LinearIndex() linear.insert(ss2) linear.insert(ss47) linear.insert(ss63) with utils.TempDirectory() as location: filename = os.path.join(location, 'foo') linear.save(filename) from sourmash import load_signatures si = set(load_signatures(filename)) x = {ss2, ss47, ss63} print(len(si)) print(len(x)) print(si) print(x) assert si == x, si
def test_sbt_gather_threshold_1(): # test gather() method, in some detail factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) tree.insert(sig47) tree.insert(sig63) tree.insert(sig2) # now construct query signatures with specific numbers of hashes -- # note, these signatures all have scaled=1000. mins = list(sorted(sig2.minhash.get_mins())) new_mh = sig2.minhash.copy_and_clear() # query with empty hashes assert not new_mh assert not tree.gather(SourmashSignature(new_mh)) # add one hash new_mh.add_hash(mins.pop()) assert len(new_mh) == 1 results = tree.gather(SourmashSignature(new_mh)) assert len(results) == 1 containment, match_sig, name = results[0] assert containment == 1.0 assert match_sig == sig2 assert name is None # check with a threshold -> should be no results. results = tree.gather(SourmashSignature(new_mh), threshold_bp=5000) assert not results # add three more hashes => length of 4 new_mh.add_hash(mins.pop()) new_mh.add_hash(mins.pop()) new_mh.add_hash(mins.pop()) assert len(new_mh) == 4 results = tree.gather(SourmashSignature(new_mh)) assert len(results) == 1 containment, match_sig, name = results[0] assert containment == 1.0 assert match_sig == sig2 assert name is None # check with a too-high threshold -> should be no results. print('len mh', len(new_mh)) results = tree.gather(SourmashSignature(new_mh), threshold_bp=5000) assert not results
def test_sig_cat_1(c): # cat 47 to 47... sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'cat', sig47) # stdout should be same signature out = c.last_result.out test_cat_sig = sourmash.load_one_signature(sig47) actual_cat_sig = sourmash.load_one_signature(out) assert actual_cat_sig == test_cat_sig
def test_import_export_1(c): # check to make sure we can import what we've exported! inp = utils.get_test_data('genome-s11.fa.gz.sig') outp = c.output('export.json') c.run_sourmash('sig', 'export', inp, '-o', outp, '-k', '21', '--dna') c.run_sourmash('sig', 'import', outp) original = sourmash.load_one_signature(inp, ksize=21, select_moltype='DNA') roundtrip = sourmash.load_one_signature(c.last_result.out) assert original.minhash == roundtrip.minhash
def test_sig_extract_1(c): # extract 47 from 47... :) sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'extract', sig47) # stdout should be new signature out = c.last_result.out test_extract_sig = sourmash.load_one_signature(sig47) actual_extract_sig = sourmash.load_one_signature(out) assert actual_extract_sig == test_extract_sig
def test_sig_split_1(c): # split 47 into 1 sig :) sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'split', sig47) outname = '09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig' assert os.path.exists(c.output(outname)) test_split_sig = sourmash.load_one_signature(sig47) actual_split_sig = sourmash.load_one_signature(c.output(outname)) assert actual_split_sig == test_split_sig
def test_sig_merge_2(c): # merge of 47 with nothing should be 47 sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'merge', sig47) # stdout should be new signature out = c.last_result.out test_merge_sig = sourmash.load_one_signature(sig47) actual_merge_sig = sourmash.load_one_signature(out) print(out) assert actual_merge_sig.minhash == test_merge_sig.minhash
def test_sig_downsample_1_scaled(c): # downsample a scaled signature sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'downsample', '--scaled', '10000', sig47) # stdout should be new signature out = c.last_result.out test_downsample_sig = sourmash.load_one_signature(sig47) actual_downsample_sig = sourmash.load_one_signature(out) test_mh = test_downsample_sig.minhash.downsample_scaled(10000) assert actual_downsample_sig.minhash == test_mh
def test_import_export_2(c): # check to make sure we can import a mash JSON dump file. # NOTE: msh.json_dump file calculated like so: # mash sketch -s 500 -k 21 ./tests/test-data/genome-s11.fa.gz # mash info -d ./tests/test-data/genome-s11.fa.gz.msh > tests/test-data/genome-s11.fa.gz.msh.json_dump # sig1 = utils.get_test_data('genome-s11.fa.gz.sig') msh_sig = utils.get_test_data('genome-s11.fa.gz.msh.json_dump') c.run_sourmash('sig', 'import', msh_sig) imported = sourmash.load_one_signature(c.last_result.out) compare = sourmash.load_one_signature(sig1, ksize=21, select_moltype='DNA') assert imported.minhash == compare.minhash
def test_sig_downsample_2_num(c): # downsample a num signature sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') c.run_sourmash('sig', 'downsample', '--num', '500', '-k', '21', '--dna', sigs11) # stdout should be new signature out = c.last_result.out test_downsample_sig = sourmash.load_one_signature(sigs11, ksize=21, select_moltype='DNA') actual_downsample_sig = sourmash.load_one_signature(out) test_mh = test_downsample_sig.minhash.downsample_n(500) assert actual_downsample_sig.minhash == test_mh
def test_save_zip(tmpdir): # load from zipped SBT, save to zipped SBT, and then search. testdata = utils.get_test_data("v6.sbt.zip") testsbt = tmpdir.join("v6.sbt.zip") newsbt = tmpdir.join("new.sbt.zip") shutil.copyfile(testdata, str(testsbt)) tree = SBT.load(str(testsbt), leaf_loader=SigLeaf.load) tree.save(str(newsbt)) assert newsbt.exists() new_tree = SBT.load(str(newsbt), leaf_loader=SigLeaf.load) assert isinstance(new_tree.storage, ZipStorage) assert new_tree.storage.list_sbts() == ['new.sbt.json'] to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0])) print("*" * 60) print("{}:".format(to_search)) old_result = {str(s) for s in tree.find(search_minhashes, to_search, 0.1)} new_result = { str(s) for s in new_tree.find(search_minhashes, to_search, 0.1) } print(*new_result, sep="\n") assert old_result == new_result assert len(new_result) == 2
def subtract(args): """ subtract one or more signatures from another """ p = SourmashArgumentParser(prog='sourmash signature subtract') p.add_argument('signature_from') p.add_argument('subtraction_sigs', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') p.add_argument('--flatten', action='store_true', help='remove abundance from signatures before subtracting') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) from_sigfile = args.signature_from from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype) from_mh = from_sigobj.minhash if from_mh.track_abundance and not args.flatten: error('Cannot use subtract on signatures with abundance tracking, sorry!') sys.exit(1) subtract_mins = set(from_mh.get_mins()) notify('loaded signature from {}...', from_sigfile, end='\r') total_loaded = 0 for sigfile in args.subtraction_sigs: for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): if sigobj.minhash.track_abundance and not args.flatten: error('Cannot use subtract on signatures with abundance tracking, sorry!') sys.exit(1) subtract_mins -= set(sigobj.minhash.get_mins()) notify('loaded and subtracted signatures from {}...', sigfile, end='\r') total_loaded += 1 if not total_loaded: error("no signatures to subtract!?") sys.exit(-1) subtract_mh = from_sigobj.minhash.copy_and_clear() subtract_mh.add_many(subtract_mins) subtract_sigobj = sourmash.SourmashSignature(subtract_mh) output_json = sourmash.save_signatures([subtract_sigobj], fp=args.output) notify('loaded and subtracted {} signatures', total_loaded)
def test_sig_merge_3_abund_ab_ok(c): # merge of 47 and 63 with abund should work sig47abund = utils.get_test_data('track_abund/47.fa.sig') sig63abund = utils.get_test_data('track_abund/63.fa.sig') c.run_sourmash('sig', 'merge', sig47abund, sig63abund) actual_merge_sig = sourmash.load_one_signature(c.last_result.out)
def load_or_generate_sig_from_file(input_file, alphabet, ksize, scaled, ignore_abundance=False, translate=False): sig = "" if input_file.endswith(".sig"): # do I want to enable multiple sigs per file here? sig = sourmash.load_one_signature(input_file, ksize=ksize) else: # read file and add sigs records = try_reading_fasta_file(input_file) # build signature name from filename .. maybe just keep filename? #signame = os.path.basename(input_file.rsplit("_", 1)[0]) # start with fresh minhash mh = determine_appropriate_fresh_minhash(alphabet, ksize, scaled, ignore_abundance) if records: for record in records: if alphabet == "nucleotide" or translate: mh.add_sequence(record.sequence, force=True) else: mh.add_protein(record.sequence) # minhash --> signature, using filename as signature name ..i think this happens automatically if don't provide name? sig = sourmash.SourmashSignature(mh, name=os.path.basename(input_file)) return sig
def export(args): """ export a signature to mash format """ p = SourmashArgumentParser(prog='sourmash signature export') p.add_argument('filename') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) total_loaded = 0 ss = sourmash.load_one_signature(args.filename, ksize=args.ksize, select_moltype=moltype) mh = ss.minhash x = {} x['kmer'] = mh.ksize x['sketchSize'] = len(mh) x['hashType'] = "MurmurHash3_x64_128" x['hashBits'] = 64 x['hashSeed'] = mh.seed ll = list(mh.get_mins()) x['sketches'] = [{ 'hashes': ll }] print(json.dumps(x), file=args.output) notify("exported signature {} ({})", ss.name(), ss.md5sum()[:8])
def test_sig_filter_3_ksize_select(c): # test filtering with ksize selectiong psw_mag = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') c.run_sourmash('sig', 'filter', '-m', '2', psw_mag, '-k', '31') # stdout should be new signature out = c.last_result.out filtered_sig = sourmash.load_one_signature(out) test_sig = sourmash.load_one_signature(psw_mag, ksize=31) abunds = test_sig.minhash.get_mins(True) abunds = {k: v for (k, v) in abunds.items() if v >= 2} assert abunds assert filtered_sig.minhash.get_mins(True) == abunds
def read_signature(sigfile, ksize_n): """Loads signatures stored in files Signature generated are stored as MinHashes in JSON format file. Sourmash implements a function to load signatures. This function here is just a shortcut for the :func:`sourmash.load_one_signature` function in sourmash_. :param sigfile: Hash signature stored in JSON format :param ksize_n: Kmer size. :type sigfile: string :type ksize_n: integer .. seealso:: This function depends on sourmash python module (https://sourmash.readthedocs.io/en/latest/). Some functions employed are: - :func:`sourmash.load_one_signature` .. include:: ../../links.inc """ ## code taken and adapted from: https://sourmash.readthedocs.io/en/latest/api-example.html #print ('\t+ Loading signature for comparison...') sig = load_one_signature(sigfile, ksize=ksize_n, select_moltype='DNA', ignore_md5sum=False) return (sig)
def test_sig_filter_3(c): # test basic filtering sig47 = utils.get_test_data('track_abund/47.fa.sig') c.run_sourmash('sig', 'filter', '-m', '2', sig47) # stdout should be new signature out = c.last_result.out filtered_sig = sourmash.load_one_signature(out) test_sig = sourmash.load_one_signature(sig47) abunds = test_sig.minhash.get_mins(True) abunds = {k: v for (k, v) in abunds.items() if v >= 2} assert abunds assert filtered_sig.minhash.get_mins(True) == abunds
def export(args): """ export a signature to mash format """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) ss = sourmash.load_one_signature(args.filename, ksize=args.ksize, select_moltype=moltype) mh = ss.minhash x = {} x['kmer'] = mh.ksize x['sketchSize'] = len(mh) x['hashType'] = "MurmurHash3_x64_128" x['hashBits'] = 64 x['hashSeed'] = mh.seed ll = list(mh.get_mins()) x['sketches'] = [{ 'hashes': ll }] with FileOutput(args.output, 'wt') as fp: print(json.dumps(x), file=fp) notify("exported signature {} ({})", ss.name(), ss.md5sum()[:8])
def test_sbt_as_index_signatures(): # test 'signatures' method from Index base class. factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) tree.insert(sig47) tree.insert(sig63) xx = list(tree.signatures()) assert len(xx) == 2 assert sig47 in xx assert sig63 in xx
def test_tree_save_load(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with utils.TempDirectory() as location: tree.save(os.path.join(location, 'demo')) tree = SBT.load(os.path.join(location, 'demo'), leaf_loader=SigLeaf.load) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def test_binary_nary_tree(): factory = GraphFactory(31, 1e5, 4) trees = {} trees[2] = SBT(factory) trees[5] = SBT(factory, d=5) trees[10] = SBT(factory, d=10) n_leaves = 0 for f in utils.SIG_FILES: sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) for tree in trees.values(): tree.add_node(leaf) to_search = leaf n_leaves += 1 assert all([len(list(t.leaves())) == n_leaves for t in trees.values()]) results = {} print('*' * 60) print("{}:".format(to_search.metadata)) for d, tree in trees.items(): results[d] = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*results[2], sep='\n') assert results[2] == results[5] assert results[5] == results[10]
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('sigs', nargs='+') parser.add_argument('lca_db') args = parser.parse_args() minhashes = [] for filename in args.sigs: ss = sourmash.load_one_signature(filename) minhashes.append(ss.minhash) # load the LCA database dblist, ksize, scaled = lca_utils.load_databases([args.lca_db], None) db = dblist[0] # double check scaled requirements some_mh = minhashes[0] mh_scaled = some_mh.scaled if scaled >= mh_scaled: print( '** warning: many minhashes will go unclassified because LCA database scaled is {}' .format(scaled), file=sys.stderr) print('** warning: the minhash scaled is {}'.format(mh_scaled), file=sys.stderr) summarize_taxonomic_purity(minhashes, db, verbose=True, filenames=args.sigs)
def test_sig_intersect_2(c): # intersect of 47 and nothing should be self sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'intersect', sig47) # stdout should be new signature out = c.last_result.out test_intersect_sig = sourmash.load_one_signature(sig47) actual_intersect_sig = sourmash.load_one_signature(out) print(test_intersect_sig.minhash) print(actual_intersect_sig.minhash) print(out) assert actual_intersect_sig.minhash == test_intersect_sig.minhash
def test_sig_extract_4(c): # extract matches to 47's name from among several signatures sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') c.run_sourmash('sig', 'extract', sig47, sig63, '--name', 'NC_009665.1') # stdout should be new signature out = c.last_result.out test_extract_sig = sourmash.load_one_signature(sig47) actual_extract_sig = sourmash.load_one_signature(out) print(test_extract_sig.minhash) print(actual_extract_sig.minhash) assert actual_extract_sig == test_extract_sig
def test_sig_merge_1_multisig(c): # merge of 47 & 63 should be union of mins; here, sigs are in same file. multisig = utils.get_test_data('47+63-multisig.sig') sig47and63 = utils.get_test_data('47+63.fa.sig') c.run_sourmash('sig', 'merge', multisig, '--flatten') # stdout should be new signature out = c.last_result.out test_merge_sig = sourmash.load_one_signature(sig47and63) actual_merge_sig = sourmash.load_one_signature(out) print(test_merge_sig.minhash) print(actual_merge_sig.minhash) print(out) assert actual_merge_sig.minhash == test_merge_sig.minhash
def test_sig_rename_1(c): # set new name for 47 sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'rename', sig47, 'fiz bar') # stdout should be new signature out = c.last_result.out test_rename_sig = sourmash.load_one_signature(sig47) actual_rename_sig = sourmash.load_one_signature(out) print(test_rename_sig.minhash) print(actual_rename_sig.minhash) assert actual_rename_sig.minhash == test_rename_sig.minhash assert test_rename_sig.name() != actual_rename_sig.name() assert actual_rename_sig.name() == 'fiz bar'
def test_search_db_scaled_lt_sig_scaled(): dbfile = utils.get_test_data('lca/47+63.lca.json') db, ksize, scaled = lca_utils.load_single_database(dbfile) sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) sig.minhash = sig.minhash.downsample_scaled(100000) with pytest.raises(ValueError) as e: results = db.search(sig, threshold=.01, ignore_abundance=True)
def test_sig_subtract_1(c): # subtract of 63 from 47 sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') c.run_sourmash('sig', 'subtract', sig47, sig63) # stdout should be new signature out = c.last_result.out test1_sig = sourmash.load_one_signature(sig47) test2_sig = sourmash.load_one_signature(sig63) actual_subtract_sig = sourmash.load_one_signature(out) mins = set(test1_sig.minhash.get_mins()) mins -= set(test2_sig.minhash.get_mins()) assert set(actual_subtract_sig.minhash.get_mins()) == set(mins)
def test_sig_intersect_1(c): # intersect of 47 and 63 should be intersection of mins sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') sig47and63 = utils.get_test_data('47+63-intersect.fa.sig') c.run_sourmash('sig', 'intersect', sig47, sig63) # stdout should be new signature out = c.last_result.out test_intersect_sig = sourmash.load_one_signature(sig47and63) actual_intersect_sig = sourmash.load_one_signature(out) print(test_intersect_sig.minhash) print(actual_intersect_sig.minhash) print(out) assert actual_intersect_sig.minhash == test_intersect_sig.minhash
def test_sig_merge_1(c): # merge of 47 & 63 should be union of mins sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') sig47and63 = utils.get_test_data('47+63.fa.sig') c.run_sourmash('sig', 'merge', sig47, sig63) # stdout should be new signature out = c.last_result.out test_merge_sig = sourmash.load_one_signature(sig47and63) actual_merge_sig = sourmash.load_one_signature(out) print(test_merge_sig.minhash) print(actual_merge_sig.minhash) print(out) assert actual_merge_sig.minhash == test_merge_sig.minhash
def test_sig_merge_1_ksize_moltype(c): # check ksize, moltype args sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') sig47and63 = utils.get_test_data('47+63.fa.sig') c.run_sourmash('sig', 'merge', sig47, sig63, '--dna', '-k', '31') # stdout should be new signature out = c.last_result.out test_merge_sig = sourmash.load_one_signature(sig47and63) actual_merge_sig = sourmash.load_one_signature(out) print(test_merge_sig.minhash) print(actual_merge_sig.minhash) print(out) assert actual_merge_sig.minhash == test_merge_sig.minhash
def test_sourmash_signature_api(): e = sourmash.MinHash(n=1, ksize=20) sig = sourmash.SourmashSignature(e) s = sourmash.save_signatures([sig]) sig_x1 = sourmash.load_one_signature(s) sig_x2 = list(sourmash.load_signatures(s))[0] assert sig_x1 == sig assert sig_x2 == sig
def test_sig_merge_flatten_2(c): # merge of 47 with abund, with 63 with, will succeed with --flatten sig47abund = utils.get_test_data('track_abund/47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') sig47and63 = utils.get_test_data('47+63.fa.sig') c.run_sourmash('sig', 'merge', sig63, sig47abund, '--flatten') print(c.last_result) out = c.last_result.out test_merge_sig = sourmash.load_one_signature(sig47and63) actual_merge_sig = sourmash.load_one_signature(out) print(test_merge_sig.minhash) print(actual_merge_sig.minhash) print(out) assert actual_merge_sig.minhash == test_merge_sig.minhash
def test_sig_intersect_2(c): # intersect of 47 with abund and 63 with abund should be same # as without abund, i.e. intersect 'flattens' sig47 = utils.get_test_data('track_abund/47.fa.sig') sig63 = utils.get_test_data('track_abund/63.fa.sig') sig47and63 = utils.get_test_data('47+63-intersect.fa.sig') c.run_sourmash('sig', 'intersect', sig47, sig63) # stdout should be new signature out = c.last_result.out test_intersect_sig = sourmash.load_one_signature(sig47and63) actual_intersect_sig = sourmash.load_one_signature(out) print(test_intersect_sig.minhash) print(actual_intersect_sig.minhash) print(out) assert actual_intersect_sig.minhash == test_intersect_sig.minhash
def test_sig_intersect_2_multisig(c): # intersect of all the multisig stuff should be nothing sig47 = utils.get_test_data('47+63-multisig.sig') c.run_sourmash('sig', 'intersect', sig47) # stdout should be new signature out = c.last_result.out actual_intersect_sig = sourmash.load_one_signature(out) assert not len(actual_intersect_sig.minhash)
def test_sig_downsample_2_num_to_scaled(c): # downsample a num signature and convert it into a scaled sig sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') c.run_sourmash('sig', 'downsample', '--scaled', '10000', '-k', '21', '--dna', sigs11) # stdout should be new signature out = c.last_result.out test_downsample_sig = sourmash.load_one_signature(sigs11, ksize=21, select_moltype='DNA') actual_downsample_sig = sourmash.load_one_signature(out) test_mins = test_downsample_sig.minhash.get_mins() actual_mins = actual_downsample_sig.minhash.get_mins() # select those mins that are beneath the new max hash... max_hash = actual_downsample_sig.minhash.max_hash test_mins_down = { k for k in test_mins if k < max_hash } assert test_mins_down == set(actual_mins)
def test_sig_downsample_1_scaled_to_num(c): # downsample a scaled signature sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'downsample', '--num', '500', sig47) # stdout should be new signature out = c.last_result.out actual_downsample_sig = sourmash.load_one_signature(out) actual_mins = actual_downsample_sig.minhash.get_mins() actual_mins = list(actual_mins) actual_mins.sort() test_downsample_sig = sourmash.load_one_signature(sig47) test_mins = test_downsample_sig.minhash.get_mins() test_mins = list(test_mins) test_mins.sort() test_mins = test_mins[:500] # take 500 smallest assert actual_mins == test_mins
def test_sig_rename_2_output_to_same(c): # change name of signature "in place", same output file sig47 = utils.get_test_data('47.fa.sig') inplace = c.output('inplace.sig') shutil.copyfile(sig47, inplace) print(inplace) c.run_sourmash('sig', 'rename', '-d', inplace, 'fiz bar', '-o', inplace) actual_rename_sig = sourmash.load_one_signature(inplace) assert actual_rename_sig.name() == 'fiz bar'
def test_sig_subtract_1_multisig(c): # subtract of everything from 47 sig47 = utils.get_test_data('47.fa.sig') multisig = utils.get_test_data('47+63-multisig.sig') c.run_sourmash('sig', 'subtract', sig47, multisig, '--flatten') # stdout should be new signature out = c.last_result.out actual_subtract_sig = sourmash.load_one_signature(out) assert not set(actual_subtract_sig.minhash.get_mins())
def test_sig_flatten_1(c): # extract matches to several names from among several signatures & flatten sig47abund = utils.get_test_data('track_abund/47.fa.sig') sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'flatten', sig47abund, '--name', 'Shewanella') # stdout should be new signature out = c.last_result.out siglist = sourmash.load_signatures(out) siglist = list(siglist) assert len(siglist) == 1 test_flattened = sourmash.load_one_signature(sig47) assert test_flattened.minhash == siglist[0].minhash
def overlap(args): """ provide detailed comparison of two signatures """ p = SourmashArgumentParser(prog='sourmash signature overlap') p.add_argument('signature1') p.add_argument('signature2') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) sig1 = sourmash.load_one_signature(args.signature1, ksize=args.ksize, select_moltype=moltype) sig2 = sourmash.load_one_signature(args.signature2, ksize=args.ksize, select_moltype=moltype) notify('loaded one signature each from {} and {}', args.signature1, args.signature2) try: similarity = sig1.similarity(sig2) except ValueError: raise cont1 = sig1.contained_by(sig2) cont2 = sig2.contained_by(sig1) sig1_file = args.signature1 sig2_file = args.signature2 name1 = sig1.name() name2 = sig2.name() md5_1 = sig1.md5sum() md5_2 = sig2.md5sum() ksize = sig1.minhash.ksize moltype = 'DNA' if sig1.minhash.is_protein: moltype = 'protein' num = sig1.minhash.num size1 = len(sig1.minhash) size2 = len(sig2.minhash) scaled = sig1.minhash.scaled hashes_1 = set(sig1.minhash.get_mins()) hashes_2 = set(sig2.minhash.get_mins()) num_common = len(hashes_1.intersection(hashes_2)) disjoint_1 = len(hashes_1 - hashes_2) disjoint_2 = len(hashes_2 - hashes_1) num_union = len(hashes_1.union(hashes_2)) print('''\ first signature: signature filename: {sig1_file} signature: {name1} md5: {md5_1} k={ksize} molecule={moltype} num={num} scaled={scaled} second signature: signature filename: {sig2_file} signature: {name2} md5: {md5_2} k={ksize} molecule={moltype} num={num} scaled={scaled} similarity: {similarity:.5f} first contained in second: {cont1:.5f} second contained in first: {cont2:.5f} number of hashes in first: {size1} number of hashes in second: {size2} number of hashes in common: {num_common} only in first: {disjoint_1} only in second: {disjoint_2} total (union): {num_union} '''.format(**locals()))