def test_jaccard_on_real_data(): from sourmash.signature import load_signatures afile = 'n10000/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz' a = utils.get_test_data(afile) sig1 = list(load_signatures(a))[0] mh1 = sig1.minhash bfile = 'n10000/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz' b = utils.get_test_data(bfile) sig2 = list(load_signatures(b))[0] mh2 = sig2.minhash assert mh1.compare(mh2) == 0.0183 assert mh2.compare(mh1) == 0.0183 mh1 = mh1.downsample_n(1000) mh2 = mh2.downsample_n(1000) assert mh1.compare(mh2) == 0.011 assert mh2.compare(mh1) == 0.011 mh1 = mh1.downsample_n(100) mh2 = mh2.downsample_n(100) assert mh1.compare(mh2) == 0.01 assert mh2.compare(mh1) == 0.01 mh1 = mh1.downsample_n(10) mh2 = mh2.downsample_n(10) assert mh1.compare(mh2) == 0.0 assert mh2.compare(mh1) == 0.0
def test_scaled_on_real_data_2(): from sourmash.signature import load_signatures afile = 'scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz' a = utils.get_test_data(afile) sig1 = list(load_signatures(a))[0] mh1 = sig1.minhash bfile = 'scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz' b = utils.get_test_data(bfile) sig2 = list(load_signatures(b))[0] mh2 = sig2.minhash assert round(mh1.compare(mh2), 5) == 0.01644 assert round(mh2.compare(mh1), 5) == 0.01644 mh1 = mh1.downsample_scaled(1000) mh2 = mh2.downsample_scaled(1000) assert round(mh1.compare(mh2), 4) == 0.0187 assert round(mh2.compare(mh1), 4) == 0.0187 mh1 = mh1.downsample_scaled(10000) mh2 = mh2.downsample_scaled(10000) assert round(mh1.compare(mh2), 3) == 0.01 assert round(mh2.compare(mh1), 3) == 0.01 mh1 = mh1.downsample_scaled(100000) mh2 = mh2.downsample_scaled(100000) assert round(mh1.compare(mh2), 2) == 0.01 assert round(mh2.compare(mh1), 2) == 0.01
def test_do_sourmash_compute_name(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') status, out, err = utils.runscript('sourmash', ['compute', '-k', '31', '--merge', 'foo', testdata1, '-o', 'foo.sig'], in_directory=location) sigfile = os.path.join(location, 'foo.sig') assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) assert sig.name() == 'foo' status, out, err = utils.runscript('sourmash', ['compute', '-k', '31', '--name', 'foo', testdata1, '-o', 'foo2.sig'], in_directory=location) sigfile2 = os.path.join(location, 'foo2.sig') assert os.path.exists(sigfile2) sig2 = next(signature.load_signatures(sigfile)) assert sig2.name() == 'foo' assert sig.name() == sig2.name()
def test_contig_search(location): # test for same results args = utils.Args() args.genome = utils.get_testfile("test-data/proteomes/GB_GCA_002691795.1_protein.100contigs.faa.gz") args.genome_sig = utils.get_testfile("test-data/intermediate/signatures/GB_GCA_002691795.1_protein.100contigs.faa.gz.sig") args.matches_sig = utils.get_testfile("test-data/intermediate/search/GB_GCA_002691795.1_protein.100contigs.faa.gz.x.gtdb-nine.protein-k11.matches.sig") args.lineages_csv = utils.get_testfile("test-data/databases/gtdb-nine.lineages.csv") args.alphabet = "protein" args.ksize = 33 args.output_prefix = "GB_GCA_002691795.1_protein.100contigs.faa.gz.x.gtdb-nine.protein-k11" args.no_search=False args.gather=False args.no_search_contigs=False args.search_genome=False search_csv = os.path.join(location, f"{args.output_prefix}.contigs.search.csv") ranksearch_csv = os.path.join(location, f"{args.output_prefix}.contigs.ranksearch.csv") search_matches = os.path.join(location, f"{args.output_prefix}.contigs.search.matches.sig") ranksearch_matches = os.path.join(location, f"{args.output_prefix}.contigs.ranksearch.matches.sig") outfiles = [search_csv, ranksearch_csv, search_matches, ranksearch_matches] status = search_or_gather.main(args) assert status == 0 for outF in outfiles: assert os.path.exists(outF) saved_search_csv = \ utils.get_testfile("test-data/intermediate/contig-search/GB_GCA_002691795.1_protein.100contigs.faa.gz.x.gtdb-nine.protein-k11.contigs.search.csv") with open(saved_search_csv) as fp: saved_search_csvset = get_csv_set(fp) with open(search_csv) as fp: this_search_csvset = get_csv_set(fp) assert saved_search_csvset == this_search_csvset saved_ranksearch_csv = \ utils.get_testfile("test-data/intermediate/contig-search/GB_GCA_002691795.1_protein.100contigs.faa.gz.x.gtdb-nine.protein-k11.contigs.ranksearch.csv") with open(saved_ranksearch_csv) as fp: saved_ranksearch_csvset = get_csv_set(fp) with open(ranksearch_csv) as fp: this_ranksearch_csvset = get_csv_set(fp) assert saved_ranksearch_csvset == this_ranksearch_csvset saved_search_matches = \ utils.get_testfile("test-data/intermediate/contig-search/GB_GCA_002691795.1_protein.100contigs.faa.gz.x.gtdb-nine.protein-k11.contigs.search.matches.sig") with open(saved_search_matches) as sm: saved_search_sigs = set(sig.load_signatures(sm)) with open(search_matches) as sm: this_search_sigs = set(sig.load_signatures(sm)) assert saved_search_sigs == this_search_sigs saved_ranksearch_matches = \ utils.get_testfile("test-data/intermediate/contig-search/GB_GCA_002691795.1_protein.100contigs.faa.gz.x.gtdb-nine.protein-k11.contigs.ranksearch.matches.sig") with open(saved_ranksearch_matches) as rm: saved_ranksearch_sigs = set(sig.load_signatures(rm)) with open(ranksearch_matches) as rm: this_ranksearch_sigs = set(sig.load_signatures(rm)) assert saved_ranksearch_sigs == this_ranksearch_sigs
def test_do_sourmash_check_knowngood_dna_comparisons_use_rna(c): # check the --rna flag; otherwise identical to previous test. testdata1 = utils.get_test_data('ecoli.genes.fna') c.run_sourmash('compute', '-k', '21', '--singleton', '--rna', testdata1) sig1 = c.output('ecoli.genes.fna.sig') assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1, sig2 = sorted(x, key=lambda x: x.name()) knowngood = utils.get_test_data('benchmark.dna.sig') good = list(signature.load_signatures(knowngood))[0] assert sig2.similarity(good) == 1.0
def test_do_sourmash_check_knowngood_dna_comparisons(c): # this test checks against a known good signature calculated # by utils/compute-dna-mh-another-way.py testdata1 = utils.get_test_data('ecoli.genes.fna') c.run_sourmash('compute', '-k', '21', '--singleton', '--dna', testdata1) sig1 = c.output('ecoli.genes.fna.sig') assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1, sig2 = sorted(x, key=lambda x: x.name()) knowngood = utils.get_test_data('benchmark.dna.sig') good = list(signature.load_signatures(knowngood))[0] assert sig2.similarity(good) == 1.0
def test_do_sourmash_check_protein_comparisons(): # this test checks 2 x 2 protein comparisons with E. coli genes. with utils.TempDirectory() as location: testdata1 = utils.get_test_data('ecoli.faa') status, out, err = utils.runscript('sourmash', ['compute', '-k', '21', '--input-is-protein', '--singleton', testdata1], in_directory=location) sig1 = os.path.join(location, 'ecoli.faa.sig') assert os.path.exists(sig1) testdata2 = utils.get_test_data('ecoli.genes.fna') status, out, err = utils.runscript('sourmash', ['compute', '-k', '21', '--protein', '--no-dna', '--singleton', testdata2], in_directory=location) sig2 = os.path.join(location, 'ecoli.genes.fna.sig') assert os.path.exists(sig2) # I'm not sure why load_signatures is randomizing order, but ok. x = list(signature.load_signatures(sig1)) sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name()) x = list(signature.load_signatures(sig2)) sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name()) name1 = sig1_aa.name().split()[0] assert name1 == 'NP_414543.1' name2 = sig2_aa.name().split()[0] assert name2 == 'NP_414544.1' name3 = sig1_trans.name().split()[0] assert name3 == 'gi|556503834:2801-3733' name4 = sig2_trans.name().split()[0] assert name4 == 'gi|556503834:337-2799' print(name1, name3, round(sig1_aa.similarity(sig1_trans), 3)) print(name2, name3, round(sig2_aa.similarity(sig1_trans), 3)) print(name1, name4, round(sig1_aa.similarity(sig2_trans), 3)) print(name2, name4, round(sig2_aa.similarity(sig2_trans), 3)) assert round(sig1_aa.similarity(sig1_trans), 3) == 0.0 assert round(sig2_aa.similarity(sig1_trans), 3) == 0.166 assert round(sig1_aa.similarity(sig2_trans), 3) == 0.174 assert round(sig2_aa.similarity(sig2_trans), 3) == 0.0
def test_binary_nary_tree(): factory = GraphFactory(31, 1e5, 4) trees = {} trees[2] = SBT(factory) trees[5] = SBT(factory, d=5) trees[10] = SBT(factory, d=10) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) for tree in trees.values(): tree.add_node(leaf) to_search = leaf n_leaves += 1 assert all([len(list(t.leaves())) == n_leaves for t in trees.values()]) results = {} print('*' * 60) print("{}:".format(to_search.metadata)) for d, tree in trees.items(): results[d] = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*results[2], sep='\n') assert results[2] == results[5] assert results[5] == results[10]
def test_do_sourmash_compute_10x_barcode(): pytest.importorskip('bam2fasta') with utils.TempDirectory() as location: testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam') barcodes_file = utils.get_test_data('10x-example/barcodes.tsv') status, out, err = utils.runscript('sourmash', ['compute', '-k', '21', '--line-count', '50', '--input-is-10x', '--protein', '--barcodes-file', barcodes_file, testdata1], in_directory=location) sigfile = os.path.join(location, 'possorted_genome_bam.bam.sig') assert os.path.exists(sigfile) siglist = list(signature.load_signatures(sigfile)) assert len(siglist) == 16 barcode_signatures = list(set([sig.name().split("_")[0] for sig in siglist])) with open(utils.get_test_data('10x-example/barcodes.tsv')) as f: true_barcodes = set(x.strip() for x in f.readlines()) # Ensure that every cell barcode in barcodes.tsv has a signature assert all(bc in true_barcodes for bc in barcode_signatures)
def test_tree_save_load(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with utils.TempDirectory() as location: tree.save(os.path.join(location, 'demo')) tree = SBT.load(os.path.join(location, 'demo'), leaf_loader=SigLeaf.load) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def test_sbt_tarstorage(): factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*old_result, sep='\n') with TarStorage(os.path.join(location, 'tree.tar.gz')) as storage: tree.save(os.path.join(location, 'tree'), storage=storage) with TarStorage(os.path.join(location, 'tree.tar.gz')) as storage: tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*new_result, sep='\n') assert old_result == new_result
def test_do_sourmash_compute_multik_with_dayhoff_hp_dna_protein(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') status, out, err = utils.runscript('sourmash', [ 'compute', '-k', '21,30', '--dayhoff', '--hp', '--protein', testdata1 ], in_directory=location) outfile = os.path.join(location, 'short.fa.sig') assert os.path.exists(outfile) with open(outfile, 'rt') as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 8 ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 30 in ksizes assert sum(x.minhash.is_molecule_type('DNA') for x in siglist) == 2 assert sum(x.minhash.is_molecule_type('dayhoff') for x in siglist) == 2 assert sum(x.minhash.is_molecule_type('hp') for x in siglist) == 2 # 2 = dayhoff, 2 = hp = 4 protein assert sum(x.minhash.is_molecule_type('protein') for x in siglist) == 2
def test_binary_nary_tree(): factory = GraphFactory(31, 1e5, 4) trees = {} trees[2] = SBT(factory) trees[5] = SBT(factory, d=5) trees[10] = SBT(factory, d=10) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) for tree in trees.values(): tree.add_node(leaf) to_search = leaf n_leaves += 1 assert all([len(list(t.leaves())) == n_leaves for t in trees.values()]) results = {} print('*' * 60) print("{}:".format(to_search.metadata)) for d, tree in trees.items(): results[d] = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*results[2], sep='\n') assert results[2] == results[5] assert results[5] == results[10]
def test_sbt_combine(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) tree_1 = SBT(factory, d=n_children) tree_2 = SBT(factory, d=n_children) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) if n_leaves < 4: tree_1.add_node(leaf) else: tree_2.add_node(leaf) n_leaves += 1 tree_1.combine(tree_2) t1_leaves = {str(l) for l in tree_1.leaves()} t_leaves = {str(l) for l in tree.leaves()} assert len(t1_leaves) == n_leaves assert len(t_leaves) == len(t1_leaves) assert t1_leaves == t_leaves to_search = next(signature.load_signatures( utils.get_test_data(utils.SIG_FILES[0]))) t1_result = {str(s) for s in tree_1.find(search_minhashes, to_search, 0.1)} tree_result = {str(s) for s in tree.find(search_minhashes, to_search, 0.1)} assert t1_result == tree_result # TODO: save and load both trees # check if adding a new node will use the next empty position next_empty = 0 for n, d in enumerate(tree_1.nodes): if n != d: next_empty = n break if not next_empty: next_empty = n + 1 tree_1.add_node(leaf) assert tree_1.next_node == next_empty
def test_sbt_combine(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) tree_1 = SBT(factory, d=n_children) tree_2 = SBT(factory, d=n_children) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) if n_leaves < 4: tree_1.add_node(leaf) else: tree_2.add_node(leaf) n_leaves += 1 tree_1.combine(tree_2) t1_leaves = {str(l) for l in tree_1.leaves()} t_leaves = {str(l) for l in tree.leaves()} assert len(t1_leaves) == n_leaves assert len(t_leaves) == len(t1_leaves) assert t1_leaves == t_leaves to_search = next( signature.load_signatures(utils.get_test_data(utils.SIG_FILES[0]))) t1_result = {str(s) for s in tree_1.find(search_minhashes, to_search, 0.1)} tree_result = {str(s) for s in tree.find(search_minhashes, to_search, 0.1)} assert t1_result == tree_result # TODO: save and load both trees # check if adding a new node will use the next empty position next_empty = 0 for n, d in enumerate(tree_1.nodes): if n != d: next_empty = n break if not next_empty: next_empty = n + 1 tree_1.add_node(leaf) assert tree_1.next_node == next_empty
def test_load_minified(track_abundance): sigfile = utils.get_test_data('genome-s10+s11.sig') sigs = load_signatures(sigfile) minified = save_signatures(sigs) with open(sigfile, 'r') as f: orig_file = f.read() assert len(minified) < len(orig_file) assert '\n' not in minified
def test_load_textmode(track_abundance): # ijson requires a file in binary mode or bytes, # but we had an API example in the docs using 'rt'. # I fixed the docs, but I'm keeping this test here # to make sure we still support it =/ sigfile = utils.get_test_data('genome-s10+s11.sig') with open(sigfile, 'rt') as sigfp: siglist = list(signature.load_signatures(sigfp)) loaded_sig = siglist[0] assert loaded_sig.name() == 's10+s11'
def test_do_sourmash_compute_outdir(c): testdata1 = utils.get_test_data('short.fa') status, out, err = utils.runscript( 'sourmash', ['compute', '-k', '31', testdata1, '--outdir', c.location]) sigfile = os.path.join(c.location, 'short.fa.sig') assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) assert sig.name().endswith('short.fa')
def test_roundtrip(track_abundance): e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_load_compressed(track_abundance): e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) x = save_signatures([sig1], compression=5) y = load_one_signature(x) assert sig1 == y sigfile = utils.get_test_data('genome-s10+s11.sig.gz') sigs = load_signatures(sigfile)
def test_do_sourmash_check_knowngood_protein_comparisons(): # this test checks against a known good signature calculated # by utils/compute-prot-mh-another-way.py with utils.TempDirectory() as location: testdata1 = utils.get_test_data('ecoli.genes.fna') status, out, err = utils.runscript('sourmash', [ 'compute', '-k', '21', '--singleton', '--protein', '--no-dna', testdata1 ], in_directory=location) sig1 = os.path.join(location, 'ecoli.genes.fna.sig') assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name()) knowngood = utils.get_test_data('benchmark.prot.sig') good_trans = list(signature.load_signatures(knowngood))[0] assert sig2_trans.similarity(good_trans) == 1.0
def test_roundtrip_empty(track_abundance): # edge case, but: empty minhash? :) e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert sig.similarity(sig2) == 0 assert sig2.similarity(sig) == 0
def test_do_sourmash_compute(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') status, out, err = utils.runscript('sourmash', ['compute', '-k', '31', testdata1], in_directory=location) sigfile = os.path.join(location, 'short.fa.sig') assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) assert sig.name().endswith('short.fa')
def test_roundtrip_max_hash(track_abundance): e = sourmash.MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=10) e.add_hash(5) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert e.max_hash == e2.max_hash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_do_sourmash_compute_multik_outfile(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') outfile = os.path.join(location, 'FOO.xxx') status, out, err = utils.runscript( 'sourmash', ['compute', '-k', '21,31', testdata1, '-o', outfile], in_directory=location) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 31 in ksizes
def test_do_sourmash_compute_10x_filter_umis(): pytest.importorskip('bam2fasta') with utils.TempDirectory() as location: testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam') csv_path = os.path.join(location, "all_barcodes_meta.csv") barcodes_path = utils.get_test_data('10x-example/barcodes.tsv') renamer_path = utils.get_test_data('10x-example/barcodes_renamer.tsv') fastas_dir = os.path.join(location, "fastas") if not os.path.exists(fastas_dir): os.makedirs(fastas_dir) status, out, err = utils.runscript('sourmash', ['compute', '-k', '31', '--dna', '--count-valid-reads', '10', '--input-is-10x', testdata1, '--write-barcode-meta-csv', csv_path, '--barcodes', barcodes_path, '--rename-10x-barcodes', renamer_path, '--save-fastas', fastas_dir, '-o', '10x-example_dna.sig'], in_directory=location) sigfile = os.path.join(location, '10x-example_dna.sig') assert os.path.exists(sigfile) siglist = list(signature.load_signatures(sigfile)) assert len(siglist) == 1 # TODO PV This seems to randomly fail/pass - commenting out for now # but the min hashes should never be empty # min_hashes = [x.minhash.get_mins() for x in siglist] # assert all(mins != [] for mins in min_hashes) with open(csv_path, 'rb') as f: data = [line.split() for line in f] assert len(data) == 9 fasta_files = os.listdir(fastas_dir) barcodes = [filename.replace(".fasta", "") for filename in fasta_files] assert len(barcodes) == 1 assert len(fasta_files) == 1 assert barcodes[0] == 'lung_epithelial_cell|AAATGCCCAAACTGCT-1' count = 0 fasta_file_name = os.path.join(fastas_dir, fasta_files[0]) for record in screed.open(fasta_file_name): name = record.name sequence = record.sequence count += 1 assert name.startswith('lung_epithelial_cell|AAATGCCCAAACTGCT-1') assert sequence.count(">") == 0 assert sequence.count("X") == 0
def test_save_minified(track_abundance): e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1, name="foo") e2 = sourmash.MinHash(n=1, ksize=25, track_abundance=track_abundance) sig2 = SourmashSignature(e2, name="bar baz") x = save_signatures([sig1, sig2]) assert '\n' not in x assert len(x.split('\n')) == 1 y = list(load_signatures(x)) assert len(y) == 2 assert any(sig.name() == 'foo' for sig in y) assert any(sig.name() == 'bar baz' for sig in y)
def test_do_sourmash_compute_multik_only_protein(c): # check sourmash compute with only protein, no nucl testdata1 = utils.get_test_data('short.fa') c.run_sourmash('compute', '-k', '21,30', '--protein', '--no-dna', testdata1) outfile = os.path.join(c.location, 'short.fa.sig') assert os.path.exists(outfile) with open(outfile, 'rt') as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 ksizes = set([ x.minhash.ksize for x in siglist ]) assert 21 in ksizes assert 30 in ksizes
def test_do_sourmash_compute_multik_with_protein(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') status, out, err = utils.runscript( 'sourmash', ['compute', '-k', '21,30', '--protein', testdata1], in_directory=location) outfile = os.path.join(location, 'short.fa.sig') assert os.path.exists(outfile) with open(outfile, 'rt') as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 4 ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 30 in ksizes
def test_save_load_multisig(track_abundance): e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) e2 = sourmash.MinHash(n=1, ksize=25, track_abundance=track_abundance) sig2 = SourmashSignature(e2) x = save_signatures([sig1, sig2]) y = list(load_signatures(x)) print(x) assert len(y) == 2 assert sig1 in y # order not guaranteed, note. assert sig2 in y assert sig1 != sig2
def test_do_sourmash_compute_with_seed(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') outfile = os.path.join(location, 'FOO.xxx') status, out, err = utils.runscript('sourmash', [ 'compute', '-k', '21,31', '--seed', '43', testdata1, '-o', outfile ], in_directory=location) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 seeds = [x.minhash.seed for x in siglist] assert len(seeds) == 2 assert set(seeds) == set([43])
def test_do_sourmash_compute_multik_only_protein_no_rna(c): # test --no-rna as well (otherwise identical to previous test) testdata1 = utils.get_test_data('short.fa') c.run_sourmash('compute', '-k', '21,30', '--protein', '--no-rna', testdata1) outfile = os.path.join(c.location, 'short.fa.sig') assert os.path.exists(outfile) with open(outfile, 'rt') as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 ksizes = set([ x.minhash.ksize for x in siglist ]) assert 21 in ksizes assert 30 in ksizes
def test_tree_v2_load(): tree_v2 = SBT.load(utils.get_test_data('v2.sbt.json'), leaf_loader=SigLeaf.load) tree_cur = SBT.load(utils.get_test_data('v3.sbt.json'), leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) to_search = next(signature.load_signatures(testdata1)) results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment, to_search, 0.1)} results_cur = {str(s) for s in tree_cur.find(search_minhashes_containment, to_search, 0.1)} assert results_v2 == results_cur assert len(results_v2) == 4
def test_do_sourmash_compute_10x_no_filter_umis(): pytest.importorskip('bam2fasta') with utils.TempDirectory() as location: # test to check if all the lines in unfiltered_umi_to_sig are callled and tested csv_path = os.path.join(location, "all_barcodes_meta.csv") testdata1 = utils.get_test_data( '10x-example/possorted_genome_bam_filtered.bam') status, out, err = utils.runscript('sourmash', [ 'compute', '-k', '31', '--dna', '--input-is-10x', testdata1, '--write-barcode-meta-csv', csv_path, '--save-fastas', location, '-o', '10x-example_dna.sig' ], in_directory=location) sigfile = os.path.join(location, '10x-example_dna.sig') assert os.path.exists(sigfile) siglist = list(signature.load_signatures(sigfile)) assert len(siglist) == 32
def test_tree_repair(): tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'), leaf_loader=SigLeaf.load) tree_cur = SBT.load(utils.get_test_data('v3.sbt.json'), leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) to_search = next(signature.load_signatures(testdata1)) results_repair = {str(s) for s in tree_repair.find(search_minhashes, to_search, 0.1)} results_cur = {str(s) for s in tree_cur.find(search_minhashes, to_search, 0.1)} assert results_repair == results_cur assert len(results_repair) == 2
def test_tree_repair_add_node(): tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'), leaf_loader=SigLeaf.load) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree_repair.add_node(leaf) for pos, node in list(tree_repair.nodes.items()): # Every parent of a node must be an internal node (and not a leaf), # except for node 0 (the root), whose parent is None. if pos != 0: assert isinstance(tree_repair.parent(pos).node, Node) # Leaf nodes can't have children if isinstance(node, Leaf): assert all(c.node is None for c in tree_repair.children(pos))
def test_search_minhashes(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = next(iter(tree.leaves())) # this fails if 'search_minhashes' is calc containment and not similarity. results = tree.find(search_minhashes, to_search.data, 0.08) for leaf in results: assert to_search.data.similarity(leaf.data) >= 0.08 print(results)
def test_distance_matrix(track_abundance): import numpy siglist = [next(signature.load_signatures(utils.get_test_data(f))) for f in utils.SIG_FILES] D1 = numpy.zeros([len(siglist), len(siglist)]) D2 = numpy.zeros([len(siglist), len(siglist)]) for i, E in enumerate(siglist): for j, E2 in enumerate(siglist): if i < j: continue similarity = E.similarity(E2, track_abundance) D2[i][j] = similarity D2[j][i] = similarity for i, E in enumerate(siglist): for j, E2 in enumerate(siglist): D1[i][j] = E.similarity(E2, track_abundance) assert numpy.array_equal(D1, D2)
def test_save_sparseness(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*old_result, sep='\n') with utils.TempDirectory() as location: tree.save(os.path.join(location, 'demo'), sparseness=1.0) tree_loaded = SBT.load(os.path.join(location, 'demo'), leaf_loader=SigLeaf.load) assert all(not isinstance(n, Node) for n in tree_loaded.nodes.values()) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = {str(s) for s in tree_loaded.find(search_minhashes, to_search.data, 0.1)} print(*new_result, sep='\n') assert old_result == new_result for pos, node in list(tree_loaded.nodes.items()): # Every parent of a node must be an internal node (and not a leaf), # except for node 0 (the root), whose parent is None. if pos != 0: assert isinstance(tree_loaded.parent(pos).node, Node) # Leaf nodes can't have children if isinstance(node, Leaf): assert all(c.node is None for c in tree_loaded.children(pos))
def test_sbt_ipfsstorage(): ipfsapi = pytest.importorskip('ipfsapi') factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*old_result, sep='\n') try: with IPFSStorage() as storage: tree.save(os.path.join(location, 'tree'), storage=storage) except ipfsapi.exceptions.ConnectionError: pytest.xfail("ipfs not installed/functioning probably") with IPFSStorage() as storage: tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*new_result, sep='\n') assert old_result == new_result