Exemple #1
0
def test_linear_index_gather():
    sig2 = utils.get_test_data('2.fa.sig')
    sig47 = utils.get_test_data('47.fa.sig')
    sig63 = utils.get_test_data('63.fa.sig')

    ss2 = sourmash.load_one_signature(sig2, ksize=31)
    ss47 = sourmash.load_one_signature(sig47)
    ss63 = sourmash.load_one_signature(sig63)

    lidx = LinearIndex()
    lidx.insert(ss2)
    lidx.insert(ss47)
    lidx.insert(ss63)

    matches = lidx.gather(ss2)
    assert len(matches) == 1
    assert matches[0][0] == 1.0
    assert matches[0][1] == ss2

    matches = lidx.gather(ss47)
    assert len(matches) == 2
    assert matches[0][0] == 1.0
    assert matches[0][1] == ss47
    assert round(matches[1][0], 2) == 0.49
    assert matches[1][1] == ss63
Exemple #2
0
def test_sbt_dayhoff_command_index(c):
    # test command-line creation of SBT database with dayhoff sigs
    sigfile1 = utils.get_test_data(
        'prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
    sigfile2 = utils.get_test_data(
        'prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig')

    db_out = c.output('dayhoff.sbt.zip')

    c.run_sourmash('index', db_out, sigfile1, sigfile2, '--scaled', '100',
                   '-k', '57', '--dayhoff')

    db2 = sourmash.load_sbt_index(db_out)

    sig1 = sourmash.load_one_signature(sigfile1)
    sig2 = sourmash.load_one_signature(sigfile2)

    # check reconstruction --
    mh_list = [x.minhash for x in db2.signatures()]
    assert len(mh_list) == 2
    assert sig1.minhash in mh_list
    assert sig2.minhash in mh_list

    # and search, gather
    results = db2.search(sig1,
                         threshold=0.0,
                         ignore_abundance=True,
                         do_containment=False,
                         best_only=False)
    assert len(results) == 2

    results = db2.gather(sig2)
    assert results[0][0] == 1.0
def test_sig_intersect_4(c):
    # use --abundances-from to preserve abundances from sig #47
    sig47 = utils.get_test_data('track_abund/47.fa.sig')
    sig63 = utils.get_test_data('track_abund/63.fa.sig')
    c.run_sourmash('sig', 'intersect', '--abundances-from', sig47, sig63)

    # stdout should be new signature
    out = c.last_result.out

    actual_intersect_sig = sourmash.load_one_signature(out)

    # actually do an intersection ourselves for the test
    mh47 = sourmash.load_one_signature(sig47).minhash
    mh63 = sourmash.load_one_signature(sig63).minhash
    mh47_abunds = mh47.get_mins(with_abundance=True)
    mh63_mins = set(mh63.get_mins())

    # get the set of mins that are in common
    mh63_mins.intersection_update(mh47_abunds)

    # take abundances from mh47 & create new sig
    mh47_abunds = {k: mh47_abunds[k] for k in mh63_mins}
    test_mh = mh47.copy_and_clear()
    test_mh.set_abundances(mh47_abunds)

    print(actual_intersect_sig.minhash)
    print(out)

    assert actual_intersect_sig.minhash == test_mh
Exemple #4
0
def test_linear_index_save():
    sig2 = utils.get_test_data('2.fa.sig')
    sig47 = utils.get_test_data('47.fa.sig')
    sig63 = utils.get_test_data('63.fa.sig')

    ss2 = sourmash.load_one_signature(sig2, ksize=31)
    ss47 = sourmash.load_one_signature(sig47)
    ss63 = sourmash.load_one_signature(sig63)

    linear = LinearIndex()
    linear.insert(ss2)
    linear.insert(ss47)
    linear.insert(ss63)

    with utils.TempDirectory() as location:
        filename = os.path.join(location, 'foo')
        linear.save(filename)

        from sourmash import load_signatures
        si = set(load_signatures(filename))

    x = {ss2, ss47, ss63}

    print(len(si))
    print(len(x))

    print(si)
    print(x)

    assert si == x, si
Exemple #5
0
def test_sbt_gather_threshold_1():
    # test gather() method, in some detail
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=2)

    sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31)
    sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31)
    sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31)

    tree.insert(sig47)
    tree.insert(sig63)
    tree.insert(sig2)

    # now construct query signatures with specific numbers of hashes --
    # note, these signatures all have scaled=1000.

    mins = list(sorted(sig2.minhash.get_mins()))
    new_mh = sig2.minhash.copy_and_clear()

    # query with empty hashes
    assert not new_mh
    assert not tree.gather(SourmashSignature(new_mh))

    # add one hash
    new_mh.add_hash(mins.pop())
    assert len(new_mh) == 1

    results = tree.gather(SourmashSignature(new_mh))
    assert len(results) == 1
    containment, match_sig, name = results[0]
    assert containment == 1.0
    assert match_sig == sig2
    assert name is None

    # check with a threshold -> should be no results.
    results = tree.gather(SourmashSignature(new_mh), threshold_bp=5000)
    assert not results

    # add three more hashes => length of 4
    new_mh.add_hash(mins.pop())
    new_mh.add_hash(mins.pop())
    new_mh.add_hash(mins.pop())
    assert len(new_mh) == 4

    results = tree.gather(SourmashSignature(new_mh))
    assert len(results) == 1
    containment, match_sig, name = results[0]
    assert containment == 1.0
    assert match_sig == sig2
    assert name is None

    # check with a too-high threshold -> should be no results.
    print('len mh', len(new_mh))
    results = tree.gather(SourmashSignature(new_mh), threshold_bp=5000)
    assert not results
def test_sig_cat_1(c):
    # cat 47 to 47...
    sig47 = utils.get_test_data('47.fa.sig')
    c.run_sourmash('sig', 'cat', sig47)

    # stdout should be same signature
    out = c.last_result.out

    test_cat_sig = sourmash.load_one_signature(sig47)
    actual_cat_sig = sourmash.load_one_signature(out)

    assert actual_cat_sig == test_cat_sig
def test_import_export_1(c):
    # check to make sure we can import what we've exported!
    inp = utils.get_test_data('genome-s11.fa.gz.sig')
    outp = c.output('export.json')

    c.run_sourmash('sig', 'export', inp, '-o', outp, '-k', '21', '--dna')
    c.run_sourmash('sig', 'import', outp)

    original = sourmash.load_one_signature(inp, ksize=21, select_moltype='DNA')
    roundtrip = sourmash.load_one_signature(c.last_result.out)

    assert original.minhash == roundtrip.minhash
def test_sig_extract_1(c):
    # extract 47 from 47... :)
    sig47 = utils.get_test_data('47.fa.sig')
    c.run_sourmash('sig', 'extract', sig47)

    # stdout should be new signature
    out = c.last_result.out

    test_extract_sig = sourmash.load_one_signature(sig47)
    actual_extract_sig = sourmash.load_one_signature(out)

    assert actual_extract_sig == test_extract_sig
def test_sig_split_1(c):
    # split 47 into 1 sig :)
    sig47 = utils.get_test_data('47.fa.sig')
    c.run_sourmash('sig', 'split', sig47)

    outname = '09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig'

    assert os.path.exists(c.output(outname))

    test_split_sig = sourmash.load_one_signature(sig47)
    actual_split_sig = sourmash.load_one_signature(c.output(outname))

    assert actual_split_sig == test_split_sig
def test_sig_merge_2(c):
    # merge of 47 with nothing should be 47
    sig47 = utils.get_test_data('47.fa.sig')
    c.run_sourmash('sig', 'merge', sig47)

    # stdout should be new signature
    out = c.last_result.out

    test_merge_sig = sourmash.load_one_signature(sig47)
    actual_merge_sig = sourmash.load_one_signature(out)

    print(out)

    assert actual_merge_sig.minhash == test_merge_sig.minhash
def test_sig_downsample_1_scaled(c):
    # downsample a scaled signature
    sig47 = utils.get_test_data('47.fa.sig')
    c.run_sourmash('sig', 'downsample', '--scaled', '10000', sig47)

    # stdout should be new signature
    out = c.last_result.out

    test_downsample_sig = sourmash.load_one_signature(sig47)
    actual_downsample_sig = sourmash.load_one_signature(out)

    test_mh = test_downsample_sig.minhash.downsample_scaled(10000)

    assert actual_downsample_sig.minhash == test_mh
def test_import_export_2(c):
    # check to make sure we can import a mash JSON dump file.
    # NOTE: msh.json_dump file calculated like so:
    #   mash sketch -s 500 -k 21 ./tests/test-data/genome-s11.fa.gz
    #   mash info -d ./tests/test-data/genome-s11.fa.gz.msh > tests/test-data/genome-s11.fa.gz.msh.json_dump
    #
    sig1 = utils.get_test_data('genome-s11.fa.gz.sig')
    msh_sig = utils.get_test_data('genome-s11.fa.gz.msh.json_dump')

    c.run_sourmash('sig', 'import', msh_sig)
    imported = sourmash.load_one_signature(c.last_result.out)
    compare = sourmash.load_one_signature(sig1, ksize=21, select_moltype='DNA')

    assert imported.minhash == compare.minhash
def test_sig_downsample_2_num(c):
    # downsample a num signature
    sigs11 = utils.get_test_data('genome-s11.fa.gz.sig')
    c.run_sourmash('sig', 'downsample', '--num', '500',
                   '-k', '21', '--dna', sigs11)

    # stdout should be new signature
    out = c.last_result.out

    test_downsample_sig = sourmash.load_one_signature(sigs11, ksize=21,
                                                      select_moltype='DNA')
    actual_downsample_sig = sourmash.load_one_signature(out)
    test_mh = test_downsample_sig.minhash.downsample_n(500)

    assert actual_downsample_sig.minhash == test_mh
Exemple #14
0
def test_save_zip(tmpdir):
    # load from zipped SBT, save to zipped SBT, and then search.
    testdata = utils.get_test_data("v6.sbt.zip")
    testsbt = tmpdir.join("v6.sbt.zip")
    newsbt = tmpdir.join("new.sbt.zip")

    shutil.copyfile(testdata, str(testsbt))

    tree = SBT.load(str(testsbt), leaf_loader=SigLeaf.load)
    tree.save(str(newsbt))
    assert newsbt.exists()

    new_tree = SBT.load(str(newsbt), leaf_loader=SigLeaf.load)
    assert isinstance(new_tree.storage, ZipStorage)
    assert new_tree.storage.list_sbts() == ['new.sbt.json']

    to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0]))

    print("*" * 60)
    print("{}:".format(to_search))
    old_result = {str(s) for s in tree.find(search_minhashes, to_search, 0.1)}
    new_result = {
        str(s)
        for s in new_tree.find(search_minhashes, to_search, 0.1)
    }
    print(*new_result, sep="\n")

    assert old_result == new_result
    assert len(new_result) == 2
Exemple #15
0
def subtract(args):
    """
    subtract one or more signatures from another
    """
    p = SourmashArgumentParser(prog='sourmash signature subtract')
    p.add_argument('signature_from')
    p.add_argument('subtraction_sigs', nargs='+')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    p.add_argument('--flatten', action='store_true',
                   help='remove abundance from signatures before subtracting')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    from_sigfile = args.signature_from
    from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype)

    from_mh = from_sigobj.minhash
    if from_mh.track_abundance and not args.flatten:
        error('Cannot use subtract on signatures with abundance tracking, sorry!')
        sys.exit(1)

    subtract_mins = set(from_mh.get_mins())

    notify('loaded signature from {}...', from_sigfile, end='\r')

    total_loaded = 0
    for sigfile in args.subtraction_sigs:
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):

            if sigobj.minhash.track_abundance and not args.flatten:
                error('Cannot use subtract on signatures with abundance tracking, sorry!')
                sys.exit(1)

            subtract_mins -= set(sigobj.minhash.get_mins())

            notify('loaded and subtracted signatures from {}...', sigfile, end='\r')
            total_loaded += 1

    if not total_loaded:
        error("no signatures to subtract!?")
        sys.exit(-1)
        

    subtract_mh = from_sigobj.minhash.copy_and_clear()
    subtract_mh.add_many(subtract_mins)

    subtract_sigobj = sourmash.SourmashSignature(subtract_mh)

    output_json = sourmash.save_signatures([subtract_sigobj], fp=args.output)

    notify('loaded and subtracted {} signatures', total_loaded)
def test_sig_merge_3_abund_ab_ok(c):
    # merge of 47 and 63 with abund should work
    sig47abund = utils.get_test_data('track_abund/47.fa.sig')
    sig63abund = utils.get_test_data('track_abund/63.fa.sig')

    c.run_sourmash('sig', 'merge', sig47abund, sig63abund)
    actual_merge_sig = sourmash.load_one_signature(c.last_result.out)
Exemple #17
0
def load_or_generate_sig_from_file(input_file,
                                   alphabet,
                                   ksize,
                                   scaled,
                                   ignore_abundance=False,
                                   translate=False):
    sig = ""
    if input_file.endswith(".sig"):
        # do I want to enable multiple sigs per file here?
        sig = sourmash.load_one_signature(input_file, ksize=ksize)
    else:
        # read file and add sigs
        records = try_reading_fasta_file(input_file)
        # build signature name from filename .. maybe just keep filename?
        #signame = os.path.basename(input_file.rsplit("_", 1)[0])
        # start with fresh minhash
        mh = determine_appropriate_fresh_minhash(alphabet, ksize, scaled,
                                                 ignore_abundance)
        if records:
            for record in records:
                if alphabet == "nucleotide" or translate:
                    mh.add_sequence(record.sequence, force=True)
                else:
                    mh.add_protein(record.sequence)
            # minhash --> signature, using filename as signature name ..i think this happens automatically if don't provide name?
            sig = sourmash.SourmashSignature(mh,
                                             name=os.path.basename(input_file))
    return sig
Exemple #18
0
def export(args):
    """
    export a signature to mash format
    """
    p = SourmashArgumentParser(prog='sourmash signature export')
    p.add_argument('filename')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    total_loaded = 0
    ss = sourmash.load_one_signature(args.filename, ksize=args.ksize,
                                     select_moltype=moltype)
    mh = ss.minhash

    x = {}
    x['kmer'] = mh.ksize
    x['sketchSize'] = len(mh)

    x['hashType'] = "MurmurHash3_x64_128"
    x['hashBits'] = 64
    x['hashSeed'] = mh.seed

    ll = list(mh.get_mins())
    x['sketches'] = [{ 'hashes': ll }]

    print(json.dumps(x), file=args.output)
    notify("exported signature {} ({})", ss.name(), ss.md5sum()[:8])
Exemple #19
0
def test_sig_filter_3_ksize_select(c):
    # test filtering with ksize selectiong
    psw_mag = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig')
    c.run_sourmash('sig', 'filter', '-m', '2', psw_mag, '-k', '31')

    # stdout should be new signature
    out = c.last_result.out

    filtered_sig = sourmash.load_one_signature(out)
    test_sig = sourmash.load_one_signature(psw_mag, ksize=31)

    abunds = test_sig.minhash.get_mins(True)
    abunds = {k: v for (k, v) in abunds.items() if v >= 2}
    assert abunds

    assert filtered_sig.minhash.get_mins(True) == abunds
def read_signature(sigfile, ksize_n):
	"""Loads signatures stored in files
	
	Signature generated are stored as MinHashes in JSON format file. Sourmash implements a function to load signatures.
	
	This function here is just a shortcut for the :func:`sourmash.load_one_signature` function in sourmash_.
	
	:param sigfile: Hash signature stored in JSON format
	:param ksize_n: Kmer size.
	
	:type sigfile: string
	:type ksize_n: integer
	
	
	.. seealso:: This function depends on sourmash python module (https://sourmash.readthedocs.io/en/latest/). Some functions employed are:
	
		- :func:`sourmash.load_one_signature`
		
	
	.. include:: ../../links.inc	 
	"""
	
	## code taken and adapted from: https://sourmash.readthedocs.io/en/latest/api-example.html
	#print ('\t+ Loading signature for comparison...')
	sig = load_one_signature(sigfile, ksize=ksize_n, select_moltype='DNA', ignore_md5sum=False)
	return (sig)
Exemple #21
0
def test_sig_filter_3(c):
    # test basic filtering
    sig47 = utils.get_test_data('track_abund/47.fa.sig')
    c.run_sourmash('sig', 'filter', '-m', '2', sig47)

    # stdout should be new signature
    out = c.last_result.out

    filtered_sig = sourmash.load_one_signature(out)
    test_sig = sourmash.load_one_signature(sig47)

    abunds = test_sig.minhash.get_mins(True)
    abunds = {k: v for (k, v) in abunds.items() if v >= 2}
    assert abunds

    assert filtered_sig.minhash.get_mins(True) == abunds
Exemple #22
0
def export(args):
    """
    export a signature to mash format
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    ss = sourmash.load_one_signature(args.filename, ksize=args.ksize,
                                     select_moltype=moltype)
    mh = ss.minhash

    x = {}
    x['kmer'] = mh.ksize
    x['sketchSize'] = len(mh)

    x['hashType'] = "MurmurHash3_x64_128"
    x['hashBits'] = 64
    x['hashSeed'] = mh.seed

    ll = list(mh.get_mins())
    x['sketches'] = [{ 'hashes': ll }]

    with FileOutput(args.output, 'wt') as fp:
        print(json.dumps(x), file=fp)
    notify("exported signature {} ({})", ss.name(), ss.md5sum()[:8])
Exemple #23
0
def test_sbt_as_index_signatures():
    # test 'signatures' method from Index base class.
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=2)

    sig47 = load_one_signature(utils.get_test_data('47.fa.sig'))
    sig63 = load_one_signature(utils.get_test_data('63.fa.sig'))

    tree.insert(sig47)
    tree.insert(sig63)

    xx = list(tree.signatures())
    assert len(xx) == 2

    assert sig47 in xx
    assert sig63 in xx
Exemple #24
0
def test_tree_save_load(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)

    for f in utils.SIG_FILES:
        sig = load_one_signature(utils.get_test_data(f))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        to_search = leaf

    print('*' * 60)
    print("{}:".format(to_search.metadata))
    old_result = {
        str(s)
        for s in tree.find(search_minhashes, to_search.data, 0.1)
    }
    print(*old_result, sep='\n')

    with utils.TempDirectory() as location:
        tree.save(os.path.join(location, 'demo'))
        tree = SBT.load(os.path.join(location, 'demo'),
                        leaf_loader=SigLeaf.load)

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
        print(*new_result, sep='\n')

        assert old_result == new_result
Exemple #25
0
def test_binary_nary_tree():
    factory = GraphFactory(31, 1e5, 4)
    trees = {}
    trees[2] = SBT(factory)
    trees[5] = SBT(factory, d=5)
    trees[10] = SBT(factory, d=10)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = load_one_signature(utils.get_test_data(f))
        leaf = SigLeaf(os.path.basename(f), sig)
        for tree in trees.values():
            tree.add_node(leaf)
        to_search = leaf
        n_leaves += 1

    assert all([len(list(t.leaves())) == n_leaves for t in trees.values()])

    results = {}
    print('*' * 60)
    print("{}:".format(to_search.metadata))
    for d, tree in trees.items():
        results[d] = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
    print(*results[2], sep='\n')

    assert results[2] == results[5]
    assert results[5] == results[10]
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('sigs', nargs='+')
    parser.add_argument('lca_db')
    args = parser.parse_args()

    minhashes = []
    for filename in args.sigs:
        ss = sourmash.load_one_signature(filename)
        minhashes.append(ss.minhash)

    # load the LCA database
    dblist, ksize, scaled = lca_utils.load_databases([args.lca_db], None)
    db = dblist[0]

    # double check scaled requirements
    some_mh = minhashes[0]
    mh_scaled = some_mh.scaled
    if scaled >= mh_scaled:
        print(
            '** warning: many minhashes will go unclassified because LCA database scaled is {}'
            .format(scaled),
            file=sys.stderr)
        print('** warning: the minhash scaled is {}'.format(mh_scaled),
              file=sys.stderr)

    summarize_taxonomic_purity(minhashes,
                               db,
                               verbose=True,
                               filenames=args.sigs)
def test_sig_intersect_2(c):
    # intersect of 47 and nothing should be self
    sig47 = utils.get_test_data('47.fa.sig')
    c.run_sourmash('sig', 'intersect', sig47)

    # stdout should be new signature
    out = c.last_result.out

    test_intersect_sig = sourmash.load_one_signature(sig47)
    actual_intersect_sig = sourmash.load_one_signature(out)

    print(test_intersect_sig.minhash)
    print(actual_intersect_sig.minhash)
    print(out)

    assert actual_intersect_sig.minhash == test_intersect_sig.minhash
def test_sig_extract_4(c):
    # extract matches to 47's name from among several signatures
    sig47 = utils.get_test_data('47.fa.sig')
    sig63 = utils.get_test_data('63.fa.sig')
    c.run_sourmash('sig', 'extract', sig47, sig63, '--name', 'NC_009665.1')

    # stdout should be new signature
    out = c.last_result.out

    test_extract_sig = sourmash.load_one_signature(sig47)
    actual_extract_sig = sourmash.load_one_signature(out)

    print(test_extract_sig.minhash)
    print(actual_extract_sig.minhash)

    assert actual_extract_sig == test_extract_sig
def test_sig_merge_1_multisig(c):
    # merge of 47 & 63 should be union of mins; here, sigs are in same file.
    multisig = utils.get_test_data('47+63-multisig.sig')
    sig47and63 = utils.get_test_data('47+63.fa.sig')
    c.run_sourmash('sig', 'merge', multisig, '--flatten')

    # stdout should be new signature
    out = c.last_result.out

    test_merge_sig = sourmash.load_one_signature(sig47and63)
    actual_merge_sig = sourmash.load_one_signature(out)

    print(test_merge_sig.minhash)
    print(actual_merge_sig.minhash)
    print(out)

    assert actual_merge_sig.minhash == test_merge_sig.minhash
def test_sig_rename_1(c):
    # set new name for 47
    sig47 = utils.get_test_data('47.fa.sig')
    c.run_sourmash('sig', 'rename', sig47, 'fiz bar')

    # stdout should be new signature
    out = c.last_result.out

    test_rename_sig = sourmash.load_one_signature(sig47)
    actual_rename_sig = sourmash.load_one_signature(out)

    print(test_rename_sig.minhash)
    print(actual_rename_sig.minhash)

    assert actual_rename_sig.minhash == test_rename_sig.minhash
    assert test_rename_sig.name() != actual_rename_sig.name()
    assert actual_rename_sig.name() == 'fiz bar'
Exemple #31
0
def test_search_db_scaled_lt_sig_scaled():
    dbfile = utils.get_test_data('lca/47+63.lca.json')
    db, ksize, scaled = lca_utils.load_single_database(dbfile)
    sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'))
    sig.minhash = sig.minhash.downsample_scaled(100000)

    with pytest.raises(ValueError) as e:
        results = db.search(sig, threshold=.01, ignore_abundance=True)
def test_sig_subtract_1(c):
    # subtract of 63 from 47
    sig47 = utils.get_test_data('47.fa.sig')
    sig63 = utils.get_test_data('63.fa.sig')
    c.run_sourmash('sig', 'subtract', sig47, sig63)

    # stdout should be new signature
    out = c.last_result.out

    test1_sig = sourmash.load_one_signature(sig47)
    test2_sig = sourmash.load_one_signature(sig63)
    actual_subtract_sig = sourmash.load_one_signature(out)

    mins = set(test1_sig.minhash.get_mins())
    mins -= set(test2_sig.minhash.get_mins())

    assert set(actual_subtract_sig.minhash.get_mins()) == set(mins)
def test_sig_intersect_1(c):
    # intersect of 47 and 63 should be intersection of mins
    sig47 = utils.get_test_data('47.fa.sig')
    sig63 = utils.get_test_data('63.fa.sig')
    sig47and63 = utils.get_test_data('47+63-intersect.fa.sig')
    c.run_sourmash('sig', 'intersect', sig47, sig63)

    # stdout should be new signature
    out = c.last_result.out

    test_intersect_sig = sourmash.load_one_signature(sig47and63)
    actual_intersect_sig = sourmash.load_one_signature(out)

    print(test_intersect_sig.minhash)
    print(actual_intersect_sig.minhash)
    print(out)

    assert actual_intersect_sig.minhash == test_intersect_sig.minhash
def test_sig_merge_1(c):
    # merge of 47 & 63 should be union of mins
    sig47 = utils.get_test_data('47.fa.sig')
    sig63 = utils.get_test_data('63.fa.sig')
    sig47and63 = utils.get_test_data('47+63.fa.sig')
    c.run_sourmash('sig', 'merge', sig47, sig63)

    # stdout should be new signature
    out = c.last_result.out

    test_merge_sig = sourmash.load_one_signature(sig47and63)
    actual_merge_sig = sourmash.load_one_signature(out)

    print(test_merge_sig.minhash)
    print(actual_merge_sig.minhash)
    print(out)

    assert actual_merge_sig.minhash == test_merge_sig.minhash
def test_sig_merge_1_ksize_moltype(c):
    # check ksize, moltype args
    sig47 = utils.get_test_data('47.fa.sig')
    sig63 = utils.get_test_data('63.fa.sig')
    sig47and63 = utils.get_test_data('47+63.fa.sig')
    c.run_sourmash('sig', 'merge', sig47, sig63, '--dna', '-k', '31')

    # stdout should be new signature
    out = c.last_result.out

    test_merge_sig = sourmash.load_one_signature(sig47and63)
    actual_merge_sig = sourmash.load_one_signature(out)

    print(test_merge_sig.minhash)
    print(actual_merge_sig.minhash)
    print(out)

    assert actual_merge_sig.minhash == test_merge_sig.minhash
Exemple #36
0
def test_sourmash_signature_api():
    e = sourmash.MinHash(n=1, ksize=20)
    sig = sourmash.SourmashSignature(e)

    s = sourmash.save_signatures([sig])
    sig_x1 = sourmash.load_one_signature(s)
    sig_x2 = list(sourmash.load_signatures(s))[0]

    assert sig_x1 == sig
    assert sig_x2 == sig
def test_sig_merge_flatten_2(c):
    # merge of 47 with abund, with 63 with, will succeed with --flatten
    sig47abund = utils.get_test_data('track_abund/47.fa.sig')
    sig63 = utils.get_test_data('63.fa.sig')
    sig47and63 = utils.get_test_data('47+63.fa.sig')

    c.run_sourmash('sig', 'merge', sig63, sig47abund, '--flatten')

    print(c.last_result)
    out = c.last_result.out

    test_merge_sig = sourmash.load_one_signature(sig47and63)
    actual_merge_sig = sourmash.load_one_signature(out)

    print(test_merge_sig.minhash)
    print(actual_merge_sig.minhash)
    print(out)

    assert actual_merge_sig.minhash == test_merge_sig.minhash
def test_sig_intersect_2(c):
    # intersect of 47 with abund and 63 with abund should be same
    # as without abund, i.e. intersect 'flattens'
    sig47 = utils.get_test_data('track_abund/47.fa.sig')
    sig63 = utils.get_test_data('track_abund/63.fa.sig')
    sig47and63 = utils.get_test_data('47+63-intersect.fa.sig')
    c.run_sourmash('sig', 'intersect', sig47, sig63)

    # stdout should be new signature
    out = c.last_result.out

    test_intersect_sig = sourmash.load_one_signature(sig47and63)
    actual_intersect_sig = sourmash.load_one_signature(out)

    print(test_intersect_sig.minhash)
    print(actual_intersect_sig.minhash)
    print(out)

    assert actual_intersect_sig.minhash == test_intersect_sig.minhash
def test_sig_intersect_2_multisig(c):
    # intersect of all the multisig stuff should be nothing
    sig47 = utils.get_test_data('47+63-multisig.sig')
    c.run_sourmash('sig', 'intersect', sig47)

    # stdout should be new signature
    out = c.last_result.out

    actual_intersect_sig = sourmash.load_one_signature(out)

    assert not len(actual_intersect_sig.minhash)
def test_sig_downsample_2_num_to_scaled(c):
    # downsample a num signature and convert it into a scaled sig
    sigs11 = utils.get_test_data('genome-s11.fa.gz.sig')
    c.run_sourmash('sig', 'downsample', '--scaled', '10000',
                   '-k', '21', '--dna', sigs11)

    # stdout should be new signature
    out = c.last_result.out

    test_downsample_sig = sourmash.load_one_signature(sigs11, ksize=21,
                                                      select_moltype='DNA')
    actual_downsample_sig = sourmash.load_one_signature(out)

    test_mins = test_downsample_sig.minhash.get_mins()
    actual_mins = actual_downsample_sig.minhash.get_mins()

    # select those mins that are beneath the new max hash...
    max_hash = actual_downsample_sig.minhash.max_hash
    test_mins_down = { k for k in test_mins if k < max_hash }
    assert test_mins_down == set(actual_mins)
def test_sig_downsample_1_scaled_to_num(c):
    # downsample a scaled signature
    sig47 = utils.get_test_data('47.fa.sig')
    c.run_sourmash('sig', 'downsample', '--num', '500', sig47)

    # stdout should be new signature
    out = c.last_result.out

    actual_downsample_sig = sourmash.load_one_signature(out)
    actual_mins = actual_downsample_sig.minhash.get_mins()
    actual_mins = list(actual_mins)
    actual_mins.sort()

    test_downsample_sig = sourmash.load_one_signature(sig47)
    test_mins = test_downsample_sig.minhash.get_mins()
    test_mins = list(test_mins)
    test_mins.sort()
    test_mins = test_mins[:500]           # take 500 smallest

    assert actual_mins == test_mins
def test_sig_rename_2_output_to_same(c):
    # change name of signature "in place", same output file
    sig47 = utils.get_test_data('47.fa.sig')
    inplace = c.output('inplace.sig')
    shutil.copyfile(sig47, inplace)

    print(inplace)

    c.run_sourmash('sig', 'rename', '-d', inplace, 'fiz bar', '-o', inplace)

    actual_rename_sig = sourmash.load_one_signature(inplace)
    assert actual_rename_sig.name() == 'fiz bar'
def test_sig_subtract_1_multisig(c):
    # subtract of everything from 47
    sig47 = utils.get_test_data('47.fa.sig')
    multisig = utils.get_test_data('47+63-multisig.sig')
    c.run_sourmash('sig', 'subtract', sig47, multisig, '--flatten')

    # stdout should be new signature
    out = c.last_result.out

    actual_subtract_sig = sourmash.load_one_signature(out)

    assert not set(actual_subtract_sig.minhash.get_mins())
def test_sig_flatten_1(c):
    # extract matches to several names from among several signatures & flatten
    sig47abund = utils.get_test_data('track_abund/47.fa.sig')
    sig47 = utils.get_test_data('47.fa.sig')
    c.run_sourmash('sig', 'flatten', sig47abund, '--name', 'Shewanella')

    # stdout should be new signature
    out = c.last_result.out

    siglist = sourmash.load_signatures(out)
    siglist = list(siglist)

    assert len(siglist) == 1

    test_flattened = sourmash.load_one_signature(sig47)
    assert test_flattened.minhash == siglist[0].minhash
Exemple #45
0
def overlap(args):
    """
    provide detailed comparison of two signatures
    """
    p = SourmashArgumentParser(prog='sourmash signature overlap')
    p.add_argument('signature1')
    p.add_argument('signature2')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')

    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)

    moltype = sourmash_args.calculate_moltype(args)

    sig1 = sourmash.load_one_signature(args.signature1, ksize=args.ksize,
                                       select_moltype=moltype)
    sig2 = sourmash.load_one_signature(args.signature2, ksize=args.ksize,
                                       select_moltype=moltype)

    notify('loaded one signature each from {} and {}', args.signature1,
           args.signature2)

    try:
        similarity = sig1.similarity(sig2)
    except ValueError:
        raise

    cont1 = sig1.contained_by(sig2)
    cont2 = sig2.contained_by(sig1)

    sig1_file = args.signature1
    sig2_file = args.signature2

    name1 = sig1.name()
    name2 = sig2.name()

    md5_1 = sig1.md5sum()
    md5_2 = sig2.md5sum()

    ksize = sig1.minhash.ksize
    moltype = 'DNA'
    if sig1.minhash.is_protein:
        moltype = 'protein'

    num = sig1.minhash.num
    size1 = len(sig1.minhash)
    size2 = len(sig2.minhash)

    scaled = sig1.minhash.scaled

    hashes_1 = set(sig1.minhash.get_mins())
    hashes_2 = set(sig2.minhash.get_mins())

    num_common = len(hashes_1.intersection(hashes_2))
    disjoint_1 = len(hashes_1 - hashes_2)
    disjoint_2 = len(hashes_2 - hashes_1)
    num_union = len(hashes_1.union(hashes_2))

    print('''\
first signature:
  signature filename: {sig1_file}
  signature: {name1}
  md5: {md5_1}
  k={ksize} molecule={moltype} num={num} scaled={scaled}

second signature:
  signature filename: {sig2_file}
  signature: {name2}
  md5: {md5_2}
  k={ksize} molecule={moltype} num={num} scaled={scaled}

similarity:                  {similarity:.5f}
first contained in second:   {cont1:.5f}
second contained in first:   {cont2:.5f}

number of hashes in first:   {size1}
number of hashes in second:  {size2}

number of hashes in common:  {num_common}
only in first:               {disjoint_1}
only in second:              {disjoint_2}
total (union):               {num_union}
'''.format(**locals()))