Ejemplo n.º 1
0
def test_jaccard_on_real_data():
    from sourmash.signature import load_signatures

    afile = 'n10000/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz'
    a = utils.get_test_data(afile)
    sig1 = list(load_signatures(a))[0]
    mh1 = sig1.minhash

    bfile = 'n10000/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz'
    b = utils.get_test_data(bfile)
    sig2 = list(load_signatures(b))[0]
    mh2 = sig2.minhash

    assert mh1.compare(mh2) == 0.0183
    assert mh2.compare(mh1) == 0.0183

    mh1 = mh1.downsample_n(1000)
    mh2 = mh2.downsample_n(1000)
    assert mh1.compare(mh2) == 0.011
    assert mh2.compare(mh1) == 0.011

    mh1 = mh1.downsample_n(100)
    mh2 = mh2.downsample_n(100)
    assert mh1.compare(mh2) == 0.01
    assert mh2.compare(mh1) == 0.01

    mh1 = mh1.downsample_n(10)
    mh2 = mh2.downsample_n(10)
    assert mh1.compare(mh2) == 0.0
    assert mh2.compare(mh1) == 0.0
Ejemplo n.º 2
0
def test_scaled_on_real_data_2():
    from sourmash.signature import load_signatures

    afile = 'scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz'
    a = utils.get_test_data(afile)
    sig1 = list(load_signatures(a))[0]
    mh1 = sig1.minhash

    bfile = 'scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz'
    b = utils.get_test_data(bfile)
    sig2 = list(load_signatures(b))[0]
    mh2 = sig2.minhash

    assert round(mh1.compare(mh2), 5) == 0.01644
    assert round(mh2.compare(mh1), 5) == 0.01644

    mh1 = mh1.downsample_scaled(1000)
    mh2 = mh2.downsample_scaled(1000)

    assert round(mh1.compare(mh2), 4) == 0.0187
    assert round(mh2.compare(mh1), 4) == 0.0187

    mh1 = mh1.downsample_scaled(10000)
    mh2 = mh2.downsample_scaled(10000)
    assert round(mh1.compare(mh2), 3) == 0.01
    assert round(mh2.compare(mh1), 3) == 0.01

    mh1 = mh1.downsample_scaled(100000)
    mh2 = mh2.downsample_scaled(100000)
    assert round(mh1.compare(mh2), 2) == 0.01
    assert round(mh2.compare(mh1), 2) == 0.01
Ejemplo n.º 3
0
def test_do_sourmash_compute_name():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('short.fa')
        status, out, err = utils.runscript('sourmash',
                                           ['compute', '-k', '31', '--merge', 'foo',
                                            testdata1, '-o', 'foo.sig'],
                                           in_directory=location)

        sigfile = os.path.join(location, 'foo.sig')
        assert os.path.exists(sigfile)

        sig = next(signature.load_signatures(sigfile))
        assert sig.name() == 'foo'

        status, out, err = utils.runscript('sourmash',
                                           ['compute', '-k', '31', '--name', 'foo',
                                            testdata1, '-o', 'foo2.sig'],
                                           in_directory=location)

        sigfile2 = os.path.join(location, 'foo2.sig')
        assert os.path.exists(sigfile2)

        sig2 = next(signature.load_signatures(sigfile))
        assert sig2.name() == 'foo'
        assert sig.name() == sig2.name()
Ejemplo n.º 4
0
def test_jaccard_on_real_data():
    from sourmash.signature import load_signatures

    afile = 'n10000/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz'
    a = utils.get_test_data(afile)
    sig1 = list(load_signatures(a))[0]
    mh1 = sig1.minhash

    bfile = 'n10000/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz'
    b = utils.get_test_data(bfile)
    sig2 = list(load_signatures(b))[0]
    mh2 = sig2.minhash

    assert mh1.compare(mh2) == 0.0183
    assert mh2.compare(mh1) == 0.0183

    mh1 = mh1.downsample_n(1000)
    mh2 = mh2.downsample_n(1000)
    assert mh1.compare(mh2) == 0.011
    assert mh2.compare(mh1) == 0.011

    mh1 = mh1.downsample_n(100)
    mh2 = mh2.downsample_n(100)
    assert mh1.compare(mh2) == 0.01
    assert mh2.compare(mh1) == 0.01

    mh1 = mh1.downsample_n(10)
    mh2 = mh2.downsample_n(10)
    assert mh1.compare(mh2) == 0.0
    assert mh2.compare(mh1) == 0.0
Ejemplo n.º 5
0
def test_scaled_on_real_data_2():
    from sourmash.signature import load_signatures

    afile = 'scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz'
    a = utils.get_test_data(afile)
    sig1 = list(load_signatures(a))[0]
    mh1 = sig1.minhash

    bfile = 'scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz'
    b = utils.get_test_data(bfile)
    sig2 = list(load_signatures(b))[0]
    mh2 = sig2.minhash

    assert round(mh1.compare(mh2), 5) == 0.01644
    assert round(mh2.compare(mh1), 5) == 0.01644

    mh1 = mh1.downsample_scaled(1000)
    mh2 = mh2.downsample_scaled(1000)

    assert round(mh1.compare(mh2), 4) == 0.0187
    assert round(mh2.compare(mh1), 4) == 0.0187

    mh1 = mh1.downsample_scaled(10000)
    mh2 = mh2.downsample_scaled(10000)
    assert round(mh1.compare(mh2), 3) == 0.01
    assert round(mh2.compare(mh1), 3) == 0.01

    mh1 = mh1.downsample_scaled(100000)
    mh2 = mh2.downsample_scaled(100000)
    assert round(mh1.compare(mh2), 2) == 0.01
    assert round(mh2.compare(mh1), 2) == 0.01
Ejemplo n.º 6
0
def test_contig_search(location):
    # test for same results
    args = utils.Args()
    args.genome = utils.get_testfile("test-data/proteomes/GB_GCA_002691795.1_protein.100contigs.faa.gz")
    args.genome_sig = utils.get_testfile("test-data/intermediate/signatures/GB_GCA_002691795.1_protein.100contigs.faa.gz.sig")
    args.matches_sig = utils.get_testfile("test-data/intermediate/search/GB_GCA_002691795.1_protein.100contigs.faa.gz.x.gtdb-nine.protein-k11.matches.sig")
    args.lineages_csv = utils.get_testfile("test-data/databases/gtdb-nine.lineages.csv")
    args.alphabet = "protein"
    args.ksize = 33
    args.output_prefix = "GB_GCA_002691795.1_protein.100contigs.faa.gz.x.gtdb-nine.protein-k11"
    args.no_search=False
    args.gather=False
    args.no_search_contigs=False
    args.search_genome=False

    search_csv = os.path.join(location, f"{args.output_prefix}.contigs.search.csv")
    ranksearch_csv = os.path.join(location, f"{args.output_prefix}.contigs.ranksearch.csv")
    search_matches = os.path.join(location, f"{args.output_prefix}.contigs.search.matches.sig")
    ranksearch_matches = os.path.join(location, f"{args.output_prefix}.contigs.ranksearch.matches.sig")
    outfiles = [search_csv, ranksearch_csv, search_matches, ranksearch_matches]
    status = search_or_gather.main(args)
    assert status == 0

    for outF in outfiles:
        assert os.path.exists(outF)

    saved_search_csv = \
    utils.get_testfile("test-data/intermediate/contig-search/GB_GCA_002691795.1_protein.100contigs.faa.gz.x.gtdb-nine.protein-k11.contigs.search.csv")
    with open(saved_search_csv) as fp:
        saved_search_csvset = get_csv_set(fp)
    with open(search_csv) as fp:
        this_search_csvset = get_csv_set(fp)
    assert saved_search_csvset == this_search_csvset

    saved_ranksearch_csv = \
    utils.get_testfile("test-data/intermediate/contig-search/GB_GCA_002691795.1_protein.100contigs.faa.gz.x.gtdb-nine.protein-k11.contigs.ranksearch.csv")
    with open(saved_ranksearch_csv) as fp:
        saved_ranksearch_csvset = get_csv_set(fp)
    with open(ranksearch_csv) as fp:
        this_ranksearch_csvset = get_csv_set(fp)
    assert saved_ranksearch_csvset == this_ranksearch_csvset

    saved_search_matches = \
    utils.get_testfile("test-data/intermediate/contig-search/GB_GCA_002691795.1_protein.100contigs.faa.gz.x.gtdb-nine.protein-k11.contigs.search.matches.sig")
    with open(saved_search_matches) as sm:
        saved_search_sigs = set(sig.load_signatures(sm))
    with open(search_matches) as sm:
        this_search_sigs = set(sig.load_signatures(sm))
    assert saved_search_sigs == this_search_sigs

    saved_ranksearch_matches = \
    utils.get_testfile("test-data/intermediate/contig-search/GB_GCA_002691795.1_protein.100contigs.faa.gz.x.gtdb-nine.protein-k11.contigs.ranksearch.matches.sig")
    with open(saved_ranksearch_matches) as rm:
        saved_ranksearch_sigs = set(sig.load_signatures(rm))

    with open(ranksearch_matches) as rm:
        this_ranksearch_sigs = set(sig.load_signatures(rm))
    assert saved_ranksearch_sigs == this_ranksearch_sigs
Ejemplo n.º 7
0
def test_do_sourmash_check_knowngood_dna_comparisons_use_rna(c):
    # check the --rna flag; otherwise identical to previous test.
    testdata1 = utils.get_test_data('ecoli.genes.fna')
    c.run_sourmash('compute', '-k', '21', '--singleton', '--rna', testdata1)
    sig1 = c.output('ecoli.genes.fna.sig')
    assert os.path.exists(sig1)

    x = list(signature.load_signatures(sig1))
    sig1, sig2 = sorted(x, key=lambda x: x.name())

    knowngood = utils.get_test_data('benchmark.dna.sig')
    good = list(signature.load_signatures(knowngood))[0]

    assert sig2.similarity(good) == 1.0
Ejemplo n.º 8
0
def test_do_sourmash_check_knowngood_dna_comparisons(c):
    # this test checks against a known good signature calculated
    # by utils/compute-dna-mh-another-way.py
    testdata1 = utils.get_test_data('ecoli.genes.fna')
    c.run_sourmash('compute', '-k', '21', '--singleton', '--dna', testdata1)
    sig1 = c.output('ecoli.genes.fna.sig')
    assert os.path.exists(sig1)

    x = list(signature.load_signatures(sig1))
    sig1, sig2 = sorted(x, key=lambda x: x.name())

    knowngood = utils.get_test_data('benchmark.dna.sig')
    good = list(signature.load_signatures(knowngood))[0]

    assert sig2.similarity(good) == 1.0
Ejemplo n.º 9
0
def test_do_sourmash_check_protein_comparisons():
    # this test checks 2 x 2 protein comparisons with E. coli genes.
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('ecoli.faa')
        status, out, err = utils.runscript('sourmash',
                                           ['compute', '-k', '21',
                                            '--input-is-protein',
                                            '--singleton',
                                            testdata1],
                                           in_directory=location)
        sig1 = os.path.join(location, 'ecoli.faa.sig')
        assert os.path.exists(sig1)

        testdata2 = utils.get_test_data('ecoli.genes.fna')
        status, out, err = utils.runscript('sourmash',
                                           ['compute', '-k', '21',
                                            '--protein', '--no-dna',
                                            '--singleton',
                                            testdata2],
                                           in_directory=location)
        sig2 = os.path.join(location, 'ecoli.genes.fna.sig')
        assert os.path.exists(sig2)

        # I'm not sure why load_signatures is randomizing order, but ok.
        x = list(signature.load_signatures(sig1))
        sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name())

        x = list(signature.load_signatures(sig2))
        sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name())

        name1 = sig1_aa.name().split()[0]
        assert name1 == 'NP_414543.1'
        name2 = sig2_aa.name().split()[0]
        assert name2 == 'NP_414544.1'
        name3 = sig1_trans.name().split()[0]
        assert name3 == 'gi|556503834:2801-3733'
        name4 = sig2_trans.name().split()[0]
        assert name4 == 'gi|556503834:337-2799'

        print(name1, name3, round(sig1_aa.similarity(sig1_trans), 3))
        print(name2, name3, round(sig2_aa.similarity(sig1_trans), 3))
        print(name1, name4, round(sig1_aa.similarity(sig2_trans), 3))
        print(name2, name4, round(sig2_aa.similarity(sig2_trans), 3))

        assert round(sig1_aa.similarity(sig1_trans), 3) == 0.0
        assert round(sig2_aa.similarity(sig1_trans), 3) == 0.166
        assert round(sig1_aa.similarity(sig2_trans), 3) == 0.174
        assert round(sig2_aa.similarity(sig2_trans), 3) == 0.0
Ejemplo n.º 10
0
def test_binary_nary_tree():
    factory = GraphFactory(31, 1e5, 4)
    trees = {}
    trees[2] = SBT(factory)
    trees[5] = SBT(factory, d=5)
    trees[10] = SBT(factory, d=10)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        for tree in trees.values():
            tree.add_node(leaf)
        to_search = leaf
        n_leaves += 1

    assert all([len(list(t.leaves())) == n_leaves for t in trees.values()])

    results = {}
    print('*' * 60)
    print("{}:".format(to_search.metadata))
    for d, tree in trees.items():
        results[d] = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
    print(*results[2], sep='\n')

    assert results[2] == results[5]
    assert results[5] == results[10]
Ejemplo n.º 11
0
def test_do_sourmash_compute_10x_barcode():
    pytest.importorskip('bam2fasta')

    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam')
        barcodes_file = utils.get_test_data('10x-example/barcodes.tsv')
        status, out, err = utils.runscript('sourmash',
                                           ['compute', '-k', '21',
                                            '--line-count', '50',
                                            '--input-is-10x',
                                            '--protein',
                                            '--barcodes-file',
                                            barcodes_file,
                                            testdata1],
                                           in_directory=location)

        sigfile = os.path.join(location, 'possorted_genome_bam.bam.sig')
        assert os.path.exists(sigfile)
        siglist = list(signature.load_signatures(sigfile))
        assert len(siglist) == 16
        barcode_signatures = list(set([sig.name().split("_")[0] for sig in siglist]))

        with open(utils.get_test_data('10x-example/barcodes.tsv')) as f:
            true_barcodes = set(x.strip() for x in f.readlines())

        # Ensure that every cell barcode in barcodes.tsv has a signature
        assert all(bc in true_barcodes for bc in barcode_signatures)
Ejemplo n.º 12
0
def test_tree_save_load(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)

    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        to_search = leaf

    print('*' * 60)
    print("{}:".format(to_search.metadata))
    old_result = {
        str(s)
        for s in tree.find(search_minhashes, to_search.data, 0.1)
    }
    print(*old_result, sep='\n')

    with utils.TempDirectory() as location:
        tree.save(os.path.join(location, 'demo'))
        tree = SBT.load(os.path.join(location, 'demo'),
                        leaf_loader=SigLeaf.load)

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
        print(*new_result, sep='\n')

        assert old_result == new_result
Ejemplo n.º 13
0
def test_sbt_tarstorage():
    factory = GraphFactory(31, 1e5, 4)
    with utils.TempDirectory() as location:
        tree = SBT(factory)

        for f in utils.SIG_FILES:
            sig = next(signature.load_signatures(utils.get_test_data(f)))
            leaf = SigLeaf(os.path.basename(f), sig)
            tree.add_node(leaf)
            to_search = leaf

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        old_result = {str(s) for s in tree.find(search_minhashes,
                                                to_search.data, 0.1)}
        print(*old_result, sep='\n')

        with TarStorage(os.path.join(location, 'tree.tar.gz')) as storage:
            tree.save(os.path.join(location, 'tree'), storage=storage)

        with TarStorage(os.path.join(location, 'tree.tar.gz')) as storage:
            tree = SBT.load(os.path.join(location, 'tree'),
                            leaf_loader=SigLeaf.load,
                            storage=storage)

            print('*' * 60)
            print("{}:".format(to_search.metadata))
            new_result = {str(s) for s in tree.find(search_minhashes,
                                                    to_search.data, 0.1)}
            print(*new_result, sep='\n')

            assert old_result == new_result
Ejemplo n.º 14
0
def test_do_sourmash_compute_multik_with_dayhoff_hp_dna_protein():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('short.fa')
        status, out, err = utils.runscript('sourmash', [
            'compute', '-k', '21,30', '--dayhoff', '--hp', '--protein',
            testdata1
        ],
                                           in_directory=location)
        outfile = os.path.join(location, 'short.fa.sig')
        assert os.path.exists(outfile)

        with open(outfile, 'rt') as fp:
            sigdata = fp.read()
            siglist = list(signature.load_signatures(sigdata))
            assert len(siglist) == 8
            ksizes = set([x.minhash.ksize for x in siglist])
            assert 21 in ksizes
            assert 30 in ksizes
            assert sum(x.minhash.is_molecule_type('DNA') for x in siglist) == 2
            assert sum(x.minhash.is_molecule_type('dayhoff')
                       for x in siglist) == 2
            assert sum(x.minhash.is_molecule_type('hp') for x in siglist) == 2
            # 2 = dayhoff, 2 = hp = 4 protein
            assert sum(x.minhash.is_molecule_type('protein')
                       for x in siglist) == 2
Ejemplo n.º 15
0
def test_binary_nary_tree():
    factory = GraphFactory(31, 1e5, 4)
    trees = {}
    trees[2] = SBT(factory)
    trees[5] = SBT(factory, d=5)
    trees[10] = SBT(factory, d=10)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        for tree in trees.values():
            tree.add_node(leaf)
        to_search = leaf
        n_leaves += 1

    assert all([len(list(t.leaves())) == n_leaves for t in trees.values()])

    results = {}
    print('*' * 60)
    print("{}:".format(to_search.metadata))
    for d, tree in trees.items():
        results[d] = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)}
    print(*results[2], sep='\n')

    assert results[2] == results[5]
    assert results[5] == results[10]
Ejemplo n.º 16
0
def test_sbt_combine(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)
    tree_1 = SBT(factory, d=n_children)
    tree_2 = SBT(factory, d=n_children)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        if n_leaves < 4:
            tree_1.add_node(leaf)
        else:
            tree_2.add_node(leaf)
        n_leaves += 1

    tree_1.combine(tree_2)

    t1_leaves = {str(l) for l in tree_1.leaves()}
    t_leaves = {str(l) for l in tree.leaves()}

    assert len(t1_leaves) == n_leaves
    assert len(t_leaves) == len(t1_leaves)
    assert t1_leaves == t_leaves

    to_search = next(signature.load_signatures(
                        utils.get_test_data(utils.SIG_FILES[0])))
    t1_result = {str(s) for s in tree_1.find(search_minhashes,
                                             to_search, 0.1)}
    tree_result = {str(s) for s in tree.find(search_minhashes,
                                             to_search, 0.1)}
    assert t1_result == tree_result

    # TODO: save and load both trees

    # check if adding a new node will use the next empty position
    next_empty = 0
    for n, d in enumerate(tree_1.nodes):
        if n != d:
            next_empty = n
            break
    if not next_empty:
        next_empty = n + 1

    tree_1.add_node(leaf)
    assert tree_1.next_node == next_empty
Ejemplo n.º 17
0
def test_sbt_combine(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)
    tree_1 = SBT(factory, d=n_children)
    tree_2 = SBT(factory, d=n_children)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        if n_leaves < 4:
            tree_1.add_node(leaf)
        else:
            tree_2.add_node(leaf)
        n_leaves += 1

    tree_1.combine(tree_2)

    t1_leaves = {str(l) for l in tree_1.leaves()}
    t_leaves = {str(l) for l in tree.leaves()}

    assert len(t1_leaves) == n_leaves
    assert len(t_leaves) == len(t1_leaves)
    assert t1_leaves == t_leaves

    to_search = next(
        signature.load_signatures(utils.get_test_data(utils.SIG_FILES[0])))
    t1_result = {str(s) for s in tree_1.find(search_minhashes, to_search, 0.1)}
    tree_result = {str(s) for s in tree.find(search_minhashes, to_search, 0.1)}
    assert t1_result == tree_result

    # TODO: save and load both trees

    # check if adding a new node will use the next empty position
    next_empty = 0
    for n, d in enumerate(tree_1.nodes):
        if n != d:
            next_empty = n
            break
    if not next_empty:
        next_empty = n + 1

    tree_1.add_node(leaf)
    assert tree_1.next_node == next_empty
Ejemplo n.º 18
0
def test_load_minified(track_abundance):
    sigfile = utils.get_test_data('genome-s10+s11.sig')
    sigs = load_signatures(sigfile)

    minified = save_signatures(sigs)
    with open(sigfile, 'r') as f:
        orig_file = f.read()
    assert len(minified) < len(orig_file)
    assert '\n' not in minified
Ejemplo n.º 19
0
def test_load_minified(track_abundance):
    sigfile = utils.get_test_data('genome-s10+s11.sig')
    sigs = load_signatures(sigfile)

    minified = save_signatures(sigs)
    with open(sigfile, 'r') as f:
        orig_file = f.read()
    assert len(minified) < len(orig_file)
    assert '\n' not in minified
Ejemplo n.º 20
0
def test_load_textmode(track_abundance):
    # ijson requires a file in binary mode or bytes,
    # but we had an API example in the docs using 'rt'.
    # I fixed the docs, but I'm keeping this test here
    # to make sure we still support it =/
    sigfile = utils.get_test_data('genome-s10+s11.sig')
    with open(sigfile, 'rt') as sigfp:
        siglist = list(signature.load_signatures(sigfp))
    loaded_sig = siglist[0]
    assert loaded_sig.name() == 's10+s11'
Ejemplo n.º 21
0
def test_do_sourmash_compute_outdir(c):
    testdata1 = utils.get_test_data('short.fa')
    status, out, err = utils.runscript(
        'sourmash', ['compute', '-k', '31', testdata1, '--outdir', c.location])

    sigfile = os.path.join(c.location, 'short.fa.sig')
    assert os.path.exists(sigfile)

    sig = next(signature.load_signatures(sigfile))
    assert sig.name().endswith('short.fa')
Ejemplo n.º 22
0
def test_load_textmode(track_abundance):
    # ijson requires a file in binary mode or bytes,
    # but we had an API example in the docs using 'rt'.
    # I fixed the docs, but I'm keeping this test here
    # to make sure we still support it =/
    sigfile = utils.get_test_data('genome-s10+s11.sig')
    with open(sigfile, 'rt') as sigfp:
        siglist = list(signature.load_signatures(sigfp))
    loaded_sig = siglist[0]
    assert loaded_sig.name() == 's10+s11'
Ejemplo n.º 23
0
def test_roundtrip(track_abundance):
    e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Ejemplo n.º 24
0
def test_load_compressed(track_abundance):
    e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1)

    x = save_signatures([sig1], compression=5)

    y = load_one_signature(x)
    assert sig1 == y

    sigfile = utils.get_test_data('genome-s10+s11.sig.gz')
    sigs = load_signatures(sigfile)
Ejemplo n.º 25
0
def test_do_sourmash_check_knowngood_protein_comparisons():
    # this test checks against a known good signature calculated
    # by utils/compute-prot-mh-another-way.py
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('ecoli.genes.fna')
        status, out, err = utils.runscript('sourmash', [
            'compute', '-k', '21', '--singleton', '--protein', '--no-dna',
            testdata1
        ],
                                           in_directory=location)
        sig1 = os.path.join(location, 'ecoli.genes.fna.sig')
        assert os.path.exists(sig1)

        x = list(signature.load_signatures(sig1))
        sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name())

        knowngood = utils.get_test_data('benchmark.prot.sig')
        good_trans = list(signature.load_signatures(knowngood))[0]

        assert sig2_trans.similarity(good_trans) == 1.0
Ejemplo n.º 26
0
def test_roundtrip(track_abundance):
    e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Ejemplo n.º 27
0
def test_roundtrip_empty(track_abundance):
    # edge case, but: empty minhash? :)
    e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)

    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert sig.similarity(sig2) == 0
    assert sig2.similarity(sig) == 0
Ejemplo n.º 28
0
def test_roundtrip_empty(track_abundance):
    # edge case, but: empty minhash? :)
    e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)

    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert sig.similarity(sig2) == 0
    assert sig2.similarity(sig) == 0
Ejemplo n.º 29
0
def test_do_sourmash_compute():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('short.fa')
        status, out, err = utils.runscript('sourmash',
                                           ['compute', '-k', '31', testdata1],
                                           in_directory=location)

        sigfile = os.path.join(location, 'short.fa.sig')
        assert os.path.exists(sigfile)

        sig = next(signature.load_signatures(sigfile))
        assert sig.name().endswith('short.fa')
Ejemplo n.º 30
0
def test_roundtrip_max_hash(track_abundance):
    e = sourmash.MinHash(n=0, ksize=20, track_abundance=track_abundance,
                             max_hash=10)
    e.add_hash(5)
    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert e.max_hash == e2.max_hash

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Ejemplo n.º 31
0
def test_do_sourmash_compute_multik_outfile():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('short.fa')
        outfile = os.path.join(location, 'FOO.xxx')
        status, out, err = utils.runscript(
            'sourmash', ['compute', '-k', '21,31', testdata1, '-o', outfile],
            in_directory=location)
        assert os.path.exists(outfile)

        siglist = list(signature.load_signatures(outfile))
        assert len(siglist) == 2
        ksizes = set([x.minhash.ksize for x in siglist])
        assert 21 in ksizes
        assert 31 in ksizes
Ejemplo n.º 32
0
def test_do_sourmash_compute_10x_filter_umis():
    pytest.importorskip('bam2fasta')
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam')
        csv_path = os.path.join(location, "all_barcodes_meta.csv")
        barcodes_path = utils.get_test_data('10x-example/barcodes.tsv')
        renamer_path = utils.get_test_data('10x-example/barcodes_renamer.tsv')
        fastas_dir = os.path.join(location, "fastas")
        if not os.path.exists(fastas_dir):
            os.makedirs(fastas_dir)

        status, out, err = utils.runscript('sourmash',
                                           ['compute', '-k', '31',
                                            '--dna', '--count-valid-reads', '10',
                                            '--input-is-10x',
                                            testdata1,
                                            '--write-barcode-meta-csv', csv_path,
                                            '--barcodes', barcodes_path,
                                            '--rename-10x-barcodes', renamer_path,
                                            '--save-fastas', fastas_dir,
                                            '-o', '10x-example_dna.sig'],
                                           in_directory=location)

        sigfile = os.path.join(location, '10x-example_dna.sig')
        assert os.path.exists(sigfile)
        siglist = list(signature.load_signatures(sigfile))
        assert len(siglist) == 1
        # TODO PV This seems to randomly fail/pass - commenting out for now
        # but the min hashes should never be empty
        # min_hashes = [x.minhash.get_mins() for x in siglist]
        # assert all(mins != [] for mins in min_hashes)

        with open(csv_path, 'rb') as f:
            data = [line.split() for line in f]
        assert len(data) == 9
        fasta_files = os.listdir(fastas_dir)
        barcodes = [filename.replace(".fasta", "") for filename in fasta_files]
        assert len(barcodes) == 1
        assert len(fasta_files) == 1
        assert barcodes[0] == 'lung_epithelial_cell|AAATGCCCAAACTGCT-1'
        count = 0
        fasta_file_name = os.path.join(fastas_dir, fasta_files[0])
        for record in screed.open(fasta_file_name):
            name = record.name
            sequence = record.sequence
            count += 1
            assert name.startswith('lung_epithelial_cell|AAATGCCCAAACTGCT-1')
            assert sequence.count(">") == 0
            assert sequence.count("X") == 0
Ejemplo n.º 33
0
def test_save_minified(track_abundance):
    e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1, name="foo")

    e2 = sourmash.MinHash(n=1, ksize=25, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2, name="bar baz")

    x = save_signatures([sig1, sig2])
    assert '\n' not in x
    assert len(x.split('\n')) == 1

    y = list(load_signatures(x))
    assert len(y) == 2
    assert any(sig.name() == 'foo' for sig in y)
    assert any(sig.name() == 'bar baz' for sig in y)
Ejemplo n.º 34
0
def test_do_sourmash_compute_multik_only_protein(c):
    # check sourmash compute with only protein, no nucl
    testdata1 = utils.get_test_data('short.fa')
    c.run_sourmash('compute', '-k', '21,30',
                   '--protein', '--no-dna', testdata1)
    outfile = os.path.join(c.location, 'short.fa.sig')
    assert os.path.exists(outfile)

    with open(outfile, 'rt') as fp:
        sigdata = fp.read()
        siglist = list(signature.load_signatures(sigdata))
        assert len(siglist) == 2
        ksizes = set([ x.minhash.ksize for x in siglist ])
        assert 21 in ksizes
        assert 30 in ksizes
Ejemplo n.º 35
0
def test_save_minified(track_abundance):
    e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1, name="foo")

    e2 = sourmash.MinHash(n=1, ksize=25, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2, name="bar baz")

    x = save_signatures([sig1, sig2])
    assert '\n' not in x
    assert len(x.split('\n')) == 1

    y = list(load_signatures(x))
    assert len(y) == 2
    assert any(sig.name() == 'foo' for sig in y)
    assert any(sig.name() == 'bar baz' for sig in y)
Ejemplo n.º 36
0
def test_do_sourmash_compute_multik_with_protein():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('short.fa')
        status, out, err = utils.runscript(
            'sourmash', ['compute', '-k', '21,30', '--protein', testdata1],
            in_directory=location)
        outfile = os.path.join(location, 'short.fa.sig')
        assert os.path.exists(outfile)

        with open(outfile, 'rt') as fp:
            sigdata = fp.read()
            siglist = list(signature.load_signatures(sigdata))
            assert len(siglist) == 4
            ksizes = set([x.minhash.ksize for x in siglist])
            assert 21 in ksizes
            assert 30 in ksizes
Ejemplo n.º 37
0
def test_save_load_multisig(track_abundance):
    e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1)

    e2 = sourmash.MinHash(n=1, ksize=25, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2)

    x = save_signatures([sig1, sig2])
    y = list(load_signatures(x))

    print(x)

    assert len(y) == 2
    assert sig1 in y                      # order not guaranteed, note.
    assert sig2 in y
    assert sig1 != sig2
Ejemplo n.º 38
0
def test_do_sourmash_compute_with_seed():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('short.fa')
        outfile = os.path.join(location, 'FOO.xxx')
        status, out, err = utils.runscript('sourmash', [
            'compute', '-k', '21,31', '--seed', '43', testdata1, '-o', outfile
        ],
                                           in_directory=location)
        assert os.path.exists(outfile)

        siglist = list(signature.load_signatures(outfile))
        assert len(siglist) == 2

        seeds = [x.minhash.seed for x in siglist]
        assert len(seeds) == 2
        assert set(seeds) == set([43])
Ejemplo n.º 39
0
def test_do_sourmash_compute_multik_only_protein_no_rna(c):
    # test --no-rna as well (otherwise identical to previous test)
    testdata1 = utils.get_test_data('short.fa')

    c.run_sourmash('compute', '-k', '21,30',
                   '--protein', '--no-rna', testdata1)
    outfile = os.path.join(c.location, 'short.fa.sig')
    assert os.path.exists(outfile)

    with open(outfile, 'rt') as fp:
        sigdata = fp.read()
        siglist = list(signature.load_signatures(sigdata))
        assert len(siglist) == 2
        ksizes = set([ x.minhash.ksize for x in siglist ])
        assert 21 in ksizes
        assert 30 in ksizes
Ejemplo n.º 40
0
def test_save_load_multisig(track_abundance):
    e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1)

    e2 = sourmash.MinHash(n=1, ksize=25, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2)

    x = save_signatures([sig1, sig2])
    y = list(load_signatures(x))

    print(x)

    assert len(y) == 2
    assert sig1 in y  # order not guaranteed, note.
    assert sig2 in y
    assert sig1 != sig2
Ejemplo n.º 41
0
def test_tree_v2_load():
    tree_v2 = SBT.load(utils.get_test_data('v2.sbt.json'),
                       leaf_loader=SigLeaf.load)

    tree_cur = SBT.load(utils.get_test_data('v3.sbt.json'),
                        leaf_loader=SigLeaf.load)

    testdata1 = utils.get_test_data(utils.SIG_FILES[0])
    to_search = next(signature.load_signatures(testdata1))

    results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment,
                                               to_search, 0.1)}
    results_cur = {str(s) for s in tree_cur.find(search_minhashes_containment,
                                                 to_search, 0.1)}

    assert results_v2 == results_cur
    assert len(results_v2) == 4
Ejemplo n.º 42
0
def test_do_sourmash_compute_10x_no_filter_umis():
    pytest.importorskip('bam2fasta')
    with utils.TempDirectory() as location:
        # test to check if all the lines in unfiltered_umi_to_sig are callled and tested
        csv_path = os.path.join(location, "all_barcodes_meta.csv")
        testdata1 = utils.get_test_data(
            '10x-example/possorted_genome_bam_filtered.bam')
        status, out, err = utils.runscript('sourmash', [
            'compute', '-k', '31', '--dna', '--input-is-10x', testdata1,
            '--write-barcode-meta-csv', csv_path, '--save-fastas', location,
            '-o', '10x-example_dna.sig'
        ],
                                           in_directory=location)
        sigfile = os.path.join(location, '10x-example_dna.sig')
        assert os.path.exists(sigfile)
        siglist = list(signature.load_signatures(sigfile))
        assert len(siglist) == 32
Ejemplo n.º 43
0
def test_tree_repair():
    tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'),
                           leaf_loader=SigLeaf.load)

    tree_cur = SBT.load(utils.get_test_data('v3.sbt.json'),
                        leaf_loader=SigLeaf.load)

    testdata1 = utils.get_test_data(utils.SIG_FILES[0])
    to_search = next(signature.load_signatures(testdata1))

    results_repair = {str(s) for s in tree_repair.find(search_minhashes,
                                                       to_search, 0.1)}
    results_cur = {str(s) for s in tree_cur.find(search_minhashes,
                                                 to_search, 0.1)}

    assert results_repair == results_cur
    assert len(results_repair) == 2
Ejemplo n.º 44
0
def test_tree_repair_add_node():
    tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'),
                           leaf_loader=SigLeaf.load)

    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree_repair.add_node(leaf)

    for pos, node in list(tree_repair.nodes.items()):
        # Every parent of a node must be an internal node (and not a leaf),
        # except for node 0 (the root), whose parent is None.
        if pos != 0:
            assert isinstance(tree_repair.parent(pos).node, Node)

        # Leaf nodes can't have children
        if isinstance(node, Leaf):
            assert all(c.node is None for c in tree_repair.children(pos))
Ejemplo n.º 45
0
def test_search_minhashes():
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)

    to_search = next(iter(tree.leaves()))

    # this fails if 'search_minhashes' is calc containment and not similarity.
    results = tree.find(search_minhashes, to_search.data, 0.08)
    for leaf in results:
        assert to_search.data.similarity(leaf.data) >= 0.08

    print(results)
Ejemplo n.º 46
0
def test_distance_matrix(track_abundance):
    import numpy

    siglist = [next(signature.load_signatures(utils.get_test_data(f)))
               for f in utils.SIG_FILES]

    D1 = numpy.zeros([len(siglist), len(siglist)])
    D2 = numpy.zeros([len(siglist), len(siglist)])

    for i, E in enumerate(siglist):
        for j, E2 in enumerate(siglist):
            if i < j:
                continue
            similarity = E.similarity(E2, track_abundance)
            D2[i][j] = similarity
            D2[j][i] = similarity

    for i, E in enumerate(siglist):
        for j, E2 in enumerate(siglist):
            D1[i][j] = E.similarity(E2, track_abundance)

    assert numpy.array_equal(D1, D2)
Ejemplo n.º 47
0
def test_save_sparseness(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)

    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        to_search = leaf

    print('*' * 60)
    print("{}:".format(to_search.metadata))
    old_result = {str(s) for s in tree.find(search_minhashes,
                                            to_search.data, 0.1)}
    print(*old_result, sep='\n')

    with utils.TempDirectory() as location:
        tree.save(os.path.join(location, 'demo'), sparseness=1.0)
        tree_loaded = SBT.load(os.path.join(location, 'demo'),
                               leaf_loader=SigLeaf.load)
        assert all(not isinstance(n, Node) for n in tree_loaded.nodes.values())

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {str(s) for s in tree_loaded.find(search_minhashes,
                                                       to_search.data, 0.1)}
        print(*new_result, sep='\n')

        assert old_result == new_result

        for pos, node in list(tree_loaded.nodes.items()):
            # Every parent of a node must be an internal node (and not a leaf),
            # except for node 0 (the root), whose parent is None.
            if pos != 0:
                assert isinstance(tree_loaded.parent(pos).node, Node)

            # Leaf nodes can't have children
            if isinstance(node, Leaf):
                assert all(c.node is None for c in tree_loaded.children(pos))
Ejemplo n.º 48
0
def test_sbt_ipfsstorage():
    ipfsapi = pytest.importorskip('ipfsapi')

    factory = GraphFactory(31, 1e5, 4)
    with utils.TempDirectory() as location:
        tree = SBT(factory)

        for f in utils.SIG_FILES:
            sig = next(signature.load_signatures(utils.get_test_data(f)))
            leaf = SigLeaf(os.path.basename(f), sig)
            tree.add_node(leaf)
            to_search = leaf

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        old_result = {str(s) for s in tree.find(search_minhashes,
                                                to_search.data, 0.1)}
        print(*old_result, sep='\n')

        try:
            with IPFSStorage() as storage:
                tree.save(os.path.join(location, 'tree'), storage=storage)
        except ipfsapi.exceptions.ConnectionError:
            pytest.xfail("ipfs not installed/functioning probably")

        with IPFSStorage() as storage:
            tree = SBT.load(os.path.join(location, 'tree'),
                            leaf_loader=SigLeaf.load,
                            storage=storage)

            print('*' * 60)
            print("{}:".format(to_search.metadata))
            new_result = {str(s) for s in tree.find(search_minhashes,
                                                    to_search.data, 0.1)}
            print(*new_result, sep='\n')

            assert old_result == new_result