Ejemplo n.º 1
0
def grow_sbt(args):
    # find input files if necessary
    input_files = args.input_files
    if args.input_is_directory:
        input_files = collect_input_files_from_dir(input_files[0],
                                                   args.subset_csv,
                                                   args.subset_info_colname)
    ksize = args.ksize
    alphabet = args.alphabet
    if alphabet == "protein" or alphabet == "dayhoff" or alphabet == "hp":
        ksize = ksize * 3
    # create or load sbt
    sbt = create_sbt_or_load_existing(args.sbt, args.load_existing_sbt)
    # iterate through input files; add to sbt
    for n, filename in enumerate(input_files):
        # swipe some handy progress reporting code from titus:
        if n % 100 == 0:
            sys.stderr.write(
                f"... loading {filename} file {n} of {len(input_files)}\n")

        # build or load signature from file

        sig = load_or_generate_sig_from_file(filename, alphabet, ksize,
                                             args.scaled,
                                             args.ignore_abundance,
                                             args.translate)
        # add to sbt
        if sig:  # is this necessary?
            if sig.minhash:
                leaf = SigLeaf(sig.md5sum(), sig)
                sbt.add_node(leaf)

    # save the tree
    sbt.save(args.sbt)
Ejemplo n.º 2
0
def test_binary_nary_tree():
    factory = GraphFactory(31, 1e5, 4)
    trees = {}
    trees[2] = SBT(factory)
    trees[5] = SBT(factory, d=5)
    trees[10] = SBT(factory, d=10)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        for tree in trees.values():
            tree.add_node(leaf)
        to_search = leaf
        n_leaves += 1

    assert all([len(list(t.leaves())) == n_leaves for t in trees.values()])

    results = {}
    print('*' * 60)
    print("{}:".format(to_search.metadata))
    for d, tree in trees.items():
        results[d] = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
    print(*results[2], sep='\n')

    assert results[2] == results[5]
    assert results[5] == results[10]
Ejemplo n.º 3
0
def test_tree_save_load(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)

    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        to_search = leaf

    print('*' * 60)
    print("{}:".format(to_search.metadata))
    old_result = {
        str(s)
        for s in tree.find(search_minhashes, to_search.data, 0.1)
    }
    print(*old_result, sep='\n')

    with utils.TempDirectory() as location:
        tree.save(os.path.join(location, 'demo'))
        tree = SBT.load(os.path.join(location, 'demo'),
                        leaf_loader=SigLeaf.load)

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
        print(*new_result, sep='\n')

        assert old_result == new_result
Ejemplo n.º 4
0
def test_sbt_combine(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)
    tree_1 = SBT(factory, d=n_children)
    tree_2 = SBT(factory, d=n_children)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = load_one_signature(utils.get_test_data(f))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        if n_leaves < 4:
            tree_1.add_node(leaf)
        else:
            tree_2.add_node(leaf)
        n_leaves += 1

    tree_1.combine(tree_2)

    t1_leaves = {str(l) for l in tree_1.leaves()}
    t_leaves = {str(l) for l in tree.leaves()}

    assert len(t1_leaves) == n_leaves
    assert len(t_leaves) == len(t1_leaves)
    assert t1_leaves == t_leaves

    to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0]))
    t1_result = {str(s) for s in tree_1.find(search_minhashes, to_search, 0.1)}
    tree_result = {str(s) for s in tree.find(search_minhashes, to_search, 0.1)}
    assert t1_result == tree_result

    # TODO: save and load both trees

    # check if adding a new node will use the next empty position
    next_empty = 0
    for n, (d, _) in enumerate(tree_1):
        if n != d:
            next_empty = n
            break
    if not next_empty:
        next_empty = n + 1

    tree_1.add_node(SigLeaf(to_search.name(), to_search))
    assert tree_1.next_node == next_empty
Ejemplo n.º 5
0
def add_singleton_sigs(sbt,
                       input_file,
                       ksize,
                       scaled,
                       alphabet,
                       ignore_abundance=False,
                       translate=False):
    if input_file.endswith(".sig"):
        sigs = sourmash.signature.load_signatures(input_file,
                                                  ksize=ksize,
                                                  select_moltype=alphabet)
        for sig in sigs:
            if sig.minhash:
                leaf = SigLeaf(sig.md5sum(), sig)
                sbt.add_node(leaf)
        # loop through and add each to sbt
    else:
        # read file and add sigs
        records = try_reading_fasta_file(input_file)
        # start with fresh minhash
        if records:
            for n, record in enumerate(records):
                signame = (record.name).rsplit("\t", 1)[0]
                if n % 10000 == 0:
                    sys.stderr.write(f"... building {n}th sig, {signame}\n")

                mh = determine_appropriate_fresh_minhash(
                    alphabet, ksize, scaled, ignore_abundance)
                if alphabet == "nucleotide" or translate:
                    mh.add_sequence(record.sequence, force=True)
                else:
                    mh.add_protein(record.sequence)
            # minhash --> signature
                sig = sourmash.SourmashSignature(mh, name=signame)
                if sig.minhash:
                    leaf = SigLeaf(sig.md5sum(), sig)
                    sbt.add_node(leaf)
    return sbt
Ejemplo n.º 6
0
def test_tree_repair_add_node():
    tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'),
                           leaf_loader=SigLeaf.load)

    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree_repair.add_node(leaf)

    for pos, node in list(tree_repair.nodes.items()):
        # Every parent of a node must be an internal node (and not a leaf),
        # except for node 0 (the root), whose parent is None.
        if pos != 0:
            assert isinstance(tree_repair.parent(pos).node, Node)

        # Leaf nodes can't have children
        if isinstance(node, Leaf):
            assert all(c.node is None for c in tree_repair.children(pos))
Ejemplo n.º 7
0
def test_search_minhashes():
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)

    to_search = next(iter(tree.leaves()))

    # this fails if 'search_minhashes' is calc containment and not similarity.
    results = tree.find(search_minhashes, to_search.data, 0.08)
    for leaf in results:
        assert to_search.data.similarity(leaf.data) >= 0.08

    print(results)
Ejemplo n.º 8
0
def test_save_sparseness(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)

    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        to_search = leaf

    print('*' * 60)
    print("{}:".format(to_search.metadata))
    old_result = {
        str(s)
        for s in tree.find(search_minhashes, to_search.data, 0.1)
    }
    print(*old_result, sep='\n')

    with utils.TempDirectory() as location:
        tree.save(os.path.join(location, 'demo'), sparseness=1.0)
        tree_loaded = SBT.load(os.path.join(location, 'demo'),
                               leaf_loader=SigLeaf.load)
        assert all(not isinstance(n, Node) for n in tree_loaded.nodes.values())

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {
            str(s)
            for s in tree_loaded.find(search_minhashes, to_search.data, 0.1)
        }
        print(*new_result, sep='\n')

        assert old_result == new_result

        for pos, node in list(tree_loaded.nodes.items()):
            # Every parent of a node must be an internal node (and not a leaf),
            # except for node 0 (the root), whose parent is None.
            if pos != 0:
                assert isinstance(tree_loaded.parent(pos).node, Node)

            # Leaf nodes can't have children
            if isinstance(node, Leaf):
                assert all(c.node is None for c in tree_loaded.children(pos))
Ejemplo n.º 9
0
def test_sbt_ipfsstorage():
    ipfshttpclient = pytest.importorskip('ipfshttpclient')

    factory = GraphFactory(31, 1e5, 4)
    with utils.TempDirectory() as location:
        tree = SBT(factory)

        for f in utils.SIG_FILES:
            sig = load_one_signature(utils.get_test_data(f))

            leaf = SigLeaf(os.path.basename(f), sig)
            tree.add_node(leaf)
            to_search = leaf

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        old_result = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
        print(*old_result, sep='\n')

        try:
            with IPFSStorage() as storage:
                tree.save(os.path.join(location, 'tree'), storage=storage)
        except ipfshttpclient.exceptions.ConnectionError:
            pytest.xfail("ipfs not installed/functioning probably")

        with IPFSStorage() as storage:
            tree = SBT.load(os.path.join(location, 'tree'),
                            leaf_loader=SigLeaf.load,
                            storage=storage)

            print('*' * 60)
            print("{}:".format(to_search.metadata))
            new_result = {
                str(s)
                for s in tree.find(search_minhashes, to_search.data, 0.1)
            }
            print(*new_result, sep='\n')

            assert old_result == new_result
Ejemplo n.º 10
0
def test_sbt_redisstorage():
    redis = pytest.importorskip('redis')
    factory = GraphFactory(31, 1e5, 4)
    with utils.TempDirectory() as location:
        tree = SBT(factory)

        for f in utils.SIG_FILES:
            sig = next(signature.load_signatures(utils.get_test_data(f)))
            leaf = SigLeaf(os.path.basename(f), sig)
            tree.add_node(leaf)
            to_search = leaf

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        old_result = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
        print(*old_result, sep='\n')

        try:
            with RedisStorage() as storage:
                tree.save(os.path.join(location, 'tree'), storage=storage)
        except redis.exceptions.ConnectionError:
            pytest.xfail("Couldn't connect to redis server")

        with RedisStorage() as storage:
            tree = SBT.load(os.path.join(location, 'tree'),
                            leaf_loader=SigLeaf.load,
                            storage=storage)

            print('*' * 60)
            print("{}:".format(to_search.metadata))
            new_result = {
                str(s)
                for s in tree.find(search_minhashes, to_search.data, 0.1)
            }
            print(*new_result, sep='\n')

            assert old_result == new_result
Ejemplo n.º 11
0
def test_sbt_zipstorage(tmpdir):
    # create tree, save to a zip, then load and search.
    factory = GraphFactory(31, 1e5, 4)

    tree = SBT(factory)

    for f in utils.SIG_FILES:
        sig = next(load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        to_search = leaf

    print('*' * 60)
    print("{}:".format(to_search.metadata))
    old_result = {
        str(s)
        for s in tree.find(search_minhashes, to_search.data, 0.1)
    }
    print(*old_result, sep='\n')

    with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage:
        tree.save(str(tmpdir.join("tree")), storage=storage)

    with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage:
        tree = SBT.load(str(tmpdir.join("tree")),
                        leaf_loader=SigLeaf.load,
                        storage=storage)

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
        print(*new_result, sep='\n')

        assert old_result == new_result
Ejemplo n.º 12
0
def test_sbt_tarstorage():
    factory = GraphFactory(31, 1e5, 4)
    with utils.TempDirectory() as location:
        tree = SBT(factory)

        for f in utils.SIG_FILES:
            sig = load_one_signature(utils.get_test_data(f))

            leaf = SigLeaf(os.path.basename(f), sig)
            tree.add_node(leaf)
            to_search = leaf

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        old_result = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
        print(*old_result, sep='\n')

        with TarStorage(os.path.join(location, 'tree.tar.gz')) as storage:
            tree.save(os.path.join(location, 'tree'), storage=storage)

        with TarStorage(os.path.join(location, 'tree.tar.gz')) as storage:
            tree = SBT.load(os.path.join(location, 'tree'),
                            leaf_loader=SigLeaf.load,
                            storage=storage)

            print('*' * 60)
            print("{}:".format(to_search.metadata))
            new_result = {
                str(s)
                for s in tree.find(search_minhashes, to_search.data, 0.1)
            }
            print(*new_result, sep='\n')

            assert old_result == new_result