def test_tree_save_load(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with utils.TempDirectory() as location: tree.save(os.path.join(location, 'demo')) tree = SBT.load(os.path.join(location, 'demo'), leaf_loader=SigLeaf.load) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def test_sbt_tarstorage(): factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*old_result, sep='\n') with TarStorage(os.path.join(location, 'tree.tar.gz')) as storage: tree.save(os.path.join(location, 'tree'), storage=storage) with TarStorage(os.path.join(location, 'tree.tar.gz')) as storage: tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*new_result, sep='\n') assert old_result == new_result
def test_search_minhashes(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = next(iter(tree.leaves())) # this fails if 'search_minhashes' is calc containment and not similarity. results = tree.find(search_minhashes, to_search.data, 0.08) for leaf in results: assert to_search.data.similarity(leaf.data) >= 0.08 print(results)
def test_save_sparseness(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with utils.TempDirectory() as location: tree.save(os.path.join(location, 'demo'), sparseness=1.0) tree_loaded = SBT.load(os.path.join(location, 'demo'), leaf_loader=SigLeaf.load) assert all(not isinstance(n, Node) for n in tree_loaded.nodes.values()) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree_loaded.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result for pos, node in list(tree_loaded.nodes.items()): # Every parent of a node must be an internal node (and not a leaf), # except for node 0 (the root), whose parent is None. if pos != 0: assert isinstance(tree_loaded.parent(pos).node, Node) # Leaf nodes can't have children if isinstance(node, Leaf): assert all(c.node is None for c in tree_loaded.children(pos))
def test_sbt_ipfsstorage(): ipfshttpclient = pytest.importorskip('ipfshttpclient') factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') try: with IPFSStorage() as storage: tree.save(os.path.join(location, 'tree'), storage=storage) except ipfshttpclient.exceptions.ConnectionError: pytest.xfail("ipfs not installed/functioning probably") with IPFSStorage() as storage: tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def test_sbt_redisstorage(): redis = pytest.importorskip('redis') factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') try: with RedisStorage() as storage: tree.save(os.path.join(location, 'tree'), storage=storage) except redis.exceptions.ConnectionError: pytest.xfail("Couldn't connect to redis server") with RedisStorage() as storage: tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def test_save_sparseness(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*old_result, sep='\n') with utils.TempDirectory() as location: tree.save(os.path.join(location, 'demo'), sparseness=1.0) tree_loaded = SBT.load(os.path.join(location, 'demo'), leaf_loader=SigLeaf.load) assert all(not isinstance(n, Node) for n in tree_loaded.nodes.values()) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = {str(s) for s in tree_loaded.find(search_minhashes, to_search.data, 0.1)} print(*new_result, sep='\n') assert old_result == new_result for pos, node in list(tree_loaded.nodes.items()): # Every parent of a node must be an internal node (and not a leaf), # except for node 0 (the root), whose parent is None. if pos != 0: assert isinstance(tree_loaded.parent(pos).node, Node) # Leaf nodes can't have children if isinstance(node, Leaf): assert all(c.node is None for c in tree_loaded.children(pos))
def test_simple(n_children): factory = GraphFactory(5, 100, 3) root = SBT(factory, d=n_children) leaf1 = Leaf("a", factory()) leaf1.data.count('AAAAA') leaf1.data.count('AAAAT') leaf1.data.count('AAAAC') leaf2 = Leaf("b", factory()) leaf2.data.count('AAAAA') leaf2.data.count('AAAAT') leaf2.data.count('AAAAG') leaf3 = Leaf("c", factory()) leaf3.data.count('AAAAA') leaf3.data.count('AAAAT') leaf3.data.count('CAAAA') leaf4 = Leaf("d", factory()) leaf4.data.count('AAAAA') leaf4.data.count('CAAAA') leaf4.data.count('GAAAA') leaf5 = Leaf("e", factory()) leaf5.data.count('AAAAA') leaf5.data.count('AAAAT') leaf5.data.count('GAAAA') root.add_node(leaf1) root.add_node(leaf2) root.add_node(leaf3) root.add_node(leaf4) root.add_node(leaf5) def search_kmer(obj, seq): return obj.data.get(seq) leaves = [leaf1, leaf2, leaf3, leaf4, leaf5] kmers = ["AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA"] def search_kmer_in_list(kmer): x = [] for l in leaves: if l.data.get(kmer): x.append(l) return set(x) for kmer in kmers: assert set(root.find(search_kmer, kmer)) == search_kmer_in_list(kmer) print('-----') print([x.metadata for x in root.find(search_kmer, "AAAAA")]) print([x.metadata for x in root.find(search_kmer, "AAAAT")]) print([x.metadata for x in root.find(search_kmer, "AAAAG")]) print([x.metadata for x in root.find(search_kmer, "CAAAA")]) print([x.metadata for x in root.find(search_kmer, "GAAAA")])
def test_simple(n_children): factory = GraphFactory(5, 100, 3) root = SBT(factory, d=n_children) leaf1 = Leaf("a", factory()) leaf1.data.count('AAAAA') leaf1.data.count('AAAAT') leaf1.data.count('AAAAC') leaf2 = Leaf("b", factory()) leaf2.data.count('AAAAA') leaf2.data.count('AAAAT') leaf2.data.count('AAAAG') leaf3 = Leaf("c", factory()) leaf3.data.count('AAAAA') leaf3.data.count('AAAAT') leaf3.data.count('CAAAA') leaf4 = Leaf("d", factory()) leaf4.data.count('AAAAA') leaf4.data.count('CAAAA') leaf4.data.count('GAAAA') leaf5 = Leaf("e", factory()) leaf5.data.count('AAAAA') leaf5.data.count('AAAAT') leaf5.data.count('GAAAA') root.add_node(leaf1) root.add_node(leaf2) root.add_node(leaf3) root.add_node(leaf4) root.add_node(leaf5) def search_kmer(obj, seq): return obj.data.get(seq) leaves = [leaf1, leaf2, leaf3, leaf4, leaf5 ] kmers = [ "AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA" ] def search_kmer_in_list(kmer): x = [] for l in leaves: if l.data.get(kmer): x.append(l) return set(x) for kmer in kmers: assert set(root.find(search_kmer, kmer)) == search_kmer_in_list(kmer) print('-----') print([ x.metadata for x in root.find(search_kmer, "AAAAA") ]) print([ x.metadata for x in root.find(search_kmer, "AAAAT") ]) print([ x.metadata for x in root.find(search_kmer, "AAAAG") ]) print([ x.metadata for x in root.find(search_kmer, "CAAAA") ]) print([ x.metadata for x in root.find(search_kmer, "GAAAA") ])
def test_sbt_zipstorage(tmpdir): # create tree, save to a zip, then load and search. factory = GraphFactory(31, 1e5, 4) tree = SBT(factory) for f in utils.SIG_FILES: sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage: tree.save(str(tmpdir.join("tree")), storage=storage) with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage: tree = SBT.load(str(tmpdir.join("tree")), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def test_sbt_ipfsstorage(): ipfsapi = pytest.importorskip('ipfsapi') factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*old_result, sep='\n') try: with IPFSStorage() as storage: tree.save(os.path.join(location, 'tree'), storage=storage) except ipfsapi.exceptions.ConnectionError: pytest.xfail("ipfs not installed/functioning probably") with IPFSStorage() as storage: tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*new_result, sep='\n') assert old_result == new_result
def test_sbt_combine(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) tree_1 = SBT(factory, d=n_children) tree_2 = SBT(factory, d=n_children) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) if n_leaves < 4: tree_1.add_node(leaf) else: tree_2.add_node(leaf) n_leaves += 1 tree_1.combine(tree_2) t1_leaves = {str(l) for l in tree_1.leaves()} t_leaves = {str(l) for l in tree.leaves()} assert len(t1_leaves) == n_leaves assert len(t_leaves) == len(t1_leaves) assert t1_leaves == t_leaves to_search = next(signature.load_signatures( utils.get_test_data(utils.SIG_FILES[0]))) t1_result = {str(s) for s in tree_1.find(search_minhashes, to_search, 0.1)} tree_result = {str(s) for s in tree.find(search_minhashes, to_search, 0.1)} assert t1_result == tree_result # TODO: save and load both trees # check if adding a new node will use the next empty position next_empty = 0 for n, d in enumerate(tree_1.nodes): if n != d: next_empty = n break if not next_empty: next_empty = n + 1 tree_1.add_node(leaf) assert tree_1.next_node == next_empty
def test_sbt_tarstorage(): factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with TarStorage(os.path.join(location, 'tree.tar.gz')) as storage: tree.save(os.path.join(location, 'tree'), storage=storage) with TarStorage(os.path.join(location, 'tree.tar.gz')) as storage: tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def test_simple_index(n_children): factory = GraphFactory(5, 100, 3) root = SBT(factory, d=n_children) leaf1 = Leaf("a", factory()) leaf1.data.count("AAAAA") leaf1.data.count("AAAAT") leaf1.data.count("AAAAC") leaf2 = Leaf("b", factory()) leaf2.data.count("AAAAA") leaf2.data.count("AAAAT") leaf2.data.count("AAAAG") leaf3 = Leaf("c", factory()) leaf3.data.count("AAAAA") leaf3.data.count("AAAAT") leaf3.data.count("CAAAA") leaf4 = Leaf("d", factory()) leaf4.data.count("AAAAA") leaf4.data.count("CAAAA") leaf4.data.count("GAAAA") leaf5 = Leaf("e", factory()) leaf5.data.count("AAAAA") leaf5.data.count("AAAAT") leaf5.data.count("GAAAA") root.add_node(leaf1) root.add_node(leaf2) root.add_node(leaf3) root.add_node(leaf4) root.add_node(leaf5) def search_kmer(obj, seq): return obj.data.get(seq) kmers = ["AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA"] linear = LinearIndex() linear.insert(leaf1) linear.insert(leaf2) linear.insert(leaf3) linear.insert(leaf4) linear.insert(leaf5) for kmer in kmers: assert set(root.find(search_kmer, kmer)) == set(linear.find(search_kmer, kmer)) print("-----") print([x.metadata for x in root.find(search_kmer, "AAAAA")]) print([x.metadata for x in root.find(search_kmer, "AAAAT")]) print([x.metadata for x in root.find(search_kmer, "AAAAG")]) print([x.metadata for x in root.find(search_kmer, "CAAAA")]) print([x.metadata for x in root.find(search_kmer, "GAAAA")])
def test_sbt_combine(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) tree_1 = SBT(factory, d=n_children) tree_2 = SBT(factory, d=n_children) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) if n_leaves < 4: tree_1.add_node(leaf) else: tree_2.add_node(leaf) n_leaves += 1 tree_1.combine(tree_2) t1_leaves = {str(l) for l in tree_1.leaves()} t_leaves = {str(l) for l in tree.leaves()} assert len(t1_leaves) == n_leaves assert len(t_leaves) == len(t1_leaves) assert t1_leaves == t_leaves to_search = next( signature.load_signatures(utils.get_test_data(utils.SIG_FILES[0]))) t1_result = {str(s) for s in tree_1.find(search_minhashes, to_search, 0.1)} tree_result = {str(s) for s in tree.find(search_minhashes, to_search, 0.1)} assert t1_result == tree_result # TODO: save and load both trees # check if adding a new node will use the next empty position next_empty = 0 for n, d in enumerate(tree_1.nodes): if n != d: next_empty = n break if not next_empty: next_empty = n + 1 tree_1.add_node(leaf) assert tree_1.next_node == next_empty
def test_longer_search(n_children): ksize = 5 factory = GraphFactory(ksize, 100, 3) root = SBT(factory, d=n_children) leaf1 = Leaf("a", factory()) leaf1.data.count('AAAAA') leaf1.data.count('AAAAT') leaf1.data.count('AAAAC') leaf2 = Leaf("b", factory()) leaf2.data.count('AAAAA') leaf2.data.count('AAAAT') leaf2.data.count('AAAAG') leaf3 = Leaf("c", factory()) leaf3.data.count('AAAAA') leaf3.data.count('AAAAT') leaf3.data.count('CAAAA') leaf4 = Leaf("d", factory()) leaf4.data.count('AAAAA') leaf4.data.count('CAAAA') leaf4.data.count('GAAAA') leaf5 = Leaf("e", factory()) leaf5.data.count('AAAAA') leaf5.data.count('AAAAT') leaf5.data.count('GAAAA') root.add_node(leaf1) root.add_node(leaf2) root.add_node(leaf3) root.add_node(leaf4) root.add_node(leaf5) def kmers(k, seq): for start in range(len(seq) - k + 1): yield seq[start:start + k] def search_transcript(node, seq, threshold): presence = [node.data.get(kmer) for kmer in kmers(ksize, seq)] if sum(presence) >= int(threshold * (len(seq) - ksize + 1)): return 1 return 0 try1 = [x.metadata for x in root.find(search_transcript, "AAAAT", 1.0)] assert set(try1) == set(['a', 'b', 'c', 'e']), try1 # no 'd' try2 = [x.metadata for x in root.find(search_transcript, "GAAAAAT", 0.6)] assert set(try2) == set(['a', 'b', 'c', 'd', 'e']) try3 = [x.metadata for x in root.find(search_transcript, "GAAAA", 1.0)] assert set(try3) == set(['d', 'e']), try3
def test_longer_search(n_children): ksize = 5 factory = GraphFactory(ksize, 100, 3) root = SBT(factory, d=n_children) leaf1 = Leaf("a", factory()) leaf1.data.count('AAAAA') leaf1.data.count('AAAAT') leaf1.data.count('AAAAC') leaf2 = Leaf("b", factory()) leaf2.data.count('AAAAA') leaf2.data.count('AAAAT') leaf2.data.count('AAAAG') leaf3 = Leaf("c", factory()) leaf3.data.count('AAAAA') leaf3.data.count('AAAAT') leaf3.data.count('CAAAA') leaf4 = Leaf("d", factory()) leaf4.data.count('AAAAA') leaf4.data.count('CAAAA') leaf4.data.count('GAAAA') leaf5 = Leaf("e", factory()) leaf5.data.count('AAAAA') leaf5.data.count('AAAAT') leaf5.data.count('GAAAA') root.add_node(leaf1) root.add_node(leaf2) root.add_node(leaf3) root.add_node(leaf4) root.add_node(leaf5) def kmers(k, seq): for start in range(len(seq) - k + 1): yield seq[start:start + k] def search_transcript(node, seq, threshold): presence = [ node.data.get(kmer) for kmer in kmers(ksize, seq) ] if sum(presence) >= int(threshold * (len(seq) - ksize + 1)): return 1 return 0 try1 = [ x.metadata for x in root.find(search_transcript, "AAAAT", 1.0) ] assert set(try1) == set([ 'a', 'b', 'c', 'e' ]), try1 # no 'd' try2 = [ x.metadata for x in root.find(search_transcript, "GAAAAAT", 0.6) ] assert set(try2) == set([ 'a', 'b', 'c', 'd', 'e' ]) try3 = [ x.metadata for x in root.find(search_transcript, "GAAAA", 1.0) ] assert set(try3) == set([ 'd', 'e' ]), try3
def test_simple(n_children): factory = GraphFactory(5, 100, 3) root = SBT(factory, d=n_children) leaf1 = Leaf("a", factory()) leaf1.data.count('AAAAA') leaf1.data.count('AAAAT') leaf1.data.count('AAAAC') leaf2 = Leaf("b", factory()) leaf2.data.count('AAAAA') leaf2.data.count('AAAAT') leaf2.data.count('AAAAG') leaf3 = Leaf("c", factory()) leaf3.data.count('AAAAA') leaf3.data.count('AAAAT') leaf3.data.count('CAAAA') leaf4 = Leaf("d", factory()) leaf4.data.count('AAAAA') leaf4.data.count('CAAAA') leaf4.data.count('GAAAA') leaf5 = Leaf("e", factory()) leaf5.data.count('AAAAA') leaf5.data.count('AAAAT') leaf5.data.count('GAAAA') root.add_node(leaf1) root.add_node(leaf2) root.add_node(leaf3) root.add_node(leaf4) root.add_node(leaf5) def search_kmer(obj, seq): return obj.data.get(seq) leaves = [leaf1, leaf2, leaf3, leaf4, leaf5] kmers = ["AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA"] def search_kmer_in_list(kmer): x = [] for l in leaves: if l.data.get(kmer): x.append(l) return set(x) for kmer in kmers: assert set(root.find(search_kmer, kmer)) == search_kmer_in_list(kmer) print('-----') print([x.metadata for x in root.find(search_kmer, "AAAAA")]) print([x.metadata for x in root.find(search_kmer, "AAAAT")]) print([x.metadata for x in root.find(search_kmer, "AAAAG")]) print([x.metadata for x in root.find(search_kmer, "CAAAA")]) print([x.metadata for x in root.find(search_kmer, "GAAAA")]) with utils.TempDirectory() as location: root.save(os.path.join(location, 'demo')) root = SBT.load(os.path.join(location, 'demo')) for kmer in kmers: new_result = {str(r) for r in root.find(search_kmer, kmer)} print(*new_result, sep='\n') assert new_result == {str(r) for r in search_kmer_in_list(kmer)}