Esempio n. 1
0
def test_sbt_combine(n_children):
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=n_children)
    tree_1 = SBT(factory, d=n_children)
    tree_2 = SBT(factory, d=n_children)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        if n_leaves < 4:
            tree_1.add_node(leaf)
        else:
            tree_2.add_node(leaf)
        n_leaves += 1

    tree_1.combine(tree_2)

    t1_leaves = {str(l) for l in tree_1.leaves()}
    t_leaves = {str(l) for l in tree.leaves()}

    assert len(t1_leaves) == n_leaves
    assert len(t_leaves) == len(t1_leaves)
    assert t1_leaves == t_leaves

    to_search = next(signature.load_signatures(
                        utils.get_test_data(utils.SIG_FILES[0])))
    t1_result = {str(s) for s in tree_1.find(search_minhashes,
                                             to_search, 0.1)}
    tree_result = {str(s) for s in tree.find(search_minhashes,
                                             to_search, 0.1)}
    assert t1_result == tree_result

    # TODO: save and load both trees

    # check if adding a new node will use the next empty position
    next_empty = 0
    for n, d in tree_1.nodes.items():
        if d is None:
            next_empty = n
            break
    if not next_empty:
        next_empty = n + 1

    tree_1.add_node(leaf)
    assert tree_1.max_node == next_empty
Esempio n. 2
0
def test_search_minhashes():
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory)

    n_leaves = 0
    for f in utils.SIG_FILES:
        sig = next(signature.load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)

    to_search = next(iter(tree.leaves()))

    # this fails if 'search_minhashes' is calc containment and not similarity.
    results = tree.find(search_minhashes, to_search.data, 0.08)
    for leaf in results:
        assert to_search.data.similarity(leaf.data) >= 0.08

    print(results)