Exemple #1
0
    def testSelfSimjoin(self):
        # ==== Self-SimJoin (Wordset) ====
        o_list = ["a b c d", "a b c", "b c d", "a", "a b c d e f", "a b c d e"]
        k_o_list = [(simjoin.wordset(o), o) for o in o_list]
        sj = simjoin.SimJoin(k_o_list)
        res = sj.selfjoin(0.4)
        w_res = sj.selfjoin(0.4, True)

        assert (len(res), len(w_res)) == (10, 4)

        # ==== Self-SimJoin (Gramset) ===="

        o_list = ["abcd", "abc", "bcd", "a", "abcdef", "abcde"]
        k_o_list = [(simjoin.gramset(o, 2), o) for o in o_list]
        sj = simjoin.SimJoin(k_o_list)
        res = sj.selfjoin(0.4)
        w_res = sj.selfjoin(0.4, True)
        assert (len(res), len(w_res)) == (6, 1)
Exemple #2
0
    def testJoin(self):
        # ==== SimJoin (Wordset) ====
        o_list1 = ["a b c d", "a b c", "b c d", "a"]
        o_list2 = ["a b c d e f", "a b c d e"]
        k_o_list1 = [(simjoin.wordset(o, 2), o) for o in o_list1]
        k_o_list2 = [(simjoin.wordset(o, 2), o) for o in o_list2]
        sj = simjoin.SimJoin(k_o_list1)
        res = sj.join(k_o_list2, 0.4)
        w_res = sj.join(k_o_list2, 0.4, True)
        assert (len(res), len(w_res)) == (6, 1)

        # ==== SimJoin (Gramset) ====
        o_list1 = ["abcd", "abc", "a"]
        o_list2 = ["abcdef", "bcd", "abcde"]
        k_o_list1 = [(simjoin.gramset(o, 2), o) for o in o_list1]
        k_o_list2 = [(simjoin.gramset(o, 2), o) for o in o_list2]
        sj = simjoin.SimJoin(k_o_list1)
        res = sj.join(k_o_list2, 0.4)
        w_res = sj.join(k_o_list2, 0.4, True)
        assert (len(res), len(w_res)) == (4, 1)
Exemple #3
0
def results_simjoin(er_result, D1_ER, jaccard_thre):
    """
    An adapter for similarity join and smart crawl.

    :param er_result: documents returned by api at each iteration
    :param D1_ER: local database
    :param jaccard_thre: jaccard threshold
    :return: match index and pair at each iteration
    """
    sj = simjoin.SimJoin(D1_ER)
    w_res = sj.join(er_result, jaccard_thre, True)
    match_ids = set()
    match_pair = []
    for r in w_res:
        match_ids.add(r[0][1])
        match_pair.append((r[0][1], r[1][1]))
    return match_ids, match_pair