Exemple #1
0
class JaccardTestCase(unittest.TestCase):
    def setUp(self):
        self.threshold = 0.3
        self.matches_using_cart_prod = sim_match(
            table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr,
            r_attr, get_jaccard_fn(), self.threshold, ['id'], ['id'])
        self.size_filter = SizeFilter(table_A, tokenized_table_A, l_attr, tok)
        self.size_filter.build_index()
        self.prefix_filter = PrefixFilter(table_A, tokenized_table_A, l_attr,
                                          tok, self.threshold, token_ordering)
        self.prefix_filter.build_index()
        self.position_filter = PositionFilter(table_A, tokenized_table_A,
                                              l_attr, tok, self.threshold,
                                              token_ordering)
        self.position_filter.build_index()
        self.suffix_filter = SuffixFilter(table_A, tokenized_table_A, l_attr,
                                          tok, self.threshold, token_ordering)

    def test_jaccard_match(self):
        # test jaccard with position filter, size filter, suffix filter
        matches = jaccard_match(
            table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr,
            r_attr, self.threshold,
            [self.position_filter, self.size_filter, self.suffix_filter],
            ['id'], ['id'])
        self.assertTrue(compare_matches(self.matches_using_cart_prod, matches))

        # test jaccard with prefix filter, size filter, suffix filter
        matches = jaccard_match(
            table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr,
            r_attr, self.threshold,
            [self.prefix_filter, self.size_filter, self.suffix_filter], ['id'],
            ['id'])
        self.assertTrue(compare_matches(self.matches_using_cart_prod, matches))
Exemple #2
0
def jaccard_join_auto(ltable,
                      rtable,
                      l_id_attr,
                      l_join_attr,
                      r_id_attr,
                      r_join_attr,
                      threshold,
                      ltable_output_attrs=None,
                      rtable_output_attrs=None):
    matches_list = []
    sim_function = get_jaccard_fn()
    token_ordering = gen_token_ordering(ltable, l_join_attr)
    position_filter = PositionFilter(ltable,
                                     l_id_attr,
                                     l_join_attr,
                                     threshold,
                                     token_ordering,
                                     adaptive_prefix=True)
    position_filter.build_index()

    prog_bar = pyprind.ProgBar(len(rtable.index))

    l_row_dict = {}
    for idx, l_row in ltable.iterrows():
        l_id = l_row[l_id_attr]
        l_row_dict[l_id] = l_row

    r_row_dict = {}
    for idx, r_row in rtable.iterrows():
        r_id = r_row[r_id_attr]
        r_row_dict[r_id] = r_row

    for r_id in r_row_dict.keys():
        r_row = r_row_dict[r_id]
        r_tokens = order_using_token_ordering(list(r_row[r_join_attr]),
                                              token_ordering)
        r_num_tokens = len(r_tokens)

        l_cand_ids = position_filter.find_candidates(r_tokens, r_num_tokens,
                                                     threshold)
        for l_id in l_cand_ids:
            l_row = l_row_dict[l_id]
            if sim_function(l_row[l_join_attr],
                            r_row[r_join_attr]) >= threshold:
                match_dict = get_output_attributes(l_row, r_row, l_id_attr,
                                                   l_id, r_id_attr, r_id,
                                                   ltable_output_attrs,
                                                   rtable_output_attrs)
                matches_list.append(match_dict)
            #  matches_list.append(str(l_id)+','+str(r_id))
        prog_bar.update()

    output_matches = pd.DataFrame(matches_list)
    return output_matches
Exemple #3
0
class PositionFilterTestCase(unittest.TestCase):
    def setUp(self):
        self.position_filter = PositionFilter(A, A_tokenized, 'str', tok, 0.8,
                                              token_ordering)
        self.position_filter.build_index()

    def test_apply_filter(self):
        # position filter satisfies
        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering(['fg', 'cd', 'aa', 'ef'],
                                              token_ordering)
        self.assertTrue(
            self.position_filter.apply_filter(l_tokens,
                                              r_tokens, len(l_tokens),
                                              len(r_tokens), 0.8))

        # position filter doesn't satisfy
        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering(['fg'], token_ordering)
        self.assertFalse(
            self.position_filter.apply_filter(l_tokens,
                                              r_tokens, len(l_tokens),
                                              len(r_tokens), 0.8))

        # prefix filter satisfies but position filter doesn't satisfy
        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering(['aa'], token_ordering)
        self.assertFalse(
            self.position_filter.apply_filter(l_tokens,
                                              r_tokens, len(l_tokens),
                                              len(r_tokens), 0.8))

        # test empty list of tokens
        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering([], token_ordering)
        self.assertFalse(
            self.position_filter.apply_filter(l_tokens,
                                              r_tokens, len(l_tokens),
                                              len(r_tokens), 0.8))
        self.assertFalse(
            self.position_filter.apply_filter(r_tokens,
                                              l_tokens, len(r_tokens),
                                              len(l_tokens), 0.8))

    def test_find_candidates(self):
        # test default case (presence of candidates)
        tokens = order_using_token_ordering(['aa', 'ef', 'ab', 'cd'],
                                            token_ordering)
        self.assertSetEqual(
            self.position_filter.find_candidates(tokens, len(tokens), 0.8),
            set([0, 3]))

        # test empty set of candidates
        tokens = order_using_token_ordering(['op', 'lp', 'mp'], token_ordering)
        self.assertSetEqual(
            self.position_filter.find_candidates(tokens, len(tokens), 0.8),
            set())

        # prefix index returns 2 candidates where as position index prunes them
        tokens = order_using_token_ordering(['aa', 'ef', 'lp'], token_ordering)
        self.assertSetEqual(
            self.position_filter.find_candidates(tokens, len(tokens), 0.8),
            set())

        # test empty list of probe tokens
        tokens = order_using_token_ordering([], token_ordering)
        self.assertSetEqual(
            self.position_filter.find_candidates(tokens, len(tokens), 0.8),
            set())