class JaccardTestCase(unittest.TestCase): def setUp(self): self.threshold = 0.3 self.matches_using_cart_prod = sim_match( table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr, r_attr, get_jaccard_fn(), self.threshold, ['id'], ['id']) self.size_filter = SizeFilter(table_A, tokenized_table_A, l_attr, tok) self.size_filter.build_index() self.prefix_filter = PrefixFilter(table_A, tokenized_table_A, l_attr, tok, self.threshold, token_ordering) self.prefix_filter.build_index() self.position_filter = PositionFilter(table_A, tokenized_table_A, l_attr, tok, self.threshold, token_ordering) self.position_filter.build_index() self.suffix_filter = SuffixFilter(table_A, tokenized_table_A, l_attr, tok, self.threshold, token_ordering) def test_jaccard_match(self): # test jaccard with position filter, size filter, suffix filter matches = jaccard_match( table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr, r_attr, self.threshold, [self.position_filter, self.size_filter, self.suffix_filter], ['id'], ['id']) self.assertTrue(compare_matches(self.matches_using_cart_prod, matches)) # test jaccard with prefix filter, size filter, suffix filter matches = jaccard_match( table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr, r_attr, self.threshold, [self.prefix_filter, self.size_filter, self.suffix_filter], ['id'], ['id']) self.assertTrue(compare_matches(self.matches_using_cart_prod, matches))
def jaccard_join_auto(ltable, rtable, l_id_attr, l_join_attr, r_id_attr, r_join_attr, threshold, ltable_output_attrs=None, rtable_output_attrs=None): matches_list = [] sim_function = get_jaccard_fn() token_ordering = gen_token_ordering(ltable, l_join_attr) position_filter = PositionFilter(ltable, l_id_attr, l_join_attr, threshold, token_ordering, adaptive_prefix=True) position_filter.build_index() prog_bar = pyprind.ProgBar(len(rtable.index)) l_row_dict = {} for idx, l_row in ltable.iterrows(): l_id = l_row[l_id_attr] l_row_dict[l_id] = l_row r_row_dict = {} for idx, r_row in rtable.iterrows(): r_id = r_row[r_id_attr] r_row_dict[r_id] = r_row for r_id in r_row_dict.keys(): r_row = r_row_dict[r_id] r_tokens = order_using_token_ordering(list(r_row[r_join_attr]), token_ordering) r_num_tokens = len(r_tokens) l_cand_ids = position_filter.find_candidates(r_tokens, r_num_tokens, threshold) for l_id in l_cand_ids: l_row = l_row_dict[l_id] if sim_function(l_row[l_join_attr], r_row[r_join_attr]) >= threshold: match_dict = get_output_attributes(l_row, r_row, l_id_attr, l_id, r_id_attr, r_id, ltable_output_attrs, rtable_output_attrs) matches_list.append(match_dict) # matches_list.append(str(l_id)+','+str(r_id)) prog_bar.update() output_matches = pd.DataFrame(matches_list) return output_matches
class PositionFilterTestCase(unittest.TestCase): def setUp(self): self.position_filter = PositionFilter(A, A_tokenized, 'str', tok, 0.8, token_ordering) self.position_filter.build_index() def test_apply_filter(self): # position filter satisfies l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['fg', 'cd', 'aa', 'ef'], token_ordering) self.assertTrue( self.position_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # position filter doesn't satisfy l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['fg'], token_ordering) self.assertFalse( self.position_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # prefix filter satisfies but position filter doesn't satisfy l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['aa'], token_ordering) self.assertFalse( self.position_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # test empty list of tokens l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering([], token_ordering) self.assertFalse( self.position_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) self.assertFalse( self.position_filter.apply_filter(r_tokens, l_tokens, len(r_tokens), len(l_tokens), 0.8)) def test_find_candidates(self): # test default case (presence of candidates) tokens = order_using_token_ordering(['aa', 'ef', 'ab', 'cd'], token_ordering) self.assertSetEqual( self.position_filter.find_candidates(tokens, len(tokens), 0.8), set([0, 3])) # test empty set of candidates tokens = order_using_token_ordering(['op', 'lp', 'mp'], token_ordering) self.assertSetEqual( self.position_filter.find_candidates(tokens, len(tokens), 0.8), set()) # prefix index returns 2 candidates where as position index prunes them tokens = order_using_token_ordering(['aa', 'ef', 'lp'], token_ordering) self.assertSetEqual( self.position_filter.find_candidates(tokens, len(tokens), 0.8), set()) # test empty list of probe tokens tokens = order_using_token_ordering([], token_ordering) self.assertSetEqual( self.position_filter.find_candidates(tokens, len(tokens), 0.8), set())