class JaccardTestCase(unittest.TestCase): def setUp(self): self.threshold = 0.3 self.matches_using_cart_prod = sim_match( table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr, r_attr, get_jaccard_fn(), self.threshold, ['id'], ['id']) self.size_filter = SizeFilter(table_A, tokenized_table_A, l_attr, tok) self.size_filter.build_index() self.prefix_filter = PrefixFilter(table_A, tokenized_table_A, l_attr, tok, self.threshold, token_ordering) self.prefix_filter.build_index() self.position_filter = PositionFilter(table_A, tokenized_table_A, l_attr, tok, self.threshold, token_ordering) self.position_filter.build_index() self.suffix_filter = SuffixFilter(table_A, tokenized_table_A, l_attr, tok, self.threshold, token_ordering) def test_jaccard_match(self): # test jaccard with position filter, size filter, suffix filter matches = jaccard_match( table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr, r_attr, self.threshold, [self.position_filter, self.size_filter, self.suffix_filter], ['id'], ['id']) self.assertTrue(compare_matches(self.matches_using_cart_prod, matches)) # test jaccard with prefix filter, size filter, suffix filter matches = jaccard_match( table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr, r_attr, self.threshold, [self.prefix_filter, self.size_filter, self.suffix_filter], ['id'], ['id']) self.assertTrue(compare_matches(self.matches_using_cart_prod, matches))
def test_filter_pair(self, lstring, rstring, tokenizer, sim_measure_type, threshold, allow_empty, allow_missing, expected_output): size_filter = SizeFilter(tokenizer, sim_measure_type, threshold, allow_empty, allow_missing) actual_output = size_filter.filter_pair(lstring, rstring) assert_equal(actual_output, expected_output)
def setUp(self): self.threshold = 0.3 self.matches_using_cart_prod = sim_match( table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr, r_attr, get_jaccard_fn(), self.threshold, ['id'], ['id']) self.size_filter = SizeFilter(table_A, tokenized_table_A, l_attr, tok) self.size_filter.build_index() self.prefix_filter = PrefixFilter(table_A, tokenized_table_A, l_attr, tok, self.threshold, token_ordering) self.prefix_filter.build_index() self.position_filter = PositionFilter(table_A, tokenized_table_A, l_attr, tok, self.threshold, token_ordering) self.position_filter.build_index() self.suffix_filter = SuffixFilter(table_A, tokenized_table_A, l_attr, tok, self.threshold, token_ordering)
def test_filter_tables(self, tokenizer, sim_measure_type, threshold, allow_empty, allow_missing, args, expected_pairs): size_filter = SizeFilter(tokenizer, sim_measure_type, threshold, allow_empty, allow_missing) actual_candset = size_filter.filter_tables(*args) expected_output_attrs = ['_id'] l_out_prefix = self.default_l_out_prefix r_out_prefix = self.default_r_out_prefix # Check for l_out_prefix in args. if len(args) > 8: l_out_prefix = args[8] expected_output_attrs.append(l_out_prefix + args[2]) # Check for r_out_prefix in args. if len(args) > 9: r_out_prefix = args[9] expected_output_attrs.append(r_out_prefix + args[3]) # Check for l_out_attrs in args. if len(args) > 6: if args[6]: l_out_attrs = remove_redundant_attrs(args[6], args[2]) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 7: if args[7]: r_out_attrs = remove_redundant_attrs(args[7], args[3]) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + args[2]]), str(row[r_out_prefix + args[3]])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_filter_candset(self, tokenizer, sim_measure_type, threshold, args, expected_pairs): size_filter = SizeFilter(tokenizer, sim_measure_type, threshold) actual_output_candset = size_filter.filter_candset(*args) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_output_candset.columns.values), list(args[0].columns.values)) actual_pairs = set() for idx, row in actual_output_candset.iterrows(): actual_pairs.add(','.join((str(row[args[1]]), str(row[args[2]])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_filter_candset(self, tokenizer, sim_measure_type, threshold, allow_empty, allow_missing, args, expected_pairs): size_filter = SizeFilter(tokenizer, sim_measure_type, threshold, allow_empty, allow_missing) actual_output_candset = size_filter.filter_candset(*args) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_output_candset.columns.values), list(args[0].columns.values)) actual_pairs = set() for idx, row in actual_output_candset.iterrows(): actual_pairs.add(','.join((str(row[args[1]]), str(row[args[2]])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
class SizeFilterTestCase(unittest.TestCase): def setUp(self): self.size_filter = SizeFilter(A, A_tokenized, 'str', tok) self.size_filter.build_index() def test_apply_filter(self): # size filter satisfies l_tokens = ['aa', 'bb', 'cd', 'ef', 'fg'] r_tokens = ['xx', 'yy', 'aa', 'bb'] self.assertTrue( self.size_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # size filter doesn't satisfy l_tokens = ['aa', 'bb', 'cd', 'ef', 'fg'] r_tokens = ['xx'] self.assertFalse( self.size_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # test empty list of tokens l_tokens = ['aa', 'bb', 'cd', 'ef', 'fg'] r_tokens = [] self.assertFalse( self.size_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) self.assertFalse( self.size_filter.apply_filter(r_tokens, l_tokens, len(r_tokens), len(l_tokens), 0.8)) def test_find_candidates(self): # test default case (presence of candidates) tokens = ['aa', 'xx', 'yy', 'uu'] self.assertSetEqual( self.size_filter.find_candidates(tokens, len(tokens), 0.8), set([0, 3, 4])) # test empty set of candidates tokens = ['aa', 'op', 'xx', 'yy', 'uu', 'yu', 'iu', 'lp'] self.assertSetEqual( self.size_filter.find_candidates(tokens, len(tokens), 0.8), set()) # test empty list of probe tokens tokens = [] self.assertSetEqual( self.size_filter.find_candidates(tokens, len(tokens), 0.8), set())
def test_invalid_threshold(self): size_filter = SizeFilter(self.tokenizer, self.sim_measure_type, 1.2)
def test_invalid_r_out_attr(self): size_filter = SizeFilter(self.tokenizer, self.sim_measure_type, self.threshold) size_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', ['A.attr'], ['B.invalid_attr'])
def test_invalid_sim_measure_type(self): size_filter = SizeFilter(self.tokenizer, 'INVALID_TYPE', self.threshold)
def test_invalid_rtable(self): size_filter = SizeFilter(self.tokenizer, self.sim_measure_type, self.threshold) size_filter.filter_tables(self.A, [], 'A.id', 'B.id', 'A.attr', 'B.attr')
def test_invalid_rtable(self): size_filter = SizeFilter(self.tokenizer, self.sim_measure_type, self.threshold) size_filter.filter_tables(self.A, [], 'A.id', 'B.id', 'A.attr', 'B.attr')
def test_numeric_r_filter_attr(self): size_filter = SizeFilter(self.tokenizer, self.sim_measure_type, self.threshold) size_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.int_attr')
def test_invalid_r_out_attr(self): size_filter = SizeFilter(self.tokenizer, self.sim_measure_type, self.threshold) size_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', ['A.attr'], ['B.invalid_attr'])
def setUp(self): self.size_filter = SizeFilter(A, A_tokenized, 'str', tok) self.size_filter.build_index()
def test_filter_pair(self, lstring, rstring, tokenizer, sim_measure_type, threshold, allow_empty, allow_missing, expected_output): size_filter = SizeFilter(tokenizer, sim_measure_type, threshold, allow_empty, allow_missing) actual_output = size_filter.filter_pair(lstring, rstring) assert_equal(actual_output, expected_output)
def test_invalid_tokenizer_for_edit_distance(self): size_filter = SizeFilter(self.tokenizer, 'EDIT_DISTANCE', 2)