def test_filter_pair(self, lstring, rstring, tokenizer, sim_measure_type, threshold, allow_empty, allow_missing, expected_output): suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold, allow_empty, allow_missing) actual_output = suffix_filter.filter_pair(lstring, rstring) assert_equal(actual_output, expected_output)
def test_filter_candset(self, tokenizer, sim_measure_type, threshold, args, expected_pairs): suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold) actual_output_candset = suffix_filter.filter_candset(*args) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_output_candset.columns.values), list(args[0].columns.values)) actual_pairs = set() for idx, row in actual_output_candset.iterrows(): actual_pairs.add(','.join((str(row[args[1]]), str(row[args[2]])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_filter_candset(self, tokenizer, sim_measure_type, threshold, allow_empty, allow_missing, args, expected_pairs): suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold, allow_empty, allow_missing) actual_output_candset = suffix_filter.filter_candset(*args) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_output_candset.columns.values), list(args[0].columns.values)) actual_pairs = set() for idx, row in actual_output_candset.iterrows(): actual_pairs.add(','.join((str(row[args[1]]), str(row[args[2]])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_filter_tables(self, tokenizer, sim_measure_type, threshold, args, expected_pairs): suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold) actual_candset = suffix_filter.filter_tables(*args) expected_output_attrs = ['_id'] l_out_prefix = self.default_l_out_prefix r_out_prefix = self.default_r_out_prefix # Check for l_out_prefix in args. if len(args) > 8: l_out_prefix = args[8] expected_output_attrs.append(l_out_prefix + args[2]) # Check for l_out_attrs in args. if len(args) > 6: if args[6]: for attr in args[6]: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_prefix in args. if len(args) > 9: r_out_prefix = args[9] expected_output_attrs.append(r_out_prefix + args[3]) # Check for r_out_attrs in args. if len(args) > 7: if args[7]: for attr in args[7]: expected_output_attrs.append(r_out_prefix + attr) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + args[2]]), str(row[r_out_prefix + args[3]])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
class SuffixFilterTestCase(unittest.TestCase): def setUp(self): self.suffix_filter = SuffixFilter(A, A_tokenized, 'str', tok, 0.8, token_ordering) def test_apply_filter(self): # suffix filter satisfies l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['fg', 'cd', 'aa'], token_ordering) self.assertTrue( self.suffix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # suffix filter doesn't satisfy l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['fg'], token_ordering) self.assertFalse( self.suffix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # position filter satisfies but suffix filter doesn't satisfy l_tokens = order_using_token_ordering(['aa', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering(['cd', 'xx', 'xy', 'aa'], token_ordering) self.assertFalse( self.suffix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) # test empty list of tokens l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'], token_ordering) r_tokens = order_using_token_ordering([], token_ordering) self.assertFalse( self.suffix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens), len(r_tokens), 0.8)) self.assertFalse( self.suffix_filter.apply_filter(r_tokens, l_tokens, len(r_tokens), len(l_tokens), 0.8))
def setUp(self): self.threshold = 0.3 self.matches_using_cart_prod = sim_match( table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr, r_attr, get_jaccard_fn(), self.threshold, ['id'], ['id']) self.size_filter = SizeFilter(table_A, tokenized_table_A, l_attr, tok) self.size_filter.build_index() self.prefix_filter = PrefixFilter(table_A, tokenized_table_A, l_attr, tok, self.threshold, token_ordering) self.prefix_filter.build_index() self.position_filter = PositionFilter(table_A, tokenized_table_A, l_attr, tok, self.threshold, token_ordering) self.position_filter.build_index() self.suffix_filter = SuffixFilter(table_A, tokenized_table_A, l_attr, tok, self.threshold, token_ordering)
def test_invalid_r_out_attr(self): suffix_filter = SuffixFilter(self.tokenizer, self.sim_measure_type, self.threshold) suffix_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', ['A.attr'], ['B.invalid_attr'])
def test_invalid_rtable(self): suffix_filter = SuffixFilter(self.tokenizer, self.sim_measure_type, self.threshold) suffix_filter.filter_tables(self.A, [], 'A.id', 'B.id', 'A.attr', 'B.attr')
def test_numeric_r_filter_attr(self): suffix_filter = SuffixFilter(self.tokenizer, self.sim_measure_type, self.threshold) suffix_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.int_attr')
def test_filter_tables(self, tokenizer, sim_measure_type, threshold, allow_empty, allow_missing, args): suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold, allow_empty, allow_missing) sim_fn = get_sim_function(sim_measure_type) # compute the join output pairs join_output_pairs = set() for l_idx, l_row in args[0].iterrows(): for r_idx, r_row in args[1].iterrows(): # if allow_missing is set to True, then add pairs containing # missing value to the join output. if pd.isnull(l_row[args[4]]) or pd.isnull(r_row[args[5]]): if allow_missing: join_output_pairs.add(','.join((str(l_row[args[2]]), str(r_row[args[3]])))) continue if sim_measure_type == 'EDIT_DISTANCE': l_join_val = str(l_row[args[4]]) r_join_val = str(r_row[args[5]]) comp_fn = COMP_OP_MAP['<='] else: l_join_val = tokenizer.tokenize(str(l_row[args[4]])) r_join_val = tokenizer.tokenize(str(r_row[args[5]])) comp_fn = COMP_OP_MAP['>='] if (len(l_join_val) == 0 and len(r_join_val) == 0 and sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']): if allow_empty: join_output_pairs.add(','.join((str(l_row[args[2]]), str(r_row[args[3]])))) continue # if both attributes are not missing and not empty, then check # if the pair satisfies the join condition. If yes, then add it # to the join output. if comp_fn(sim_fn(l_join_val, r_join_val), threshold): join_output_pairs.add(','.join((str(l_row[args[2]]), str(r_row[args[3]])))) actual_candset = suffix_filter.filter_tables(*args) expected_output_attrs = ['_id'] l_out_prefix = self.default_l_out_prefix r_out_prefix = self.default_r_out_prefix # Check for l_out_prefix in args. if len(args) > 8: l_out_prefix = args[8] expected_output_attrs.append(l_out_prefix + args[2]) # Check for r_out_prefix in args. if len(args) > 9: r_out_prefix = args[9] expected_output_attrs.append(r_out_prefix + args[3]) # Check for l_out_attrs in args. if len(args) > 6: if args[6]: l_out_attrs = remove_redundant_attrs(args[6], args[2]) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 7: if args[7]: r_out_attrs = remove_redundant_attrs(args[7], args[3]) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(int(row[l_out_prefix + args[2]])), str(int(row[r_out_prefix + args[3]]))))) # verify whether all the join output pairs are # present in the actual output pairs common_pairs = actual_pairs.intersection(join_output_pairs) assert_equal(len(common_pairs), len(join_output_pairs))
def test_filter_tables(self, tokenizer, sim_measure_type, threshold, allow_empty, allow_missing, args): suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold, allow_empty, allow_missing) sim_fn = get_sim_function(sim_measure_type) # compute the join output pairs join_output_pairs = set() for l_idx, l_row in args[0].iterrows(): for r_idx, r_row in args[1].iterrows(): # if allow_missing is set to True, then add pairs containing # missing value to the join output. if pd.isnull(l_row[args[4]]) or pd.isnull(r_row[args[5]]): if allow_missing: join_output_pairs.add(','.join( (str(l_row[args[2]]), str(r_row[args[3]])))) continue if sim_measure_type == 'EDIT_DISTANCE': l_join_val = str(l_row[args[4]]) r_join_val = str(r_row[args[5]]) comp_fn = COMP_OP_MAP['<='] else: l_join_val = tokenizer.tokenize(str(l_row[args[4]])) r_join_val = tokenizer.tokenize(str(r_row[args[5]])) comp_fn = COMP_OP_MAP['>='] if (len(l_join_val) == 0 and len(r_join_val) == 0 and sim_measure_type not in ['OVERLAP', 'EDIT_DISTANCE']): if allow_empty: join_output_pairs.add(','.join( (str(l_row[args[2]]), str(r_row[args[3]])))) continue # if both attributes are not missing and not empty, then check # if the pair satisfies the join condition. If yes, then add it # to the join output. if comp_fn(sim_fn(l_join_val, r_join_val), threshold): join_output_pairs.add(','.join( (str(l_row[args[2]]), str(r_row[args[3]])))) actual_candset = suffix_filter.filter_tables(*args) expected_output_attrs = ['_id'] l_out_prefix = self.default_l_out_prefix r_out_prefix = self.default_r_out_prefix # Check for l_out_prefix in args. if len(args) > 8: l_out_prefix = args[8] expected_output_attrs.append(l_out_prefix + args[2]) # Check for r_out_prefix in args. if len(args) > 9: r_out_prefix = args[9] expected_output_attrs.append(r_out_prefix + args[3]) # Check for l_out_attrs in args. if len(args) > 6: if args[6]: l_out_attrs = remove_redundant_attrs(args[6], args[2]) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 7: if args[7]: r_out_attrs = remove_redundant_attrs(args[7], args[3]) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(int(row[l_out_prefix + args[2]])), str(int(row[r_out_prefix + args[3]]))))) # verify whether all the join output pairs are # present in the actual output pairs common_pairs = actual_pairs.intersection(join_output_pairs) assert_equal(len(common_pairs), len(join_output_pairs))
def _set_sim_join_split(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, sim_measure_type, threshold, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score): """Perform set similarity join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_join_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_join_attr_index) # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_dict.values(), rtable_dict.values()], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # build a dictionary of tokenized l_join_attr l_join_attr_dict = {} for row in ltable_dict.values(): l_join_attr_dict[row[l_key_attr_index]] = order_using_token_ordering( tokenize(str(row[l_join_attr_index]), tokenizer, sim_measure_type), token_ordering) # Build position index on l_join_attr position_index = PositionIndex(ltable_dict.values(), l_key_attr_index, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) position_index.build() pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold) suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(rtable_dict.keys())) for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_join_attr_index]) # check for empty string if not r_string: continue r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type) r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens, token_ordering) r_num_tokens = len(r_ordered_tokens) r_prefix_length = get_prefix_length(r_num_tokens, sim_measure_type, threshold, tokenizer) candidate_overlap = find_candidates_position_filter( r_ordered_tokens, r_num_tokens, r_prefix_length, pos_filter, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: l_ordered_tokens = l_join_attr_dict[cand] l_num_tokens = position_index.get_size(cand) l_prefix_length = get_prefix_length( l_num_tokens, sim_measure_type, threshold, tokenizer) if not suffix_filter._filter_suffix( l_ordered_tokens[l_prefix_length:], r_ordered_tokens[r_prefix_length:], l_prefix_length, r_prefix_length, l_num_tokens, r_num_tokens): sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens) if sim_score >= threshold: if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices, r_out_attrs_indices) if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) else: output_row = [cand, r_id] if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def setUp(self): self.suffix_filter = SuffixFilter(A, A_tokenized, 'str', tok, 0.8, token_ordering)
def test_invalid_threshold(self): suffix_filter = SuffixFilter(self.tokenizer, self.sim_measure_type, 1.2)
def test_invalid_sim_measure_type(self): suffix_filter = SuffixFilter(self.tokenizer, 'INVALID_TYPE', self.threshold)
def test_invalid_tokenizer_for_edit_distance(self): suffix_filter = SuffixFilter(self.tokenizer, 'EDIT_DISTANCE', 2)