def test_filter_tables(self, tokenizer, sim_measure_type, threshold, allow_empty, allow_missing, args, expected_pairs): size_filter = SizeFilter(tokenizer, sim_measure_type, threshold, allow_empty, allow_missing) actual_candset = size_filter.filter_tables(*args) expected_output_attrs = ['_id'] l_out_prefix = self.default_l_out_prefix r_out_prefix = self.default_r_out_prefix # Check for l_out_prefix in args. if len(args) > 8: l_out_prefix = args[8] expected_output_attrs.append(l_out_prefix + args[2]) # Check for r_out_prefix in args. if len(args) > 9: r_out_prefix = args[9] expected_output_attrs.append(r_out_prefix + args[3]) # Check for l_out_attrs in args. if len(args) > 6: if args[6]: l_out_attrs = remove_redundant_attrs(args[6], args[2]) for attr in l_out_attrs: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_attrs in args. if len(args) > 7: if args[7]: r_out_attrs = remove_redundant_attrs(args[7], args[3]) for attr in r_out_attrs: expected_output_attrs.append(r_out_prefix + attr) # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + args[2]]), str(row[r_out_prefix + args[3]])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def test_invalid_r_out_attr(self): size_filter = SizeFilter(self.tokenizer, self.sim_measure_type, self.threshold) size_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', ['A.attr'], ['B.invalid_attr'])
def test_invalid_rtable(self): size_filter = SizeFilter(self.tokenizer, self.sim_measure_type, self.threshold) size_filter.filter_tables(self.A, [], 'A.id', 'B.id', 'A.attr', 'B.attr')
def test_invalid_r_out_attr(self): size_filter = SizeFilter(self.tokenizer, self.sim_measure_type, self.threshold) size_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr', ['A.attr'], ['B.invalid_attr'])
def test_numeric_r_filter_attr(self): size_filter = SizeFilter(self.tokenizer, self.sim_measure_type, self.threshold) size_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.int_attr')
def test_invalid_rtable(self): size_filter = SizeFilter(self.tokenizer, self.sim_measure_type, self.threshold) size_filter.filter_tables(self.A, [], 'A.id', 'B.id', 'A.attr', 'B.attr')