Beispiel #1
0
class JaccardTestCase(unittest.TestCase):
    def setUp(self):
        self.threshold = 0.3
        self.matches_using_cart_prod = sim_match(
            table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr,
            r_attr, get_jaccard_fn(), self.threshold, ['id'], ['id'])
        self.size_filter = SizeFilter(table_A, tokenized_table_A, l_attr, tok)
        self.size_filter.build_index()
        self.prefix_filter = PrefixFilter(table_A, tokenized_table_A, l_attr,
                                          tok, self.threshold, token_ordering)
        self.prefix_filter.build_index()
        self.position_filter = PositionFilter(table_A, tokenized_table_A,
                                              l_attr, tok, self.threshold,
                                              token_ordering)
        self.position_filter.build_index()
        self.suffix_filter = SuffixFilter(table_A, tokenized_table_A, l_attr,
                                          tok, self.threshold, token_ordering)

    def test_jaccard_match(self):
        # test jaccard with position filter, size filter, suffix filter
        matches = jaccard_match(
            table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr,
            r_attr, self.threshold,
            [self.position_filter, self.size_filter, self.suffix_filter],
            ['id'], ['id'])
        self.assertTrue(compare_matches(self.matches_using_cart_prod, matches))

        # test jaccard with prefix filter, size filter, suffix filter
        matches = jaccard_match(
            table_A, table_B, tokenized_table_A, tokenized_table_B, l_attr,
            r_attr, self.threshold,
            [self.prefix_filter, self.size_filter, self.suffix_filter], ['id'],
            ['id'])
        self.assertTrue(compare_matches(self.matches_using_cart_prod, matches))
Beispiel #2
0
class SizeFilterTestCase(unittest.TestCase):
    def setUp(self):
        self.size_filter = SizeFilter(A, A_tokenized, 'str', tok)
        self.size_filter.build_index()

    def test_apply_filter(self):
        # size filter satisfies
        l_tokens = ['aa', 'bb', 'cd', 'ef', 'fg']
        r_tokens = ['xx', 'yy', 'aa', 'bb']
        self.assertTrue(
            self.size_filter.apply_filter(l_tokens, r_tokens, len(l_tokens),
                                          len(r_tokens), 0.8))

        # size filter doesn't satisfy
        l_tokens = ['aa', 'bb', 'cd', 'ef', 'fg']
        r_tokens = ['xx']
        self.assertFalse(
            self.size_filter.apply_filter(l_tokens, r_tokens, len(l_tokens),
                                          len(r_tokens), 0.8))

        # test empty list of tokens
        l_tokens = ['aa', 'bb', 'cd', 'ef', 'fg']
        r_tokens = []
        self.assertFalse(
            self.size_filter.apply_filter(l_tokens, r_tokens, len(l_tokens),
                                          len(r_tokens), 0.8))
        self.assertFalse(
            self.size_filter.apply_filter(r_tokens, l_tokens, len(r_tokens),
                                          len(l_tokens), 0.8))

    def test_find_candidates(self):
        # test default case (presence of candidates)
        tokens = ['aa', 'xx', 'yy', 'uu']
        self.assertSetEqual(
            self.size_filter.find_candidates(tokens, len(tokens), 0.8),
            set([0, 3, 4]))

        # test empty set of candidates
        tokens = ['aa', 'op', 'xx', 'yy', 'uu', 'yu', 'iu', 'lp']
        self.assertSetEqual(
            self.size_filter.find_candidates(tokens, len(tokens), 0.8), set())

        # test empty list of probe tokens
        tokens = []
        self.assertSetEqual(
            self.size_filter.find_candidates(tokens, len(tokens), 0.8), set())