コード例 #1
0
    def _build_index(self):
        always_check_rules = []
        url_keywords = []
        url_outputs = []
        domain_keywords = []
        domain_outputs = []
        domain_opt_keywords = []
        domain_opt_outputs = []

        for rule_index, rule in enumerate(self.rules):
            if rule.options:
                self._add_keywords(rule_index, domain_opt_keywords,
                                   domain_opt_outputs,
                                   rule.options.include_domains)
            rule_keywords = rule.get_keywords()
            if not rule_keywords.url_keywords and not rule_keywords.domain_keywords:
                always_check_rules.append(rule_index)
                continue
            self._add_keywords(rule_index, url_keywords, url_outputs,
                               rule_keywords.url_keywords)
            self._add_keywords(rule_index, domain_keywords, domain_outputs,
                               rule_keywords.domain_keywords)

        self._url_index = AhoCorasickIndex.from_keywords(
            url_keywords, url_outputs)
        self._domain_index = AhoCorasickIndex.from_keywords(
            domain_keywords, domain_outputs)
        self._domain_opt_index = AhoCorasickIndex.from_keywords(
            domain_opt_keywords, domain_opt_outputs)
        self._always_check_rules = always_check_rules
コード例 #2
0
 def test_build_index(self):
     # This example is taken from:
     # Aho, Corasick. Efficient String Matching: An Aid to Bibliographic
     # search. Communications of the ACM, 18(6), 333-340. 1975.
     # PDF: https://cr.yp.to/bib/1975/aho.pdf
     keywords = ['he', 'she', 'his', 'hers']
     index = AhoCorasickIndex.from_keywords(keywords)
     self.assertEqual(index._goto_fn, {
         (0, 'h'): 1,
         (0, 's'): 3,
         (1, 'e'): 2,
         (1, 'i'): 6,
         (2, 'r'): 8,
         (3, 'h'): 4,
         (4, 'e'): 5,
         (6, 's'): 7,
         (8, 's'): 9
     })
     self.assertEqual(index._fail_fn,
                      [0, 0, 0, 0, 1, 2, 0, 3, 0, 3])
     he = keywords.index('he')
     she = keywords.index('she')
     his = keywords.index('his')
     hers = keywords.index('hers')
     self.assertEqual(index._output_fn, {
         2: {he},
         5: {she, he},
         7: {his},
         9: {hers}
     })
コード例 #3
0
 def test_failfn_step2b(self):
     keywords = ['ad', 'bac']
     index = AhoCorasickIndex.from_keywords(keywords)
     result = index.get_matching_keywords('labac')
     self.assertSequenceEqual([1], sorted(result))
コード例 #4
0
 def test_empty_string_haystack(self):
     keywords = ['foo', 'bar', 'foobar']
     index = AhoCorasickIndex.from_keywords(keywords)
     result = index.get_matching_keywords('')
     self.assertSequenceEqual([], sorted(result))
コード例 #5
0
 def test_empty_keywords(self):
     index = AhoCorasickIndex.from_keywords([])
     result_foo = index.get_matching_keywords('foo')
     self.assertSequenceEqual([], sorted(result_foo))
コード例 #6
0
 def test_keywords_match_full(self):
     keywords = ['foo', 'bar', 'foobar']
     index = AhoCorasickIndex.from_keywords(keywords)
     result_foo = index.get_matching_keywords('foo')
     self.assertSequenceEqual([0], sorted(result_foo))
コード例 #7
0
 def test_keywords_match_both_consecutive(self):
     keywords = ['foo', 'bar', 'foobar']
     index = AhoCorasickIndex.from_keywords(keywords)
     result_both = index.get_matching_keywords('XXfoobarXX')
     self.assertSequenceEqual([0, 1, 2], sorted(result_both))
コード例 #8
0
 def test_no_match(self):
     keywords = ['foo', 'bar', 'foobar']
     index = AhoCorasickIndex.from_keywords(keywords)
     result_both = index.get_matching_keywords('anfobaxdummy')
     self.assertSequenceEqual([], sorted(result_both))