コード例 #1
0
 def test_string_boundaries(self):
     # See http://bugs.python.org/issue10713
     self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
                      "abc")
     # There's a word boundary at the start of a string.
     self.assertTrue(re.match(r"\b", "abc"))
     # A non-empty string includes a non-boundary zero-length match.
     self.assertTrue(re.search(r"\B", "abc"))
     # There is no non-boundary match at the start of a string.
     self.assertFalse(re.match(r"\B", "abc"))
     # However, an empty string contains no word boundaries, and also no
     # non-boundaries.
     self.assertEqual(re.search(r"\B", ""), None)
     # This one is questionable and different from the perlre behaviour,
     # but describes current behavior.
     self.assertEqual(re.search(r"\b", ""), None)
     # A single word-character string has two boundaries, but no
     # non-boundary gaps.
     self.assertEqual(len(re.findall(r"\b", "a")), 2)
     self.assertEqual(len(re.findall(r"\B", "a")), 0)
     # If there are no words, there are no boundaries
     self.assertEqual(len(re.findall(r"\b", " ")), 0)
     self.assertEqual(len(re.findall(r"\b", "   ")), 0)
     # Can match around the whitespace.
     self.assertEqual(len(re.findall(r"\B", " ")), 2)
コード例 #2
0
 def test_re_findall(self):
     self.assertEqual(re.findall(":+", "abc"), [])
     self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
     self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
     self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
                                                            (":", ":"),
                                                            (":", "::")])
コード例 #3
0
 def test_re_escape_non_ascii_bytes(self):
     b = u'y\u2620y\u2620y'.encode('utf-8')
     b_escaped = re.escape(b)
     self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
     self.assertMatch(b_escaped, b)
     res = re.findall(re.escape(u'\u2620'.encode('utf-8')), b)
     self.assertEqual(len(res), 2)
コード例 #4
0
def extract_tags_from_text(text, tags):
    tags = pickle.loads(codecs.decode(tags.encode(), "base64"))

    tags_found = []
    text = ' '.join(text.splitlines())
    for line in text.split('.'):
        # Skip lines with no value to make it faster.
        if len(line) < 2 or text.isdigit():
            continue

        for tag in tags:
            try:
                result = pcre.findall(tag['compiletag'], line)
                times = len(result)
                if times > 0:
                    tag_copy = tag.copy()
                    tag_copy.pop('compiletag')
                    tag_copy['times'] = times
                    __append_tag_to_founds(tags_found, tag_copy)
            except pcre.PCREError as e:
                print(e)

    return {
        'status':
        'SUCCESS',
        'excerpt':
        text if len(text) <= config.SCANNED_TEXT_EXCERPT_SIZE else
        text[:config.SCANNED_TEXT_EXCERPT_SIZE - 3] + ' [...]',
        'result': {
            'topics':
            sorted(list(set([tag['topic'] for tag in tags_found]))),
            'tags':
            sorted(tags_found,
                   key=lambda t: (t['topic'], t['subtopic'], t['tag'])),
        }
    }
コード例 #5
0
 def test_bug_13899(self):
     # Issue #13899: re pattern r"[\A]" should work like "A" but matches
     # nothing. Ditto B and Z.
     self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
                      ['A', 'B', '\b', 'C', 'Z'])
コード例 #6
0
 def test_bug_117612(self):
     self.assertEqual(re.findall(r"(a|(b))", "aba"),
                      [("a", ""),("b", "b"),("a", "")])