def test_string_boundaries(self): # See http://bugs.python.org/issue10713 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), "abc") # There's a word boundary at the start of a string. self.assertTrue(re.match(r"\b", "abc")) # A non-empty string includes a non-boundary zero-length match. self.assertTrue(re.search(r"\B", "abc")) # There is no non-boundary match at the start of a string. self.assertFalse(re.match(r"\B", "abc")) # However, an empty string contains no word boundaries, and also no # non-boundaries. self.assertEqual(re.search(r"\B", ""), None) # This one is questionable and different from the perlre behaviour, # but describes current behavior. self.assertEqual(re.search(r"\b", ""), None) # A single word-character string has two boundaries, but no # non-boundary gaps. self.assertEqual(len(re.findall(r"\b", "a")), 2) self.assertEqual(len(re.findall(r"\B", "a")), 0) # If there are no words, there are no boundaries self.assertEqual(len(re.findall(r"\b", " ")), 0) self.assertEqual(len(re.findall(r"\b", " ")), 0) # Can match around the whitespace. self.assertEqual(len(re.findall(r"\B", " ")), 2)
def test_re_findall(self): self.assertEqual(re.findall(":+", "abc"), []) self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"]) self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"]) self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""), (":", ":"), (":", "::")])
def test_re_escape_non_ascii_bytes(self): b = u'y\u2620y\u2620y'.encode('utf-8') b_escaped = re.escape(b) self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y') self.assertMatch(b_escaped, b) res = re.findall(re.escape(u'\u2620'.encode('utf-8')), b) self.assertEqual(len(res), 2)
def extract_tags_from_text(text, tags): tags = pickle.loads(codecs.decode(tags.encode(), "base64")) tags_found = [] text = ' '.join(text.splitlines()) for line in text.split('.'): # Skip lines with no value to make it faster. if len(line) < 2 or text.isdigit(): continue for tag in tags: try: result = pcre.findall(tag['compiletag'], line) times = len(result) if times > 0: tag_copy = tag.copy() tag_copy.pop('compiletag') tag_copy['times'] = times __append_tag_to_founds(tags_found, tag_copy) except pcre.PCREError as e: print(e) return { 'status': 'SUCCESS', 'excerpt': text if len(text) <= config.SCANNED_TEXT_EXCERPT_SIZE else text[:config.SCANNED_TEXT_EXCERPT_SIZE - 3] + ' [...]', 'result': { 'topics': sorted(list(set([tag['topic'] for tag in tags_found]))), 'tags': sorted(tags_found, key=lambda t: (t['topic'], t['subtopic'], t['tag'])), } }
def test_bug_13899(self): # Issue #13899: re pattern r"[\A]" should work like "A" but matches # nothing. Ditto B and Z. self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'), ['A', 'B', '\b', 'C', 'Z'])
def test_bug_117612(self): self.assertEqual(re.findall(r"(a|(b))", "aba"), [("a", ""),("b", "b"),("a", "")])