def test_bug_581080(self): iter = re.finditer(r"\s", "a b") self.assertEqual(iter.next().span(), (1,2)) self.assertRaises(StopIteration, iter.next) scanner = re.compile(r"\s").scanner("a b") self.assertEqual(scanner.search().span(), (1, 2)) self.assertEqual(scanner.search(), None)
def test_dealloc(self): # PCRE: disabled, we're not testing _sre here # issue 3299: check for segfault in debug build #import _sre # the overflow limit is different on wide and narrow builds and it # depends on the definition of SRE_CODE (see sre.h). # 2**128 should be big enough to overflow on both. For smaller values # a RuntimeError is raised instead of OverflowError. #long_overflow = 2**128 # PCRE: finditer is implemented as generator function -- next() has to be called # to throw the error #self.assertRaises(TypeError, re.finditer, "a", {}) self.assertRaises(TypeError, re.finditer("a", {}).next)
def extract_entities(text,deduplication=False): sentences = split_setences(text) text_preprocessed = remove_accents(text) text_preprocessed = remove_digits(text_preprocessed) regexp2bat = build_regexpression() matches = re.finditer(regexp2bat, text_preprocessed) pt_patterns = PortuguesePatterns() phase_0_entities = [] for matchNum, match in enumerate(matches): matchNum = matchNum + 1 phase_0_entities.append(match.group()) phase_1_entities = [] for token in phase_0_entities: doc = nlp(token) prefix_pos=doc[0].pos_ if(prefix_pos in pt_patterns.tags_exclusions): token=token.replace(doc[0].text,'',1) doc[0].pos_='' if(doc[0].pos_=='' and len(doc)>1): if(doc[1].pos_ in pt_patterns.tags_exclusions): token=token.replace(doc[1].text,'',1) if(token.strip()!=''): phase_1_entities.append(token) unique_tokens = [] for token in phase_1_entities: if(token != '' and len(token) > 2): token = token.strip() if not(token in pt_patterns.stopwords or token in pt_patterns.preprositions): if(deduplication): if not(token in unique_tokens): unique_tokens.append(token) else: unique_tokens.append(token) origin="("+"|".join(unique_tokens)+")" text=re.split(origin,text) output={'text':text,'tokens':unique_tokens} return output
def test_bug_817234(self): iter = re.finditer(r".*", "asdf") self.assertEqual(iter.next().span(), (0, 4)) self.assertEqual(iter.next().span(), (4, 4)) self.assertRaises(StopIteration, iter.next)
def test_finditer(self): iter = re.finditer(r":+", "a:b::c:::d") self.assertEqual([item.group(0) for item in iter], [":", "::", ":::"])