Ejemplo n.º 1
0
    def test_bug_581080(self):
        iter = re.finditer(r"\s", "a b")
        self.assertEqual(iter.next().span(), (1,2))
        self.assertRaises(StopIteration, iter.next)

        scanner = re.compile(r"\s").scanner("a b")
        self.assertEqual(scanner.search().span(), (1, 2))
        self.assertEqual(scanner.search(), None)
Ejemplo n.º 2
0
 def test_dealloc(self):
     # PCRE: disabled, we're not testing _sre here
     # issue 3299: check for segfault in debug build
     #import _sre
     # the overflow limit is different on wide and narrow builds and it
     # depends on the definition of SRE_CODE (see sre.h).
     # 2**128 should be big enough to overflow on both. For smaller values
     # a RuntimeError is raised instead of OverflowError.
     #long_overflow = 2**128
     # PCRE: finditer is implemented as generator function -- next() has to be called
     # to throw the error
     #self.assertRaises(TypeError, re.finditer, "a", {})
     self.assertRaises(TypeError, re.finditer("a", {}).next)
Ejemplo n.º 3
0
def extract_entities(text,deduplication=False):
    sentences = split_setences(text)
    text_preprocessed = remove_accents(text)
    text_preprocessed = remove_digits(text_preprocessed)

    regexp2bat = build_regexpression()
    matches = re.finditer(regexp2bat, text_preprocessed)

    pt_patterns = PortuguesePatterns()

    phase_0_entities = []
    for matchNum, match in enumerate(matches):
        matchNum = matchNum + 1
        phase_0_entities.append(match.group())
    phase_1_entities = []
    for token in phase_0_entities:
	    doc = nlp(token)
	    prefix_pos=doc[0].pos_
	    if(prefix_pos in pt_patterns.tags_exclusions):
		    token=token.replace(doc[0].text,'',1)
		    doc[0].pos_=''
	    if(doc[0].pos_=='' and len(doc)>1):
	        if(doc[1].pos_ in pt_patterns.tags_exclusions):
		        token=token.replace(doc[1].text,'',1)
	    if(token.strip()!=''):
		    phase_1_entities.append(token)

    unique_tokens = []
    for token in phase_1_entities:
        if(token != '' and len(token) > 2):
            token = token.strip()
            if not(token in pt_patterns.stopwords or token in pt_patterns.preprositions):
                if(deduplication):
                    if not(token in unique_tokens):
                        unique_tokens.append(token)
                else:
                    unique_tokens.append(token)

    origin="("+"|".join(unique_tokens)+")" 
    text=re.split(origin,text)
    output={'text':text,'tokens':unique_tokens}

    return output
Ejemplo n.º 4
0
 def test_bug_817234(self):
     iter = re.finditer(r".*", "asdf")
     self.assertEqual(iter.next().span(), (0, 4))
     self.assertEqual(iter.next().span(), (4, 4))
     self.assertRaises(StopIteration, iter.next)
Ejemplo n.º 5
0
 def test_finditer(self):
     iter = re.finditer(r":+", "a:b::c:::d")
     self.assertEqual([item.group(0) for item in iter],
                      [":", "::", ":::"])