def __init__(
        self,
        outstrm=sys.stdout,
        language="en",
        is_raw_text=False,
        do_lowercase=True,
        require_whitespace_on_markup=True,
        skip_first_tab=False,
    ):
        self._tag_translation = {}
        detection_elements = []
        for (el, sur, pos, lem, cpos) in tag_translation_table:
            if el in self._tag_translation:
                log_stderr("Skipping double definition of tag translation for '%s'" % el)
            else:
                self._tag_translation[el] = (sur, pos, lem, cpos)
                detection_elements.append(el)
        if not is_raw_text:
            detection_elements.append(u"<.*?>")
            detection_elements.append(u"\[.*?\]")
        if require_whitespace_on_markup:
            self._detection_rgx = re.compile(u"(?:^|\s)(" + u"|".join(detection_elements) + u")(?=\s|$)")
        else:
            self._detection_rgx = re.compile(u"(" + u"|".join(detection_elements) + u")")

        self._language = language
        self._txt2sufex = txt2sufex.processor(False, language)
        self._space = u"<s> </s>"
        self._outstrm = outstrm
        self._is_raw_text = is_raw_text
        self._do_lowercase = do_lowercase
        self._skip_first_tab = skip_first_tab
Beispiel #2
0
 def __init__(self,lang = 'en'):
     self.tagger = txt2sufex.processor(
         insert_sentence_markers = False, language = lang)
     self.left_bracket_punct  = frozenset((u'(',u'[',u'{'))
     self.right_bracket_punct = frozenset((u'.',u',',u';',u':',u')',u']',u'}',u'!',u'?'))
     self.annotators          = []
     self.pretag_processor    = re.compile(u'^PP <([^>]+)>\s+')