def __init__( self, outstrm=sys.stdout, language="en", is_raw_text=False, do_lowercase=True, require_whitespace_on_markup=True, skip_first_tab=False, ): self._tag_translation = {} detection_elements = [] for (el, sur, pos, lem, cpos) in tag_translation_table: if el in self._tag_translation: log_stderr("Skipping double definition of tag translation for '%s'" % el) else: self._tag_translation[el] = (sur, pos, lem, cpos) detection_elements.append(el) if not is_raw_text: detection_elements.append(u"<.*?>") detection_elements.append(u"\[.*?\]") if require_whitespace_on_markup: self._detection_rgx = re.compile(u"(?:^|\s)(" + u"|".join(detection_elements) + u")(?=\s|$)") else: self._detection_rgx = re.compile(u"(" + u"|".join(detection_elements) + u")") self._language = language self._txt2sufex = txt2sufex.processor(False, language) self._space = u"<s> </s>" self._outstrm = outstrm self._is_raw_text = is_raw_text self._do_lowercase = do_lowercase self._skip_first_tab = skip_first_tab
def __init__(self,lang = 'en'): self.tagger = txt2sufex.processor( insert_sentence_markers = False, language = lang) self.left_bracket_punct = frozenset((u'(',u'[',u'{')) self.right_bracket_punct = frozenset((u'.',u',',u';',u':',u')',u']',u'}',u'!',u'?')) self.annotators = [] self.pretag_processor = re.compile(u'^PP <([^>]+)>\s+')