def __init__( self, outstrm=sys.stdout, language="en", is_raw_text=False, do_lowercase=True, require_whitespace_on_markup=True, skip_first_tab=False, ): self._tag_translation = {} detection_elements = [] for (el, sur, pos, lem, cpos) in tag_translation_table: if el in self._tag_translation: log_stderr("Skipping double definition of tag translation for '%s'" % el) else: self._tag_translation[el] = (sur, pos, lem, cpos) detection_elements.append(el) if not is_raw_text: detection_elements.append(u"<.*?>") detection_elements.append(u"\[.*?\]") if require_whitespace_on_markup: self._detection_rgx = re.compile(u"(?:^|\s)(" + u"|".join(detection_elements) + u")(?=\s|$)") else: self._detection_rgx = re.compile(u"(" + u"|".join(detection_elements) + u")") self._language = language self._txt2sufex = txt2sufex.processor(False, language) self._space = u"<s> </s>" self._outstrm = outstrm self._is_raw_text = is_raw_text self._do_lowercase = do_lowercase self._skip_first_tab = skip_first_tab
def lengthvar(S,M,query,stream,left_end = '$SB$',right_end = '$SE$'): for length in range(1,10): stream.write("Length %d:\n\n" % length) for left_length in range(length): right_length = length - left_length left_string = left_end right_string = right_end if left_length > 0: left_string = '%s@=%d!|,+\(+\)|' % (left_end,left_length) if right_length > 0: right_string = '@=%d!|,+\(+\)|%s' % (right_length,right_end) modquery = left_string + query + right_string print modquery try: context = 150 print "Trying context length %d" % context results = S.query(query=modquery,matcher=M,context=context) except StandardError,msg: log_stderr("An exception was thrown: %s,%s" % (msg,sys.exc_info())) return None text_match_sets = results[0] for match_set in text_match_sets: for match in match_set: stream.write(match[0].encode('utf-8'))
def process_line(self, uline, mark_beginning=True, mark_end=True): uline = uline.strip() # The process pipe will be filled with fragments of text. Some # of them will be the result of applying tag translation or # lemma translation parts of the original line, others will be # copies of parts of the original line that still need to be # POS tagged. Each fragment will be accompanied with a flag # that is set to True if the fragment needs POS tagging. if self._skip_first_tab: tab_pos = uline.find(u"\t") if tab_pos > -1 and (tab_pos + 1) < len(uline): first_tab = uline[0:tab_pos] uline = uline[tab_pos + 1 :] else: first_tab = u"-" else: first_tab = u"" process_pipe = [] last_pos = 0 for it in self._detection_rgx.finditer(uline): cur_pos = it.start(1) if cur_pos > last_pos: process_pipe.append((uline[last_pos:cur_pos], True)) process_pipe.append((self._space, False)) cur_el = it.group(1) if len(cur_el) > 0: if (not self._is_raw_text) and cur_el[0] == u"<" and cur_el[-1] == u">": # <...> elements are translated as lemmatized forms if len(cur_el) > 2: lem = cur_el[1:-1] sur = u"#" + lem + u"#" pos = u"VV" cpos = u"vi" process_pipe.append((el2sufex(sur, pos, lem, cpos), False)) process_pipe.append((self._space, False)) elif (not self._is_raw_text) and cur_el[0] == u"[" and cur_el[-1] == u"]": # [...] elements are translated as lemmatized # forms like <...>, but alternatively they may # contain a POS after a double-hash symbol ## if len(cur_el) > 2: lem = cur_el[1:-1] slashpos = lem.find(u"##") if slashpos >= 0: sur = lem[:slashpos] pos = lem[slashpos + 2 :] lem = sur cpos = pos else: sur = u"#" + lem + u"#" pos = u"VV" cpos = u"vi" process_pipe.append((el2sufex(sur, pos, lem, cpos), False)) process_pipe.append((self._space, False)) else: if cur_el not in self._tag_translation: log_stderr(u"Found unknown element '%s'" % cur_el) else: (sur, pos, lem, cpos) = self._tag_translation[cur_el] process_pipe.append((el2sufex(sur, pos, lem, cpos), False)) process_pipe.append((self._space, False)) last_pos = it.end(1) if len(uline) > last_pos: process_pipe.append((uline[last_pos:], True)) if mark_beginning: self._outstrm.write('<s tag="SB"> </s>') if self._skip_first_tab: self._outstrm.write((u'<m tag="m">%s</m><s> </s>' % first_tab).encode("utf-8")) for (fragment, need_tagging) in process_pipe: if need_tagging: self._txt2sufex.process_one_line(fragment, self._outstrm, do_lowercase=self._do_lowercase) else: self._outstrm.write(fragment.encode("utf-8")) if mark_end: self._outstrm.write('<s tag="SE">.</s>\n')
def match(S,M,query,stream): try: results = S.query(query=query,matcher=M) except StandardError,msg: log_stderr("An exception was thrown: %s,%s" % (msg,sys.exc_info())) return None