def __init__(
        self,
        outstrm=sys.stdout,
        language="en",
        is_raw_text=False,
        do_lowercase=True,
        require_whitespace_on_markup=True,
        skip_first_tab=False,
    ):
        self._tag_translation = {}
        detection_elements = []
        for (el, sur, pos, lem, cpos) in tag_translation_table:
            if el in self._tag_translation:
                log_stderr("Skipping double definition of tag translation for '%s'" % el)
            else:
                self._tag_translation[el] = (sur, pos, lem, cpos)
                detection_elements.append(el)
        if not is_raw_text:
            detection_elements.append(u"<.*?>")
            detection_elements.append(u"\[.*?\]")
        if require_whitespace_on_markup:
            self._detection_rgx = re.compile(u"(?:^|\s)(" + u"|".join(detection_elements) + u")(?=\s|$)")
        else:
            self._detection_rgx = re.compile(u"(" + u"|".join(detection_elements) + u")")

        self._language = language
        self._txt2sufex = txt2sufex.processor(False, language)
        self._space = u"<s> </s>"
        self._outstrm = outstrm
        self._is_raw_text = is_raw_text
        self._do_lowercase = do_lowercase
        self._skip_first_tab = skip_first_tab
Beispiel #2
0
def lengthvar(S,M,query,stream,left_end = '$SB$',right_end = '$SE$'):
    for length in range(1,10):
        stream.write("Length %d:\n\n" % length)
        for left_length in range(length):
            right_length = length - left_length
            left_string  = left_end
            right_string = right_end
            if left_length > 0:
                left_string = '%s@=%d!|,+\(+\)|' % (left_end,left_length)
            if right_length > 0:
                right_string = '@=%d!|,+\(+\)|%s' % (right_length,right_end)
            modquery = left_string + query + right_string
            print modquery
            try:
                context = 150
                print "Trying context length %d" % context
                results = S.query(query=modquery,matcher=M,context=context)
            except StandardError,msg:
                log_stderr("An exception was thrown: %s,%s" % (msg,sys.exc_info()))
                return None
            text_match_sets = results[0]
            for match_set in text_match_sets:
                for match in match_set:
                    stream.write(match[0].encode('utf-8'))
    def process_line(self, uline, mark_beginning=True, mark_end=True):
        uline = uline.strip()
        # The process pipe will be filled with fragments of text. Some
        # of them will be the result of applying tag translation or
        # lemma translation parts of the original line, others will be
        # copies of parts of the original line that still need to be
        # POS tagged. Each fragment will be accompanied with a flag
        # that is set to True if the fragment needs POS tagging.
        if self._skip_first_tab:
            tab_pos = uline.find(u"\t")
            if tab_pos > -1 and (tab_pos + 1) < len(uline):
                first_tab = uline[0:tab_pos]
                uline = uline[tab_pos + 1 :]
            else:
                first_tab = u"-"
        else:
            first_tab = u""
        process_pipe = []
        last_pos = 0
        for it in self._detection_rgx.finditer(uline):
            cur_pos = it.start(1)
            if cur_pos > last_pos:
                process_pipe.append((uline[last_pos:cur_pos], True))
                process_pipe.append((self._space, False))
            cur_el = it.group(1)
            if len(cur_el) > 0:
                if (not self._is_raw_text) and cur_el[0] == u"<" and cur_el[-1] == u">":
                    # <...> elements are translated as lemmatized forms
                    if len(cur_el) > 2:
                        lem = cur_el[1:-1]
                        sur = u"#" + lem + u"#"
                        pos = u"VV"
                        cpos = u"vi"
                        process_pipe.append((el2sufex(sur, pos, lem, cpos), False))
                        process_pipe.append((self._space, False))
                elif (not self._is_raw_text) and cur_el[0] == u"[" and cur_el[-1] == u"]":
                    # [...] elements are translated as lemmatized
                    # forms like <...>, but alternatively they may
                    # contain a POS after a double-hash symbol ##
                    if len(cur_el) > 2:
                        lem = cur_el[1:-1]
                        slashpos = lem.find(u"##")
                        if slashpos >= 0:
                            sur = lem[:slashpos]
                            pos = lem[slashpos + 2 :]
                            lem = sur
                            cpos = pos
                        else:
                            sur = u"#" + lem + u"#"
                            pos = u"VV"
                            cpos = u"vi"
                        process_pipe.append((el2sufex(sur, pos, lem, cpos), False))
                        process_pipe.append((self._space, False))
                else:
                    if cur_el not in self._tag_translation:
                        log_stderr(u"Found unknown element '%s'" % cur_el)
                    else:
                        (sur, pos, lem, cpos) = self._tag_translation[cur_el]
                        process_pipe.append((el2sufex(sur, pos, lem, cpos), False))
                        process_pipe.append((self._space, False))
            last_pos = it.end(1)
        if len(uline) > last_pos:
            process_pipe.append((uline[last_pos:], True))

        if mark_beginning:
            self._outstrm.write('<s tag="SB"> </s>')
        if self._skip_first_tab:
            self._outstrm.write((u'<m tag="m">%s</m><s> </s>' % first_tab).encode("utf-8"))
        for (fragment, need_tagging) in process_pipe:
            if need_tagging:
                self._txt2sufex.process_one_line(fragment, self._outstrm, do_lowercase=self._do_lowercase)
            else:
                self._outstrm.write(fragment.encode("utf-8"))
        if mark_end:
            self._outstrm.write('<s tag="SE">.</s>\n')
Beispiel #4
0
def match(S,M,query,stream):
    try:
        results = S.query(query=query,matcher=M)
    except StandardError,msg:
        log_stderr("An exception was thrown: %s,%s" % (msg,sys.exc_info()))
        return None