def replace_punct(self, token): """Replaces unicode punctuation marks with ones understood by ocamorph.""" try: if ispunct(token): token.encode(self._encoding) return token except UnicodeError: if isquot(token): return '"' else: return ','
def correct(self, analysis, original): """Inverts the xmlcharreplacements in the lemma, as well as replace_punct for unicode punctuation marks.""" word, crap = analysis # print "AAA", original.encode('utf-8'), word.encode('utf-8'), crap.encode('utf-8') if original == u'|': return (word, original + u'||PUNCT') try: lemma, stuff, derivation = crap.split('|') # If the original is a punctuation mark, tag it as such to avoid # problems with |, etc. Also, we include the original character, # not the one possibly replaced by replace_punct. if ispunct(original): return (word, original + u'||PUNCT') # Word not in the morphtable, or POS tag could not be determined. if crap == u'unknown||': lemma = word derivation = u'UNKNOWN' pieces = MorphAnalyzer.UNICODE_PATTERN.split(lemma) if len(pieces) > 1: for i in xrange(1, len(pieces), 2): pieces[i] = unichr(int(pieces[i])) lemma = u''.join(pieces) if len(derivation) == 0: parts = lemma.rsplit('?', 1) if len(parts) >= 2: lemma = parts[0] derivation = parts[1].rsplit('/', 1)[-1].upper() return (word, lemma + u'|' + stuff + u'|' + derivation) except ValueError, ve: logging.debug(ve) logging.debug(word + u" // " + crap) raise ve