Example #1
0
 def replace_punct(self, token):
     """Replaces unicode punctuation marks with ones understood by
     ocamorph."""
     try:
         if ispunct(token):
             token.encode(self._encoding)
         return token
     except UnicodeError:
         if isquot(token):
             return '"'
         else:
             return ','
Example #2
0
 def replace_punct(self, token):
     """Replaces unicode punctuation marks with ones understood by
     ocamorph."""
     try:
         if ispunct(token):
             token.encode(self._encoding)
         return token
     except UnicodeError:
         if isquot(token):
             return '"'
         else:
             return ','
Example #3
0
    def correct(self, analysis, original):
        """Inverts the xmlcharreplacements in the lemma, as well as
        replace_punct for unicode punctuation marks."""
        word, crap = analysis
        #        print "AAA", original.encode('utf-8'), word.encode('utf-8'), crap.encode('utf-8')
        if original == u'|':
            return (word, original + u'||PUNCT')
        try:
            lemma, stuff, derivation = crap.split('|')

            # If the original is a punctuation mark, tag it as such to avoid
            # problems with |, etc. Also, we include the original character,
            # not the one possibly replaced by replace_punct.
            if ispunct(original):
                return (word, original + u'||PUNCT')

            # Word not in the morphtable, or POS tag could not be determined.
            if crap == u'unknown||':
                lemma = word
                derivation = u'UNKNOWN'

            pieces = MorphAnalyzer.UNICODE_PATTERN.split(lemma)
            if len(pieces) > 1:
                for i in xrange(1, len(pieces), 2):
                    pieces[i] = unichr(int(pieces[i]))
                lemma = u''.join(pieces)

            if len(derivation) == 0:
                parts = lemma.rsplit('?', 1)
                if len(parts) >= 2:
                    lemma = parts[0]
                    derivation = parts[1].rsplit('/', 1)[-1].upper()
            return (word, lemma + u'|' + stuff + u'|' + derivation)
        except ValueError, ve:
            logging.debug(ve)
            logging.debug(word + u" // " + crap)
            raise ve
Example #4
0
    def correct(self, analysis, original):
        """Inverts the xmlcharreplacements in the lemma, as well as
        replace_punct for unicode punctuation marks."""
        word, crap = analysis
#        print "AAA", original.encode('utf-8'), word.encode('utf-8'), crap.encode('utf-8')
        if original == u'|':
            return (word, original + u'||PUNCT')
        try:
            lemma, stuff, derivation = crap.split('|')

            # If the original is a punctuation mark, tag it as such to avoid
            # problems with |, etc. Also, we include the original character,
            # not the one possibly replaced by replace_punct.
            if ispunct(original):
                return (word, original + u'||PUNCT')

            # Word not in the morphtable, or POS tag could not be determined.
            if crap == u'unknown||':
                lemma = word
                derivation = u'UNKNOWN'

            pieces = MorphAnalyzer.UNICODE_PATTERN.split(lemma)
            if len(pieces) > 1:
                for i in xrange(1, len(pieces), 2):
                    pieces[i] = unichr(int(pieces[i]))
                lemma = u''.join(pieces)

            if len(derivation) == 0:
                parts = lemma.rsplit('?', 1)
                if len(parts) >= 2:
                    lemma = parts[0]
                    derivation = parts[1].rsplit('/', 1)[-1].upper()
            return (word, lemma + u'|' + stuff + u'|' + derivation)
        except ValueError, ve:
            logging.debug(ve)
            logging.debug(word + u" // " + crap)
            raise ve