예제 #1
0
 def __init__(self, max_consec=3):
     ignore = []
     for c in map(unichr, xrange(sys.maxunicode)):
         cat = unicodedata.category(c)
         if cat.startswith('N'):
             ignore.append(c)
     self._des = SequenceDestutterer(max_consec=max_consec, ignore=ignore)
     self._tok = CharacterTokenizer()
예제 #2
0
class TextDestutterer(Transformer):
    """
    Drop overly-repeated non-digit characters.
    """

    def __init__(self, max_consec=3):
        ignore = []
        for c in map(unichr, xrange(sys.maxunicode)):
            cat = unicodedata.category(c)
            if cat.startswith('N'):
                ignore.append(c)
        self._des = SequenceDestutterer(max_consec=max_consec, ignore=ignore)
        self._tok = CharacterTokenizer()

    def transform(self, texts):
        ccc = self._tok.transform(texts)
        ccc = self._des.transform(ccc)
        return map(lambda cc: ''.join(cc), ccc)