class Parser(object): def __init__(self, encoding="utf8", mecab_option=default_option): self.encoding = encoding self.mecab_option = mecab_option self.tagger = MeCab.Tagger(self.mecab_option) self.normalizer = TextNormalizer() def node(self, s): try: if type(s) == str: s = s.decode(self.encoding) s = self.normalizer.normalize(s) s = s.encode(self.encoding) except: s = "" return self.tagger.parseToNode(s) def parse(self, s, to_unicode=False): node = self.node(s) ret = [] while node: surface = node.surface if surface != "": if to_unicode: surface = surface.decode(self.encoding) ret.append(surface) node = node.next return ret
class Parser(object): def __init__(self, encoding="utf8", mecab_option=default_option): self.encoding = encoding self.mecab_option = mecab_option self.tagger = MeCab.Tagger(self.mecab_option) self.normalizer = TextNormalizer() def node(self, s): try: if type(s) == str: s = s.decode(self.encoding) s = self.normalizer.normalize(s) s = s.encode(self.encoding) except: s = "" return self.tagger.parseToNode(s) def parse(self, s, to_unicode=False): node = self.node(s) ret = [] while node: surface = node.surface if surface != "": if to_unicode: surface = surface.decode(self.encoding) ret.append(surface) node = node.next return ret
#!/usr/bin/python #encoding: utf8 import sys from NormalizeText import TextNormalizer if __name__ == '__main__': #print sys.argv if len(sys.argv) < 2: normalizer = TextNormalizer() else: normalizer = TextNormalizer(*sys.argv[1:]) #print normalizer.normalize_methods while 1: s = raw_input() print normalizer.normalize(s.decode('utf8'))