-
Notifications
You must be signed in to change notification settings - Fork 0
/
MeCabParser.py
43 lines (37 loc) · 1.24 KB
/
MeCabParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/python
import os
from NormalizeText import TextNormalizer
import MeCab
HOME = os.environ["HOME"]
default_option = "-d %s/mecab-ipadic-2.7.0-20070801 -u %s/mecab-dic-overdrive/misc/dic/wikipedia.dic" % (HOME, HOME)
class Parser(object):
def __init__(self, encoding="utf8", mecab_option=default_option):
self.encoding = encoding
self.mecab_option = mecab_option
self.tagger = MeCab.Tagger(self.mecab_option)
self.normalizer = TextNormalizer()
def node(self, s):
try:
if type(s) == str:
s = s.decode(self.encoding)
s = self.normalizer.normalize(s)
s = s.encode(self.encoding)
except:
s = ""
return self.tagger.parseToNode(s)
def parse(self, s, to_unicode=False):
node = self.node(s)
ret = []
while node:
surface = node.surface
if surface != "":
if to_unicode:
surface = surface.decode(self.encoding)
ret.append(surface)
node = node.next
return ret
if __name__ == '__main__':
parser = Parser()
while 1:
for x in parser.parse(raw_input("input:" ), to_unicode=True):
print x