def modern_chinese_tokenizer(raw_text): global TOKENIZER if TOKENIZER is not 'Modern': # reload mmseg to re-init reset_mmseg() #directory of modern dictionary dirname = os.path.dirname(__file__) dictionary = os.path.join(dirname, 'modern words.dic') mmseg.dict_load_defaults() mmseg.Dictionary.load_words(dictionary) TOKENIZER = 'Modern' # process text #print raw_text.encode('utf-8') tokenizer = mmseg.Algorithm(raw_text.encode('utf-8-sig')) tokens = [] for token in tokenizer: token = token.text.decode('utf-8-sig', errors='replace').replace(u'\x00', '') if token: if token not in chinese_punctuation: tokens.append(token) return tokens
def __init__(self, dict_chars=None, dict_words=None): if dict_chars: mmseg.mmseg_load_chars(dict_chars) elif dict_words: mmseg.mmseg_load_words(dict_words) else: mmseg.dict_load_defaults() Tokenizer.__init__(self, mmseg.Algorithm)
def benchmark(text): import time dict_load_defaults() print ">>>> load dict done!" for i in range(100): begin = time.time() wlist = [word for word in Algorithm(text)] end = time.time() print ">>>> times: %f" % float(end-begin)
""" Easy to use Chinese segmenter. Uses mmseg to do the real work. This is just some easy-to-use wrappers. """ __copyright__ = \ "Copyright 2010 Laurence Gonsalves <*****@*****.**>. GNU GPL v2." import sys import unicodedata from StringIO import StringIO import mmseg mmseg.dict_load_defaults() def Segment(s): """ Given a unicode string performs Chinese segmentation. Result is a list of unicode strings, each being one "segment". Nte that the underlying segmented will ocasionally throw out bits of text (particularly punctuation). This wrapper will preserve these substrings by including them as distinct "segments". """ assert type(s) is unicode s = s.encode('utf-8') tokens = mmseg.Algorithm(s) result = [] pos = 0
""" Easy to use Chinese segmenter. Uses mmseg to do the real work. This is just some easy-to-use wrappers. """ __copyright__ = \ "Copyright 2010 Laurence Gonsalves <*****@*****.**>. GNU GPL v2." import sys import unicodedata from StringIO import StringIO import mmseg mmseg.dict_load_defaults() def Segment(s): """ Given a unicode string performs Chinese segmentation. Result is a list of unicode strings, each being one "segment". Nte that the underlying segmented will ocasionally throw out bits of text (particularly punctuation). This wrapper will preserve these substrings by including them as distinct "segments". """ assert type(s) is unicode s = s.encode('utf-8') tokens = mmseg.Algorithm(s) result = []
def __init__(self): mmseg.dict_load_defaults()