コード例 #1
0
    def modern_chinese_tokenizer(raw_text):
        global TOKENIZER
        if TOKENIZER is not 'Modern':
            # reload mmseg to re-init
            reset_mmseg()
    
            #directory of modern dictionary
            dirname = os.path.dirname(__file__)
            dictionary = os.path.join(dirname, 'modern words.dic')
            mmseg.dict_load_defaults()
            mmseg.Dictionary.load_words(dictionary)
            TOKENIZER = 'Modern'

        # process text
        #print raw_text.encode('utf-8')
        tokenizer = mmseg.Algorithm(raw_text.encode('utf-8-sig'))

        tokens = []
        for token in tokenizer:
            token = token.text.decode('utf-8-sig', errors='replace').replace(u'\x00', '')
            if token:
                if token not in chinese_punctuation:
                    tokens.append(token)

        return tokens
コード例 #2
0
ファイル: getkeywords.py プロジェクト: zy-sunshine/Thesis
 def __init__(self, dict_chars=None, dict_words=None):
     if dict_chars:
         mmseg.mmseg_load_chars(dict_chars)
     elif dict_words:
         mmseg.mmseg_load_words(dict_words)
     else:
         mmseg.dict_load_defaults()
     Tokenizer.__init__(self, mmseg.Algorithm)
コード例 #3
0
 def __init__(self, dict_chars=None, dict_words=None):
     if dict_chars:
         mmseg.mmseg_load_chars(dict_chars)
     elif dict_words:
         mmseg.mmseg_load_words(dict_words)
     else:
         mmseg.dict_load_defaults()
     Tokenizer.__init__(self, mmseg.Algorithm)
コード例 #4
0
ファイル: tests.py プロジェクト: JamzyWang/SMSClassification
def benchmark(text):
    import time
    dict_load_defaults()
    print ">>>> load dict done!"

    for i in range(100):
        begin = time.time()
        wlist = [word for word in Algorithm(text)]
        end = time.time()
        print ">>>> times: %f" % float(end-begin)
コード例 #5
0
ファイル: tests.py プロジェクト: ClarePhang/nlp-2
def benchmark(text):
    import time
    dict_load_defaults()
    print ">>>> load dict done!"

    for i in range(100):
        begin = time.time()
        wlist = [word for word in Algorithm(text)]
        end = time.time()
        print ">>>> times: %f" % float(end-begin)
コード例 #6
0
ファイル: zhsegmenter.py プロジェクト: xenomachina/public
"""
Easy to use Chinese segmenter.

Uses mmseg to do the real work. This is just some easy-to-use wrappers.
"""

__copyright__ = \
    "Copyright 2010 Laurence Gonsalves <*****@*****.**>. GNU GPL v2."

import sys
import unicodedata
from StringIO import StringIO

import mmseg
mmseg.dict_load_defaults()

def Segment(s):
  """
  Given a unicode string performs Chinese segmentation.

  Result is a list of unicode strings, each being one "segment". Nte
  that the underlying segmented will ocasionally throw out bits of text
  (particularly punctuation). This wrapper will preserve these
  substrings by including them as distinct "segments".
  """
  assert type(s) is unicode
  s = s.encode('utf-8')
  tokens = mmseg.Algorithm(s)
  result = []
  pos = 0
コード例 #7
0
"""
Easy to use Chinese segmenter.

Uses mmseg to do the real work. This is just some easy-to-use wrappers.
"""

__copyright__ = \
    "Copyright 2010 Laurence Gonsalves <*****@*****.**>. GNU GPL v2."

import sys
import unicodedata
from StringIO import StringIO

import mmseg

mmseg.dict_load_defaults()


def Segment(s):
    """
  Given a unicode string performs Chinese segmentation.

  Result is a list of unicode strings, each being one "segment". Nte
  that the underlying segmented will ocasionally throw out bits of text
  (particularly punctuation). This wrapper will preserve these
  substrings by including them as distinct "segments".
  """
    assert type(s) is unicode
    s = s.encode('utf-8')
    tokens = mmseg.Algorithm(s)
    result = []
コード例 #8
0
ファイル: segmentor.py プロジェクト: SigmaQuan/NLP
 def __init__(self):
     mmseg.dict_load_defaults()
コード例 #9
0
 def __init__(self):
     mmseg.dict_load_defaults()