コード例 #1
0
def segment_sents(sents):
    if not os.environ.get('STANFORD_SEGMENTER'):
        os.environ['STANFORD_SEGMENTER'] = STANFORD_SEGMENTER
        
    segmenter = StanfordSegmenter()
    tokens = segmenter.segment_sents(sents)

    return tokens
コード例 #2
0
def nltk_cn():
	print '--------------------nltk-cn分词------------------'
	os.environ['JAVAHOME'] = "F://Java//jdk1.8.0_51//bin/"  #注意添加环境变量的方法
	sentence = u"这是斯坦福中文分词器测试"
	stanford_dir="D:\Software\stanford-segmenter-2014-08-27"
	segmenter = StanfordSegmenter(
    path_to_jar=os.path.join(stanford_dir,"stanford-segmenter-3.4.1.jar"),
    path_to_slf4j = os.path.join(stanford_dir,"slf4j-api-1.5.2.jar"),
    path_to_sihan_corpora_dict=os.path.join(stanford_dir,"./data"),
    path_to_model=os.path.join(stanford_dir,"./data/pku.gz"),
    path_to_dict=os.path.join(stanford_dir,"./data/dict-chris6.ser.gz"))
	segres=segmenter.segment(sentence)
	print segres