def segment_sents(sents): if not os.environ.get('STANFORD_SEGMENTER'): os.environ['STANFORD_SEGMENTER'] = STANFORD_SEGMENTER segmenter = StanfordSegmenter() tokens = segmenter.segment_sents(sents) return tokens
def nltk_cn(): print '--------------------nltk-cn分词------------------' os.environ['JAVAHOME'] = "F://Java//jdk1.8.0_51//bin/" #注意添加环境变量的方法 sentence = u"这是斯坦福中文分词器测试" stanford_dir="D:\Software\stanford-segmenter-2014-08-27" segmenter = StanfordSegmenter( path_to_jar=os.path.join(stanford_dir,"stanford-segmenter-3.4.1.jar"), path_to_slf4j = os.path.join(stanford_dir,"slf4j-api-1.5.2.jar"), path_to_sihan_corpora_dict=os.path.join(stanford_dir,"./data"), path_to_model=os.path.join(stanford_dir,"./data/pku.gz"), path_to_dict=os.path.join(stanford_dir,"./data/dict-chris6.ser.gz")) segres=segmenter.segment(sentence) print segres