# 作者:田丰(FontTian) # 创建时间:'2017/8/17' import nltk from ftools import cnlp import matplotlib as mpl mpl.rcParams[u'font.sans-serif'] = [u'KaiTi'] mpl.rcParams[u'font.serif'] = [u'KaiTi'] # mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题,或者转换负号为字符串 stopwords_path = u'/home/fonttian/Data/NLP/stopwords/CNENstopwords.txt' # 停用词词表 text_path = u'/home/fonttian/Data/NLP/txt/lz.txt' # 设置要分析的文本路径 fontsTools = cnlp.basis(textPath=text_path, stopwordsPath=stopwords_path, ReadText=True) fontsTools.addUserWords([u'路明非']) cutAndclearStr = fontsTools.getText() print(u'nltk 文本分析基本操作') print(u'nltk.word_tokenize(cutAndclearStr') print(u'nltk.FreqDist(tokenstr)') tokenstr = nltk.word_tokenize(cutAndclearStr) print(u'====== 其他基本指标并绘图 ======') print(u'下面是nltk的其他类 ')
# - * - coding: utf - 8 -*- # 作者:田丰(FontTian) # 创建时间:'2017/8/12' from os import path from scipy.misc import imread from ftools import cnlp d = path.dirname(__file__) text_path = d + '/txt/lztest.txt' # 设置要分析的文本路径 stopwords_path = d + '/stopwords/CNENstopwords.txt' # 停用词词表 fontsTools = cnlp.basis(textPath=text_path, stopwordsPath=stopwords_path) fontsTools.addUserWords([u'路明非']) font_path = d + '/Fonts/simkai.ttf' # 为worldcloud设置中文字体路径没 back_coloring_path = d + '/img/lz1.jpg' # 设置背景图片路径 imgname1 = d + '/WordCloudDefautColors.png' # 保存的图片名字1(只按照背景图片形状) imgname2 = d + '/WordCloudColorsByImg.png' # 保存的图片名字2(颜色按照背景图片颜色布局生成) back_coloring = imread(path.join(d, back_coloring_path)) # 设置词云的形状 # 设置词云属性 wc = fontsTools.wordcloud( text=None, image_path=back_coloring_path, to_file_path=imgname2, show=True, font_path=font_path, # 设置字体