def word_segment(): in_text = codecs.open('data/xuezhong.txt', 'r', encoding='UTF-8').read() pynlpir.open() # 添加自定义字典 nlpir.AddUserWord(c_char_p("徐骁".encode())) nlpir.AddUserWord(c_char_p("老怪物".encode())) nlpir.AddUserWord(c_char_p("徐渭熊".encode())) nlpir.AddUserWord(c_char_p("徐北枳".encode())) nlpir.AddUserWord(c_char_p("白狐儿脸".encode())) nlpir.AddUserWord(c_char_p("轩辕青锋".encode())) nlpir.AddUserWord(c_char_p("姜泥".encode())) nlpir.AddUserWord(c_char_p("大官子".encode())) nlpir.AddUserWord(c_char_p("北凉".encode())) nlpir.AddUserWord(c_char_p("小和尚".encode())) # 对文件分词 nlpir.FileProcess('data/xuezhong.txt'.encode("utf-8"), 'data/xuezhong_seg_1.txt'.encode("utf-8"), False) # key_words = pynlpir.get_key_words(in_text, max_words=100, weighted=True) pynlpir.close() print(key_words) print("segment finished")
def st_segment(): # 分词 pynlpir.open() # 添加自定义字典 nlpir.AddUserWord(c_char_p("三体".encode())) nlpir.AddUserWord(c_char_p("罗辑".encode())) # 对文件分词 nlpir.FileProcess('data/st.txt'.encode("utf-8"), 'data/segdata/st_seg.txt'.encode("utf-8"), False) pynlpir.close()
def st_WordCloud(): # 生成三体词云 in_text = codecs.open('data/st.txt', 'r', encoding='UTF-8').read() pynlpir.open() nlpir.AddUserWord(c_char_p("三体".encode())) nlpir.AddUserWord(c_char_p("罗辑".encode())) key_words = pynlpir.get_key_words(in_text, max_words=300, weighted=True) # 停用词 stopwords = pd.read_csv("data/stop_words.txt", index_col=False, quoting=3, sep="\n", names=['stopword'], encoding='utf-8') words = [word for word, wegiht in key_words] keywords_df = pd.DataFrame({'keywords': words}) # 去掉停用词 keywords_df = keywords_df[~keywords_df.keywords.isin(stopwords.stopword. tolist())] word_freq = [] for word in keywords_df.keywords.tolist(): for w, k in key_words: if word == w: word_freq.append((word, k)) pynlpir.close() print(word_freq) font = r'C:\Windows\Fonts\msyh.ttc' # 指定字体,不指定会报错 # color_mask = imread("resource/ge.jpg") # 读取背景图片 color_mask = imread("resource/timg.jpg") # 读取背景图片 wcloud = WordCloud( font_path=font, # 背景颜色 background_color="white", # 词云形状 mask=color_mask, # 允许最大词汇 max_words=2000, # 最大号字体 max_font_size=80) wcloud.generate_from_frequencies(dict(word_freq)) # 以下代码显示图片 plt.imshow(wcloud) plt.axis("off") plt.show() wcloud.to_file("data/wcimage/三体词云_2.png")
def ImDict(fname): with open(fname) as frd: for line in frd: tmp = line.strip() if tmp: tmp = tmp.decode("utf-8") if tmp: nlpir.AddUserWord(tmp.encode("utf-8"))
def import_AddUserWord(self, word): nlpir.AddUserWord(str(word))
def add_user_word(path): # 添加用户词典 [nlpir.AddUserWord(line.strip("\n")) for line in open(os.path.join(TEXT_RESOURCE, path))]