def preprocessing(single_comment): """ 这是一个预处理过程,包括分词,去停词,去除数字,去除特殊符号 :param single_comment: 一条单独的文档(注意:不是整篇大文档,类似于分析购物评论中的一条评论) :return: [['单词', '单词', '单词',...], ['单词', '单词', '单词',...], ['', '', '',...],......] """ jieba.load_userdict('D:\Pycharm\PycharmProjects\Class/jieba_dict/dict.txt') jieba.load_userdict( 'D:\Pycharm\PycharmProjects\Class/jieba_dict/coal_dict.txt') jieba.load_userdict( 'D:\Pycharm\PycharmProjects\Class/jieba_dict/user_dictionary.txt') comment0 = re.sub('\u3000', '', single_comment) # 去掉一些字符,例如\u3000 comment1 = re.sub(r'&[a-z]*', '', comment0) comment2 = re.sub(r'\ufffd', '', comment1) comment3 = re.sub('\u3000', '', comment2) comment4 = re.sub( r'\d+MM|\d+mm|\d+CM|\d+cm|\d+V|\d+v|\d+A|\d+m|\d+M|\d+w|\d+W', 'param', comment3) comment5 = re.sub(r'\d+\.\d+|\d+', 'num', comment4) comment6 = SnowNLP(comment5).han comment7 = re.sub(r'博世|博士|Bosch|BOSCH|bosch', '博世', comment6) comment8 = re.sub(r'小威|WORX|威克士|worx|wx|WX', '威克士', comment7) comment2words = jieba.__lcut(comment8) stop_words = open('./stop_words.txt', 'r').readlines() for i in range(comment2words.__len__())[::-1]: if comment2words[i] in stop_words: # 去除停用词 comment2words.pop(i) elif comment2words[i].isdigit(): comment2words.pop(i) return comment2words
def load_data(in_file): cn = [] en = [] num_examples = 0 with open(in_file, 'r') as f: for line in f: line = line.strip().split("\t") en.append(["BOS"] + nltk.word_tokenize(line[0].lower()) + ["EOS"]) cn.append(["BOS"] + [c for c in jieba.__lcut(line[1])] + ["EOS"]) return en, cn
def Get_cloud(list): #list = Get_txt() with open('test.txt', 'w') as file: for i in range(0, len(list)): y = str(list[i]) # print(jieba.__lcut(y)) for j in range(0, len(jieba.__lcut(y[16:-7]))): if len(jieba.__lcut(y[16:-7])[j]) > 1 and Check_word( jieba.__lcut(y[16:-7])[j]) == 0: file.write(jieba.__lcut(y[16:-7])[j] + " ") file.write("\n") with open('test.txt', 'r') as sentence: # print(type(sentence.read())) wc = wordcloud.WordCloud(font_path='STXINGKA.TTF', width=671, height=400, prefer_horizontal=0.8, max_words=50) wc.generate(sentence.read()) wc.to_file("cloud.png")
def cut(sentence): """ 分词,去除停用词 :param sentence: 评论 :return: 词列表 """ jieba.load_userdict('./jieba_dict/user_dictionary.txt') with open('./stopwords.txt', 'r', encoding='UTF-8') as input_file: stopwords = input_file.readlines() words = [] for word in jieba.__lcut(sentence): if word not in stopwords: words.append(word) return words
def lda(lines,stopwords): """lda主题""" sentences = [] for line in lines: try: text = line[1].replace("\n", "").replace(" ", "").replace("\t", "") segs = jieba.__lcut(text) segs = filter(lambda x: len(x) > 1, segs) segs = [seg for seg in list(segs) if seg not in stopwords] sentences.append(segs) except Exception as e: print(e) # 词袋模型 dictionary = corpora.Dictionary(sentences) corpus = [dictionary.doc2bow(_sentence) for _sentence in sentences] lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20) # 主题模型打印 print(lda.print_topics()) wors={} for topic in lda.print_topics(): words=topic[1].split("+") for word in words: ss=[ii.replace(" ","").replace("\"","") for ii in word.split("*")] print(wors.get(ss[1],0),ss[0],wors.get(ss[1],0)+float(ss[0])) wors[ss[1]]=wors.get(ss[1],0)+float(ss[0]) # print(ss) wors={x:float('%.3f'%y) for x,y in wors.items()} # 合并词 data_dic = {'count': wors} data_df = pd.DataFrame(data_dic) data_df = data_df.reset_index().sort_values(by=["count"], ascending=False) print(data_df[:10]["index"]) print(data_df[:10].index) print(data_df[:10]["count"]) number = numpy.array(data_df[:10]["count"].values*1000) work_type = data_df[:10]["index"].values labels = tuple(work_type) fracs = number print(labels) plt.pie(x=fracs, labels=labels, autopct='%.0f%%') # autopct显示百分比 plt.show()
def word_count(lines,stopwords): # 词频统计 segment = [] for line in lines: try: text = line[1].replace("\n", "").replace(" ", "").replace("\t", "") segs = jieba.__lcut(text) for seg in segs: if len(seg) > 1 and seg != '\r\n' and seg not in stopwords: segment.append(seg) # print(segment) except Exception as e: print(e) words_df = pd.DataFrame({'segment': segment}) words_stat = words_df.groupby(by=['segment'])['segment'].agg(["size"]) words_stat = words_stat[1300:] words_stat = words_stat.reset_index().sort_values(by=["size"], ascending=False) print(words_stat[:1500]) wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80) word_frequence = {x[0]: x[1] for x in words_stat.head(1500).values} wordcloud = wordcloud.fit_words(word_frequence) plt.imshow(wordcloud) plt.show()
def test2word(value): if isinstance(value, str) == True: newWord = jieba.__lcut(value) return newWord
m = re.sub("[\!\%\[\]\,\。\()\-\~]", " ", s) ''' 提取数字 ''' num = re.sub("\D", " ", m) ''' 提取文字 ''' b = re.sub("\d", " ", m) # print(b.strip()) # print(b.lstrip()) # print(list(b.strip())) c = b.strip() newlist = [] new = jieba.__lcut(c) # print(type(new)) print(new) """ 不可使用for循环删除空格,如果尾向也是空格则无法删除 需使用: 1, while '' in test: test.remove('') 2, mytest = [i for i in test if i != ''] """ # mytest = [i for i in new if i != ' '] # print(mytest) for i in new: if i != ' ': newlist.append(i)
""" import jieba content = "现如今,机器学习和深度学习带动人工智能飞速的发展,并在图片处理、语音识别领域取得巨大成功。" # 精准分词 segs_1 = jieba.cut(content, cut_all=False) print("*".join(segs_1)) # 全模式分词 segs_2 = jieba.cut(content, cut_all=True) print("*".join(segs_2)) # 搜索引擎模式分词 segs_3 = jieba.cut_for_search(content) print("*".join(segs_3)) # 封装成列表返回 segs_5 = jieba.__lcut(content) print(segs_5) # 获取词性 import jieba.posseg as psg print([(x.word, x.flag) for x in psg.__lcut_internal(content)]) # count 词出现的次数 from collections import Counter top5 = Counter(segs_5).most_common(5) print(top5) # text = "铁甲网是中国最大的工程机械交易平台。"