def k_means_mine(o_weight, o_features, k): """ 完成K-Means聚类,把同类的词语分在一起,分别保存为临时文件 :param o_weight: 词权重 :param o_features: 特征词 :param k: 簇类 :return: null """ weight, features, cluster = o_weight, o_features, k print('================== KMenas ====================') centroids, index_distance = my_k_means(weight, cluster) tmp = centroids.argsort()[:, ::-1] for i in range(cluster): print("cluster %d:" % i, end='') tmp_words_txt = open("./data/tmp4kmeans/Cluster{}.txt".format(i), "w", encoding='utf-8') for j in tmp.A[i, :50]: print(' %s ' % features[j], end='') tmp_words_txt.write(features[j] + '\n') tmp_words_txt.close() print() print('需要词云处理的kmeans临时文件已经生成,\n正在等待可视化模块的调用,请稍候...') clear_all_var() return
def analysis(start_time, end_time, topic_id, k): """ 统合前面的所有函数 :param start_time: 开始时间 :param end_time: 结束时间 :param topic_id: 话题ID :param k: 簇类 :return: 作为其他函数处理的标志 """ start = time.time() data = connect_mysql(start_time, end_time, topic_id) try: print(data['CONTENT'][0]) except Exception: print('您所查日期内并没有值,告辞') return 0 collector = jieba_cut(data) x, y = tf_idf(collector) k_means_mine(x, y, k) # 根据词频生成词云 wc4tf_idf(topic_id, k) # 根据KMeans生成词云 wc4k_means(topic_id, k) print("================== 词云已经生成 ====================") end = time.time() print("分析结束!运行时间: %f" % (end - start) + 's') clear_all_var() return 1
def get_stop_words(): """ 读取data下setup中的自定义数据 """ stop_words = set() stop_words_file = open('./data/setup/word_cloud_stop_words.txt', encoding='utf-8') for tt in stop_words_file.readlines(): stop_words.add(tt.strip('\n')) stop_words_file.close() clear_all_var()
def get_abstract(data, a, b, c): """ 生成excel可以打开的的摘要文件 :param data: MySQL从取出的数据 :param a: 起始时间的 :param b: 结束时间 :param c: 话题ID :return: null """ now_path = os.getcwd() path = now_path.replace('\\', '/') tr4s = TextRank4Sentence() print('当前文章的摘要:') results = [] for i in range(len(data['CONTENT'])): # i = re.sub("[^\u4e00-\u9fa5]", '', i) # 记住只留文本?没有断句算不上摘要,这里需要用其他方式处理 # print('\u3002 \uff1b \uff0c \uff1a \u201c \u201d' # '\uff08 \uff09 \u3001 \uff1f \u300a \u300b') # # 。 ; , : “ ”( ) 、 ? 《 》 tmp = re.sub( "[^\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b0-9]", '', data['CONTENT'][i]) tr4s.analyze(text=tmp, lower=True) result = '' # print() # print('摘要:') for item in tr4s.get_key_sentences(num=3): # print(item.index, item.weight, item.sentence) result += item.sentence if len(result) != 0: results.append([ data['UPTIME'][i], data['TITLE'][i], data['AUTHOR'][i], result ]) # data['CONTENT'][i] = results column_name = ['更新时间', '标题/题目', '作者', '摘要'] tmp_text = pd.DataFrame(columns=column_name, data=results) tmp_text.to_csv('./data/textrank/topic{}_{}-{}abstract.csv'.format( c, a, b), encoding='utf_8_sig') print('>>>>>>>>>>>>>> 已经保存到csv等待计算或查看 >>>>>>>>>>>>>>>') # os.startfile(now_path + '/data/textrank/topic{}_{}-{}abstract.csv'.format(c, a, b)) # 弹出工作表 # 但是这里需要完整的工作路径才行 T:/AC/Python/PublicOpinionMonitor/data/textrank/topic{}_abstract.csv # print(results) # print(' >>>>>> >>>>>>>>> 将在10秒后关闭excel >>>>>>> >>>>>>>> ') time.sleep(10) clear_all_var() return
def wc4k_means(topic_id, k): """ 主要根据临时文件的数据,在固定大小的情况下,先单独一张张地生成图片,再拼接起来 :param topic_id: :param k: """ tmp_width = int(1920 / k) for i in range(k): words = set() for line in open('./data/tmp4kmeans/Cluster{}.txt'.format(i), 'r', encoding='utf-8'): words.add(line.strip('\n')) words = ' '.join(words) # <class 'str'> wc = WordCloud(font_path="./data/msyh.ttc", max_words=100, width=tmp_width, height=1080, background_color='white', stopwords=get_stop_words(), colormap=get_color()) word_cloud = wc.generate(words) # 写词云图片 word_cloud.to_file("./data/wordcloud/wc_k_means_topic{}_{}.png".format( topic_id, i)) # 显示词云文件 plt.imshow(word_cloud) plt.axis("off") # plt.show() print("第 {}/{} 张分图已经切片完成".format(i + 1, k)) print('开始拼图') image_collector = [] for i in range(k): tmp = Image.open('./data/wordcloud/wc_k_means_topic{}_{}.png'.format( topic_id, i)) image_collector.append(tmp) width, height = image_collector[0].size result = Image.new(image_collector[0].mode, (width * len(image_collector), height)) for i, im in enumerate(image_collector): result.paste(im, box=(i * width, 0)) result.save( './data/wordcloud/wc_k_means_topic{}_result.png'.format(topic_id)) print("图片拼接完成,正在弹出图片,请稍候...") # Image.open('./data/wordcloud/wc_k_means_topic{}_result.png'.format(topic_id)).show() clear_all_var()
def html_report(a, b, c): """ 统合前面的函数并组装HTML报告 :param a: 开始时间 :param b: 结束时间 :param c: 话题id :return: """ x, y = get_tpm2(a, b, c) print(a, b, c, type(a), type(b), type(c)) data = get_tpm(a, b, c) page = Page(layout=Page.SimplePageLayout, page_title='舆情分析统计报告') page.add( scatter_sentiment(x, y), pie_html(data), ) page.render("./data/reportforms/report_topic{}_{}-{}.html".format(c, a, b)) print('================== 动态网页组装完成,已保存 ==================') sao_operation = ''' <h3 style="text-align:center">TF-IDF 关键词/主旨词汇提取</h1> <p style="text-align:center"><img src="../wordcloud/wc_tf_idf_topic{}.png" width=85% /></p> <h3 style="text-align:center">K-Means 聚类词汇展示</h2> <p style="text-align:center"><img src="../wordcloud/wc_k_means_topic{}_result.png" width=85% /></p> <h3 style="text-align:center">其他 资料</h2> <p style="text-align:center"><a href="../textrank/topic{}_{}-{}abstract.csv">点击打开您查询日期内的csv摘要文件</a></p> <p style="text-align:center"> 注: 以上所有报告均已去除无效值(仅含文本数据) </p> </body> </html> '''.format(c, c, c, a, b) with open('./data/reportforms/report_topic{}_{}-{}.html'.format(c, a, b), 'r', encoding='utf-8') as h: content = h.read() content = content.replace('</body>\n</html>', '') with open('./data/reportforms/report_topic{}_{}-{}.html'.format(c, a, b), 'w', encoding='utf-8') as h: tmp = content + sao_operation h.write(tmp) now_path = os.getcwd() path = now_path.replace('\\', '/') os.startfile(path + '/data/reportforms/report_topic{}_{}-{}.html'.format(c, a, b)) clear_all_var() return
def analysis_sentiment(a, b, c): """ 在摘要中追加注入感情的倾向值 :param a: 开始时间 :param b: 结束 :param c: 话题ID :return: null """ # 防读取错误 time.sleep(3) cc = os.system('tasklist|find /i "excel.exe"') while cc == 0: os.system('taskkill /IM excel.exe') time.sleep(3) print('>>>> 正在尝试关闭了已经打开的csv文件, >>>>> >>>> >>>>') cc = os.system('tasklist|find /i "excel.exe"') else: print('>>>> 不存在打开的csv文件,开始执行情感分析 >>>>> >>>> >>>>') pass csv_file = './data/textrank/topic{}_{}-{}abstract.csv'.format(c, a, b) csv_data = pd.read_csv(csv_file, low_memory=False) # 防止弹出警告 # 情感分析 results = [] for i in csv_data['摘要']: s = SnowNLP(i) results.append(s.sentiments) csv_data['sentiment'] = results csv_data.to_csv('./data/textrank/topic{}_{}-{}abstract.csv'.format( c, a, b), encoding='utf_8_sig', index=False) print('............ 情感值追加注入完成 ...........') # 直接计算数据库的数据太过庞大,改为计算摘要的情感 clear_all_var() return
def get_abs(start_time, end_time, topic_id): """ 统合上面的所有功能 :param start_time: 开始时间 :param end_time: 结束时间 :param topic_id: 话题id :return: null """ start = time.time() tmp = data_get(start_time, end_time, topic_id) get_abstract(tmp, start_time, end_time, topic_id) # return # for i in range(len(tmp['CONTENT'])): # print(tmp['TITLE'][i]) # print(tmp['AUTHOR'][i]) # break # success! # 注意:无法追加写入打开了的文件。请关闭后重试(直接在这里面关闭文件夹算了 analysis_sentiment(start_time, end_time, topic_id) # 内容为空的时候无法进行下一步计算 print(' 文本摘要和情感分析模块 消耗了 {} s'.format(time.time() - start)) clear_all_var() return
def wc4tf_idf(topic_id, k): """ TF-IDF生成词云 :param topic_id: :param k: """ frequencies = {} for line in open('./data/tmp4tfidf/tmp4wordcloud.txt', encoding='utf-8'): arr = line.split(" ") frequencies[arr[0]] = float(arr[1]) wc = WordCloud(font_path="./data/msyh.ttc", max_words=300, width=1920, height=1080, background_color='white', stopwords=get_stop_words()) word_cloud = wc.generate_from_frequencies(frequencies) # 写词云图片 word_cloud.to_file( "./data/wordcloud/wc_tf_idf_topic{}.png".format(topic_id)) # 显示词云文件 print('TF / IDF 词频图云已经完成,正在弹出图片,请稍候...') # Image.open("./data/wordcloud/wc_tf_idf_topic{}.png".format(topic_id)).show() clear_all_var()
def real_time_mode(self): sys.stdout = EmittingStream(textWritten=self.hello) # schedule.every(7).day.at('08:00').do(self.analysis_btn_click()) # schedule.every(5).minutes.do(self.analysis_btn_click()) key_words, start_time, end_time, k = self.blank_check() if key_words == 0 and start_time == 0 and end_time == 0 and k == 0: QMessageBox.about(self, '无法查询', '请输入关键字,关键字是必须的') return elif key_words == 1 and start_time == 1 and end_time == 1 and k == 1: QMessageBox.about(self, '无法查询', '麻烦正确输入日期') return else: pass # print('实施模式下,为保证程序稳定性,只分析最近一个月的数据') end_time = datetime.datetime.now().strftime('%Y%m%d') if int(start_time) > int(end_time): self.info.setText('本系统不支持查询未来') return search = SearchTerm(key_words) # search.topic_id, note = get_topic_id(search.keywords, search.header) # IP出现问题时的临时解决办法:单机指定:测试模式 # 19570564 武汉ID 19639859 境外ID 19622792 痛苦ID 19564412 恋爱ID search.topic_id = "19564412" note = 'IP或网站出现问题,临时解决措施:测试模式' print(search.keywords, search.topic_id, note) if self.checkBox.isChecked(): # print('实时模式:准备就绪\n为保证程序稳定性,建议只分析最近一个月的数据') try: global t # 准确的时间 t = Timer(1200, self.real_time_mode) # 5min # 推荐是按天算的,所以没有判断文件存在的选项,必定不一样 t.start() # 测试模式:数据暂时无法获取 print("测试模式:使用本地数据,跳过数据获取") # spider(search.keywords, search.topic_id, search.header) time.sleep(5) get_abs(start_time, end_time, search.topic_id) time.sleep(5) analysis(start_time, end_time, search.topic_id, k) time.sleep(5) html_report(start_time, end_time, search.topic_id) time.sleep(5) mail_report(start_time, end_time, search.topic_id) time.sleep(5) # self.query_btn_click() # 自动查询 TODO 注意这里是默认执行程序,需要更改 # self.analysis_btn_click() # 自动分析 clear_all_var() except Exception: print('本轮出现了一个错误,跳过\n请检查日期内数据是否存在') # 看需不需要发送错误报告 finally: # global t # 模糊的时间 # t = Timer(300, self.real_time_mode) # 5min # 推荐是按天算的,所以没有判断文件存在的选项,必定不一样 # t.start() pass else: self.analysis_btn_click() return
def analysis_btn_click(self): key_words, start_time, end_time, k = self.blank_check() if key_words == 0 and start_time == 0 and end_time == 0 and k == 0: QMessageBox.about(self, '无法查询', '请输入关键字,关键字是必须的') return elif key_words == 1 and start_time == 1 and end_time == 1 and k == 1: QMessageBox.about(self, '无法查询', '麻烦正确输入日期') return else: pass self.img_change0.setStyleSheet("image: url(./data/tmp4design/舆情监控红500.png);") self.setWindowIcon(QIcon('./data/tmp4design/网络舆情红.png')) sys.stdout = EmittingStream(textWritten=self.update_info) search = SearchTerm(key_words) # search.topic_id, note = get_topic_id(search.keywords, search.header) # IP出现问题时的临时解决办法:单机指定:测试模式 # 19570564 武汉ID 19639859 境外ID 19575118 奋斗ID 19622792 痛苦ID search.topic_id = "19622792" note = 'IP或网站出现问题,临时解决措施:测试模式' print(search.keywords, search.topic_id, note) try: print('================== 数据分析:开始 ====================') cqc = analysis(start_time, end_time, search.topic_id, int(k)) if cqc == 0: self.img_change0.setStyleSheet("image: url(./data/tmp4design/舆情监控蓝500.png);") self.setWindowIcon(QIcon('./data/tmp4design/负面舆情帽子蓝.png')) return print('================== 聚类完成,图片生成 ====================') time.sleep(3) try: html_report(start_time, end_time, search.topic_id) except Exception: print('================== 没有生成文摘,优先生成文摘 ====================') time.sleep(5) get_abs(start_time, end_time, search.topic_id) finally: html_report(start_time, end_time, search.topic_id) print('================== 已经弹出了HTML报告 ====================') time.sleep(3) reply = QMessageBox.question(self, '来自邮件系统:', "需要讲HTML发送到邮箱作为历史保存嘛?(建议发送)\n如要发送,请关闭相关文件", QMessageBox.Yes | QMessageBox.No, QMessageBox.Yes) if reply == QMessageBox.Yes: try: mail_report(start_time, end_time, search.topic_id) print('================== 已发送相关文件 ====================') except Exception: print('发送失败,请检查') else: pass self.update_info(' {} : {} 分析完成 请检查\n相关文件存放在data下面,如有需要请自行查看'.format(key_words, search.topic_id)) except Exception: self.warning_message('有错误发生,请检查!\n建议增加单词分析的数据量(增大时间跨度)') self.img_change0.setStyleSheet("image: url(./data/tmp4design/舆情监控蓝500.png);") self.setWindowIcon(QIcon('./data/tmp4design/负面舆情帽子蓝.png')) return self.img_change0.setStyleSheet("image: url(./data/tmp4design/舆情监控蓝500.png);") self.setWindowIcon(QIcon('./data/tmp4design/负面舆情帽子蓝.png')) clear_all_var() return
def mail_report(a, b, topic_id): # 路径 now_path = os.getcwd() path = now_path.replace('\\', '/') x_code = "your code" img1_path = path + "/data/wordcloud/wc_tf_idf_topic{}.png".format(topic_id) img2_path = path + "/data/wordcloud/wc_k_means_topic{}_result.png".format( topic_id) csv_file = path + '/data/textrank/topic{}_{}-{}abstract.csv'.format( topic_id, a, b) html_file = path + '/data/reportforms/report_topic{}_{}-{}.html'.format( topic_id, a, b) content = MIMEMultipart() image1 = MIMEImage(open(img1_path, 'rb').read(), _subtype='octet-stream') image1.add_header('Content-Disposition', 'attachment', filename=img1_path) content.attach(image1) image2 = MIMEImage(open(img2_path, 'rb').read(), _subtype='octet-stream') image2.add_header('Content-Disposition', 'attachment', filename=img2_path) content.attach(image2) csv1 = MIMEApplication(open(csv_file, 'rb').read()) csv1.add_header('Content-Disposition', 'attachment', filename=csv_file) content.attach(csv1) html1 = MIMEApplication(open(html_file, 'rb').read()) html1.add_header('Content-Disposition', 'attachment', filename=html_file) content.attach(html1) with open(img1_path, 'rb') as im1: msg_image1 = MIMEImage(im1.read()) msg_image1.add_header('Content-ID', '<image1>') content.attach(msg_image1) with open(img2_path, 'rb') as im2: msg_image2 = MIMEImage(im2.read()) msg_image2.add_header('Content-ID', '<image2>') content.attach(msg_image2) msg = ''' <h1>舆情分析简报:</h1> <h2 style="color:red">This is a tf-idf:</h1> <p><img src="cid:image1" width=75% /></p> <h2 style="color:blue">This is a k-means:</h1> <p><img src="cid:image2" width=75% /></p> <p>如果想看详细报告,请手动下载附件4:HTML网页报告</p> ''' message = MIMEText(msg, 'html', 'utf-8') content.attach(message) content['Subject'] = '舆情监控系统 汇报 20分钟定时 模式 {}'.format( check_time(time.time())) content['To'] = 'to [email protected]' content['From'] = "from [email protected]" server = smtplib.SMTP_SSL("smtp.qq.com", port=465) try: server.login("from [email protected]", x_code) print('login success!!!') server.sendmail("from [email protected]", 'to [email protected]', content.as_string()) print('Send Successful!') except Exception: print( '邮箱出现了一点异常,数据保存在文件夹下data文件中的reporforms中\n文件名为:report_topic{}_{}-{}.html' .format(topic_id, a, b)) finally: clear_all_var() return