Beispiel #1
0
def k_means_mine(o_weight, o_features, k):
    """
    完成K-Means聚类,把同类的词语分在一起,分别保存为临时文件
    :param o_weight: 词权重
    :param o_features: 特征词
    :param k: 簇类
    :return: null
    """
    weight, features, cluster = o_weight, o_features, k
    print('================== KMenas ====================')

    centroids, index_distance = my_k_means(weight, cluster)

    tmp = centroids.argsort()[:, ::-1]

    for i in range(cluster):
        print("cluster %d:" % i, end='')
        tmp_words_txt = open("./data/tmp4kmeans/Cluster{}.txt".format(i), "w", encoding='utf-8')
        for j in tmp.A[i, :50]:
            print(' %s ' % features[j], end='')
            tmp_words_txt.write(features[j] + '\n')
        tmp_words_txt.close()
        print()
    print('需要词云处理的kmeans临时文件已经生成,\n正在等待可视化模块的调用,请稍候...')
    clear_all_var()
    return
Beispiel #2
0
def analysis(start_time, end_time, topic_id, k):
    """
    统合前面的所有函数
    :param start_time: 开始时间
    :param end_time: 结束时间
    :param topic_id: 话题ID
    :param k: 簇类
    :return: 作为其他函数处理的标志
    """
    start = time.time()
    data = connect_mysql(start_time, end_time, topic_id)
    try:
        print(data['CONTENT'][0])
    except Exception:
        print('您所查日期内并没有值,告辞')
        return 0

    collector = jieba_cut(data)
    x, y = tf_idf(collector)

    k_means_mine(x, y, k)

    # 根据词频生成词云
    wc4tf_idf(topic_id, k)
    # 根据KMeans生成词云
    wc4k_means(topic_id, k)
    print("================== 词云已经生成 ====================")
    end = time.time()
    print("分析结束!运行时间: %f" % (end - start) + 's')
    clear_all_var()
    return 1
Beispiel #3
0
def get_stop_words():
    """
    读取data下setup中的自定义数据
    """
    stop_words = set()
    stop_words_file = open('./data/setup/word_cloud_stop_words.txt',
                           encoding='utf-8')
    for tt in stop_words_file.readlines():
        stop_words.add(tt.strip('\n'))
    stop_words_file.close()
    clear_all_var()
Beispiel #4
0
def get_abstract(data, a, b, c):
    """
    生成excel可以打开的的摘要文件
    :param data: MySQL从取出的数据
    :param a: 起始时间的
    :param b: 结束时间
    :param c: 话题ID
    :return: null
    """
    now_path = os.getcwd()
    path = now_path.replace('\\', '/')
    tr4s = TextRank4Sentence()

    print('当前文章的摘要:')
    results = []
    for i in range(len(data['CONTENT'])):

        # i = re.sub("[^\u4e00-\u9fa5]", '', i)  # 记住只留文本?没有断句算不上摘要,这里需要用其他方式处理
        # print('\u3002 \uff1b \uff0c \uff1a \u201c \u201d'
        #       '\uff08 \uff09 \u3001 \uff1f \u300a \u300b')
        # # 。 ; , : “ ”( ) 、 ? 《 》
        tmp = re.sub(
            "[^\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b0-9]",
            '', data['CONTENT'][i])
        tr4s.analyze(text=tmp, lower=True)
        result = ''
        # print()
        # print('摘要:')
        for item in tr4s.get_key_sentences(num=3):
            # print(item.index, item.weight, item.sentence)
            result += item.sentence
        if len(result) != 0:
            results.append([
                data['UPTIME'][i], data['TITLE'][i], data['AUTHOR'][i], result
            ])
        # data['CONTENT'][i] = results
    column_name = ['更新时间', '标题/题目', '作者', '摘要']

    tmp_text = pd.DataFrame(columns=column_name, data=results)
    tmp_text.to_csv('./data/textrank/topic{}_{}-{}abstract.csv'.format(
        c, a, b),
                    encoding='utf_8_sig')
    print('>>>>>>>>>>>>>>  已经保存到csv等待计算或查看  >>>>>>>>>>>>>>>')
    # os.startfile(now_path + '/data/textrank/topic{}_{}-{}abstract.csv'.format(c, a, b))  # 弹出工作表
    # 但是这里需要完整的工作路径才行 T:/AC/Python/PublicOpinionMonitor/data/textrank/topic{}_abstract.csv
    # print(results)
    # print(' >>>>>> >>>>>>>>>   将在10秒后关闭excel   >>>>>>> >>>>>>>> ')
    time.sleep(10)
    clear_all_var()
    return
Beispiel #5
0
def wc4k_means(topic_id, k):
    """
    主要根据临时文件的数据,在固定大小的情况下,先单独一张张地生成图片,再拼接起来
    :param topic_id:
    :param k:
    """
    tmp_width = int(1920 / k)
    for i in range(k):
        words = set()
        for line in open('./data/tmp4kmeans/Cluster{}.txt'.format(i),
                         'r',
                         encoding='utf-8'):
            words.add(line.strip('\n'))
        words = ' '.join(words)  # <class 'str'>
        wc = WordCloud(font_path="./data/msyh.ttc",
                       max_words=100,
                       width=tmp_width,
                       height=1080,
                       background_color='white',
                       stopwords=get_stop_words(),
                       colormap=get_color())
        word_cloud = wc.generate(words)
        # 写词云图片
        word_cloud.to_file("./data/wordcloud/wc_k_means_topic{}_{}.png".format(
            topic_id, i))
        # 显示词云文件
        plt.imshow(word_cloud)
        plt.axis("off")
        # plt.show()
        print("第 {}/{} 张分图已经切片完成".format(i + 1, k))

    print('开始拼图')
    image_collector = []
    for i in range(k):
        tmp = Image.open('./data/wordcloud/wc_k_means_topic{}_{}.png'.format(
            topic_id, i))
        image_collector.append(tmp)
    width, height = image_collector[0].size
    result = Image.new(image_collector[0].mode,
                       (width * len(image_collector), height))
    for i, im in enumerate(image_collector):
        result.paste(im, box=(i * width, 0))
    result.save(
        './data/wordcloud/wc_k_means_topic{}_result.png'.format(topic_id))
    print("图片拼接完成,正在弹出图片,请稍候...")
    # Image.open('./data/wordcloud/wc_k_means_topic{}_result.png'.format(topic_id)).show()
    clear_all_var()
Beispiel #6
0
def html_report(a, b, c):
    """
    统合前面的函数并组装HTML报告
    :param a: 开始时间
    :param b: 结束时间
    :param c: 话题id
    :return:
    """
    x, y = get_tpm2(a, b, c)
    print(a, b, c, type(a), type(b), type(c))
    data = get_tpm(a, b, c)
    page = Page(layout=Page.SimplePageLayout, page_title='舆情分析统计报告')
    page.add(

        scatter_sentiment(x, y),
        pie_html(data),
    )
    page.render("./data/reportforms/report_topic{}_{}-{}.html".format(c, a, b))
    print('==================  动态网页组装完成,已保存 ==================')

    sao_operation = '''
    <h3 style="text-align:center">TF-IDF 关键词/主旨词汇提取</h1>
    <p style="text-align:center"><img src="../wordcloud/wc_tf_idf_topic{}.png" width=85% /></p>
    <h3 style="text-align:center">K-Means 聚类词汇展示</h2>
    <p style="text-align:center"><img src="../wordcloud/wc_k_means_topic{}_result.png" width=85% /></p>
    <h3 style="text-align:center">其他 资料</h2>
    <p style="text-align:center"><a href="../textrank/topic{}_{}-{}abstract.csv">点击打开您查询日期内的csv摘要文件</a></p>
    <p style="text-align:center"> 注: 以上所有报告均已去除无效值(仅含文本数据) </p>
    </body>
    </html>
    '''.format(c, c, c, a, b)

    with open('./data/reportforms/report_topic{}_{}-{}.html'.format(c, a, b), 'r', encoding='utf-8') as h:
        content = h.read()
        content = content.replace('</body>\n</html>', '')
    with open('./data/reportforms/report_topic{}_{}-{}.html'.format(c, a, b), 'w', encoding='utf-8') as h:
        tmp = content + sao_operation
        h.write(tmp)
    now_path = os.getcwd()
    path = now_path.replace('\\', '/')
    os.startfile(path + '/data/reportforms/report_topic{}_{}-{}.html'.format(c, a, b))
    clear_all_var()
    return
Beispiel #7
0
def analysis_sentiment(a, b, c):
    """
    在摘要中追加注入感情的倾向值
    :param a: 开始时间
    :param b: 结束
    :param c: 话题ID
    :return: null
    """
    # 防读取错误
    time.sleep(3)
    cc = os.system('tasklist|find /i "excel.exe"')
    while cc == 0:
        os.system('taskkill /IM excel.exe')

        time.sleep(3)
        print('>>>> 正在尝试关闭了已经打开的csv文件, >>>>> >>>> >>>>')
        cc = os.system('tasklist|find /i "excel.exe"')
    else:
        print('>>>> 不存在打开的csv文件,开始执行情感分析 >>>>> >>>> >>>>')
        pass

    csv_file = './data/textrank/topic{}_{}-{}abstract.csv'.format(c, a, b)
    csv_data = pd.read_csv(csv_file, low_memory=False)  # 防止弹出警告

    # 情感分析
    results = []
    for i in csv_data['摘要']:
        s = SnowNLP(i)
        results.append(s.sentiments)

    csv_data['sentiment'] = results
    csv_data.to_csv('./data/textrank/topic{}_{}-{}abstract.csv'.format(
        c, a, b),
                    encoding='utf_8_sig',
                    index=False)
    print('............  情感值追加注入完成   ...........')

    # 直接计算数据库的数据太过庞大,改为计算摘要的情感
    clear_all_var()
    return
Beispiel #8
0
def get_abs(start_time, end_time, topic_id):
    """
    统合上面的所有功能
    :param start_time: 开始时间
    :param end_time: 结束时间
    :param topic_id: 话题id
    :return: null
    """
    start = time.time()
    tmp = data_get(start_time, end_time, topic_id)
    get_abstract(tmp, start_time, end_time, topic_id)
    # return
    # for i in range(len(tmp['CONTENT'])):
    #     print(tmp['TITLE'][i])
    #     print(tmp['AUTHOR'][i])
    #     break  # success!

    # 注意:无法追加写入打开了的文件。请关闭后重试(直接在这里面关闭文件夹算了
    analysis_sentiment(start_time, end_time, topic_id)  # 内容为空的时候无法进行下一步计算
    print(' 文本摘要和情感分析模块 消耗了 {} s'.format(time.time() - start))
    clear_all_var()
    return
Beispiel #9
0
def wc4tf_idf(topic_id, k):
    """
    TF-IDF生成词云
    :param topic_id:
    :param k:
    """
    frequencies = {}
    for line in open('./data/tmp4tfidf/tmp4wordcloud.txt', encoding='utf-8'):
        arr = line.split(" ")
        frequencies[arr[0]] = float(arr[1])
    wc = WordCloud(font_path="./data/msyh.ttc",
                   max_words=300,
                   width=1920,
                   height=1080,
                   background_color='white',
                   stopwords=get_stop_words())
    word_cloud = wc.generate_from_frequencies(frequencies)
    # 写词云图片
    word_cloud.to_file(
        "./data/wordcloud/wc_tf_idf_topic{}.png".format(topic_id))
    # 显示词云文件
    print('TF / IDF 词频图云已经完成,正在弹出图片,请稍候...')
    # Image.open("./data/wordcloud/wc_tf_idf_topic{}.png".format(topic_id)).show()
    clear_all_var()
Beispiel #10
0
    def real_time_mode(self):
        sys.stdout = EmittingStream(textWritten=self.hello)
        # schedule.every(7).day.at('08:00').do(self.analysis_btn_click())
        # schedule.every(5).minutes.do(self.analysis_btn_click())
        key_words, start_time, end_time, k = self.blank_check()
        if key_words == 0 and start_time == 0 and end_time == 0 and k == 0:
            QMessageBox.about(self, '无法查询', '请输入关键字,关键字是必须的')
            return
        elif key_words == 1 and start_time == 1 and end_time == 1 and k == 1:
            QMessageBox.about(self, '无法查询', '麻烦正确输入日期')
            return
        else:
            pass
        # print('实施模式下,为保证程序稳定性,只分析最近一个月的数据')
        end_time = datetime.datetime.now().strftime('%Y%m%d')
        if int(start_time) > int(end_time):
            self.info.setText('本系统不支持查询未来')
            return

        search = SearchTerm(key_words)
        # search.topic_id, note = get_topic_id(search.keywords, search.header)
        # IP出现问题时的临时解决办法:单机指定:测试模式
        # 19570564 武汉ID 19639859 境外ID 19622792 痛苦ID 19564412 恋爱ID
        search.topic_id = "19564412"
        note = 'IP或网站出现问题,临时解决措施:测试模式'
        print(search.keywords, search.topic_id, note)
        if self.checkBox.isChecked():
            # print('实时模式:准备就绪\n为保证程序稳定性,建议只分析最近一个月的数据')
            try:

                global t  # 准确的时间
                t = Timer(1200, self.real_time_mode)  # 5min # 推荐是按天算的,所以没有判断文件存在的选项,必定不一样
                t.start()
                # 测试模式:数据暂时无法获取
                print("测试模式:使用本地数据,跳过数据获取")
                # spider(search.keywords, search.topic_id, search.header)
                time.sleep(5)
                get_abs(start_time, end_time, search.topic_id)
                time.sleep(5)
                analysis(start_time, end_time, search.topic_id, k)
                time.sleep(5)
                html_report(start_time, end_time, search.topic_id)
                time.sleep(5)
                mail_report(start_time, end_time, search.topic_id)
                time.sleep(5)

                # self.query_btn_click()  # 自动查询  TODO 注意这里是默认执行程序,需要更改
                # self.analysis_btn_click()  # 自动分析
                clear_all_var()

            except Exception:
                print('本轮出现了一个错误,跳过\n请检查日期内数据是否存在')
                # 看需不需要发送错误报告

            finally:
                # global t  # 模糊的时间
                # t = Timer(300, self.real_time_mode)  # 5min # 推荐是按天算的,所以没有判断文件存在的选项,必定不一样
                # t.start()
                pass

        else:
            self.analysis_btn_click()
        return
Beispiel #11
0
    def analysis_btn_click(self):

        key_words, start_time, end_time, k = self.blank_check()
        if key_words == 0 and start_time == 0 and end_time == 0 and k == 0:
            QMessageBox.about(self, '无法查询', '请输入关键字,关键字是必须的')
            return
        elif key_words == 1 and start_time == 1 and end_time == 1 and k == 1:
            QMessageBox.about(self, '无法查询', '麻烦正确输入日期')
            return
        else:
            pass
        self.img_change0.setStyleSheet("image: url(./data/tmp4design/舆情监控红500.png);")
        self.setWindowIcon(QIcon('./data/tmp4design/网络舆情红.png'))
        sys.stdout = EmittingStream(textWritten=self.update_info)
        search = SearchTerm(key_words)
        # search.topic_id, note = get_topic_id(search.keywords, search.header)
        # IP出现问题时的临时解决办法:单机指定:测试模式
        # 19570564 武汉ID 19639859 境外ID 19575118 奋斗ID 19622792 痛苦ID
        search.topic_id = "19622792"
        note = 'IP或网站出现问题,临时解决措施:测试模式'
        print(search.keywords, search.topic_id, note)

        try:

            print('================== 数据分析:开始 ====================')

            cqc = analysis(start_time, end_time, search.topic_id, int(k))
            if cqc == 0:
                self.img_change0.setStyleSheet("image: url(./data/tmp4design/舆情监控蓝500.png);")
                self.setWindowIcon(QIcon('./data/tmp4design/负面舆情帽子蓝.png'))
                return
            print('================== 聚类完成,图片生成 ====================')
            time.sleep(3)


            try:
                html_report(start_time, end_time, search.topic_id)
            except Exception:
                print('================== 没有生成文摘,优先生成文摘 ====================')
                time.sleep(5)
                get_abs(start_time, end_time, search.topic_id)
            finally:
                html_report(start_time, end_time, search.topic_id)
            print('================== 已经弹出了HTML报告 ====================')
            time.sleep(3)

            reply = QMessageBox.question(self, '来自邮件系统:',
                                         "需要讲HTML发送到邮箱作为历史保存嘛?(建议发送)\n如要发送,请关闭相关文件", QMessageBox.Yes |
                                         QMessageBox.No, QMessageBox.Yes)
            if reply == QMessageBox.Yes:
                try:
                    mail_report(start_time, end_time, search.topic_id)
                    print('================== 已发送相关文件 ====================')
                except Exception:
                    print('发送失败,请检查')
            else:
                pass
            self.update_info(' {} : {} 分析完成 请检查\n相关文件存放在data下面,如有需要请自行查看'.format(key_words, search.topic_id))

        except Exception:
            self.warning_message('有错误发生,请检查!\n建议增加单词分析的数据量(增大时间跨度)')
            self.img_change0.setStyleSheet("image: url(./data/tmp4design/舆情监控蓝500.png);")
            self.setWindowIcon(QIcon('./data/tmp4design/负面舆情帽子蓝.png'))
            return
        self.img_change0.setStyleSheet("image: url(./data/tmp4design/舆情监控蓝500.png);")
        self.setWindowIcon(QIcon('./data/tmp4design/负面舆情帽子蓝.png'))
        clear_all_var()
        return
Beispiel #12
0
def mail_report(a, b, topic_id):
    # 路径
    now_path = os.getcwd()
    path = now_path.replace('\\', '/')

    x_code = "your code"
    img1_path = path + "/data/wordcloud/wc_tf_idf_topic{}.png".format(topic_id)
    img2_path = path + "/data/wordcloud/wc_k_means_topic{}_result.png".format(
        topic_id)
    csv_file = path + '/data/textrank/topic{}_{}-{}abstract.csv'.format(
        topic_id, a, b)
    html_file = path + '/data/reportforms/report_topic{}_{}-{}.html'.format(
        topic_id, a, b)

    content = MIMEMultipart()

    image1 = MIMEImage(open(img1_path, 'rb').read(), _subtype='octet-stream')
    image1.add_header('Content-Disposition', 'attachment', filename=img1_path)
    content.attach(image1)
    image2 = MIMEImage(open(img2_path, 'rb').read(), _subtype='octet-stream')
    image2.add_header('Content-Disposition', 'attachment', filename=img2_path)
    content.attach(image2)
    csv1 = MIMEApplication(open(csv_file, 'rb').read())
    csv1.add_header('Content-Disposition', 'attachment', filename=csv_file)
    content.attach(csv1)
    html1 = MIMEApplication(open(html_file, 'rb').read())
    html1.add_header('Content-Disposition', 'attachment', filename=html_file)
    content.attach(html1)

    with open(img1_path, 'rb') as im1:
        msg_image1 = MIMEImage(im1.read())
        msg_image1.add_header('Content-ID', '<image1>')
        content.attach(msg_image1)
    with open(img2_path, 'rb') as im2:
        msg_image2 = MIMEImage(im2.read())
        msg_image2.add_header('Content-ID', '<image2>')
        content.attach(msg_image2)

    msg = '''
    <h1>舆情分析简报:</h1>
    <h2 style="color:red">This is a tf-idf:</h1>
    <p><img src="cid:image1" width=75% /></p>
    <h2 style="color:blue">This is a k-means:</h1>
    <p><img src="cid:image2" width=75% /></p>
    <p>如果想看详细报告,请手动下载附件4:HTML网页报告</p>
    '''

    message = MIMEText(msg, 'html', 'utf-8')
    content.attach(message)

    content['Subject'] = '舆情监控系统 汇报 20分钟定时 模式 {}'.format(
        check_time(time.time()))
    content['To'] = 'to [email protected]'
    content['From'] = "from [email protected]"

    server = smtplib.SMTP_SSL("smtp.qq.com", port=465)
    try:
        server.login("from [email protected]", x_code)
        print('login success!!!')
        server.sendmail("from [email protected]", 'to [email protected]',
                        content.as_string())
        print('Send Successful!')
    except Exception:
        print(
            '邮箱出现了一点异常,数据保存在文件夹下data文件中的reporforms中\n文件名为:report_topic{}_{}-{}.html'
            .format(topic_id, a, b))
    finally:
        clear_all_var()
        return