def wechat0327(): message_conn = cd.MySQLCommand() message_conn.connectMysql(table="wechat_message") contact_conn = cd.MySQLCommand() contact_conn.connectMysql(table="wechat_contact") chatroomNums = message_conn.select_distinct() with open("firstTime.txt", "a", encoding="utf-8") as f: for chatroomNum in chatroomNums: title_list = ["createTime"] situation = "WHERE talker = '%s'" % chatroomNum cursor = message_conn.select_order(title_list, situation, order_title="createTime") informationTime = cursor.fetchone()[0] firstTime = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(int(informationTime) / 1000)) chatroomName = contact_conn.select_order(["nickname"], "WHERE username = '******'" % chatroomNum).fetchone()[0] f.write(chatroomName + "\t第一次发言:\t" + firstTime + "\n") contact_conn.closeMysql() message_conn.closeMysql()
def getFile(chatroomNum): """ 通过聊天群编号获得聊天记录 :param chatroomNum: :return:None """ # 建立message表数据库连接 message_conn = cd.MySQLCommand() message_conn.connectMysql(table="wechat_message") # 建立contact表数据库连接 contact_conn = cd.MySQLCommand() contact_conn.connectMysql(table="wechat_contact") getChatroomContent(message_conn, contact_conn, chatroomNum) message_conn.closeMysql() contact_conn.closeMysql()
def saveResult(chatroomId, chatroomName): # 建立数据库连接 word_conn = cd.MySQLCommand() word_conn.connectMysql(table="wechat_word") # 构建跨表查询语句,为了分群标注,减少来回查找用户的数量 sql = "SELECT msgId, context, atList From wechat_word where tag = 0 and msgId in (SELECT msgId from wechat_message where talker = '%s')" % chatroomId word_cursor = word_conn.cursor res = word_cursor.execute(sql) print("结果:", res) while True: res = word_cursor.fetchone() # 遍历结束之后,退出循环 if res is None: break # 如果没有@的人,进入下一条 msgId = res[0] atList = res[2] nicknames = [] print("第%s条信息正在处理" % msgId) # 如果@的人为空,开启新线程修改数据 if atList is not "" and atList is not None: name_list = atList.split("/") dealByNickName(chatroomId, chatroomName, res, name_list) continue # 将数据存储到数据库 getInformationBymsgId(update_conn, msgId, [], nicknames) # 关闭数据库连接 word_conn.closeMysql()
def speed_word_vector(coreNum): word_conn = cd.MySQLCommand() word_conn.connectMysql(table="wechat_word") situation = "where msgId > %s" % str(90909) word_cursor = word_conn.select_order(["msgId", "jieba_word"], situation=situation) ignore_word = set() conn_dict = {} # 建立数据库连接 for i in range(coreNum): vector_conn = cd.MySQLCommand() vector_conn.db = "tencent_word_vec" vector_conn.connectMysql(table="tc_word_vec") new_conn = cd.MySQLCommand() new_conn.connectMysql(table="wechat_vector") conn_dict["conn_%s" % str(i)] = (vector_conn, new_conn) TAG = True while TAG: ts = [] for conn_tuple in conn_dict.values(): try: (msgId, words) = word_cursor.fetchone() except TypeError: TAG = False break # print(words) th = threading.Thread(target=get_word_vector, args=(conn_tuple[0], conn_tuple[1], words, ignore_word,)) th.start() print("第%s条信息开始处理!" % str(msgId)) ts.append(th) for th in ts: th.join() with open("data/ignore_word.txt", "w", encoding="utf-8") as f: for word in ignore_word: f.write(str(word) + "\n") for conn_tuple in conn_dict.values(): conn_tuple[0].closeMysql() conn_tuple[1].closeMysql() word_conn.closeMysql()
def multi_run(coreNum, targetTable, targetFunction): """ 多线程启动函数 :param targetFunction: 多线程调用的函数 :param targetTable: 要存储的表名 :param coreNum: 线程数 :return: """ # 与message表建立数据库连接 message_conn = cd.MySQLCommand() message_conn.connectMysql(table="wechat_message") message_cursor = message_conn.select_order(["msgId", "type", "talker", "content"]) conn_dict = {} for j in range(coreNum): multi_conn = cd.MySQLCommand() multi_conn.connectMysql(table=targetTable) conn_dict["conn_%s" % str(j)] = multi_conn # 设计一个钩子 TAG = True while TAG: # 多线程解析content for multi_conn in conn_dict.values(): message = message_cursor.fetchone() # 如果已经遍历结束,直接结束 if message is None: # message_conn.closeMysql() TAG = False break th = threading.Thread(target=targetFunction, args=(multi_conn, message, )) # print("第", i, "个线程开启") th.start() th.join() # 关闭连接 for conn_j in conn_dict.values(): conn_j.closeMysql() message_conn.closeMysql()
def multiThread(): """ 多线程启动,降低IO延时造成的问题; :return:None """ # 建立message表数据库连接 message_conn = cd.MySQLCommand() message_conn.connectMysql(table="wechat_message") chatroomNums = message_conn.select_distinct() for chatroomNum in chatroomNums: chatroomNum = chatroomNum[0] # 判断talker是否为群编号,例如“weixin”这样的私人信息要排除 if "chatroom" not in chatroomNum: continue th = threading.Thread(target=getFile, args=(chatroomNum, )) th.start() message_conn.closeMysql()
def getStopWords(): contact_conn = cd.MySQLCommand() contact_conn.connectMysql(table="wechat_vector") # f = open("data/ignore_word.txt", "a", encoding="utf-8") contact_cursor = contact_conn.cursor sql = "SELECT word FROM wechat_vector WHERE vector = '0'" contact_cursor.execute(sql) while True: word = contact_cursor.fetchone()[0] if word is None: break print(word) f.write(word + '\n') f.close() contact_conn.closeMysql()
def clear_wechat_message(): """ # 清除数据库中发言数量少于20的聊天记录。 :return: """ message_conn = cd.MySQLCommand() message_conn.connectMysql(table="wechat_message") for chatroom in message_conn.select_distinct(): chatroom = chatroom[0] res = message_conn.cursor.execute("select talker from wechat_message where talker = '%s'" % chatroom) print("*****", res) if res < 20: message_conn.cursor.execute("delete from wechat_message where talker = '%s'" % chatroom) message_conn.cursor.execute("delete from wechat_sender where chatroom = '%s'" % chatroom) message_conn.conn.commit() message_conn.cursor.execute("select nickname from wechat_contact where username = '******'" % chatroom) print(message_conn.cursor.fetchone()[0], "*****已删除") message_conn.closeMysql()
# -*- coding: utf-8 -*- """ 修改备注的工具类 """ from commonTools import ConnectDatabase as cd from commonTools import wechatContent as wc # 声明全局变量 message_conn = cd.MySQLCommand() message_conn.connectMysql(table="wechat_message") contact_conn = cd.MySQLCommand() contact_conn.connectMysql(table="wechat_contact") update_conn = cd.MySQLCommand() update_conn.connectMysql(table="wechat_word") def getInformationBymsgId(word_conn, msgId, name_list, nicknames): """ 根据msgId获取消息的相关信息 :param name_list: @的备注列表 :param word_conn: 数据库连接 :param msgId: 信息ID :param nicknames: 被@的用户名昵称列表 :return: """ title_list = ["content"] situation = "WHERE msgId = '%s'" % msgId message_conn.select_order(title_list=title_list, situation=situation) content = message_conn.cursor.fetchone()[0] wechat_content = wc.WechatContent(content) result = wechat_content.splitContent()