def update_num_of_comment_commentlike(): # 更新content表中的评论量和点赞量 time0 = time.time() sql = "SELECT id, `comment` FROM content" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (id, comment) in result: if len(comment) > 2: comment = comment[2:len(comment) - 2] aa = comment.split('}, {') comment_list = [] for a in aa: a = "{%s}" % a comment_list.append(a) comment_like = 0 for discuss in comment_list: di = eval(discuss) # 字符串转数组,该函数不安全 tmp = di.get('elected') # 评论点赞量也有过万的,在数据持久化的时候,'elected'存放的数据不太规范,有String类型,把无数据的当做0整型数据存放了 if isinstance(tmp, str) and '万' in tmp: # print('评论点赞量过万: %s' % discuss) comment_like += float(tmp.split('万')[0]) * 10000 else: comment_like += int(tmp) # print('评论点赞量:%d' % comment_like) sql2 = "update content set comment_num = %d, commentlike_num = %d WHERE id = %d" \ % (len(comment_list), comment_like, id) dbutil.exec_sql(conn, sql2) print('用时 ', time.time()-time0)
def update_readnum(): time0 = time.time() sql = "select id, readnum from content where readnum like '%万%'" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (id, readnum) in result: tmp = float(readnum.split('万')[0]) * 10000 sql2 = "update content set readnum= %s where id = %d" % (int(tmp), id) dbutil.exec_sql(conn, sql2) print('用时 ', time.time()-time0)
def drag_index_to_tabel(): a = [] sql = "SELECT id, contenturl FROM content" result, rowcount = dbutil.query_with_sql_rowcount(conn, sql) for (id, contenturl) in result: index = contenturl.find('&idx=') idx = int(contenturl[index + 5:index + 6]) tmp = {'id': id, 'idx': idx} a.append(tmp) sql = 'update content set idx= %d where id = %d' % (idx, id) dbutil.exec_sql(conn, sql)
def shard_action(driver): logging.info("【开始自动获取公众号所有的历史消息】") sql = 'SELECT biz,nickname,history_offset from bizinfo WHERE spider=0 and id between 1 and 58' official_accounts = dbutil.query(conn, sql) # 进入对话框 utils.enter_talkbox(driver, 'com.tencent.mm:id/b4m') count = 0 # 接口访问计数 logging.info('查询总量:%d' % len(official_accounts)) outter_break = False for (biz, nickname, offset) in official_accounts: if outter_break or count > 180: logging.info('接口总访问量:%d' % count) break time_start = time.time() logging.info('----------------当前测试biz:' + str(biz)) logging.info('----------------当前测试nickname:' + str(nickname)) # 爬取半年之内的 while count <= 180: count += 1 bizurl = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=' \ + biz + '&f=json&offset=' + str(offset) + '&count=10' + '\n' + str(offset) # 发送消息 utils.send_msg(driver, bizurl) # 点击链接 utils.click_last_msg_in_talkbox(driver, 'com.tencent.mm:id/nl') # 获取文章url can_continue = get_article(driver, biz) # 更新偏移量 offset += 10 sql = "update bizinfo set history_offset= '{}' where biz= '{}'".format( offset, biz) dbutil.exec_sql(conn, sql) logging.info('下一个偏移量:%s' % offset) if can_continue == 'cannot_continue': time_end = time.time() sum_time = int(time_end - time_start) logging.info('单个公众号采集历史消息花费时间:%s' % str(sum_time)) dbutil.update_bizinfo_consume(conn, sum_time, biz) break elif can_continue == 'banned': outter_break = True break driver.quit()
def delete_white_line(): """ 去除内容冗余的空行和前后空格 """ time0 = time.time() sql = "SELECT id,digest FROM content" result = dbutil.query_with_sql(conn, sql) for (id, digest) in result: # 不为空 if digest: # 去除多余的空行,只保留一个空行 content = digest.lstrip("\n").rstrip("\n") content = re.sub("\n{2,}", "\n", content) digest = content.strip() sql = "update content set digest = '{}' where id = {}"\ .format(pymysql.escape_string(digest), id) dbutil.exec_sql(conn, sql) # print(digest) print('用时 ', time.time()-time0)
def delete_white_space(): """ 去除内容冗余的空行和前后空格 """ time0 = time.time() sql = "SELECT id,strong_content, color_content FROM content" result = dbutil.query_with_sql(conn, sql) for (id, strong_content, color_content) in result: # 不为空 if strong_content: # 去除多余的空行,只保留一个空行 content = strong_content.lstrip("。").rstrip("。") content = re.sub("。{2,}", "。", content) strong_content = content.strip() if color_content: # 去除多余的空行,只保留一个空行 content = color_content.lstrip("。").rstrip("。") content = re.sub("。{2,}", "。", content) color_content = content.strip() sql = "update content set strong_content = '{}' , color_content='{}' where id = {}"\ .format(pymysql.escape_string(strong_content), pymysql.escape_string(color_content), id) dbutil.exec_sql(conn, sql) # print(digest) print('用时 ', time.time()-time0)
time_local = time.localtime(mintime) # 转换成新的时间格式(2016-05-05 20:28:54) mintime_format = time.strftime("%Y-%m-%d %H:%M:%S", time_local) sql2 = "select nickname from bizinfo where biz='{}'".format(biz) nickname = dbutil.query_with_sql_one(conn, sql2) if nickname: sql3 = "SELECT MAX(datetime) as maxtime FROM content where biz = '{}'".format( biz) result3 = dbutil.query_with_sql_one(conn, sql3) maxtime = result3[0] # 转换成localtime time_local = time.localtime(maxtime) # 转换成新的时间格式(2016-05-05 20:28:54) maxtime_format = time.strftime("%Y-%m-%d %H:%M:%S", time_local) meta = { 'mintime': mintime_format, 'maxtime': maxtime_format, 'nickname': nickname } sql4 = "update bizinfo set mintime='{}', maxtime='{}' where biz='{}' "\ .format(mintime_format, maxtime_format, biz) exec_sql = dbutil.exec_sql(conn, sql4) data[biz] = meta else: print('未查询到结果') print(len(data)) for key in data.keys(): print(key) print(data[key])