예제 #1
0
def update_num_of_comment_commentlike():
    # 更新content表中的评论量和点赞量
    time0 = time.time()
    sql = "SELECT id, `comment` FROM content"
    result, rowcount = dbutil.query_with_sql_rowcount(conn, sql)
    for (id, comment) in result:
        if len(comment) > 2:
            comment = comment[2:len(comment) - 2]
            aa = comment.split('}, {')
            comment_list = []
            for a in aa:
                a = "{%s}" % a
                comment_list.append(a)
            comment_like = 0
            for discuss in comment_list:
                di = eval(discuss)  # 字符串转数组,该函数不安全
                tmp = di.get('elected')  # 评论点赞量也有过万的,在数据持久化的时候,'elected'存放的数据不太规范,有String类型,把无数据的当做0整型数据存放了
                if isinstance(tmp, str) and '万' in tmp:
                    # print('评论点赞量过万: %s' % discuss)
                    comment_like += float(tmp.split('万')[0]) * 10000
                else:
                    comment_like += int(tmp)
                # print('评论点赞量:%d' % comment_like)
            sql2 = "update content set comment_num = %d, commentlike_num = %d WHERE id = %d" \
                   % (len(comment_list), comment_like, id)
            dbutil.exec_sql(conn, sql2)

    print('用时   ', time.time()-time0)
예제 #2
0
def update_readnum():
    time0 = time.time()
    sql = "select id, readnum from content where readnum like '%万%'"
    result, rowcount = dbutil.query_with_sql_rowcount(conn, sql)
    for (id, readnum) in result:
        tmp = float(readnum.split('万')[0]) * 10000
        sql2 = "update content set readnum= %s where id = %d" % (int(tmp), id)
        dbutil.exec_sql(conn, sql2)

    print('用时   ', time.time()-time0)
예제 #3
0
def drag_index_to_tabel():
    a = []

    sql = "SELECT id, contenturl FROM content"
    result, rowcount = dbutil.query_with_sql_rowcount(conn, sql)
    for (id, contenturl) in result:
        index = contenturl.find('&idx=')
        idx = int(contenturl[index + 5:index + 6])
        tmp = {'id': id, 'idx': idx}
        a.append(tmp)
        sql = 'update content set idx= %d where id = %d' % (idx, id)
        dbutil.exec_sql(conn, sql)
예제 #4
0
def shard_action(driver):
    logging.info("【开始自动获取公众号所有的历史消息】")
    sql = 'SELECT biz,nickname,history_offset from bizinfo WHERE spider=0 and id between 1 and 58'
    official_accounts = dbutil.query(conn, sql)
    # 进入对话框
    utils.enter_talkbox(driver, 'com.tencent.mm:id/b4m')

    count = 0  # 接口访问计数
    logging.info('查询总量:%d' % len(official_accounts))
    outter_break = False
    for (biz, nickname, offset) in official_accounts:
        if outter_break or count > 180:
            logging.info('接口总访问量:%d' % count)
            break
        time_start = time.time()
        logging.info('----------------当前测试biz:' + str(biz))
        logging.info('----------------当前测试nickname:' + str(nickname))
        # 爬取半年之内的
        while count <= 180:
            count += 1
            bizurl = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=' \
                     + biz + '&f=json&offset=' + str(offset) + '&count=10' + '\n' + str(offset)  # 发送消息
            utils.send_msg(driver, bizurl)
            # 点击链接
            utils.click_last_msg_in_talkbox(driver, 'com.tencent.mm:id/nl')
            # 获取文章url
            can_continue = get_article(driver, biz)
            # 更新偏移量
            offset += 10
            sql = "update bizinfo set history_offset= '{}' where biz= '{}'".format(
                offset, biz)
            dbutil.exec_sql(conn, sql)
            logging.info('下一个偏移量:%s' % offset)
            if can_continue == 'cannot_continue':
                time_end = time.time()
                sum_time = int(time_end - time_start)
                logging.info('单个公众号采集历史消息花费时间:%s' % str(sum_time))
                dbutil.update_bizinfo_consume(conn, sum_time, biz)
                break
            elif can_continue == 'banned':
                outter_break = True
                break

    driver.quit()
예제 #5
0
def delete_white_line():
    """
    去除内容冗余的空行和前后空格
    """
    time0 = time.time()
    sql = "SELECT id,digest FROM content"
    result = dbutil.query_with_sql(conn, sql)
    for (id, digest) in result:
        # 不为空
        if digest:
            # 去除多余的空行,只保留一个空行
            content = digest.lstrip("\n").rstrip("\n")
            content = re.sub("\n{2,}", "\n", content)
            digest = content.strip()
        sql = "update content set digest = '{}' where id = {}"\
            .format(pymysql.escape_string(digest), id)
        dbutil.exec_sql(conn, sql)
        # print(digest)
    print('用时   ', time.time()-time0)
예제 #6
0
def delete_white_space():
    """
    去除内容冗余的空行和前后空格
    """
    time0 = time.time()
    sql = "SELECT id,strong_content, color_content FROM content"
    result = dbutil.query_with_sql(conn, sql)
    for (id, strong_content, color_content) in result:
        # 不为空
        if strong_content:
            # 去除多余的空行,只保留一个空行
            content = strong_content.lstrip("。").rstrip("。")
            content = re.sub("。{2,}", "。", content)
            strong_content = content.strip()
        if color_content:
            # 去除多余的空行,只保留一个空行
            content = color_content.lstrip("。").rstrip("。")
            content = re.sub("。{2,}", "。", content)
            color_content = content.strip()
        sql = "update content set strong_content = '{}' , color_content='{}' where id = {}"\
            .format(pymysql.escape_string(strong_content), pymysql.escape_string(color_content), id)
        dbutil.exec_sql(conn, sql)
        # print(digest)
    print('用时   ', time.time()-time0)
예제 #7
0
        time_local = time.localtime(mintime)
        # 转换成新的时间格式(2016-05-05 20:28:54)
        mintime_format = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
        sql2 = "select nickname from bizinfo where biz='{}'".format(biz)
        nickname = dbutil.query_with_sql_one(conn, sql2)
        if nickname:
            sql3 = "SELECT MAX(datetime) as maxtime FROM content where biz = '{}'".format(
                biz)
            result3 = dbutil.query_with_sql_one(conn, sql3)
            maxtime = result3[0]
            # 转换成localtime
            time_local = time.localtime(maxtime)
            # 转换成新的时间格式(2016-05-05 20:28:54)
            maxtime_format = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
            meta = {
                'mintime': mintime_format,
                'maxtime': maxtime_format,
                'nickname': nickname
            }
            sql4 = "update bizinfo set mintime='{}', maxtime='{}' where biz='{}' "\
                .format(mintime_format, maxtime_format, biz)
            exec_sql = dbutil.exec_sql(conn, sql4)
            data[biz] = meta
        else:
            print('未查询到结果')

    print(len(data))
    for key in data.keys():
        print(key)
        print(data[key])