Example #1
0
def crawl_comment_page(mid):
    limit = get_max_comment_page()
    cur_page = 1
    next_url = ''
    while cur_page <= limit:
        cur_time = int(time.time()*1000)
        if cur_page == 1:
            url = start_url.format(mid, cur_time)
        else:
            url = base_url.format(next_url, cur_time)
        html = get_page(url, user_verify=False)
        comment_datas = comment.get_comment_list(html, mid)

        if not comment_datas and cur_page == 1:
            crawler.warning('微博id为{}的微博评论未采集成功,请检查原因'.format(mid))
            return

        save_comments(comment_datas)
        # 由于这里每一步都要根据上一步来迭代,所以不适合采用网络调用(主要是比较麻烦)
        next_url = comment.get_next_url(html)

        if not next_url:
            crawler.info('微博{}的评论采集已经完成'.format(mid))
            return
        cur_page += 1
Example #2
0
def crawl_comment_by_page(mid, page_num):
    cur_time = int(time.time() * 1000)
    cur_url = base_url.format(mid, page_num, cur_time)
    html = get_page(cur_url, user_verify=False)
    comment_datas = comment.get_comment_list(html, mid)
    save_comments(comment_datas)
    wb_data.set_weibo_comment_crawled(mid)
    return html
Example #3
0
def crawl_comment_by_page(mid, page_num):
    cur_time = int(time.time() * 1000)
    cur_url = base_url.format(mid, page_num, cur_time)
    html = get_page(cur_url, user_verify=False)
    comment_datas = comment.get_comment_list(html, mid)
    save_comments(comment_datas)
    if page_num == 1:
        wb_data.set_weibo_comment_crawled(mid)
    return html