Ejemplo n.º 1
0
def crawl_praise_page(mid):
    # 这里为了马上拿到返回结果,采用本地调用的方式
    cur_time = int(time.time() * 1000)
    cur_url = BASE_URL.format(mid, cur_time)
    html = get_page(cur_url, auth_level=2, is_ajax=True)
    praise_data, ext_param = praise.get_praise_list(html, mid)
    PraiseOper.add_all(praise_data)

    WbDataOper.set_weibo_praise_crawled(mid)

    if not ext_param:
        crawler.error(
            'fail to get praise page 2 ext_param, mid is {mid}'.format(
                mid=mid))
        return

    # why no app.send_task and fall back to sequential execution
    # because weibo praise now require a parameter called max_id
    # and request without it will return something different from normal browser

    # should work after 5
    # TODO: retry or return depending on ext_param
    for __ in range(2, 5):
        # ext_param mainly max_id will be updated each time and be used next time
        html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param)
    return
Ejemplo n.º 2
0
def crawl_praise_by_page(mid, page_num):
    try:
        cur_time = int(time.time() * 1000)
        cur_url = BASE_URL.format(mid, page_num, cur_time)
        html = get_page(cur_url, auth_level=2, is_ajax=True)
        praise_datas = praise.get_praise_list(html, mid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "praise SoftTimeLimitExceeded    mid={mid} page_num={page_num}".
            format(mid=mid, page_num=page_num))
        crawl_praise_by_page(mid, page_num)
    PraiseOper.add_all(praise_datas)
    if page_num == 1:
        WbDataOper.set_weibo_praise_crawled(mid)
    return html, praise_datas
Ejemplo n.º 3
0
def crawl_praise_page(mid):
    # 这里为了马上拿到返回结果,采用本地调用的方式
    cur_time = int(time.time() * 1000)
    cur_url = BASE_URL.format(mid, cur_time)
    html = get_page(cur_url, auth_level=2, is_ajax=True)
    praise_data, ext_param = praise.get_praise_list(html, mid)
    PraiseOper.add_all(praise_data)
    
    WbDataOper.set_weibo_praise_crawled(mid)

    if not ext_param:
        crawler.error('fail to get praise page 2 ext_param, mid is {mid}'.format(mid=mid))
        return

    # why no app.send_task and fall back to sequential execution
    # because weibo praise now require a parameter called max_id
    # and request without it will return something different from normal browser

    # should work after 5
    # TODO: retry or return depending on ext_param
    for __ in range(2,5):
        # ext_param mainly max_id will be updated each time and be used next time
        html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param)
    return