Exemple #1
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
Exemple #2
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)
Exemple #3
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = home_url.format(uid, cur_page)
        html = get_page(url)
        weibo_datas = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        insert_weibo_datas(weibo_datas)

        domain = public.get_userdomain(html)
        page_id = public.get_pageid(html)
        cur_time = int(time.time() * 1000)
        ajax_url_0 = ajax_url.format(domain, 0, page_id, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, page_id, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_0, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_1, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
    set_seed_home_crawled(uid)
Exemple #4
0
def crawl_weibo_pics(uid):
    limit = get_max_home_page()
    cur_page = 1

    # 自定义最大爬取的页数
    max_page = 20
    # end

    url = home_url.format(uid, cur_page)
    html = get_page(url)
    domain = public.get_userdomain(html)
    
    # 只爬取微博个人用户的相片,如果是非个人用户(如政府,组织等)不爬取。
    if domain not in ['103505', '100306', '100505', '']:
        set_seed_home_crawled(uid, 2)
        return
    # end

    domain_uid = domain + uid
    page_domain = 'page_' + domain
    url = pic_url.format(domain_uid, page_domain)

    html = get_page(url)

    weibo_pics, next_ajax_url = get_wbdata_fromweb(html)

    if weibo_pics is None or next_ajax_url is None:
        crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid))
        set_seed_home_crawled(uid, 3)
        return

    if not weibo_pics:
        crawler.warning('用户id为{}的用户相册未采集成功,可能是因为TA没有发过带图微博'.format(uid))
        set_seed_home_crawled(uid, 5)
        return

    insert_weibo_pics(weibo_pics)

    if not next_ajax_url:
        crawler.warning('用户id为{}的相册采集完成'.format(uid))
        set_seed_home_crawled(uid, 4)
        return
    
    cur_page += 1

    while cur_page <= limit:

        # 有些微博账号的照片多达两三千张,如果全部爬取比较浪费时间,这里先简单粗暴地根据当前微博的页数
        # 进行限制。经过调查发现10页左右应该是比较理想的数字。
        if cur_page > max_page:
            break
        # ebd

        cur_time = int(time.time()*1000)
        ajax_call = 1
        page_id = domain_uid
        url = ajax_url.format(page_id, cur_page, ajax_call, cur_time) + '&' + next_ajax_url
        html = get_page(url, user_verify=False)

        weibo_pics, next_ajax_url = get_pic_data_byajax(html)
        
        if weibo_pics is None or next_ajax_url is None:
            crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid))
            set_seed_home_crawled(uid, 3)
            return

        if not weibo_pics:
            crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid))
            set_seed_home_crawled(uid, 3)
            return
        
        insert_weibo_pics(weibo_pics)
        
        if not next_ajax_url:
            crawler.warning('用户id为{}的相册采集完成'.format(uid))
            set_seed_home_crawled(uid, 4)
            return

        cur_page += 1
        
    # 在完成规定的最大爬取页数后主动退出,将标志位置位为4
    set_seed_home_crawled(uid, 4)
    return
def crawl_weibo(uid):
    debug_mode = 1

    limit = get_max_home_page()
    cur_page = 1

    pic_count = 0
    max_pic_count = 150

    max_retry_cnt = 2
    cur_retry_cnt = 0

    direct_get_sleep_time = 30

    containerid = '230413' + uid
    luicode = '10000011'
    lfid = '230283' + uid
    featurecode = '20000180'
    value = uid
    page_type = '03'
    page = cur_page

    # 只要db中没有proxy,就认为当前进入了一个暂时无代理而需要直接连接的状况,sleep的时间就应该相应的拉长
    proxy = get_a_random_proxy()
    if proxy == {}:
        direct_get_sleep_time = 60
    elif random_event_occur():
        proxy = {}
    print(proxy)
    # end

    if debug_mode == 1:
        direct_get_sleep_time = 1
    # test for getting empty proxy
    if proxy == {}:
        # crawler.warning('empty proxy!')
        # time.sleep(3)
        # proxy = get_a_random_proxy()
        # proxy_cnt = count_proxy()
        # crawler.warning('new proxy:{}, proxy count:{}'.format(proxy, proxy_cnt))
        # return
        time.sleep(randint(0, direct_get_sleep_time))
    # end

    url = ori_wb_temp_url.format(containerid, luicode, lfid, featurecode, value, page_type, page)
    html = get_page(url, user_verify=False, need_login=False, proxys=proxy)

    # html为空也有可能是其他原因,但是代理问题应该是大概率,因此对代理进行扣分。
    # 如果重试还是返回空html,那么两个proxy均不扣分,记录uid异常后直接return,如果返回非空但无效的html,则在后面流程进行扣分
    if html == '':
        if cur_retry_cnt < max_retry_cnt:
            cur_retry_cnt = cur_retry_cnt + 1
            proxy_handler(proxy, -1)
            proxy = get_a_random_proxy()

            if proxy == {}:
                time.sleep(randint(0, direct_get_sleep_time))

            html = get_page(url, user_verify=False, need_login=False, proxys=proxy)
            if html == '':
                proxy_handler(proxy, -1)
                return
        else:
            proxy_handler(proxy, -1)
            return
    # end

    weibo_pics = get_weibo_list(html)

    if weibo_pics == '':
        crawler.warning('请求过于频繁')
        if proxy == {}:
            time.sleep(randint(0, direct_get_sleep_time))
        proxy_handler(proxy, -1)
        return

    if weibo_pics == None:
        proxy_handler(proxy, -1)
        return
    elif weibo_pics == False:
        finish_uid_handler(uid, proxy)
        return
    elif weibo_pics:
        insert_weibo_pics(weibo_pics)

    pic_count = pic_count + len(weibo_pics)

    cur_page += 1

    while cur_page <= limit and pic_count < max_pic_count:
        
        page = cur_page
        url = ori_wb_temp_url.format(containerid, luicode, lfid, featurecode, value, page_type, page)
        html = get_page(url, user_verify=False, need_login=False, proxys=proxy)

        # html为空也有可能是其他原因,但是代理问题应该是大概率,因此对代理进行扣分。
        if html == '':
            if cur_retry_cnt < max_retry_cnt:
                cur_retry_cnt = cur_retry_cnt + 1
                proxy_handler(proxy, -1)
                proxy = get_a_random_proxy()

                if proxy == {}:
                    time.sleep(randint(0, direct_get_sleep_time))
                
                html = get_page(url, user_verify=False, need_login=False, proxys=proxy)
                if html == '':
                    exception_uid_handler(uid, 6, proxy)
                    return
            else:
                exception_uid_handler(uid, 3, proxy)
                return
        # end

        weibo_pics = get_weibo_list(html)

        # 如果通过当前代理所获取到的页面是被封锁页面,则将当前代理降分并直接return
        if weibo_pics == '':
            crawler.warning('请求过于频繁')
            if proxy == {}:
                time.sleep(randint(0, direct_get_sleep_time))
            proxy_handler(proxy, -1)
            return

        if weibo_pics == None:
            exception_uid_handler(uid, 4, proxy, html)
            return
        elif weibo_pics == False:
            finish_uid_handler(uid, proxy)
            return
        elif weibo_pics:
            insert_weibo_pics(weibo_pics)
        
        pic_count = pic_count + len(weibo_pics)

        cur_page += 1

    finish_uid_handler(uid, proxy)
    return
Exemple #6
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1

    # 自定义最大爬取的页数
    max_page = 10
    # end

    while cur_page <= limit:

        # 有些微博账号的照片多达两三千张,如果全部爬取比较浪费时间,这里先简单粗暴地根据当前微博的页数
        # 进行限制。经过调查发现10页左右应该是比较理想的数字,电脑版微博一页有45条微博,那么一个账户就是
        # 450条微博。
        if cur_page > max_page:
            break
        # end

        url = home_url.format(uid, cur_page)
        html = get_page(url)

        domain = public.get_userdomain(html)
        # 只爬取微博个人用户的相片,如果是非个人用户(如政府,组织等)不爬取。
        if domain not in ['103505', '100306', '100505', '']:
            set_seed_home_crawled(uid, 2)
            return
        # end

        weibo_datas, weibo_pics = get_wbdata_fromweb(html)

        if not weibo_datas:
            crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid))
            return

        insert_weibo_datas(weibo_datas)

        # 如果非空,则将weibo_pics插入数据库中
        if weibo_pics:
            insert_weibo_pics(weibo_pics)
        # end

        cur_time = int(time.time() * 1000)
        ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            total_page = get_total_page(crawl_ajax_page(ajax_url_1))

        if total_page < limit:
            limit = total_page

        cur_page += 1
        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_0, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page',
                      args=(ajax_url_1, ),
                      queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

    # 在遍历完所有页数之后,将flag置位。放在这里表示所有页面都遍历过,不保证遍历成功后置位。可能以后还要优化,即在
    # 某个回调函数中使用它。
    set_seed_home_crawled(uid)