def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page) if cur_page == 1: total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler', routing_key='ajax_home_info')
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler', routing_key='ajax_home_info') set_seed_home_crawled(uid)
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) page_id = public.get_pageid(html) cur_time = int(time.time() * 1000) ajax_url_0 = ajax_url.format(domain, 0, page_id, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, page_id, uid, cur_page, cur_page, cur_time + 100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, ), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, ), queue='ajax_home_crawler', routing_key='ajax_home_info') set_seed_home_crawled(uid)
def crawl_weibo_pics(uid): limit = get_max_home_page() cur_page = 1 # 自定义最大爬取的页数 max_page = 20 # end url = home_url.format(uid, cur_page) html = get_page(url) domain = public.get_userdomain(html) # 只爬取微博个人用户的相片,如果是非个人用户(如政府,组织等)不爬取。 if domain not in ['103505', '100306', '100505', '']: set_seed_home_crawled(uid, 2) return # end domain_uid = domain + uid page_domain = 'page_' + domain url = pic_url.format(domain_uid, page_domain) html = get_page(url) weibo_pics, next_ajax_url = get_wbdata_fromweb(html) if weibo_pics is None or next_ajax_url is None: crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid)) set_seed_home_crawled(uid, 3) return if not weibo_pics: crawler.warning('用户id为{}的用户相册未采集成功,可能是因为TA没有发过带图微博'.format(uid)) set_seed_home_crawled(uid, 5) return insert_weibo_pics(weibo_pics) if not next_ajax_url: crawler.warning('用户id为{}的相册采集完成'.format(uid)) set_seed_home_crawled(uid, 4) return cur_page += 1 while cur_page <= limit: # 有些微博账号的照片多达两三千张,如果全部爬取比较浪费时间,这里先简单粗暴地根据当前微博的页数 # 进行限制。经过调查发现10页左右应该是比较理想的数字。 if cur_page > max_page: break # ebd cur_time = int(time.time()*1000) ajax_call = 1 page_id = domain_uid url = ajax_url.format(page_id, cur_page, ajax_call, cur_time) + '&' + next_ajax_url html = get_page(url, user_verify=False) weibo_pics, next_ajax_url = get_pic_data_byajax(html) if weibo_pics is None or next_ajax_url is None: crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid)) set_seed_home_crawled(uid, 3) return if not weibo_pics: crawler.warning('用户id为{}的用户相册未采集成功,请检查原因'.format(uid)) set_seed_home_crawled(uid, 3) return insert_weibo_pics(weibo_pics) if not next_ajax_url: crawler.warning('用户id为{}的相册采集完成'.format(uid)) set_seed_home_crawled(uid, 4) return cur_page += 1 # 在完成规定的最大爬取页数后主动退出,将标志位置位为4 set_seed_home_crawled(uid, 4) return
def crawl_weibo(uid): debug_mode = 1 limit = get_max_home_page() cur_page = 1 pic_count = 0 max_pic_count = 150 max_retry_cnt = 2 cur_retry_cnt = 0 direct_get_sleep_time = 30 containerid = '230413' + uid luicode = '10000011' lfid = '230283' + uid featurecode = '20000180' value = uid page_type = '03' page = cur_page # 只要db中没有proxy,就认为当前进入了一个暂时无代理而需要直接连接的状况,sleep的时间就应该相应的拉长 proxy = get_a_random_proxy() if proxy == {}: direct_get_sleep_time = 60 elif random_event_occur(): proxy = {} print(proxy) # end if debug_mode == 1: direct_get_sleep_time = 1 # test for getting empty proxy if proxy == {}: # crawler.warning('empty proxy!') # time.sleep(3) # proxy = get_a_random_proxy() # proxy_cnt = count_proxy() # crawler.warning('new proxy:{}, proxy count:{}'.format(proxy, proxy_cnt)) # return time.sleep(randint(0, direct_get_sleep_time)) # end url = ori_wb_temp_url.format(containerid, luicode, lfid, featurecode, value, page_type, page) html = get_page(url, user_verify=False, need_login=False, proxys=proxy) # html为空也有可能是其他原因,但是代理问题应该是大概率,因此对代理进行扣分。 # 如果重试还是返回空html,那么两个proxy均不扣分,记录uid异常后直接return,如果返回非空但无效的html,则在后面流程进行扣分 if html == '': if cur_retry_cnt < max_retry_cnt: cur_retry_cnt = cur_retry_cnt + 1 proxy_handler(proxy, -1) proxy = get_a_random_proxy() if proxy == {}: time.sleep(randint(0, direct_get_sleep_time)) html = get_page(url, user_verify=False, need_login=False, proxys=proxy) if html == '': proxy_handler(proxy, -1) return else: proxy_handler(proxy, -1) return # end weibo_pics = get_weibo_list(html) if weibo_pics == '': crawler.warning('请求过于频繁') if proxy == {}: time.sleep(randint(0, direct_get_sleep_time)) proxy_handler(proxy, -1) return if weibo_pics == None: proxy_handler(proxy, -1) return elif weibo_pics == False: finish_uid_handler(uid, proxy) return elif weibo_pics: insert_weibo_pics(weibo_pics) pic_count = pic_count + len(weibo_pics) cur_page += 1 while cur_page <= limit and pic_count < max_pic_count: page = cur_page url = ori_wb_temp_url.format(containerid, luicode, lfid, featurecode, value, page_type, page) html = get_page(url, user_verify=False, need_login=False, proxys=proxy) # html为空也有可能是其他原因,但是代理问题应该是大概率,因此对代理进行扣分。 if html == '': if cur_retry_cnt < max_retry_cnt: cur_retry_cnt = cur_retry_cnt + 1 proxy_handler(proxy, -1) proxy = get_a_random_proxy() if proxy == {}: time.sleep(randint(0, direct_get_sleep_time)) html = get_page(url, user_verify=False, need_login=False, proxys=proxy) if html == '': exception_uid_handler(uid, 6, proxy) return else: exception_uid_handler(uid, 3, proxy) return # end weibo_pics = get_weibo_list(html) # 如果通过当前代理所获取到的页面是被封锁页面,则将当前代理降分并直接return if weibo_pics == '': crawler.warning('请求过于频繁') if proxy == {}: time.sleep(randint(0, direct_get_sleep_time)) proxy_handler(proxy, -1) return if weibo_pics == None: exception_uid_handler(uid, 4, proxy, html) return elif weibo_pics == False: finish_uid_handler(uid, proxy) return elif weibo_pics: insert_weibo_pics(weibo_pics) pic_count = pic_count + len(weibo_pics) cur_page += 1 finish_uid_handler(uid, proxy) return
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 # 自定义最大爬取的页数 max_page = 10 # end while cur_page <= limit: # 有些微博账号的照片多达两三千张,如果全部爬取比较浪费时间,这里先简单粗暴地根据当前微博的页数 # 进行限制。经过调查发现10页左右应该是比较理想的数字,电脑版微博一页有45条微博,那么一个账户就是 # 450条微博。 if cur_page > max_page: break # end url = home_url.format(uid, cur_page) html = get_page(url) domain = public.get_userdomain(html) # 只爬取微博个人用户的相片,如果是非个人用户(如政府,组织等)不爬取。 if domain not in ['103505', '100306', '100505', '']: set_seed_home_crawled(uid, 2) return # end weibo_datas, weibo_pics = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid)) return insert_weibo_datas(weibo_datas) # 如果非空,则将weibo_pics插入数据库中 if weibo_pics: insert_weibo_pics(weibo_pics) # end cur_time = int(time.time() * 1000) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time + 100) if cur_page == 1: total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, ), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, ), queue='ajax_home_crawler', routing_key='ajax_home_info') # 在遍历完所有页数之后,将flag置位。放在这里表示所有页面都遍历过,不保证遍历成功后置位。可能以后还要优化,即在 # 某个回调函数中使用它。 set_seed_home_crawled(uid)