def crawl_repost_by_page(mid, page_num): cur_url = BASE_URL.format(mid, page_num) html = get_page(cur_url, auth_level=1, is_ajax=True) repost_datas = repost.get_repost_list(html, mid) if page_num == 1: WbDataOper.set_weibo_repost_crawled(mid) return html, repost_datas
def crawl_praise_page(mid): # 这里为了马上拿到返回结果,采用本地调用的方式 cur_time = int(time.time() * 1000) cur_url = BASE_URL.format(mid, cur_time) html = get_page(cur_url, auth_level=2, is_ajax=True) praise_data, ext_param = praise.get_praise_list(html, mid) PraiseOper.add_all(praise_data) WbDataOper.set_weibo_praise_crawled(mid) if not ext_param: crawler.error( 'fail to get praise page 2 ext_param, mid is {mid}'.format( mid=mid)) return # why no app.send_task and fall back to sequential execution # because weibo praise now require a parameter called max_id # and request without it will return something different from normal browser # should work after 5 # TODO: retry or return depending on ext_param for __ in range(2, 5): # ext_param mainly max_id will be updated each time and be used next time html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param) return
def search_keyword(keyword, keyword_id): crawler.info('We are searching keyword "{}"'.format(keyword)) cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < LIMIT: cur_url = URL.format(encode_keyword, cur_page) # current only for login, maybe later crawling page one without login search_page = get_page(cur_url, auth_level=2) if not search_page: crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # We need not crawl the same keyword in this turn for wb_data in search_list: rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id) KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id) if rs: crawler.info('Weibo {} has been crawled, skip it.'.format(wb_data.weibo_id)) continue else: WbDataOper.add_one(wb_data) app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') if cur_page == 1: cur_page += 1 elif 'noresult_tit' not in search_page: cur_page += 1 else: crawler.info('Keyword {} has been crawled in this turn'.format(keyword)) return
def crawl_comment_by_page(mid, page_num): cur_url = BASE_URL.format(mid, page_num) html = get_page(cur_url, auth_level=1, is_ajax=True) comment_datas = comment.get_comment_list(html, mid) CommentOper.add_all(comment_datas) if page_num == 1: WbDataOper.set_weibo_comment_crawled(mid) return html, comment_datas
def crawl_dialogue_by_comment_page(mid, page_num): comment_url = COMMENT_URL.format(mid, page_num) html = get_page(comment_url, auth_level=1, is_ajax=True) comment_ids = dialogue.get_comment_id(html, mid) for cid in comment_ids: crawl_dialogue_by_comment_id(cid, mid) if page_num == 1: WbDataOper.set_weibo_dialogue_crawled(mid) return html
def search_items_v2(keyword, keyword_id, date_item): search_time_list = [ "{}-{}:{}-{}".format(d, t, d, t + 2) for d, t in itertools.product([date_item], TIME_LIIT) ] for s_time in search_time_list: crawler.info('We are searching keyword "{}", {}'.format( keyword, s_time)) cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < LIMIT: cur_url = MAX_URL.format(encode_keyword, cur_page, s_time) # current only for login, maybe later crawling page one without login search_page = get_page(cur_url, auth_level=1, need_proxy=True) if "您可以尝试更换关键词,再次搜索" in search_page: break if not search_page: crawler.warning( 'No search result for keyword {}, the source page is {}'. format(keyword, search_page)) cur_page += 1 continue # return search_list = parse_search.get_search_info(search_page) if cur_page == 1: cur_page += 1 elif 'noresult_tit' not in search_page: cur_page += 1 else: crawler.info( 'Keyword {} has been crawled in this turn'.format(keyword)) return # Because the search results are sorted by time, if any result has been stored in mysql, # We don't need to crawl the same keyword in this turn for wb_data in search_list: # print(wb_data) rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id) KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id) # todo incremental crawling using time if rs: crawler.info('Weibo {} has been crawled, skip it.'.format( wb_data.weibo_id)) continue else: WbDataOper.add_one(wb_data) # todo: only add seed ids and remove this task app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid, ), queue='user_crawler', routing_key='for_user_info')
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = HOME_URL.format(uid, cur_page) if cur_page == 1: html = get_page(url, auth_level=1) else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return # Check whether weibo created after time in spider.yaml timeafter = time.mktime( time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S')) length_weibo_datas = len(weibo_datas) for i in range(0, len(weibo_datas)): weibo_time = time.mktime( time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M')) if weibo_time < timeafter: weibo_datas = weibo_datas[0:i] break WbDataOper.add_all(weibo_datas) # If the weibo isn't created after the given time, jump out the loop if i != length_weibo_datas - 1: break domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) auth_level = 1 else: auth_level = 2 if total_page < limit: limit = total_page app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)
def crawl_ajax_page(url, auth_level): """ :param url: user home ajax url :param auth_level: 1 stands for no login but need fake cookies, 2 stands for login :return: resp.text """ ajax_html = get_page(url, auth_level, is_ajax=True) ajax_wbdatas = get_ajax_data(ajax_html) if not ajax_wbdatas: return '' WbDataOper.add_all(ajax_wbdatas) return ajax_html
def crawl_comment_by_page(mid, page_num, seeion): try: cur_url = BASE_URL.format(mid, page_num) html = get_page(cur_url, auth_level=1, is_ajax=True) comment_datas, seed_ids = comment.get_comment_list(html, mid) except SoftTimeLimitExceeded: crawler.error( "comment SoftTimeLimitExceeded mid={mid} page_num={page_num}". format(mid=mid, page_num=page_num)) crawl_comment_by_page(mid, page_num) CommentOper.add_all(comment_datas, seeion) SeedidsOper.insert_seeds(seed_ids, seeion) if page_num == 1: WbDataOper.set_weibo_comment_crawled(mid, seeion) return html, comment_datas
def crawl_praise_by_page(mid, page_num): try: cur_time = int(time.time() * 1000) cur_url = BASE_URL.format(mid, page_num, cur_time) html = get_page(cur_url, auth_level=2, is_ajax=True) praise_datas = praise.get_praise_list(html, mid) except SoftTimeLimitExceeded: crawler.error( "praise SoftTimeLimitExceeded mid={mid} page_num={page_num}". format(mid=mid, page_num=page_num)) crawl_praise_by_page(mid, page_num) PraiseOper.add_all(praise_datas) if page_num == 1: WbDataOper.set_weibo_praise_crawled(mid) return html, praise_datas
def test_weibodata_oper(self): db_session.execute("insert into {} ({}.weibo_id) values ('".format( weibo_data.name, weibo_data.name) + FAKE_ID + "')") assert WbDataOper.get_wb_by_mid(FAKE_ID) is not None assert len(WbDataOper.get_weibo_comment_not_crawled()) == 1 assert len(WbDataOper.get_weibo_repost_not_crawled()) == 1 WbDataOper.set_weibo_comment_crawled(FAKE_ID) WbDataOper.set_weibo_repost_crawled(FAKE_ID) assert len(WbDataOper.get_weibo_comment_not_crawled()) == 0 assert len(WbDataOper.get_weibo_repost_not_crawled()) == 0
def test_weibodata_oper(self): db_session.execute("insert into {} ({}.weibo_id) values ('".format(weibo_data.name, weibo_data.name) + FAKE_ID + "')") assert WbDataOper.get_wb_by_mid(FAKE_ID) is not None assert len(WbDataOper.get_weibo_comment_not_crawled()) == 1 assert len(WbDataOper.get_weibo_repost_not_crawled()) == 1 WbDataOper.set_weibo_comment_crawled(FAKE_ID) WbDataOper.set_weibo_repost_crawled(FAKE_ID) assert len(WbDataOper.get_weibo_comment_not_crawled()) == 0 assert len(WbDataOper.get_weibo_repost_not_crawled()) == 0
def execute_praise_task(): weibo_datas = WbDataOper.get_weibo_praise_not_crawled() for weibo_data in weibo_datas: app.send_task('tasks.praise.crawl_praise_page', args=(weibo_data.weibo_id, ), queue='praise_crawler', routing_key='praise_info')
def determine(weibo_datum, timeafter): weibo_time = time.mktime( time.strptime(weibo_datum.create_time, '%Y-%m-%d %H:%M')) if weibo_time < timeafter: return False if WbDataOper.get_wb_by_mid(weibo_datum.weibo_id): return False return True
def execute_dialogue_task(): weibo_datas = WbDataOper.get_weibo_dialogue_not_crawled() for weibo_data in weibo_datas: # crawl_dialogue(weibo_data.weibo_id) app.send_task('tasks.dialogue.crawl_dialogue', args=(weibo_data.weibo_id,), queue='dialogue_crawler', routing_key='dialogue_info')
def crawl_comment_by_page(mid, page_num): try: cur_url = BASE_URL.format(mid, page_num) html = get_page(cur_url, auth_level=1, is_ajax=True) comment_datas = comment.get_comment_list(html, mid) except SoftTimeLimitExceeded: crawler.error( "comment SoftTimeLimitExceeded mid={mid} page_num={page_num}". format(mid=mid, page_num=page_num)) app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler', routing_key='comment_page_info') CommentOper.add_all(comment_datas) if page_num == 1: WbDataOper.set_weibo_comment_crawled(mid) return html, comment_datas
def execute_repost_task(): # regard current weibo url as the original url, you can also analyse from the root url weibo_datas = WbDataOper.get_weibo_repost_not_crawled() crawler.info('There are {} repost urls have to be crawled'.format(len(weibo_datas))) for weibo_data in weibo_datas: app.send_task('tasks.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid), queue='repost_crawler', routing_key='repost_info')
def execute_repost_task(): # regard current weibo url as the original url, you can also analyse from the root url weibo_datas = WbDataOper.get_weibo_repost_not_crawled() crawler.info('There are {} repost urls have to be crawled'.format( len(weibo_datas))) for weibo_data in weibo_datas: crawl_repost_page(weibo_data.weibo_id, weibo_data.uid)
def execute_dialogue_task(): weibo_datas = WbDataOper.get_weibo_dialogue_not_crawled() for weibo_data in weibo_datas: # crawl_dialogue(weibo_data.weibo_id) app.send_task('tasks.dialogue.crawl_dialogue', args=(weibo_data.weibo_id, ), queue='dialogue_crawler', routing_key='dialogue_info')
def execute_comment_task(): # 只解析了根评论,而未对根评论下的评论进行抓取,如果有需要的同学,可以适当做修改 weibo_datas = WbDataOper.get_weibo_comment_not_crawled() for weibo_data in weibo_datas: app.send_task('tasks.comment.crawl_comment_page', args=(weibo_data.weibo_id, ), queue='comment_crawler', routing_key='comment_info')
def crawl_comment_by_page(mid, page_num): try: cur_url = BASE_URL.format(mid, page_num) html = get_page(cur_url, auth_level=1, is_ajax=True) comment_datas = comment.get_comment_list(html, mid) except SoftTimeLimitExceeded: crawler.error( "comment SoftTimeLimitExceeded mid={mid} page_num={page_num}". format(mid=mid, page_num=page_num)) app.send_task( 'tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler', routing_key='comment_page_info') CommentOper.add_all(comment_datas) if page_num == 1: WbDataOper.set_weibo_comment_crawled(mid) return html, comment_datas
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < LIMIT: cur_url = URL.format(encode_keyword, cur_page) if cur_page == 1: search_page = get_page(cur_url, auth_level=1) else: search_page = get_page(cur_url, auth_level=2) if not search_page: crawler.warning( 'No result for keyword {}, the source page is {}'.format( keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # we need not crawl the same keyword in this turn for wb_data in search_list: rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info( 'Keyword {} has been crawled in this turn'.format(keyword)) return else: WbDataOper.add_one(wb_data) KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id) # send task for crawling user info app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid, ), queue='user_crawler', routing_key='for_user_info') if cur_page == 1: cur_page += 1 elif 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info( 'Keyword {} has been crawled in this turn'.format(keyword)) return
def crawl_ajax_page(url, auth_level): """ :param url: user home ajax url :param auth_level: 1 stands for no login but need fake cookies, 2 stands for login :return: resp.text """ ajax_html = get_page(url, auth_level, is_ajax=True) ajax_wbdata = get_ajax_data(ajax_html) if not ajax_wbdata: return '' timeafter = time.mktime( time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S')) ajax_wbdata = [ ajax_wbdatum for ajax_wbdatum in ajax_wbdata if determine(ajax_wbdatum, timeafter) ] WbDataOper.add_all(ajax_wbdata) return ajax_html
def crawl_ajax_page(url, auth_level): """ :param url: user home ajax url :param auth_level: 1 stands for no login but need fake cookies, 2 stands for login :return: resp.text """ ajax_html = get_page(url, auth_level, is_ajax=True) ajax_wbdatas = get_ajax_data(ajax_html) if not ajax_wbdatas: return '' timeafter = time.mktime(time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S')) for i in range(0,len(ajax_wbdatas)): weibo_time = time.mktime(time.strptime(ajax_wbdatas[i].create_time, '%Y-%m-%d %H:%M')) if weibo_time < timeafter: ajax_wbdatas = ajax_wbdatas[0:i] break WbDataOper.add_all(ajax_wbdatas) return ajax_html
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = HOME_URL.format(uid, cur_page) if cur_page == 1: html = get_page(url, auth_level=1) else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return WbDataOper.add_all(weibo_datas) domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) auth_level = 1 else: auth_level = 2 if total_page < limit: limit = total_page app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)
def search_keyword_topic(keyword, keyword_id, start_time='', end_time=''): crawler.info( 'We are crawling weibo topic content with keyword "{}"'.format( keyword)) cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < LIMIT: cur_url = URL.format(encode_keyword, start_time, end_time, cur_page) search_page = get_page(cur_url, auth_level=2) if not search_page: crawler.info( 'No such result for keyword {}, the source page is {}'.format( keyword, search_page)) return search_list = parse_topic.get_search_info(search_page) if cur_page == 1: cur_page += 1 elif '您可以尝试更换关键词' not in search_page: cur_page += 1 else: crawler.info( 'Keyword {} has been crawled in this turn'.format(keyword)) return for wb_data in search_list: rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id) KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id) if rs: crawler.info('Weibo {} has been crawled, skip it.'.format( wb_data.weibo_id)) continue else: WbDataOper.add_one(wb_data) app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid, ), queue='user_crawler', routing_key='for_user_info')
def search_keyword(keyword, keyword_id): crawler.info('We are searching keyword "{}"'.format(keyword)) cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < LIMIT: cur_url = URL.format(encode_keyword, cur_page) # current only for login, maybe later crawling page one without login search_page = get_page(cur_url, auth_level=2) if not search_page: crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) if cur_page == 1: cur_page += 1 elif 'noresult_tit' not in search_page: cur_page += 1 else: crawler.info('Keyword {} has been crawled in this turn'.format(keyword)) return # Because the search results are sorted by time, if any result has been stored in mysql, # We don't need to crawl the same keyword in this turn for wb_data in search_list: rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id) KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id) # todo incremental crawling using time if rs: crawler.info('Weibo {} has been crawled, skip it.'.format(wb_data.weibo_id)) continue else: WbDataOper.add_one(wb_data) # todo: only add seed ids and remove this task app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info')
def crawl_praise_page(mid): # 这里为了马上拿到返回结果,采用本地调用的方式 cur_time = int(time.time() * 1000) cur_url = BASE_URL.format(mid, cur_time) html = get_page(cur_url, auth_level=2, is_ajax=True) praise_data, ext_param = praise.get_praise_list(html, mid) PraiseOper.add_all(praise_data) WbDataOper.set_weibo_praise_crawled(mid) if not ext_param: crawler.error('fail to get praise page 2 ext_param, mid is {mid}'.format(mid=mid)) return # why no app.send_task and fall back to sequential execution # because weibo praise now require a parameter called max_id # and request without it will return something different from normal browser # should work after 5 # TODO: retry or return depending on ext_param for __ in range(2,5): # ext_param mainly max_id will be updated each time and be used next time html, praise_data, ext_param = crawl_praise_by_page(mid, ext_param) return
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = HOME_URL.format(uid, cur_page) if cur_page == 1: html = get_page(url, auth_level=1) else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return # Check whether weibo created after time in spider.yaml length_weibo_datas = len(weibo_datas) timeafter = time.mktime( time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S')) weibo_datas = [ weibo_datum for weibo_datum in weibo_datas if determine(weibo_datum, timeafter) ] WbDataOper.add_all(weibo_datas) # If the weibo isn't created after the given time, jump out the loop if len(weibo_datas) != length_weibo_datas: break domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) auth_level = 1 if total_page < limit: limit = total_page # Since the second ajax of page 1 has already been crawled # in the code above and has been stored in databse, # we only have to crawl the first ajax of page 1 crawl_ajax_page(ajax_url_0, auth_level) else: auth_level = 2 # Still the same as before # if total_page != limit: # limit = total_page # crawler.warning("total pagenum is {}".format(total_page)) crawl_ajax_page(ajax_url_0, auth_level) crawl_ajax_page(ajax_url_1, auth_level) cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)
def execute_praise_task(): weibo_datas = WbDataOper.get_weibo_praise_not_crawled() for weibo_data in weibo_datas: app.send_task('tasks.praise.crawl_praise_page', args=(weibo_data.weibo_id,), queue='praise_crawler', routing_key='praise_info')
def execute_praise_task(): weibo_datas = WbDataOper.get_weibo_praise_not_crawled() for weibo_data in weibo_datas: crawl_praise_page(weibo_data.weibo_id)
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = HOME_URL.format(uid, cur_page) if cur_page == 1: html = get_page(url, auth_level=1) else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return # Check whether weibo created after time in spider.yaml length_weibo_datas = len(weibo_datas) timeafter = time.mktime( time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S')) weibo_datas = [ weibo_datum for weibo_datum in weibo_datas if determine(weibo_datum, timeafter) ] WbDataOper.add_all(weibo_datas) # If the weibo isn't created after the given time, jump out the loop if len(weibo_datas) != length_weibo_datas: break domain = public.get_userdomain(html) cur_time = int(time.time() * 1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time + 100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) auth_level = 1 if total_page < limit: limit = total_page # Since the second ajax of page 1 has already been crawled # in the code above and has been stored in databse, # we only have to crawl the first ajax of page 1 crawl_ajax_page(ajax_url_0, auth_level) else: auth_level = 2 # Still the same as before # if total_page != limit: # limit = total_page # crawler.warning("total pagenum is {}".format(total_page)) crawl_ajax_page(ajax_url_0, auth_level) crawl_ajax_page(ajax_url_1, auth_level) cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)
def execute_comment_task(): # 只解析了根评论,而未对根评论下的评论进行抓取,如果有需要的同学,可以适当做修改 weibo_datas = WbDataOper.get_weibo_comment_not_crawled() for weibo_data in weibo_datas: app.send_task('tasks.comment.crawl_comment_page', args=(weibo_data.weibo_id,), queue='comment_crawler', routing_key='comment_info')
def execute_comment_task(): weibo_datas = WbDataOper.get_weibo_comment_not_crawled(db_session) crawl_comment_page(4253637545362266)
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 36 retry_count = 1 while cur_page <= 36: crawler.warning("current page {}".format(cur_page)) url = HOME_URL.format(uid, cur_page) #if cur_page == 1: # html = get_page(url, auth_level=1) #else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: if retry_count < 10: crawler.warning("user {} has no weibo, retry".format(uid)) retry_count = retry_count + 1 #time.sleep(240) continue; else: crawler.warning("user {} has no weibo, return".format(uid)) return # Check whether weibo created after time in spider.yaml # timeafter = time.mktime( # time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S')) # length_weibo_datas = len(weibo_datas) # for i in range(0, len(weibo_datas)): # weibo_time = time.mktime( # time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M')) # if weibo_time < timeafter: # weibo_datas = weibo_datas[0:i] # break WbDataOper.add_all(weibo_datas) # # If the weibo isn't created after the given time, jump out the loop # if i != length_weibo_datas - 1: # break domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) # if cur_page == 1: # # here we use local call to get total page number # total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) # auth_level = 1 # else: auth_level = 2 #if total_page < limit: # limit = total_page crawler.warning("append tasks.home.crawl_ajax_page{}".format(uid)); app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)