def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return try: user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers', routing_key='for_fans_followers') # By adding '--soft-time-limit secs' when you start celery, this will resend task to broker # e.g. celery -A tasks.workers -Q user_crawler worker -l info -c 1 --soft-time-limit 10 except SoftTimeLimitExceeded: crawler.error("user SoftTimeLimitExceeded uid={uid}".format(uid=uid)) app.send_task('tasks.user.crawl_person_infos', args=(uid, ), queue='user_crawler', routing_key='for_user_info')
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return try: user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid, ), queue='fans_followers', routing_key='for_fans_followers') # By adding '--soft-time-limit secs' when you start celery, this will resend task to broker # e.g. celery -A tasks.workers -Q user_crawler worker -l info -c 1 --soft-time-limit 10 except SoftTimeLimitExceeded: crawler.error( "user SoftTimeLimitExceeded uid={uid}".format(uid=uid)) app.send_task('tasks.user.crawl_person_infos', args=(uid, ), queue='user_crawler', routing_key='for_user_info')
def crawl_follower_fans(uid): seed = SeedidsOper.get_seed_by_id(uid) if seed.other_crawled == 0: rs = get_fans_or_followers_ids(uid, 1, 1) rs.extend(get_fans_or_followers_ids(uid, 2, 1)) datas = set(rs) # If data already exits, just skip it if datas: SeedidsOper.insert_seeds(datas) SeedidsOper.set_seed_other_crawled(uid)
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = HOME_URL.format(uid, cur_page) if cur_page == 1: html = get_page(url, auth_level=1) else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return # Check whether weibo created after time in spider.yaml timeafter = time.mktime( time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S')) length_weibo_datas = len(weibo_datas) for i in range(0, len(weibo_datas)): weibo_time = time.mktime( time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M')) if weibo_time < timeafter: weibo_datas = weibo_datas[0:i] break WbDataOper.add_all(weibo_datas) # If the weibo isn't created after the given time, jump out the loop if i != length_weibo_datas - 1: break domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) auth_level = 1 else: auth_level = 2 if total_page < limit: limit = total_page app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)
def crawl_follower_fans(uid): user, is_crawled = get_profile(uid) if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return rs = get_fans_or_followers_ids(uid, 1, 1) rs.extend(get_fans_or_followers_ids(uid, 2, 1)) datas = set(rs) for uid in datas: get_profile(uid) # If data already exits, just skip it # if datas: # SeedidsOper.insert_seeds(datas) SeedidsOper.set_seed_other_crawled(uid)
def crawl_comment_by_page(mid, page_num, seeion): try: cur_url = BASE_URL.format(mid, page_num) html = get_page(cur_url, auth_level=1, is_ajax=True) comment_datas, seed_ids = comment.get_comment_list(html, mid) except SoftTimeLimitExceeded: crawler.error( "comment SoftTimeLimitExceeded mid={mid} page_num={page_num}". format(mid=mid, page_num=page_num)) crawl_comment_by_page(mid, page_num) CommentOper.add_all(comment_datas, seeion) SeedidsOper.insert_seeds(seed_ids, seeion) if page_num == 1: WbDataOper.set_weibo_comment_crawled(mid, seeion) return html, comment_datas
def execute_home_task(): # you can have many strategies to crawl user's home page, here we choose table seed_ids's uid # whose home_crawl is 0 id_objs = SeedidsOper.get_home_ids() for id_obj in id_objs: app.send_task('tasks.home.crawl_weibo_datas', args=(id_obj.uid,), queue='home_crawler', routing_key='home_info')
def execute_user_task(): seeds = SeedidsOper.get_seed_ids() if seeds: for seed in seeds: app.send_task('tasks.user.crawl_person_infos', args=(seed.uid, ), queue='user_crawler', routing_key='for_user_info')
def test_seedids_oper(self): SeedidsOper.insert_seeds(FAKE_IDS) assert len(SeedidsOper.get_seed_ids()) == 2 assert SeedidsOper.get_seed_by_id(FAKE_ID) is not None SeedidsOper.set_seed_crawled(FAKE_ID, 1) assert len(SeedidsOper.get_seed_ids()) == 1
def crawl_person_infos(uid): if not uid: return try: user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: crawl_follower_fans(uid) except SoftTimeLimitExceeded: crawler.error( "user SoftTimeLimitExceeded uid={uid}".format(uid=uid)) crawl_person_infos(uid)
def execute_user_task(): # 查找seed表中报错的所有用户id,条件为is_crawled=0 seeds = SeedidsOper.get_seed_ids() if seeds: for seed in seeds: # 提交任务 app.send_task('tasks.user.crawl_person_infos', args=(seed.uid, ), queue='user_crawler', routing_key='for_user_info')
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers', routing_key='for_fans_followers')
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = HOME_URL.format(uid, cur_page) if cur_page == 1: html = get_page(url, auth_level=1) else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return WbDataOper.add_all(weibo_datas) domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) auth_level = 1 else: auth_level = 2 if total_page < limit: limit = total_page app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)
def get_profile(user_id): """ :param user_id: uid :return: user info and is crawled or not """ user = UserOper.get_user_by_uid(user_id) if user: storage.info('user {id} has already crawled'.format(id=user_id)) SeedidsOper.set_seed_crawled(user_id, 1) is_crawled = 1 else: user = get_url_from_web(user_id) if user is not None: SeedidsOper.set_seed_crawled(user_id, 1) else: SeedidsOper.set_seed_crawled(user_id, 2) is_crawled = 0 return user, is_crawled
from jieba.analyse import tfidf import xlwt from db.dao import CommentOper from jieba import analyse from db.dao import SeedidsOper import datetime import csv if __name__ == '__main__': book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet('test', cell_overwrite_ok=True) lists = [ 'id', 'comment_id', 'comment_cont', 'comment_screen_name', 'weibo_id', 'user_id', 'create_time' ] i = 0 for list in lists: sheet.write(0, i, list) i += 1 infos = CommentOper.get_all_comment_by_weibo_id(4244968959004196) i = 1 nowTime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') for info in infos: SeedidsOper.set_seed_id(info.user_id)
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 36 retry_count = 1 while cur_page <= 36: crawler.warning("current page {}".format(cur_page)) url = HOME_URL.format(uid, cur_page) #if cur_page == 1: # html = get_page(url, auth_level=1) #else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: if retry_count < 10: crawler.warning("user {} has no weibo, retry".format(uid)) retry_count = retry_count + 1 #time.sleep(240) continue; else: crawler.warning("user {} has no weibo, return".format(uid)) return # Check whether weibo created after time in spider.yaml # timeafter = time.mktime( # time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S')) # length_weibo_datas = len(weibo_datas) # for i in range(0, len(weibo_datas)): # weibo_time = time.mktime( # time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M')) # if weibo_time < timeafter: # weibo_datas = weibo_datas[0:i] # break WbDataOper.add_all(weibo_datas) # # If the weibo isn't created after the given time, jump out the loop # if i != length_weibo_datas - 1: # break domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) # if cur_page == 1: # # here we use local call to get total page number # total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) # auth_level = 1 # else: auth_level = 2 #if total_page < limit: # limit = total_page crawler.warning("append tasks.home.crawl_ajax_page{}".format(uid)); app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler', routing_key='ajax_home_info') cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)
def execute_relation_task(): seeds = SeedidsOper.get_other_ids() if seeds: for seed in seeds: app.send_task('tasks.relation.crawl_follower_fans', args=(seed.uid,), queue='relation_crawler', routing_key='for_relation_info')
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = HOME_URL.format(uid, cur_page) if cur_page == 1: html = get_page(url, auth_level=1) else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return # Check whether weibo created after time in spider.yaml length_weibo_datas = len(weibo_datas) timeafter = time.mktime( time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S')) weibo_datas = [ weibo_datum for weibo_datum in weibo_datas if determine(weibo_datum, timeafter) ] WbDataOper.add_all(weibo_datas) # If the weibo isn't created after the given time, jump out the loop if len(weibo_datas) != length_weibo_datas: break domain = public.get_userdomain(html) cur_time = int(time.time() * 1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time + 100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) auth_level = 1 if total_page < limit: limit = total_page # Since the second ajax of page 1 has already been crawled # in the code above and has been stored in databse, # we only have to crawl the first ajax of page 1 crawl_ajax_page(ajax_url_0, auth_level) else: auth_level = 2 # Still the same as before # if total_page != limit: # limit = total_page # crawler.warning("total pagenum is {}".format(total_page)) crawl_ajax_page(ajax_url_0, auth_level) crawl_ajax_page(ajax_url_1, auth_level) cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = HOME_URL.format(uid, cur_page) if cur_page == 1: html = get_page(url, auth_level=1) else: html = get_page(url, auth_level=2) weibo_datas = get_data(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return # Check whether weibo created after time in spider.yaml length_weibo_datas = len(weibo_datas) timeafter = time.mktime( time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S')) weibo_datas = [ weibo_datum for weibo_datum in weibo_datas if determine(weibo_datum, timeafter) ] WbDataOper.add_all(weibo_datas) # If the weibo isn't created after the given time, jump out the loop if len(weibo_datas) != length_weibo_datas: break domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2)) auth_level = 1 if total_page < limit: limit = total_page # Since the second ajax of page 1 has already been crawled # in the code above and has been stored in databse, # we only have to crawl the first ajax of page 1 crawl_ajax_page(ajax_url_0, auth_level) else: auth_level = 2 # Still the same as before # if total_page != limit: # limit = total_page # crawler.warning("total pagenum is {}".format(total_page)) crawl_ajax_page(ajax_url_0, auth_level) crawl_ajax_page(ajax_url_1, auth_level) cur_page += 1 SeedidsOper.set_seed_home_crawled(uid)
def execute_user_task(): seeds = SeedidsOper.get_seed_ids() if seeds: for seed in seeds: crawl_person_infos(seed.uid)
def execute_user_task(): seeds = SeedidsOper.get_seed_ids() if seeds: for seed in seeds: app.send_task('tasks.user.crawl_person_infos', args=(seed.uid,), queue='user_crawler', routing_key='for_user_info')