def crawl_follower_fans(uid): user, is_crawled = get_profile(uid) if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return rs = get_fans_or_followers_ids(uid, 1, 1) rs.extend(get_fans_or_followers_ids(uid, 2, 1)) datas = set(rs) for uid in datas: get_profile(uid) # If data already exits, just skip it # if datas: # SeedidsOper.insert_seeds(datas) SeedidsOper.set_seed_other_crawled(uid)
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return try: user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid, ), queue='fans_followers', routing_key='for_fans_followers') # By adding '--soft-time-limit secs' when you start celery, this will resend task to broker # e.g. celery -A tasks.workers -Q user_crawler worker -l info -c 1 --soft-time-limit 10 except SoftTimeLimitExceeded: crawler.error( "user SoftTimeLimitExceeded uid={uid}".format(uid=uid)) app.send_task('tasks.user.crawl_person_infos', args=(uid, ), queue='user_crawler', routing_key='for_user_info')
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return try: user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers', routing_key='for_fans_followers') # By adding '--soft-time-limit secs' when you start celery, this will resend task to broker # e.g. celery -A tasks.workers -Q user_crawler worker -l info -c 1 --soft-time-limit 10 except SoftTimeLimitExceeded: crawler.error("user SoftTimeLimitExceeded uid={uid}".format(uid=uid)) app.send_task('tasks.user.crawl_person_infos', args=(uid, ), queue='user_crawler', routing_key='for_user_info')
def crawl_repost_page(mid, uid): limit = get_max_repost_page() + 1 first_repost_data = crawl_repost_by_page(mid, 1) total_page = repost.get_total_page(first_repost_data[0]) repost_datas = first_repost_data[1] if not repost_datas: return root_user, _ = get_profile(uid) if total_page < limit: limit = total_page + 1 for page_num in range(2, limit): cur_repost_datas = crawl_repost_by_page(mid, page_num)[1] if cur_repost_datas: for index, repost_obj in enumerate(cur_repost_datas): print(repost_obj) user_id = IdNames.fetch_uid_by_name( repost_obj.parent_user_name) if not user_id and root_user: # when it comes to errors, set the args to default(root) repost_obj.parent_user_id = root_user.uid repost_obj.parent_user_name = root_user.name else: repost_obj.parent_user_id = user_id repost_datas[index] = repost_obj RepostOper.add_all(repost_datas)
def crawl_repost_page(mid, uid): limit = get_max_repost_page() + 1 first_repost_data = crawl_repost_by_page(mid, 1) total_page = repost.get_total_page(first_repost_data[0]) repost_datas = first_repost_data[1] if not repost_datas: return root_user, _ = get_profile(uid) if total_page < limit: limit = total_page + 1 for page_num in range(2, limit): cur_repost_datas = crawl_repost_by_page(mid, page_num)[1] if cur_repost_datas: repost_datas.extend(cur_repost_datas) for index, repost_obj in enumerate(repost_datas): user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name) if not user_id: # when it comes to errors, set the args to default(root) repost_obj.parent_user_id = root_user.uid repost_obj.parent_user_name = root_user.name else: repost_obj.parent_user_id = user_id repost_datas[index] = repost_obj RepostOper.add_all(repost_datas)
def get_praise_list(html: str, wb_id: str): """[get praise list] Arguments: html {str} -- [web page] wb_id {str} -- [weibo mid] Raises: in -- [can't get wanted dom] Returns: WeiboPraise list -- [list contains praises in this html] ext_param -- [extra parameters to get next page] """ cont = get_html_cont(html) if not cont: return list(), '' soup = BeautifulSoup(cont, 'html.parser') praise_list = list() praises = soup.find_all(attrs={'class': 'list_li S_line1 clearfix'}) # pattern = re.compile(r'<li uid=\\"\d{10}\\">') # praises = pattern.findall(cont) for praise in praises: try: user_id = praise.find('img').get('usercard')[3:] get_profile(user_id) wb_praise = WeiboPraise(user_id, wb_id) except Exception as e: parser.error('解析点赞失败,具体信息是{}'.format(e)) else: praise_list.append(wb_praise) like_loading = soup.find(attrs={'node-type': 'like_loading'}) feed_like_more = soup.find(attrs={'action-type': 'feed_like_more'}) if like_loading: action_data = like_loading.get('action-data', '') elif feed_like_more: action_data = feed_like_more.get('action-data', '') else: action_data = '' ext_param = htmllib.unescape(action_data) return praise_list, ext_param
def crawl_user_info(name): """抓取用户首页的信息 :param name: 用户名 :return: None """ if not name: return None crawler.info(f"received task crawl_user_info {name}") user, other_crawled = get_profile(name) if not other_crawled: crawler.info(f"send task crawl_follower_fans {user.name}") app.send_task("tasks.user.crawl_follower_fans", args=(user.name, ))
def crawl_person_infos(uid): if not uid: return try: user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: crawl_follower_fans(uid) except SoftTimeLimitExceeded: crawler.error( "user SoftTimeLimitExceeded uid={uid}".format(uid=uid)) crawl_person_infos(uid)
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers', routing_key='for_fans_followers')
def test_parse_user_info(uid, expect_name): user_info = get_profile(uid)[0] assert user_info.name == expect_name time.sleep(REQUEST_INTERNAL)
def get_repost_list(html, mid): """ Get repost details :param html: page source :param mid: weibo mid :return: list of repost infos """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') repost_list = list() reposts = soup.find_all(attrs={'action-type': 'feed_list_item'}) for repost in reposts: wb_repost = WeiboRepost() try: repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\ split('//@') wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode( 'gbk', 'ignore') wb_repost.weibo_id = repost['mid'] # TODO 将wb_repost.user_id加入待爬队列(seed_ids) wb_repost.user_id = repost.find(attrs={ 'class': 'WB_face W_fl' }).find('a').get('usercard')[3:] get_profile(wb_repost.user_id) wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\ text wb_repost.repost_time = repost.find(attrs={ 'class': 'WB_from S_txt2' }).find('a').get('title') wb_repost.weibo_url = REPOST_URL.format( repost.find(attrs={ 'class': 'WB_from S_txt2' }).find('a').get('href')) parents = repost.find(attrs={ 'class': 'WB_text' }).find(attrs={'node-type': 'text'}) wb_repost.root_weibo_id = mid # Save the current repost user's name and id as the middle result IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id) if not parents: wb_repost.parent_user_name = '' else: try: # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed temp = parents.find(attrs={'extra-data': 'type=atname'}) if temp: wb_repost.parent_user_name = temp.get('usercard')[5:] else: wb_repost.parent_user_name = '' except Exception as e: parser.error( "error occurred when parsing the parent's name ,the detail is {}" .format(e)) wb_repost.parent_user_name = '' except Exception as e: parser.error( 'repost parse error occurred,the detail is {}'.format(e)) else: repost_list.append(wb_repost) return repost_list
def test_crawl_userinfo(uid, expect): user = get_profile(uid)[0] assert user.name == expect time.sleep(REQUEST_INTERNAL)
def get_comment_list(html, wb_id): """ 获取评论列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html5lib') comment_list = list() comments = soup.find(attrs={ 'node-type': 'comment_list' }).find_all(attrs={'class': 'list_li S_line1 clearfix'}) for comment in comments: wb_comment = WeiboComment() try: cont = [] first_author = True first_colon = True for content in comment.find(attrs={'class': 'WB_text'}).contents: if not content: continue if content.name == 'a': if first_author: first_author = False continue else: if content.text: cont.append(content.text) elif content.name == 'img': img_title = content.get('title', '') if img_title == '': img_title = content.get('alt', '') if img_title == '': img_src = content.get('src', '') img_src = img_src.split('/')[-1].split('.', 1)[0] try: img_title = parse_emoji.softband_to_utf8(img_src) except Exception as e: parser.error('解析表情失败,具体信息是{},{}'.format( e, comment)) img_title = '' cont.append(img_title) else: if first_colon: if content.find(':') == 0: cont.append(content.replace(':', '', 1)) first_colon = False else: cont.append(content) wb_comment.comment_cont = ''.join(cont) wb_comment.comment_screen_name = comment.find(attrs={ 'class': 'WB_text' }).find('a').text wb_comment.comment_id = comment['comment_id'] # TODO 将wb_comment.user_id加入待爬队列(seed_ids) wb_comment.user_id = comment.find(attrs={ 'class': 'WB_text' }).find('a').get('usercard')[3:] # 爬取新用户基本信息 if wb_comment.user_id: get_profile(wb_comment.user_id) # 日期格式化 create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text if '分钟前' in create_time: now = datetime.datetime.now() reduce_minute = create_time.strip().split('分钟')[0] delta = datetime.timedelta(minutes=int(reduce_minute)) real_time = now - delta wb_comment.create_time = str( real_time.strftime('%Y-%m-%d %H:%M')) elif '今天' in create_time: now = datetime.datetime.now().strftime('%Y-%m-%d') real_time = now + create_time.strip().split('今天')[-1] wb_comment.create_time = str(real_time) elif '楼' in create_time: wb_comment.create_time = str(re.sub('第\d*楼', '', create_time)) else: wb_comment.create_time = create_time if not wb_comment.create_time.startswith('201'): wb_comment.create_time = str( datetime.datetime.now().year) + wb_comment.create_time # 中文时间戳转换成标准格式 "%Y-%m-%d %H:%M" create_time_copy = wb_comment.create_time if '月' in create_time_copy and '日' in create_time_copy: month = create_time_copy.split("年")[-1].split("月")[0] day = create_time_copy.split("年")[-1].split("月")[-1].split( "日")[0] # 补齐0 if month and int(month) < 10: wb_comment.create_time = wb_comment.create_time.replace( str(month) + "月", "0" + str(month) + "月") if day and int(day) < 10: wb_comment.create_time = wb_comment.create_time.replace( str(day) + "日", "0" + str(day) + "日") wb_comment.create_time = wb_comment.create_time.replace( "月", "-") wb_comment.create_time = wb_comment.create_time.replace( "日", "") if '年' in wb_comment.create_time: wb_comment.create_time = wb_comment.create_time.replace( "年", "-") wb_comment.weibo_id = wb_id except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: comment_list.append(wb_comment) return comment_list