def get_url_from_web(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = base_url.format('100505', user_id) html = get_page(url) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = base_url.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) if user.name: save_user(user) storage.info('has stored user {id} info successfully'.format(id=user_id)) return user else: return None else: return None
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning('用户id为{}的用户主页微博数据未采集成功,请检查原因'.format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page) if cur_page == 1: total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler', routing_key='ajax_home_info')
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # we need not crawl the same keyword in this turn for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # send task for crawling user info app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return
def crawl_comment_page(mid): limit = get_max_comment_page() cur_page = 1 next_url = '' while cur_page <= limit: cur_time = int(time.time()*1000) if cur_page == 1: url = start_url.format(mid, cur_time) else: url = base_url.format(next_url, cur_time) html = get_page(url, user_verify=False) comment_datas = comment.get_comment_list(html, mid) if not comment_datas and cur_page == 1: crawler.warning('微博id为{}的微博评论未采集成功,请检查原因'.format(mid)) return save_comments(comment_datas) # 由于这里每一步都要根据上一步来迭代,所以不适合采用网络调用(主要是比较麻烦) next_url = comment.get_next_url(html) if not next_url: crawler.info('微博{}的评论采集已经完成'.format(mid)) return cur_page += 1
def crawl_ajax_page(url): ajax_html_0 = get_page(url) ajax_wbdatas_0 = get_home_wbdata_byajax(ajax_html_0) if not ajax_wbdatas_0: return insert_weibo_datas(ajax_wbdatas_0)
def get_fans_or_followers_ids(user_id, crawl_type): """ Get followers or fans :param user_id: user id :param crawl_type: 1 stands for fans,2 stands for follows :return: lists of fans or followers """ # todo check fans and followers the special users,such as writers # todo deal with conditions that fans and followers more than 5 pages if crawl_type == 1: fans_or_follows_url = 'http://weibo.com/p/100505{}/follow?relate=fans&page={}#Pl_Official_HisRelation__60' else: fans_or_follows_url = 'http://weibo.com/p/100505{}/follow?page={}#Pl_Official_HisRelation__60' cur_page = 1 max_page = 6 user_ids = list() while cur_page < max_page: url = fans_or_follows_url.format(user_id, cur_page) page = get_page(url) if cur_page == 1: urls_length = public.get_max_crawl_pages(page) if max_page > urls_length: max_page = urls_length + 1 # get ids and store relations user_ids.extend(public.get_fans_or_follows(page, user_id, crawl_type)) cur_page += 1 return user_ids
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return url = home_url.format(uid) html = get_page(url) if is_404(html): return None domain = public.get_userdomain(html) user, is_crawled = user_get.get_profile(uid, domain) # If it's enterprise user, just skip it if user and user.verify_type == 2: set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid, domain), queue='fans_followers', routing_key='for_fans_followers')
def crawl_repost_by_page(mid, page_num): cur_url = base_url.format(mid, page_num) html = get_page(cur_url, user_verify=False) repost_datas = repost.get_repost_list(html, mid) if page_num == 1: wb_data.set_weibo_repost_crawled(mid) return html, repost_datas
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) cur_time = int(time.time()*1000) ajax_url_0 = ajax_url.format(domain, 0, domain, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0,), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1,), queue='ajax_home_crawler', routing_key='ajax_home_info') set_seed_home_crawled(uid)
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) #crawler.info(limit) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) #crawler.info(search_page) if not search_page: crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # we need not crawl the same keyword in this turn for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('keyword {} has been crawled in last turn'.format(keyword)) #continue return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # send task for crawling user info app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return
def get_fans_or_followers_ids(user_id, crawl_type): """ 获取用户的粉丝和关注用户 :param user_id: 用户id :param crawl_type: 1表示获取粉丝,2表示获取关注 :return: 获取的关注或者粉丝列表 """ # todo 验证作家等用户的粉丝和关注是否满足;处理粉丝或者关注5页的情况 if crawl_type == 1: ff_url = 'http://weibo.com/p/100505{}/follow?relate=fans&page={}#Pl_Official_HisRelation__60' else: ff_url = 'http://weibo.com/p/100505{}/follow?page={}#Pl_Official_HisRelation__60' cur_page = 1 max_page = 6 user_ids = list() while cur_page < max_page: url = ff_url.format(user_id, cur_page) page = get_page(url) if cur_page == 1: user_ids.extend(public.get_fans_or_follows(page)) urls_length = public.get_max_crawl_pages(page) if max_page > urls_length: max_page = urls_length + 1 cur_page += 1 return user_ids
def get_fans_or_followers_ids(user_id, crawl_type): """ Get followers or fans :param user_id: user id :param crawl_type: 1 stands for fans,2 stands for followers :return: lists of fans or followers """ # todo check fans and followers the special users,such as writers # todo process the conditions that fans and followers more than 5 pages if crawl_type == 1: fans_or_follows_url = 'http://weibo.com/p/100505{}/follow?relate=fans&page={}#Pl_Official_HisRelation__60' else: fans_or_follows_url = 'http://weibo.com/p/100505{}/follow?page={}#Pl_Official_HisRelation__60' cur_page = 1 max_page = 6 user_ids = list() while cur_page < max_page: url = fans_or_follows_url.format(user_id, cur_page) page = get_page(url) if cur_page == 1: urls_length = public.get_max_crawl_pages(page) if max_page > urls_length: max_page = urls_length + 1 # get ids and store relations user_ids.extend(public.get_fans_or_follows(page, user_id, crawl_type)) cur_page += 1 return user_ids
def _crawl_loop(page, page_counter, mid, uid, user_name, spread_other_and_caches, spread_others, spread_other_caches): while page > 0 and page_counter < page_max: ajax_url = base_url.format(mid=mid, currpage=page) repost_info = get_page(ajax_url, False) try: repost_json = json.loads(repost_info) repost_html = repost_json['data']['html'] except Exception as why: # 如果出现异常,默认不抓该ajax_url对应的微博信息 parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format( url=ajax_url, why=why)) else: repost_urls = parse_status.get_reposturls(repost_html) # 转发节点排序逻辑 # todo 不通过repost_urls去获取转发微博的相关信息,验证扩散效果是否相同 for repost_url in repost_urls: repost_cont = status.get_status_info(repost_url, uid, user_name, mid) if repost_cont is not None: spread_other_and_caches.append(repost_cont) for soac in spread_other_and_caches: if soac.get_so().id != '': spread_others.append(soac.get_so()) spread_other_caches.append(soac.get_soc()) finally: print('当前位于第{}页'.format(page)) page -= 1 page_counter += 1
def _get_current_source(url, wb_mid): """ :param url: 当前微博url :param wb_mid: 当前微博mid :return: 转发数,微博用户id,用户名 """ html = get_page(url) if not html or basic.is_404(html): return None reposts = parse_status.get_repostcounts(html) comments = parse_status.get_commentcounts(html) # 更新weibo_search_data表中的转发数、评论数 weibosearch_dao.update_repost_comment(mid=wb_mid, reposts=reposts, comments=comments) root_url = url user_id = parse_status.get_userid(html) user_name = parse_status.get_username(html) post_time = parse_status.get_statustime(html) device = parse_status.get_statussource(html) comments_count = parse_status.get_commentcounts(html) reposts_count = parse_status.get_repostcounts(html) root_user = user.get_profile(user_id) # 源微博的相关信息存储 spread_original_dao.save(root_user, wb_mid, post_time, device, reposts_count, comments_count, root_url) crawler.info('该微博转发数为{counts}'.format(counts=reposts_count)) return reposts_count, user_id, user_name
def get_fans_or_followers_names(name, crawl_type): """ 抓取用户和粉丝 :param name: 用户名 :param crawl_type: 抓取类型。 followees: 关注, followers: 粉丝 :return: """ LIMIT = 20 page = 1 is_end = False max_follow_page = get_max_follow_page() while (not is_end) and (page < max_follow_page): url = FOLLOW_URL.format(name, crawl_type, (page - 1) * LIMIT, LIMIT) html = get_page(url) user_names, is_end = get_fans_or_follows(html, name) storage.info( f"get {name} {crawl_type}: user_names: {user_names}, is_end:{is_end}" ) SeedUser.insert_many(user_names) page += 1 storage.info( f"get {name} page={page}, max_follow_page={max_follow_page}, is_end={is_end}" )
def crawl_repost_by_page(mid, page_num): cur_url = base_url.format(mid, page_num) html = get_page(cur_url, user_verify=False) repost_datas = repost.get_repost_list(html, mid) if page_num == 1: wb_data.set_weibo_repost_crawled(mid) return html, repost_datas
def get_mpproxy_to_db(): url = 'http://proxy.mimvp.com/api/fetch.php?orderid=860170808163932696&num=100&country_group=1&http_type=2&anonymous=3,5&result_fields=1,2&result_format=json&ping_time=1' html = get_page(url, user_verify=False, need_login=False) proxy_dict = parse_json_to_dict(html) proxies = proxy_dict.get('result') proxy_list = [] if proxies: for proxy in proxies: data = proxy.get('ip:port') data = data.split(':') if data: ip = data[0] port = data[1] else: return False new_proxy = Proxys() new_proxy.ip = ip new_proxy.port = port new_proxy.types = 2 new_proxy.protocol = 2 new_proxy.country = '国内' new_proxy.area = '米扑代理' new_proxy.speed = 0.00 new_proxy.score = 5 proxy_list.append(new_proxy) if proxy_list: insert_proxy(proxy_list) return True else: return False
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning('本次并没获取到关键词{}的相关微博,该页面源码是{}'.format( keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环 for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword)) return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处 app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid, ), queue='user_crawler', routing_key='for_user_info') # 判断是否包含下一页 if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('关键词{}搜索完成'.format(keyword)) return
def test_page_get(self): """ test crawling pages """ from page_get import basic test_url = 'http://weibo.com/p/1005051764222885/info?mod=pedit_more' text = basic.get_page(test_url) self.assertIn('深扒娱乐热点', text)
def crawl_comment_by_page(mid, page_num): cur_time = int(time.time() * 1000) cur_url = base_url.format(mid, page_num, cur_time) html = get_page(cur_url, user_verify=False) comment_datas = comment.get_comment_list(html, mid) save_comments(comment_datas) wb_data.set_weibo_comment_crawled(mid) return html
def test_page_get(self): """ 测试页面抓取功能 """ from page_get import basic test_url = 'http://weibo.com/p/1005051764222885/info?mod=pedit_more' text = basic.get_page(test_url) self.assertIn('深扒娱乐热点', text)
def crawl_ajax_page(url): ajax_html = get_page(url, user_verify=False) ajax_wbdatas = get_home_wbdata_byajax(ajax_html) if not ajax_wbdatas: return '' insert_weibo_datas(ajax_wbdatas) return ajax_html
def get_status_info(url, user_id, name, mid=''): soc = SpreadOtherCache() print('当前转发微博url为:' + url) repost_cont = get_page(url) if not is_404(repost_cont): repost_user_id = parse_status.get_userid(repost_cont) if repost_user_id == '': return None repost_user_name = parse_status.get_username(repost_cont) soc.set_id(repost_user_id) soc.set_name(repost_user_name) so = SpreadOther() so.id = repost_user_id so.screen_name = repost_user_name so.upper_user_name = parse_status.get_upperusername(repost_cont, name) cur_user = user.get_profile(repost_user_id) try: so.province = cur_user.province so.city = cur_user.city so.location = cur_user.location so.description = cur_user.description so.domain_name = cur_user.domain_name so.blog_url = cur_user.blog_url so.gender = cur_user.gender so.headimg_url = cur_user.headimg_url so.followers_count = cur_user.followers_count so.friends_count = cur_user.friends_count so.status_count = cur_user.status_count so.verify_type = cur_user.verify_type so.verify_info = cur_user.verify_info so.register_time = cur_user.register_time if so.screen_name == name: so.id = user_id so.mid = parse_status.get_mid(repost_cont) so.status_post_time = parse_status.get_statustime(repost_cont) so.device = parse_status.get_statussource(repost_cont) if mid: so.original_status_id = mid else: so.original_status_id = parse_status.get_orignalmid(repost_cont) so.comments_count = parse_status.get_commentcounts(repost_cont) so.reposts_count = parse_status.get_repostcounts(repost_cont) so.like_count = parse_status.get_likecounts(repost_cont) so.status_url = url except AttributeError as e: # todo:找出这里的问题 logging.info('解析{user_id}失败, 堆栈为{e}'.format(user_id=user_id, e=e)) logging.info(r'该转发页面的源代码为:\n{repost_cont}'.format(repost_cont=repost_cont)) return None else: return SpreadOtherAndCache(so, soc) else: return None
def get_url_from_web(user_id): """ 根据用户id获取用户资料:如果用户的domain为100505,那么会直接返回用户详细资料;如果是103505或者100306,那么需要再进行 一次请求,因为用base_url的方式它只会定位到用户主页而不是详细资料页;如果是企业和服务号等,通过base_url访问也会跳转到该 用户的主页,由于该类用户的详细页价值不大,所以不再进行请求它们的详细页 :param user_id: 用户id :return: 用户类实体 """ if not user_id: return None url = base_url.format('100505', user_id) html = get_page(url) if not is_404(html): domain = public.get_userdomain(html) # 作家 if domain == '103505' or domain == '100306': url = base_url.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # 普通用户 elif domain == '100505': user = get_user_detail(user_id, html) # 默认是企业 else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) # 保存用户信息到数据库 save_user(user) storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id)) return user else: return None
def get_url_from_web(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = base_url.format('100505', user_id) html = get_page(url) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = base_url.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) save_user(user) storage.info('已经成功保存ID为{id}的用户信息'.format(id=user_id)) return user else: return None
def crawl_comment_by_page(mid, page_num): cur_time = int(time.time() * 1000) cur_url = base_url.format(mid, page_num, cur_time) html = get_page(cur_url, user_verify=False) comment_datas = comment.get_comment_list(html, mid) save_comments(comment_datas) if page_num == 1: wb_data.set_weibo_comment_crawled(mid) return html
def search_one(keyword, session): url = 'http://s.weibo.com/weibo/' + keyword + '&Refer=STopic_box' search_page = get_page(url, session, headers) if search_page: search_list = search_parse.get_search_info(search_page) for s in search_list: s.keyword = keyword s.mk_primary = '_'.join([str(s.mid), keyword]) add_search_cont(search_list) else: print('并未解析到搜索结果:{page}'.format(page=search_page))
def crawl_ajax_page(url): """ :param url: user home ajax url :return: resp.text """ ajax_html = get_page(url, user_verify=False) ajax_wbdatas = get_home_wbdata_byajax(ajax_html) if not ajax_wbdatas: return '' insert_weibo_datas(ajax_wbdatas) return ajax_html
def get_fans_list_return(uid, page): fans_wb_temp_url = 'https://m.weibo.cn/api/container/getIndex?containerid={}_-_followers_-_{}&luicode={}&lfid={}&featurecode={}&type=uid&value={}&page={}' containerid = '231051' luicode = '10000011' lfid = '100505' + str(uid) featurecode = '20000320' value = str(uid) url = fans_wb_temp_url.format(containerid, uid, luicode, lfid, featurecode, value, page) html = get_page(url, user_verify=False, need_login=False) return url, html
def crawl_ajax_page(url): """ :param url: user home ajax url :return: resp.text """ ajax_html = get_page(url, user_verify=False) ajax_wbdatas = get_home_wbdata_byajax(ajax_html) if not ajax_wbdatas: return '' insert_weibo_datas(ajax_wbdatas) return ajax_html
def crawl_ajax_page(url): """ 返回值主要供第一次本地调用使用(获取总页数),网络调用忽略返回值 :param url: :return: """ ajax_html = get_page(url, user_verify=False) ajax_wbdatas = get_home_wbdata_byajax(ajax_html) if not ajax_wbdatas: return '' insert_weibo_datas(ajax_wbdatas) return ajax_html
def get_hot_list_from_web(title): if not title: return None url = HOT_LIST_URL.format(title) html = get_page(url) all_lists = parse_hot_list(title, html) if all_lists: CommonOperate.add_all(all_lists) storage.info(f"Has stored hot_list {title} info successfully") return all_lists
def get_cont_of_weibo(mid): """ :param mid: weibo's mid :return: all cont of the weibo """ url = base_url.format(mid) html = get_page(url, user_verify=False) if html: try: html = json.loads(html, encoding='utf-8').get('data').get('html') cont = filters.text_filter(html) except AttributeError: cont = '' return cont
def get_cont_of_weibo(mid): """ :param mid: weibo's mid :return: all cont of the weibo """ url = base_url.format(mid) html = get_page(url, user_verify=False) if html: try: html = json.loads(html, encoding='utf-8').get('data').get('html') cont = filters.text_filter(html) except AttributeError: cont = '' return cont
def get_user_info_from_web(user_name): """从网络抓取用户信息 :param: user_name 用户名 :return: user entiry """ if not user_name: return None url = USER_HOME_URL.format(user_name) html = get_page(url) user = get_user_detail(user_name, html) if user: CommonOperate.add_one(user) storage.info(f"Has stored user {user_name} info successfully") return user
def _get_total_page(wb_mid): page = 1 ajax_url = base_url.format(mid=wb_mid, currpage=page) source = get_page(ajax_url, False) if source == '': crawler.error('本次转发url{}抓取出错'.format(ajax_url)) return 0 crawler.info('本次转发信息url为{}'.format(ajax_url)) try: repost_json = json.loads(source) total_page = int(repost_json['data']['page']['totalpage']) except Exception as why: parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(url=ajax_url, why=why)) return 0 else: return total_page
def parse_xdaili_return(url): html = get_page(url, user_verify=False, need_login=False) proxy_dict = parse_json_to_dict(html) proxies = proxy_dict.get('RESULT') err_code = int(proxy_dict.get('ERRORCODE')) proxy_list = [] if proxies and err_code == 0: for proxy in proxies: port = proxy.get('port') ip = proxy.get('ip') new_proxy = Proxys() new_proxy.ip = ip new_proxy.port = port new_proxy.types = 2 new_proxy.protocol = 2 new_proxy.country = '国内' new_proxy.area = '讯代理' new_proxy.speed = 0.00 new_proxy.score = 5 proxy_list.append(new_proxy) return proxy_list
def search_keyword(row): cur_page = 1 keyword = row.keyword if row.startTime: startTime = row.startTime.strftime('%Y-%m-%d') url = 'http://s.weibo.com/weibo/{}&scope=ori&suball=1&page={}×cope=custom:{}' if row.endTime: endTime = row.endTime.strftime('%Y-%m-%d') encode_keyword = url_parse.quote(keyword) while cur_page < limit: if row.startTime and row.endTime: finalTime = startTime + ':' + endTime cur_url = url.format(encode_keyword, cur_page, finalTime) else: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning( '本次并没获取到关键词{}的相关微博,该页面源码是{}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环 for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword)) return else: insert_weibo_data(wb_data) # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处 app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') # 判断是否包含下一页 if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('关键词{}搜索完成'.format(keyword)) return
def crawl_weibo_datas(uid): limit = get_max_home_page() cur_page = 1 while cur_page <= limit: url = home_url.format(uid, cur_page) html = get_page(url) weibo_datas = get_wbdata_fromweb(html) if not weibo_datas: crawler.warning("user {} has no weibo".format(uid)) return insert_weibo_datas(weibo_datas) domain = public.get_userdomain(html) page_id = public.get_pageid(html) cur_time = int(time.time() * 1000) ajax_url_0 = ajax_url.format(domain, 0, page_id, uid, cur_page, cur_page, cur_time) ajax_url_1 = ajax_url.format(domain, 1, page_id, uid, cur_page, cur_page, cur_time + 100) if cur_page == 1: # here we use local call to get total page number total_page = get_total_page(crawl_ajax_page(ajax_url_1)) if total_page < limit: limit = total_page cur_page += 1 app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, ), queue='ajax_home_crawler', routing_key='ajax_home_info') app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, ), queue='ajax_home_crawler', routing_key='ajax_home_info') set_seed_home_crawled(uid)