def get_part_req_data(self, nickname): """ 仅获取阅读量和评论的请求数据 :param nickname:公众号昵称 :return:最后成功与否取决在redis中是否找到有有效数据 """ TidyReqData.flush_data() redis_instance.set('current_nickname', nickname) self.home_to_gzh_search() self.search_gzh(nickname) self.click_a_message(args=1) self.check_comments() self.home()
def get_all_req_data(self, nickname, hand=False): """ 获取关于一个公众号的全部请求数据 当前程序使用baidu API受到网络和并发限制效果并十分理想 :param nickname: 公众号昵称 :return:最后成功与否取决在redis中是否找到有有效数据 """ TidyReqData.flush_data("*.req") redis_instance.set('current_nickname', nickname) self.home_to_gzh_search() self.search_gzh(nickname) if hand == False: self.all_message() self.click_a_message() # self.check_comments() else: input("请一一手动或取参数 回车退出") self.home()
def process_request(self, request, spider): current_req_data = self.req_data_list[self.counter % self.wx_num] req_data = TidyReqData.req_to_dict( current_req_data['load_more']['req_data']) request.set_method(req_data['method']) req_data['url_param_dict']['offset'] = request.meta['list_offset'] url = req_data['url'] + dict_to_str(req_data['url_param_dict']) request._set_url(url) request.set_headers(req_data['headers']) self.counter += 1 return None
def __init__(self, *args, **kwargs): """ :param args: :param kwargs: 实例化爬虫需要调用的函数 """ # 包含当前公众号所有不存在文本内容数据的生成器 self.current_nickname = TidyReqData.get_nickname() self.articles_list = get_collection_article(self.current_nickname, article={"$exists": False}, title={"$exists": True}) self.crawler_begin_time = time() self.crawler_parse_counter = 0
def process_request(self, request, spider): current_req_data = self.req_data_list[self.counter % self.wx_num] req_data = TidyReqData.req_to_dict( current_req_data['content']['req_data']) url = request._get_url() raw_url = copy(url) if "https" in raw_url: raw_url = raw_url.replace("https", "http") request.set_ext_data({"raw_url": raw_url}) if "https" not in url: url = url.replace("http", "https") request._set_url(url) request.set_method(req_data['method']) if "Cookie" in req_data['headers']: req_data['headers'].pop("Cookie") request.set_headers(req_data['headers']) self.counter += 1 return None
def process_request(self, request, spider): current_req_data = self.req_data_list[self.counter % self.wx_num] req_data = TidyReqData.req_to_dict( current_req_data['getappmsgext']['req_data']) content_url = request._get_url() content_url_param_dict = str_to_dict( content_url.split('?')[-1], '&', '=') body_dict = req_data['body_dict'] body_dict.update(content_url_param_dict) body_dict['comment_id'] = request.get_ext_data['comment_id'] body_dict['is_need_reward'] = 1 url = req_data['url'] + req_data['url_param_str'] request._set_url(url) request.set_method(req_data['method']) request.set_headers(req_data['headers']) body_str = dict_to_str(body_dict) request._set_body(body_str) self.counter += 1 return None
def __init__(self, *args, **kwargs): """ :param args: :param kwargs: 实例化爬虫需要调用的函数 """ # 包含当前公众号所有不存在文本内容数据的生成器 self.current_nickname = TidyReqData.get_nickname() print(self.current_nickname) articles_list = get_collection_article(self.current_nickname, read_num={"$exists": False}, comment_id={"$exists": True}) self.articles_list = [] for article in articles_list: self.articles_list.append(article) self.task_num = len(self.articles_list) self.task_counter = 0 self.begin_time = time() self.pre_time = time()
def prepare_req_data(self, current_req_data, request, _type): """ :param current_req_data: 本轮请求需要使用的请求参数 :param request: Request对象 :return: 准备爬取阅读数据的请求参数 """ request_data = {} if _type in ['getappmsgext', 'appmsg_comment']: req_data = TidyReqData.req_to_dict( current_req_data[_type]['req_data']) else: return request_data #根据原始文章的url构建body参数 content_url = request._get_url() content_url_param_dict = str_to_dict( content_url.split('?')[-1], '&', '=') body_dict = copy(req_data['body_dict']) from tools.utils import update_dict_by_dict update_dict_by_dict(body_dict, content_url_param_dict, ['mid', 'sn', 'idx', 'scene']) body_dict['comment_id'] = request.meta['comment_id'] body_dict['is_need_reward'] = 1 # 如果请求的是评论内容 if "comment_id" in req_data['url_param_dict']: url_param_dict = copy(req_data['url_param_dict']) url_param_dict['comment_id'] = request.meta['comment_id'] url_param_dict['idx'] = content_url_param_dict['idx'] from tools.utils import dict_to_str url_param_str = dict_to_str(url_param_dict) request_data['url_str'] = req_data['url'] + url_param_str # 如果请求的是阅读量 else: request_data[ 'url_str'] = req_data['url'] + req_data['url_param_str'] request_data['header_dict'] = req_data['headers'] request_data['body_dict'] = body_dict return request_data
def get_xcx_item_list(self, nickname, hand=False): """ 获取小程序所有请求数据 :param hand: 是否手动 :param nickname: 小程序名称 :return: """ print(nickname) TidyReqData.flush_data("*.req") self.home_to_search() self.search_xcx(nickname) # 选中第一个结果后进入小程序,先选择第一个栏目 self.oap.tap(tuple(eval(self.data['BTN']['JIU_QIAN_ZFJY']))) time.sleep(1) # self.oap.tap(tuple(eval(self.data['BTN']['JIU_QIAN_HWYJ']))) # 截图 与记录匹配获取相关信息 # 方案一:先拉取全部文章列表,然后遍历获取每篇文章 # 方案二:现截现获取信息 get_list_slide_num = 0 while redis_instance.get("xcx_get_list_stop") is None: self.oap.swap([60, 1000], [60, 250]) get_list_slide_num = get_list_slide_num + 1 time.sleep(0.5) # 回退到首部 if redis_instance.get("xcx_get_list_stop"): for i in range(get_list_slide_num): self.oap.swap([60, 250], [60, 1000]) # 获取小程序信息列表 xcx_item_list = TidyReqData.get_xcx_req_data("*._xcx") # xcx_item_list = [] for item in xcx_item_list: print("当前文档", item['title']) if xcx.doc_exist("jqzt", item['id']): self.oap.swap([60, 500], [60, 250]) continue # 遍历每一项,并截图处理 item_pos = self.vc.click_by_words(item['title'], tap=False) print(item_pos, "", item['title']) self.oap.tap(item_pos) time.sleep(3) self.oap.key(self.data['KEY']['BACK_KEYEVENT']) # 到达限制次数,退出循环 if redis_instance.get("xcx_get_detail_stop"): break self.oap.swap([60, 500], [60, 250]) # 滑动拉取列表拉完停止 time.sleep(1) self.oap.key(self.data['KEY']['BACK_KEYEVENT']) self.oap.key(self.data['KEY']['BACK_KEYEVENT']) print("原始数据进入mongo %s" % ("xcx_jqzt")) TidyReqData.insert_xcx_to_mongo("xcx_jqzt") print("原始数据进入mongo %s 完成" % ("xcx_jqzt")) print("正在为 %s 创建索引..." % ("jqzt")) index_result = xcx.index_db_docs("jqzt") print("索引完成", index_result) print("redis 相关数据设置缓存时间") ttl_result = TidyReqData.set_redis_ttl(60 * 60 * 5) print("redis 5小时失效时间设置完成")
def spider_opened(self, spider): self.wx_num, self.req_data_dict, self.req_data_list = TidyReqData.get_gzh_req_data( ) if self.wx_num == 0: self.wx_num = 1 self.pre_crawl_time = time.time()
def spider_opened(self, spider): self.wx_num, self.req_data_dict, self.req_data_list = TidyReqData.get_gzh_req_data( ) if self.wx_num == 0: self.wx_num = 1
class ArticleListSpider(scrapy.Spider): name = 'article_list' allowed_domains = ['mp.weixin.qq.com'] start_url = [] custom_settings = get_global_settings() wx_num, _, _ = TidyReqData.get_gzh_req_data() if wx_num == 0: wx_num = 1 custom_settings['DOWNLOAD_DELAY'] = round(2.0 / wx_num, 2) custom_settings['ITEM_PIPELINES'] = { 'crawler.crawler.pipelines.load_more.ResponseArticleListPipeline': 300, } custom_settings['DOWNLOADER_MIDDLEWARES'] = { 'crawler.crawler.middlewares.load_more.LoadMoreMiddleware': 543, } counter = 0 list_offset = 0 def __init__(self, *args, **kwargs): """ :param args: :param kwargs: 实例化爬虫需要调用的函数 """ self.current_nickname = '' def start_requests(self): """ :return:重新爬虫的入口函数, 否者直接请求start_urls中的各个url 重写之后手动调用Request并指定回调函数例如self.parse """ yield Request(url='http://www.aii.com', meta={"list_offset": self.list_offset}, callback=self.parse, dont_filter=True) def parse(self, response): """ :param response: :return:请求完成之后的回调函数 """ self.counter += 1 cmc = response.get_ext_data['can_msg_continue'] next_offset = response.get_ext_data['next_offset'] item = LoadMoreItem() item['article_list'] = response.get_ext_data['data'] item['nickname'] = response.get_ext_data['nickname'] self.current_nickname = response.get_ext_data['nickname'] gc.report_crawling({ 'nickname': item['nickname'], 'percent': 'UNK', 'more': cmc, 'title': len(item['article_list']) }) yield item if cmc == 1: yield Request(url='http://www.aii.com', meta={"list_offset": next_offset}, callback=self.parse, dont_filter=True) def close(self, reason): """ :param reason: :return:所有url请求完毕之后关闭爬虫的回调函数 """ # 删除被删除的公众号 被删除的公众号content_url为空 from db import delete delete(self.current_nickname, content_url="") print(self.name, "爬虫关闭")
def flush_req_data(): from crawler_assist.tidy_req_data import TidyReqData TidyReqData.flush_data("*.req") return "缓存的请求数据已经删除"
def get_xcx_item_list_mini_batch(self, nickname, cur_phone): """ 获取小程序所有请求数据 :param cur_phone: 当前设备 :param nickname: 小程序名称 :return: """ print(nickname) # TidyReqData.flush_data("*.req") self.back_to_weixin_home() self.home_to_search() self.search_xcx(nickname) time.sleep(3) # 在这里监听,设备的search_key是否需要更新 # print("config data", OPENID_PHONE) open_id = OPENID_PHONE[cur_phone] task_device_list = device_manager.get_task_type_devices("wxzs") open_id_device = dict(zip(OPENID_PHONE.values(), OPENID_PHONE.keys())) #检查该账号对应search_key是否有效 while True: need_update = TidyReqData.get_need_update_keys() # 校验待更新设备是否存在,不存在则去除该open_id if len(need_update) > 0: need_del_open_id_list = [] for need_update_open_id in need_update: need_update_device_num = open_id_device[ need_update_open_id] if need_update_device_num not in task_device_list: need_del_open_id = OPENID_PHONE[need_update_device_num] need_del_open_id_list.append(need_del_open_id) if need_del_open_id_list: req_res = TidyReqData.set_offline_wechat_index_accounts( need_del_open_id_list) print("清除不可用账号{},结果{}".format(need_del_open_id_list, req_res)) time.sleep(2) need_update = TidyReqData.get_need_update_keys() # 校验是否有设备需要刷新 if len(need_update) > 0: print("接收到更新内容", need_update) print("当前设备:{}, 当前open_id: {}".format(cur_phone, open_id)) if open_id in need_update: # 点击右上角 time.sleep(1) self.back_to_weixin_home() time.sleep(1) try: device_manager.push(cur_phone) print("{} 设备释放成功".format(cur_phone)) except TfMongoException as e: # device_manager.push(cur_phone) print( TfMongoException( -2, "设备 {} --mongo设备释放操作出错,可能是链接超时".format( cur_phone), cur_phone).processer()) break # 这里有两种情况,1.另一线程设备进入,2.确实有设备不响应(暂不考虑) else: # 链接设备不响应,操作后退出任务重新刷新设备 print( "设备 {} 需要刷新或对应设备可能已断开连接,重新获取待更新记录".format(need_update)) # 刷新可用列表到服务器 self.oap.swap([60, 400], [60, 350]) time.sleep(randint(1, 3)) self.oap.swap([60, 350], [60, 400]) time.sleep(1) # break else: print("设备{}重新获取待更新记录".format(cur_phone)) self.oap.swap([60, 400], [60, 350]) time.sleep(randint(3, 6)) self.oap.swap([60, 350], [60, 400])
class ArticleSpider(scrapy.Spider): """ 公众号文章内容爬虫 """ name = 'article' allowed_domains = ['mp.weixin.qq.com'] start_url = [] custom_settings = get_global_settings() wx_num, _, _ = TidyReqData.get_gzh_req_data() # 担心ip被封 设置请求间隔 # custom_settings['DOWNLOAD_DELAY'] = 0.5 custom_settings['DOWNLOADER_MIDDLEWARES'] = { 'crawler.crawler.middlewares.crawl_article.CrawlArticleMiddleware': 543, } custom_settings['ITEM_PIPELINES'] = { 'crawler.crawler.pipelines.crawl_article.ResponseArticlePipeline': 300, } custom_settings['DOWNLOAD_TIMEOUT'] = 10 custom_settings['CONCURRENT_REQUESTS'] = 16 def __init__(self, *args, **kwargs): """ :param args: :param kwargs: 实例化爬虫需要调用的函数 """ # 包含当前公众号所有不存在文本内容数据的生成器 self.current_nickname = TidyReqData.get_nickname() self.articles_list = get_collection_article(self.current_nickname, article={"$exists": False}, title={"$exists": True}) self.crawler_begin_time = time() self.crawler_parse_counter = 0 def start_requests(self): """ :return:重新爬虫的入口函数, 否者直接请求start_urls中的各个url 重写之后手动调用Request并指定回调函数例如self.parse """ for article in self.articles_list: if "weixin" in article['content_url']: yield Request(url=article['content_url'], callback=self.parse) def parse(self, response): """ :param response: :return:请求完成之后的回调函数 """ item = CrawlArticleItem() item['article_data'] = response.get_ext_data['article_data'] item['nickname'] = response.get_ext_data['nickname'] item['raw_url'] = response.get_ext_data['raw_url'] self.crawler_parse_counter += 1 time_gap = time() - self.crawler_begin_time print(round(time_gap / self.crawler_parse_counter, 3), item['article_data']['article'].replace('\n', '')) # 发送状态给前端 crawling_item = {} crawling_item['nickname'] = item['nickname'] crawling_item['percent'] = self.crawler_parse_counter crawling_item['more'] = round(time_gap / self.crawler_parse_counter, 3) crawling_item['title'] = find_one(item['nickname'], item['raw_url'])['title'][:10] gc.report_crawling(crawling_item) yield item def close(self, reason): """ :param reason: :return:所有url请求完毕之后关闭爬虫的回调函数 """ time_gap = time() - self.crawler_begin_time if self.crawler_parse_counter != 0: print("%s爬虫关闭 用时%d 共计爬取%d 平均%f" % (self.name, time_gap, self.crawler_parse_counter, time_gap / self.crawler_parse_counter)) from instance.global_instance import gs print("正在为 %s 创建索引..." % (self.current_nickname)) index_result = gs.index_db_docs(self.current_nickname) print("索引完成", index_result) from db.meta_data import insert_article_metadata insert_article_metadata( self.current_nickname, { 'date': datetime.datetime.now(), 'articles_num': self.crawler_parse_counter })
class ArticleReadDataSpider(scrapy.Spider): """ 公众号文章阅读数据爬虫 """ name = 'read_data' allowed_domains = ['mp.weixin.qq.com'] start_url = [] custom_settings = get_global_settings() wx_num, _, _ = TidyReqData.get_gzh_req_data() if wx_num == 0: wx_num = 1 custom_settings['DOWNLOAD_DELAY'] = round(2.5 / wx_num, 2) custom_settings['DOWNLOADER_MIDDLEWARES'] = { 'crawler.crawler.middlewares.crawl_article.ArticleReadDataMiddleware': 543, } custom_settings['ITEM_PIPELINES'] = { 'crawler.crawler.pipelines.crawl_article.ResponseArticleReadDataPipeline': 300, } custom_settings['CONCURRENT_REQUESTS'] = 1 def __init__(self, *args, **kwargs): """ :param args: :param kwargs: 实例化爬虫需要调用的函数 """ # 包含当前公众号所有不存在文本内容数据的生成器 self.current_nickname = TidyReqData.get_nickname() print(self.current_nickname) articles_list = get_collection_article(self.current_nickname, read_num={"$exists": False}, comment_id={"$exists": True}) self.articles_list = [] for article in articles_list: self.articles_list.append(article) self.task_num = len(self.articles_list) self.task_counter = 0 self.begin_time = time() self.pre_time = time() def start_requests(self): """ :return:重新爬虫的入口函数, 否者直接请求start_urls中的各个url 重写之后手动调用Request并指定回调函数例如self.parse """ for article in self.articles_list: if ':' in article['content_url']: request = Request(url=article['content_url'], callback=self.parse, dont_filter=False) request.set_ext_data({ 'content_url': article['content_url'], 'comment_id': article['comment_id'] }) yield request def parse(self, response): """ :param response: :return:请求完成之后的回调函数 """ item = CrawlArticleReadDataItem() item['read_data'] = response.get_ext_data['read_data'] item['nickname'] = response.get_ext_data['nickname'] item['content_url'] = response.get_ext_data['content_url'] # 打印状爬虫状态信息 self.task_counter += 1 pre_time_gap = time() - self.pre_time total_time_gap = time() - self.begin_time time_need = (self.task_num - self.task_counter) * (total_time_gap / self.task_counter) print(round(pre_time_gap, 2), round(total_time_gap / self.task_counter, 2), "%d/%d" % (self.task_counter, self.task_num), response.get_ext_data['read_data']['read_num'], response.get_ext_data['read_data']['like_num'], response.get_ext_data['read_data']['nick_name'], str(datetime.timedelta(seconds=time_need)).split('.')[0]) self.pre_time = time() crawling_item = {} crawling_item['nickname'] = item['nickname'] crawling_item['percent'] = '%d/%d' % (self.task_counter, self.task_num) crawling_item['more'] = response.get_ext_data['read_data']['read_num'] crawling_item['title'] = find_one(item['nickname'], item['content_url'])['title'][:10] gc.report_crawling(crawling_item) yield item def close(self, reason): """ :param reason: :return:所有url请求完毕之后关闭爬虫的回调函数 """ print(self.name, "爬虫关闭")
def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) self.wx_num, self.req_data_dict, self.req_data_list = TidyReqData.get_gzh_req_data( ) if self.wx_num == 0: self.wx_num = 1