def deal_comment(self, req_url, text): """ 解析评论 :param req_url: :param text: :return: """ data = tools.get_json(text) __biz = tools.get_param(req_url, "__biz") comment_id = tools.get_param(req_url, "comment_id") # 与文章关联 elected_comment = data.get("elected_comment", []) comment_datas = [ dict( __biz=__biz, comment_id=comment_id, nick_name=comment.get("nick_name"), logo_url=comment.get("logo_url"), content=comment.get("content"), create_time=tools.timestamp_to_date( comment.get("create_time")), content_id=comment.get("content_id"), like_num=comment.get("like_num"), is_top=comment.get("is_top"), spider_time=tools.get_current_date(), ) for comment in elected_comment ] if comment_datas: data_pipeline.save_article_commnet(comment_datas)
def deal_comment(self, req_url, text): data = tools.get_json(text) __biz = tools.get_param(req_url, '__biz') comment_id = tools.get_param(req_url, 'comment_id') # 与文章关联 elected_comment = data.get('elected_comment', []) comment_datas = [ dict( __biz=__biz, comment_id=comment_id, nick_name=comment.get('nick_name'), logo_url=comment.get('logo_url'), content=comment.get('content'), create_time=tools.timestamp_to_date(comment.get('create_time')), content_id=comment.get('content_id'), like_num=comment.get('like_num'), is_top=comment.get('is_top'), spider_time=tools.get_current_date() ) for comment in elected_comment ] if comment_datas: data_pipeline.save_article_commnet(comment_datas)
def deal_article(self, req_url, text): """ 解析文章 :param req_url: :param text: :return: """ sn = tools.get_param(req_url, "sn") if not text: self._task_manager.update_article_task_state(sn, -1) return None selector = Selector(text) content = selector.xpath( '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]' ).extract_first(default="") title = (selector.xpath('//h2[@class="rich_media_title"]/text()'). extract_first(default="").strip()) account = (selector.xpath('//a[@id="js_name"]/text()').extract_first( default="").strip()) author = (selector.xpath( '//span[@class="rich_media_meta rich_media_meta_text"]//text()'). extract_first(default="").strip()) publish_timestamp = selector.re_first('n="(\d{10})"') publish_timestamp = int( publish_timestamp) if publish_timestamp else None publish_time = (tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None) biz = tools.get_param(req_url, "__biz") text = remove_tags(content).strip() spider_name = 'wechat' collection_mode = 'spider' data_source_type = '微信公众号' article_data = { "data_type": account, "title": title, "data_address": req_url, "author": author, "publish_time": publish_time, "__biz": biz, "text": text, "spider_name": spider_name, "collection_mode": collection_mode, "data_source_type": data_source_type, "sn": sn, "collection_time": tools.get_current_date(), } # 入库 if article_data and data_pipeline.save_article( article_data) is not None: self._task_manager.update_article_task_state(sn, 1) return self._task_manager.get_task()
def parse_article_info(article_info, release_time): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get('title') summary = article_info.get('digest') url = article_info.get('content_url').replace('\\', '').replace( 'amp;', '') source_url = article_info.get('source_url').replace('\\', '') # 引用的文章链接 cover = article_info.get('cover').replace('\\', '') author = article_info.get('author') if url and url.startswith( 'http://mp.weixin.qq.com/' ): # 被发布者删除的文章 无url和其他信息, 此时取不到mid 且不用入库, 或者商城类的url不入库 mid = tools.get_param(url, 'mid') or tools.get_param( url, 'appmsgid') # 图文消息id 同一天发布的图文消息 id一样 idx = tools.get_param(url, 'idx') or tools.get_param( url, 'itemidx') # 第几条图文消息 从1开始 article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260 idx = 1,则article_id = 26504922601 # 判断该文章库中是否已存在 if WechatAction._wechat_service.is_exist( 'wechat_article', article_id) or (ONLY_TODAY_MSG and release_time < tools.get_current_date('%Y-%m-%d')): self._is_need_get_more = False return # 不往下进行 舍弃之后的文章 __biz = tools.get_param(url, '__biz') # 用于关联公众号 # 缓存文章信息 WechatAction._article_info[article_id] = { 'article_id': int(article_id), 'title': title, 'summary': summary, 'release_time': release_time, 'url': url, 'source_url': source_url, 'cover': cover, 'account': '', 'author': author, '__biz': __biz, 'read_num': None, 'like_num': None, 'content': '', 'comment': [], 'record_time': tools.get_current_date() } # 将文章url添加到待抓取队列 WechatAction._todo_urls.append(url)
def get_read_watched_count(self, data, req_url): ''' @summary: --------- @param data: { "advertisement_num":0, "advertisement_info":[ ], "appmsgstat":{ "show":true, "is_login":true, "liked":false, "read_num":38785, "like_num":99, "ret":0, "real_read_num":0 }, "comment_enabled":1, "reward_head_imgs":[ ], "only_fans_can_comment":false, "is_ios_reward_open":0, "base_resp":{ "wxtoken":3465907592 } } @param req_url: --------- @result: ''' log.debug('获取观看和点赞量') req_url = req_url.replace('amp;', '') mid = tools.get_param(req_url, 'mid') # 图文消息id 同一天发布的图文消息 id一样 idx = tools.get_param(req_url, 'idx') # 第几条图文消息 从1开始 article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260 idx = 1,则article_id = 26504922601 data = tools.get_json(data) read_num = data.get('appmsgstat', {}).get('read_num') like_num = data.get('appmsgstat', {}).get('like_num') # 缓存文章阅读量点赞量 WechatAction._article_info[article_id]['read_num'] = read_num WechatAction._article_info[article_id]['like_num'] = like_num if not data.get('comment_enabled'): # 无评论区,不请求get_comment 函数,此时直接入库 self._wechat_service.add_article_info( WechatAction._article_info.pop(article_id))
def deal_article(self, req_url, text): sn = tools.get_param(req_url, 'sn') if not text: self._task_manager.update_article_task_state(sn, -1) return None selector = Selector(text) content = selector.xpath('//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]') title = selector.xpath('//h2[@class="rich_media_title"]/text()').extract_first(default='').strip() account = selector.xpath('//a[@id="js_name"]/text()').extract_first(default='').strip() author = selector.xpath('//span[@class="rich_media_meta rich_media_meta_text"]//text()').extract_first(default='').strip() publish_timestamp = selector.re_first('n="(\d{10})"') publish_timestamp = int(publish_timestamp) if publish_timestamp else None publish_time = tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None pics_url = content.xpath('.//img/@src|.//img/@data-src').extract() biz = tools.get_param(req_url, '__biz') digest = selector.re_first('var msg_desc = "(.*?)"') cover = selector.re_first('var cover = "(.*?)";') or selector.re_first('msg_cdn_url = "(.*?)"') source_url = selector.re_first("var msg_source_url = '(.*?)';") content_html = content.extract_first(default='') comment_id = selector.re_first('var comment_id = "(\d+)"') article_data = { 'account': account, 'title': title, 'url': req_url, 'author': author, 'publish_time': publish_time, '__biz': biz, 'digest': digest, 'cover': cover, "pics_url": pics_url, "content_html": content_html, "source_url": source_url, "comment_id": comment_id, "sn": sn, "spider_time": tools.get_current_date() } # 入库 if article_data and data_pipeline.save_article(article_data) is not None: self._task_manager.update_article_task_state(sn, 1) return self._task_manager.get_task()
def get_article_content(self, data, req_url): log.debug('获取文章内容') if data: # 被验证不详实的文章 首次不反回内容,跳转到https://mp.weixin.qq.com/mp/rumor req_url = req_url.replace('amp;', '') mid = tools.get_param(req_url, 'mid') or tools.get_param( req_url, 'appmsgid') # 图文消息id 同一天发布的图文消息 id一样 idx = tools.get_param(req_url, 'idx') or tools.get_param( req_url, 'itemidx') # 第几条图文消息 从1开始 article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260 idx = 1,则article_id = 26504922601 WechatAction._current_aritcle_id = article_id # 记录当前文章的id 为获取评论信息时找对应的文章id使用 print('当前id' + WechatAction._current_aritcle_id) regex = '(<div class="rich_media_content ".*?)<script nonce' content = tools.get_info(data, regex, fetch_one=True) if content: # 缓存文章内容 WechatAction._article_info[article_id]['content'] = content # 取公众号名 regex = '<title>(.*?)</title>' account = tools.get_info(data, regex, fetch_one=True) WechatAction._article_info[article_id]['account'] = account else: # 被验证不实的文章,不会请求观看点赞数,此时直接入库 regex = '<title>(.*?)</title>' content = tools.get_info(data, regex, fetch_one=True) WechatAction._article_info[article_id]['content'] = content # 入库 print('被验证不实的文章,不会请求观看点赞数,此时直接入库') WechatAction._wechat_service.add_article_info( WechatAction._article_info.pop(article_id)) # 如果下一页是文章列表的链接, 替换文章列表中的appmsg_token,防止列表链接过期 if (len(WechatAction._todo_urls) == 1) and ('/mp/profile_ext' in WechatAction._todo_urls[-1]): regex = 'appmsg_token = "(.*?)"' appmsg_token = tools.get_info(data, regex, fetch_one=True).strip() WechatAction._todo_urls[-1] = tools.replace_str( WechatAction._todo_urls[-1], 'appmsg_token=.*?&', 'appmsg_token=%s&' % appmsg_token) return self.__open_next_page() else: # 无文章内容 pass
def get_comment(self, data, req_url): log.debug('获取评论信息') req_url = req_url.replace('amp;', '') mid = tools.get_param(req_url, 'appmsgid') # 图文消息id 同一天发布的图文消息 id一样 idx = tools.get_param(req_url, 'idx') # 第几条图文消息 从1开始 article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260 idx = 1,则article_id = 26504922601 data = tools.get_json(data) comment = data.get('elected_comment', []) # 精选留言 # 缓存文章评论信息 WechatAction._article_info[article_id]['comment'] = comment WechatAction._wechat_service.add_article_info(WechatAction._article_info.pop(article_id))
def __parse_account_info(self, data, req_url): __biz = tools.get_param(req_url, "__biz") regex = 'id="nickname">(.*?)</strong>' account = tools.get_info(data, regex, fetch_one=True).strip() regex = 'profile_avatar">.*?<img src="(.*?)"' head_url = tools.get_info(data, regex, fetch_one=True) regex = 'class="profile_desc">(.*?)</p>' summary = tools.get_info(data, regex, fetch_one=True).strip() # 认证信息(关注的账号直接点击查看历史消息,无认证信息) regex = '<i class="icon_verify success">.*?</i>(.*?)</span>' verify = tools.get_info(data, regex, fetch_one=True) verify = verify.strip() if verify else "" # 二维码 regex = 'var username = "" \|\| "(.*?)";' # || 需要转译 qr_code = tools.get_info(data, regex, fetch_one=True) qr_code = "http://open.weixin.qq.com/qr/code?username="******"__biz": __biz, "account": account, "head_url": head_url, "summary": summary, "qr_code": qr_code, "verify": verify, "spider_time": tools.get_current_date(), } if account_data: data_pipeline.save_account(account_data)
def deal_article_dynamic_info(self, req_data, text): """ 取文章动态信息 阅读 点赞 评论 :param req_data: post 请求的data str格式 :param text: :return: """ data = tools.get_json(text) dynamic_data = dict( sn=tools.get_param(req_data, "sn"), __biz=tools.get_param(req_data, "__biz").replace("%3D", "="), read_num=data.get("appmsgstat", {}).get("read_num"), like_num=data.get("appmsgstat", {}).get("like_num"), comment_count=data.get("comment_count"), spider_time=tools.get_current_date(), ) if dynamic_data: data_pipeline.save_article_dynamic(dynamic_data)
def deal_article_dynamic_info(self, req_data, text): """ 取文章动态信息 阅读 点赞 评论 :param req_data: post 请求的data str格式 :param text: :return: """ data = tools.get_json(text) dynamic_data = dict( sn=tools.get_param(req_data, 'sn'), __biz=tools.get_param(req_data, '__biz').replace('%3D', '='), read_num=data.get('appmsgstat', {}).get('read_num'), like_num=data.get('appmsgstat', {}).get('like_num'), comment_count=data.get('comment_count'), spider_time=tools.get_current_date() ) if dynamic_data: data_pipeline.save_article_dynamic(dynamic_data)
def __parse_account_info(self, data, req_url): ''' @summary: --------- @param data: --------- @result: ''' __biz = tools.get_param(req_url, '__biz') WechatAction._current_account_biz = __biz regex = 'id="nickname">(.*?)</strong>' account = tools.get_info(data, regex, fetch_one=True).strip() regex = 'profile_avatar">.*?<img src="(.*?)"' head_url = tools.get_info(data, regex, fetch_one=True) regex = 'class="profile_desc">(.*?)</p>' summary = tools.get_info(data, regex, fetch_one=True).strip() # 认证信息(关注的账号直接点击查看历史消息,无认证信息) regex = '<i class="icon_verify success">.*?</i>(.*?)</span>' verify = tools.get_info(data, regex, fetch_one=True) verify = verify.strip() if verify else '' # 二维码 regex = 'var username = "" \|\| "(.*?)";' # || 需要转译 qr_code = tools.get_info(data, regex, fetch_one=True) qr_code = 'http://open.weixin.qq.com/qr/code?username='******'__biz': __biz, 'account': account, 'head_url': head_url, 'summary': summary, 'qr_code': qr_code, 'verify': verify, 'account_id': WechatAction._account_info.pop(__biz) if __biz in WechatAction._account_info.keys() else '', 'record_time': tools.get_current_date() } if not WechatAction._wechat_service.is_exist('wechat_account', __biz): WechatAction._wechat_service.add_account_info(account_info)
def parse_article_info(article_info, comm_msg_info): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get("title") digest = article_info.get("digest") url = article_info.get("content_url").replace("\\", "").replace( "amp;", "") source_url = article_info.get("source_url").replace("\\", "") # 引用的文章链接 cover = article_info.get("cover").replace("\\", "") subtype = article_info.get("subtype") is_multi = article_info.get("is_multi") author = article_info.get("author") copyright_stat = article_info.get("copyright_stat") duration = article_info.get("duration") del_flag = article_info.get("del_flag") type = comm_msg_info.get("type") publish_time = tools.timestamp_to_date( comm_msg_info.get("datetime")) sn = tools.get_param(url, "sn") if sn: # 缓存文章信息 article_data = { "title": title, "digest": digest, "url": url, "source_url": source_url, "cover": cover, "subtype": subtype, "is_multi": is_multi, "author": author, "copyright_stat": copyright_stat, "duration": duration, "del_flag": del_flag, "type": type, "publish_time": publish_time, "sn": sn, "__biz": __biz, "spider_time": tools.get_current_date(), } return article_data
def parse_article_info(article_info, comm_msg_info): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get('title') digest = article_info.get('digest') url = article_info.get('content_url').replace('\\', '').replace('amp;', '') source_url = article_info.get('source_url').replace('\\', '') # 引用的文章链接 cover = article_info.get('cover').replace('\\', '') subtype = article_info.get('subtype') is_multi = article_info.get('is_multi') author = article_info.get('author') copyright_stat = article_info.get('copyright_stat') duration = article_info.get('duration') del_flag = article_info.get('del_flag') type = comm_msg_info.get('type') publish_time = tools.timestamp_to_date(comm_msg_info.get('datetime')) sn = tools.get_param(url, 'sn') if sn: # 缓存文章信息 article_data = { 'title': title, 'digest': digest, 'url': url, 'source_url': source_url, 'cover': cover, 'subtype': subtype, 'is_multi': is_multi, 'author': author, 'copyright_stat': copyright_stat, 'duration': duration, 'del_flag': del_flag, 'type': type, 'publish_time': publish_time, 'sn': sn, '__biz': __biz, 'spider_time': tools.get_current_date() } return article_data
def __parse_account_info(self, data, req_url): ''' @summary: --------- @param data: --------- @result: ''' __biz = tools.get_param(req_url, '__biz') regex = 'id="nickname">(.*?)</strong>' account = tools.get_info(data, regex, fetch_one=True).strip() regex = 'profile_avatar">.*?<img src="(.*?)"' head_url = tools.get_info(data, regex, fetch_one=True) regex = 'class="profile_desc">(.*?)</p>' summary = tools.get_info(data, regex, fetch_one=True).strip() # 认证信息(关注的账号直接点击查看历史消息,无认证信息) regex = '<i class="icon_verify success">.*?</i>(.*?)</span>' verify = tools.get_info(data, regex, fetch_one=True) verify = verify.strip() if verify else '' # 二维码 regex = 'var username = "" \|\| "(.*?)";' # || 需要转译 qr_code = tools.get_info(data, regex, fetch_one=True) qr_code = 'http://open.weixin.qq.com/qr/code?username='******'__biz': __biz, 'account': account, 'head_url': head_url, 'summary': summary, 'qr_code': qr_code, 'verify': verify, 'spider_time': tools.get_current_date() } if account_data: data_pipeline.save_account(account_data)
def __parse_article_list(self, article_list, req_url): ''' @summary: 解析文章列表 --------- @param article_list: 文章列表信息 str { "list":[ { "comm_msg_info":{ "id":1000000513, "type":49, "datetime":1511354167, "fakeid":"3082125093", "status":2, "content":"" }, "app_msg_ext_info":{ "title":"Python 内存优化", "digest":"实际项目中,pythoner更加关注的是Python的性能问题。本文,关注的是Python的内存优化,一般说来,如果不发生内存泄露,运行在服务端的Python代码不用太关心内存,但是如果运行在客户端,那还是有优化的必要。", "content":"", "fileid":505083208, "content_url":"http:\/\/mp.weixin.qq.com\/s?__biz=MzA4MjEyNTA5Mw==&mid=2652566858&idx=1&sn=d2a76f4a601f94d8acc7b436d18e9648&chksm=8464dd00b313541684c14f974325ea6ae725ffc901fd9888cc00d1acdd13619de3297a5d9a35&scene=27#wechat_redirect", "source_url":"http:\/\/www.cnblogs.com\/xybaby\/p\/7488216.html", "cover":"http:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/fhujzoQe7TpODTuicia4geCiaIj1AbZwVQQVbRHy3FhzwMHEvCvtzXVicHTaPEu8jZ2pgkCAgBqEHugYMvzg3tpoww\/0?wx_fmt=jpeg", "subtype":9, "is_multi":1, "multi_app_msg_item_list":[ { "title":"面向对象:With the wonder of your love, the sun above always shines", "digest":"With the wonder of your love, the sun above always shines", "content":"", "fileid":505083209, "content_url":"http:\/\/mp.weixin.qq.com\/s?__biz=MzA4MjEyNTA5Mw==&mid=2652566858&idx=2&sn=97f223783da7748080f8103654447c99&chksm=8464dd00b313541601938565a41487ea76209331fd6f4c8996a2ff5572f4fd465de9fa4cbaac&scene=27#wechat_redirect", "source_url":"https:\/\/mp.weixin.qq.com\/s\/_uD9jY4nXQQ6CtA__dsN8w?scene=25#wechat_redirect", "cover":"http:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/fhujzoQe7TpODTuicia4geCiaIj1AbZwVQQ5ukvwH1GPq5zlWxv05WvRiaw6BiaeyGRD1w17nAPGTlQgEvvDuZnB9HA\/0?wx_fmt=jpeg", "author":"", "copyright_stat":101, "del_flag":1 } ], "author":"", "copyright_stat":100, "del_flag":1 } } ] } --------- @result: ''' # log.debug(tools.dumps_json(article_list)) # 解析json内容里文章信息 def parse_article_info(article_info, release_time): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get('title') summary = article_info.get('digest') url = article_info.get('content_url').replace('\\', '').replace('amp;', '') source_url = article_info.get('source_url').replace('\\', '') # 引用的文章链接 cover = article_info.get('cover').replace('\\', '') author = article_info.get('author') if url and url.startswith('http://mp.weixin.qq.com/'):# 被发布者删除的文章 无url和其他信息, 此时取不到mid 且不用入库, 或者商城类的url不入库 mid = tools.get_param(url, 'mid') or tools.get_param(url, 'appmsgid') # 图文消息id 同一天发布的图文消息 id一样 idx = tools.get_param(url, 'idx') or tools.get_param(url, 'itemidx')# 第几条图文消息 从1开始 article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260 idx = 1,则article_id = 26504922601 # 判断该文章库中是否已存在 if WechatAction._wechat_service.is_exist('wechat_article', article_id) or (ONLY_TODAY_MSG and release_time < tools.get_current_date('%Y-%m-%d')): self._is_need_get_more = False return # 不往下进行 舍弃之后的文章 __biz = tools.get_param(url, '__biz') # 用于关联公众号 # 缓存文章信息 WechatAction._article_info[article_id] = { 'article_id':int(article_id), 'title' : title, 'summary' : summary, 'release_time':release_time, 'url' : url, 'source_url' : source_url, 'cover' : cover, 'account':'', 'author' : author, '__biz' : __biz, 'read_num' : None, 'like_num' : None, 'content' : '', 'comment' : [], 'record_time':tools.get_current_date() } # 将文章url添加到待抓取队列 WechatAction._todo_urls.append(url) # log.debug(tools.dumps_json(article_list)) article_list = tools.get_json(article_list) article_list = article_list.get('list', []) for article in article_list: article_type = article.get('comm_msg_info', {}).get('type') if article_type != 49: # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一 continue release_time = article.get('comm_msg_info', {}).get('datetime') release_time = tools.timestamp_to_date(release_time) # 微信公众号每次可以发多个图文消息 # 第一个图文消息 app_msg_ext_info = article.get('app_msg_ext_info', {}) parse_article_info(app_msg_ext_info, release_time) if not self._is_need_get_more: break # 同一天附带的图文消息 multi_app_msg_item_list = app_msg_ext_info.get('multi_app_msg_item_list') for multi_app_msg_item in multi_app_msg_item_list: parse_article_info(multi_app_msg_item, release_time) if not self._is_need_get_more: break # 将更新公众号为做完的回调加入到队列中 __biz = tools.get_param(req_url, '__biz') # 用于关联公众号 WechatAction._todo_urls.append(lambda: WechatAction._wechat_service.update_account_article_num(__biz))
def get_article_list(self, data, req_url): ''' @summary: 获取文章列表 分为两种 1、第一次查看历史消息 返回的是html格式 包含公众号信息 2、下拉显示更多时 返回json格式 但是文章列表都是json格式 且合适相同 抓取思路: 1、如果是第一种格式,直接解析文章内容,拼接下一页json格式的地址 2、如果是第二种格式, --------- @param data: --------- @result: ''' try: # 判断是否为被封的账号, 被封账号没有文章列表 if 'list' in data: # 取html格式里的文章列表 if 'action=home' in req_url: # 解析公众号信息 self.__parse_account_info(data, req_url) # 解析文章列表 regex = "msgList = '(.*?})';" article_list = tools.get_info(data, regex, fetch_one=True) article_list = article_list.replace('"', '"') self.__parse_article_list(article_list) #判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 regex = "can_msg_continue = '(\d)'" can_msg_continue = tools.get_info(data, regex, fetch_one=True) if can_msg_continue == '0': # 无更多文章 pass elif self._is_need_get_more: # 以下是拼接下拉显示更多的历史文章 跳转 # 取appmsg_token 在html中 regex = 'appmsg_token = "(.*?)";' appmsg_token = tools.get_info(data, regex, fetch_one=True) # 取其他参数 在url中 __biz = tools.get_param(req_url, '__biz') pass_ticket = tools.get_param(req_url, 'pass_ticket') next_page_url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json'.format( __biz=__biz, offset=10, pass_ticket=pass_ticket, appmsg_token=appmsg_token) WechatAction._todo_urls.append(next_page_url) else: # json格式 data = tools.get_json(data) article_list = data.get('general_msg_list', {}) self.__parse_article_list(article_list) #判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 can_msg_continue = data.get('can_msg_continue') if not can_msg_continue: # 无更多文章 pass elif self._is_need_get_more: # 以下是拼接下拉显示更多的历史文章 跳转 # 取参数 在url中 __biz = tools.get_param(req_url, '__biz') pass_ticket = tools.get_param(req_url, 'pass_ticket') appmsg_token = tools.get_param(req_url, 'appmsg_token') # 取offset 在json中 offset = data.get('next_offset', 0) next_page_url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json'.format( __biz=__biz, offset=offset, pass_ticket=pass_ticket, appmsg_token=appmsg_token) WechatAction._todo_urls.append(next_page_url) else: # 该__biz 账号已被封 pass except Exception as e: log.error(e) return self.__open_next_page()
def deal_article_list(self, req_url, text): """ @summary: 获取文章列表 分为两种 1、第一次查看历史消息 返回的是html格式 包含公众号信息 2、下拉显示更多时 返回json格式 但是文章列表都是json格式 且合适相同 抓取思路: 1、如果是第一种格式,直接解析文章内容,拼接下一页json格式的地址 2、如果是第二种格式, --------- @param data: --------- @result: """ try: # 判断是否为被封的账号, 被封账号没有文章列表 __biz = tools.get_param(req_url, "__biz") if "list" in text: # 取html格式里的文章列表 if "action=home" in req_url: # 解析公众号信息 self.__parse_account_info(text, req_url) # 解析文章列表 regex = "msgList = '(.*?})';" article_list = tools.get_info(text, regex, fetch_one=True) article_list = article_list.replace(""", '"') publish_time = self.__parse_article_list( article_list, __biz, is_first_page=True) # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 regex = "can_msg_continue = '(\d)'" can_msg_continue = tools.get_info(text, regex, fetch_one=True) if can_msg_continue == "0": # 无更多文章 log.info("抓取到列表底部 无更多文章,公众号 {} 抓取完毕".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) if not new_last_publish_time: # 标记成僵尸号 log.info("公众号 {} 为僵尸账号 不再监控".format(__biz)) self._task_manager.sign_account_is_zombie(__biz) else: self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) elif publish_time: # 以下是拼接下拉显示更多的历史文章 跳转 # 取appmsg_token 在html中 regex = 'appmsg_token = "(.*?)";' appmsg_token = tools.get_info(text, regex, fetch_one=True) # 取其他参数 在url中 __biz = tools.get_param(req_url, "__biz") pass_ticket = tools.get_param(req_url, "pass_ticket") next_page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json".format( __biz=__biz, offset=10, pass_ticket=pass_ticket, appmsg_token=appmsg_token, ) return self._task_manager.get_task( next_page_url, tip="正在抓取列表 next_offset {} 抓取到 {}".format( 10, publish_time), ) else: # json格式 text = tools.get_json(text) article_list = text.get("general_msg_list", {}) publish_time = self.__parse_article_list( article_list, __biz) # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 can_msg_continue = text.get("can_msg_continue") if not can_msg_continue: # 无更多文章 log.info("抓取到列表底部 无更多文章,公众号 {} 抓取完毕".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) pass elif publish_time: # 以下是拼接下拉显示更多的历史文章 跳转 # 取参数 在url中 __biz = tools.get_param(req_url, "__biz") pass_ticket = tools.get_param(req_url, "pass_ticket") appmsg_token = tools.get_param(req_url, "appmsg_token") # 取offset 在json中 offset = text.get("next_offset", 0) next_page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json".format( __biz=__biz, offset=offset, pass_ticket=pass_ticket, appmsg_token=appmsg_token, ) return self._task_manager.get_task( next_page_url, tip="正在抓取列表 next_offset {} 抓取到 {}".format( offset, publish_time), ) else: # 该__biz 账号已被封 self._task_manager.sign_account_is_zombie(__biz) pass except Exception as e: log.exception(e) return self._task_manager.get_task()
def deal_article(self, req_url, text): """ 解析文章 :param req_url: :param text: :return: """ sn = tools.get_param(req_url, "sn") if not text: self._task_manager.update_article_task_state(sn, -1) return None selector = Selector(text) content = selector.xpath( '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]' ) title = (selector.xpath('//h2[@class="rich_media_title"]/text()'). extract_first(default="").strip()) account = (selector.xpath('//a[@id="js_name"]/text()').extract_first( default="").strip()) author = (selector.xpath( '//span[@class="rich_media_meta rich_media_meta_text"]//text()'). extract_first(default="").strip()) publish_timestamp = selector.re_first('n="(\d{10})"') publish_timestamp = int( publish_timestamp) if publish_timestamp else None publish_time = (tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None) pics_url = content.xpath(".//img/@src|.//img/@data-src").extract() biz = tools.get_param(req_url, "__biz") digest = selector.re_first('var msg_desc = "(.*?)"') cover = selector.re_first('var cover = "(.*?)";') or selector.re_first( 'msg_cdn_url = "(.*?)"') source_url = selector.re_first("var msg_source_url = '(.*?)';") content_html = content.extract_first(default="") comment_id = selector.re_first('var comment_id = "(\d+)"') article_data = { "account": account, "title": title, "url": req_url, "author": author, "publish_time": publish_time, "__biz": biz, "digest": digest, "cover": cover, "pics_url": pics_url, "content_html": content_html, "source_url": source_url, "comment_id": comment_id, "sn": sn, "spider_time": tools.get_current_date(), } # 入库 if article_data and data_pipeline.save_article( article_data) is not None: self._task_manager.update_article_task_state(sn, 1) return self._task_manager.get_task()