def __init__(self, remote_url, local_save_path, project_path, main_lnk_paths, sync_files = [], ignore_files = []): ''' @summary: 更新代码初始化函数 --------- @param remote_url: 远程代码发布地址 @param local_save_path: 代码下载路径 @param project_path: 本地项目路径 @param main_lnk_paths: 本地项目执行文件快捷方式地址 @param sync_files: 同步的文件 .* 表示同步全部 @param ignore_files: 忽略的文件 --------- @result: ''' self._remote_url = remote_url self._local_save_path = local_save_path self._project_path = project_path self._main_lnk_paths = main_lnk_paths self._sync_files = sync_files self._ignore_files = ignore_files self._remote_zip_url = '' self._tag = '' self._zip_path = '' self._unpack_path = '' self._project_name = tools.get_info(remote_url, '/([^/]*?)/releases', fetch_one = True) self._tag_json = tools.get_json(tools.read_file(VERSION_FILE)) or {}
def deal_comment(self, req_url, text): data = tools.get_json(text) __biz = tools.get_param(req_url, '__biz') comment_id = tools.get_param(req_url, 'comment_id') # 与文章关联 elected_comment = data.get('elected_comment', []) comment_datas = [ dict( __biz=__biz, comment_id=comment_id, nick_name=comment.get('nick_name'), logo_url=comment.get('logo_url'), content=comment.get('content'), create_time=tools.timestamp_to_date(comment.get('create_time')), content_id=comment.get('content_id'), like_num=comment.get('like_num'), is_top=comment.get('is_top'), spider_time=tools.get_current_date() ) for comment in elected_comment ] if comment_datas: data_pipeline.save_article_commnet(comment_datas)
def deal_comment(self, req_url, text): """ 解析评论 :param req_url: :param text: :return: """ data = tools.get_json(text) __biz = tools.get_param(req_url, "__biz") comment_id = tools.get_param(req_url, "comment_id") # 与文章关联 elected_comment = data.get("elected_comment", []) comment_datas = [ dict( __biz=__biz, comment_id=comment_id, nick_name=comment.get("nick_name"), logo_url=comment.get("logo_url"), content=comment.get("content"), create_time=tools.timestamp_to_date( comment.get("create_time")), content_id=comment.get("content_id"), like_num=comment.get("like_num"), is_top=comment.get("is_top"), spider_time=tools.get_current_date(), ) for comment in elected_comment ] if comment_datas: data_pipeline.save_article_commnet(comment_datas)
def get_read_watched_count(self, data, req_url): ''' @summary: --------- @param data: { "advertisement_num":0, "advertisement_info":[ ], "appmsgstat":{ "show":true, "is_login":true, "liked":false, "read_num":38785, "like_num":99, "ret":0, "real_read_num":0 }, "comment_enabled":1, "reward_head_imgs":[ ], "only_fans_can_comment":false, "is_ios_reward_open":0, "base_resp":{ "wxtoken":3465907592 } } @param req_url: --------- @result: ''' log.debug('获取观看和点赞量') req_url = req_url.replace('amp;', '') # 2018-04-13 微信版本更新 地址中无mid与idx参数 article_id拼不出来 # mid = tools.get_param(req_url, 'mid') # 图文消息id 同一天发布的图文消息 id一样 # idx = tools.get_param(req_url, 'idx') # 第几条图文消息 从1开始 # article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260 idx = 1,则article_id = 26504922601 article_id = WechatAction._current_aritcle_id # 直接取 data = tools.get_json(data) read_num = data.get('appmsgstat', {}).get('read_num') like_num = data.get('appmsgstat', {}).get('like_num') # 缓存文章阅读量点赞量 WechatAction._article_info[article_id]['read_num'] = read_num WechatAction._article_info[article_id]['like_num'] = like_num # if not data.get('comment_enabled'): # 无评论区,不请求get_comment 函数,此时直接入库 WechatAction._wechat_service.add_article_info( WechatAction._article_info.pop(article_id))
def __init__(self, table): self._record_time = tools.get_json( tools.read_file(SYNC_TIME_FILE)) or {} self._compare_keywords = CompareKeywords() self._summary = Summary() self._emotion = Emotion() self._word_cloud = WordCloud() self._es = ES() self._hot_sync = HotSync() self._vip_checked = VipChecked() self._table = table self._per_record_time_key = '{table}_record_time'.format( table=self._table)
def get_comment(self, data, req_url): log.debug('获取评论信息') req_url = req_url.replace('amp;', '') mid = tools.get_param(req_url, 'appmsgid') # 图文消息id 同一天发布的图文消息 id一样 idx = tools.get_param(req_url, 'idx') # 第几条图文消息 从1开始 article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260 idx = 1,则article_id = 26504922601 data = tools.get_json(data) comment = data.get('elected_comment', []) # 精选留言 # 缓存文章评论信息 WechatAction._article_info[article_id]['comment'] = comment WechatAction._wechat_service.add_article_info(WechatAction._article_info.pop(article_id))
def __init__(self, table): self._sync_time_file = SYNC_TIME_FILE + table + '.txt' self._record_time = tools.get_json( tools.read_file(self._sync_time_file)) or {} self._compare_keywords = CompareKeywords() self._summary = Summary() self._emotion = Emotion() self._word_cloud = WordCloud() self._yqtj_es = ES(YQTJ) self._data_pool_es = ES(DATA_POOL) self._hot_sync = HotSync() self._vip_checked = VipChecked() self._province_filter = ProvinceFilter() self._table = table self._per_record_time_key = '{table}_record_time'.format( table=self._table) self._vip_checked.start() self._compare_keywords.start()
def deal_article_dynamic_info(self, req_data, text): """ 取文章动态信息 阅读 点赞 评论 :param req_data: post 请求的data str格式 :param text: :return: """ data = tools.get_json(text) dynamic_data = dict( sn=tools.get_param(req_data, "sn"), __biz=tools.get_param(req_data, "__biz").replace("%3D", "="), read_num=data.get("appmsgstat", {}).get("read_num"), like_num=data.get("appmsgstat", {}).get("like_num"), comment_count=data.get("comment_count"), spider_time=tools.get_current_date(), ) if dynamic_data: data_pipeline.save_article_dynamic(dynamic_data)
def deal_article_dynamic_info(self, req_data, text): """ 取文章动态信息 阅读 点赞 评论 :param req_data: post 请求的data str格式 :param text: :return: """ data = tools.get_json(text) dynamic_data = dict( sn=tools.get_param(req_data, 'sn'), __biz=tools.get_param(req_data, '__biz').replace('%3D', '='), read_num=data.get('appmsgstat', {}).get('read_num'), like_num=data.get('appmsgstat', {}).get('like_num'), comment_count=data.get('comment_count'), spider_time=tools.get_current_date() ) if dynamic_data: data_pipeline.save_article_dynamic(dynamic_data)
def __parse_article_list(self, article_list, __biz, is_first_page=False): ''' @summary: 解析文章列表 --------- @param article_list: 文章列表信息 str --------- @result: True / None (True: 继续向下抓取; None: 停止向下抓取) ''' # log.debug(tools.dumps_json(article_list)) # 解析json内容里文章信息 def parse_article_info(article_info, comm_msg_info): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get('title') digest = article_info.get('digest') url = article_info.get('content_url').replace('\\', '').replace('amp;', '') source_url = article_info.get('source_url').replace('\\', '') # 引用的文章链接 cover = article_info.get('cover').replace('\\', '') subtype = article_info.get('subtype') is_multi = article_info.get('is_multi') author = article_info.get('author') copyright_stat = article_info.get('copyright_stat') duration = article_info.get('duration') del_flag = article_info.get('del_flag') type = comm_msg_info.get('type') publish_time = tools.timestamp_to_date(comm_msg_info.get('datetime')) sn = tools.get_param(url, 'sn') if sn: # 缓存文章信息 article_data = { 'title': title, 'digest': digest, 'url': url, 'source_url': source_url, 'cover': cover, 'subtype': subtype, 'is_multi': is_multi, 'author': author, 'copyright_stat': copyright_stat, 'duration': duration, 'del_flag': del_flag, 'type': type, 'publish_time': publish_time, 'sn': sn, '__biz': __biz, 'spider_time': tools.get_current_date() } return article_data # log.debug(tools.dumps_json(article_list)) article_list = tools.get_json(article_list) article_list_data = [] publish_time = None is_need_get_more = True article_list = article_list.get('list', []) is_first_article = True for article in article_list: comm_msg_info = article.get('comm_msg_info', {}) publish_timestamp = comm_msg_info.get('datetime') publish_time = tools.timestamp_to_date(publish_timestamp) # 记录最新发布时间 if is_first_page and is_first_article: self._task_manager.record_new_last_article_publish_time(__biz, publish_time) is_first_article = False if publish_timestamp and self._task_manager.is_zombie_account(publish_timestamp): # 首页检测是否为最新发布的文章 若最近未发布 则为僵尸账号 log.info('公众号 {} 为僵尸账号 不再监控'.format(__biz)) self._task_manager.sign_account_is_zombie(__biz, publish_time) is_need_get_more = False break # 对比时间 若采集到上次时间,则跳出 is_reach = self._task_manager.is_reach_last_article_publish_time(__biz, publish_time) if is_reach: log.info('采集到上次发布时间 公众号 {} 采集完成'.format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time(__biz) self._task_manager.update_account_last_publish_time(__biz, new_last_publish_time) is_need_get_more = False break elif is_reach is None: log.info('公众号 {} 为爬虫启动时的手点公众号。不遍历历史消息,即将抓取监控池里的公众号'.format(__biz)) return article_type = comm_msg_info.get('type') if article_type != 49: # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一 continue # 看是否在抓取时间范围 publish_time_status = self._task_manager.is_in_crawl_time_range(publish_time) if publish_time_status == TaskManager.OVER_MIN_TIME_RANGE: log.info('公众号 {} 超过采集时间范围 采集完成'.format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time(__biz) self._task_manager.update_account_last_publish_time(__biz, new_last_publish_time) is_need_get_more = False break elif publish_time_status == TaskManager.NOT_REACH_TIME_RANGE: log.info('公众号 {} 当前采集到的时间 {} 未到采集时间范围 不采集'.format(__biz, publish_time)) continue # 在时间范围 # 微信公众号每次可以发多个图文消息 # 第一个图文消息 app_msg_ext_info = article.get('app_msg_ext_info', {}) article_data = parse_article_info(app_msg_ext_info, comm_msg_info) if article_data: article_list_data.append(article_data) # 同一天附带的图文消息 multi_app_msg_item_list = app_msg_ext_info.get('multi_app_msg_item_list') for multi_app_msg_item in multi_app_msg_item_list: article_data = parse_article_info(multi_app_msg_item, comm_msg_info) if article_data: article_list_data.append(article_data) if article_list_data: data_pipeline.save_article_list(article_list_data) if is_need_get_more: return publish_time
def get_article_list(self, data, req_url): ''' @summary: 获取文章列表 分为两种 1、第一次查看历史消息 返回的是html格式 包含公众号信息 2、下拉显示更多时 返回json格式 但是文章列表都是json格式 且合适相同 抓取思路: 1、如果是第一种格式,直接解析文章内容,拼接下一页json格式的地址 2、如果是第二种格式, --------- @param data: --------- @result: ''' try: # 判断是否为被封的账号, 被封账号没有文章列表 if 'list' in data: # 取html格式里的文章列表 if 'action=home' in req_url: # 解析公众号信息 self.__parse_account_info(data, req_url) # 解析文章列表 regex = "msgList = '(.*?})';" article_list = tools.get_info(data, regex, fetch_one=True) article_list = article_list.replace('"', '"') self.__parse_article_list(article_list) #判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 regex = "can_msg_continue = '(\d)'" can_msg_continue = tools.get_info(data, regex, fetch_one=True) if can_msg_continue == '0': # 无更多文章 pass elif self._is_need_get_more: # 以下是拼接下拉显示更多的历史文章 跳转 # 取appmsg_token 在html中 regex = 'appmsg_token = "(.*?)";' appmsg_token = tools.get_info(data, regex, fetch_one=True) # 取其他参数 在url中 __biz = tools.get_param(req_url, '__biz') pass_ticket = tools.get_param(req_url, 'pass_ticket') next_page_url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json'.format( __biz=__biz, offset=10, pass_ticket=pass_ticket, appmsg_token=appmsg_token) WechatAction._todo_urls.append(next_page_url) else: # json格式 data = tools.get_json(data) article_list = data.get('general_msg_list', {}) self.__parse_article_list(article_list) #判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 can_msg_continue = data.get('can_msg_continue') if not can_msg_continue: # 无更多文章 pass elif self._is_need_get_more: # 以下是拼接下拉显示更多的历史文章 跳转 # 取参数 在url中 __biz = tools.get_param(req_url, '__biz') pass_ticket = tools.get_param(req_url, 'pass_ticket') appmsg_token = tools.get_param(req_url, 'appmsg_token') # 取offset 在json中 offset = data.get('next_offset', 0) next_page_url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json'.format( __biz=__biz, offset=offset, pass_ticket=pass_ticket, appmsg_token=appmsg_token) WechatAction._todo_urls.append(next_page_url) else: # 该__biz 账号已被封 pass except Exception as e: log.error(e) return self.__open_next_page()
def __parse_article_list(self, article_list): ''' @summary: 解析文章列表 --------- @param article_list: 文章列表信息 str { "list":[ { "comm_msg_info":{ "id":1000000513, "type":49, "datetime":1511354167, "fakeid":"3082125093", "status":2, "content":"" }, "app_msg_ext_info":{ "title":"Python 内存优化", "digest":"实际项目中,pythoner更加关注的是Python的性能问题。本文,关注的是Python的内存优化,一般说来,如果不发生内存泄露,运行在服务端的Python代码不用太关心内存,但是如果运行在客户端,那还是有优化的必要。", "content":"", "fileid":505083208, "content_url":"http:\/\/mp.weixin.qq.com\/s?__biz=MzA4MjEyNTA5Mw==&mid=2652566858&idx=1&sn=d2a76f4a601f94d8acc7b436d18e9648&chksm=8464dd00b313541684c14f974325ea6ae725ffc901fd9888cc00d1acdd13619de3297a5d9a35&scene=27#wechat_redirect", "source_url":"http:\/\/www.cnblogs.com\/xybaby\/p\/7488216.html", "cover":"http:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/fhujzoQe7TpODTuicia4geCiaIj1AbZwVQQVbRHy3FhzwMHEvCvtzXVicHTaPEu8jZ2pgkCAgBqEHugYMvzg3tpoww\/0?wx_fmt=jpeg", "subtype":9, "is_multi":1, "multi_app_msg_item_list":[ { "title":"面向对象:With the wonder of your love, the sun above always shines", "digest":"With the wonder of your love, the sun above always shines", "content":"", "fileid":505083209, "content_url":"http:\/\/mp.weixin.qq.com\/s?__biz=MzA4MjEyNTA5Mw==&mid=2652566858&idx=2&sn=97f223783da7748080f8103654447c99&chksm=8464dd00b313541601938565a41487ea76209331fd6f4c8996a2ff5572f4fd465de9fa4cbaac&scene=27#wechat_redirect", "source_url":"https:\/\/mp.weixin.qq.com\/s\/_uD9jY4nXQQ6CtA__dsN8w?scene=25#wechat_redirect", "cover":"http:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/fhujzoQe7TpODTuicia4geCiaIj1AbZwVQQ5ukvwH1GPq5zlWxv05WvRiaw6BiaeyGRD1w17nAPGTlQgEvvDuZnB9HA\/0?wx_fmt=jpeg", "author":"", "copyright_stat":101, "del_flag":1 } ], "author":"", "copyright_stat":100, "del_flag":1 } } ] } --------- @result: ''' # log.debug(tools.dumps_json(article_list)) # 解析json内容里文章信息 def parse_article_info(article_info, release_time): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get('title') summary = article_info.get('digest') url = article_info.get('content_url').replace('\\', '').replace( 'amp;', '') source_url = article_info.get('source_url').replace('\\', '') # 引用的文章链接 cover = article_info.get('cover').replace('\\', '') author = article_info.get('author') if url and url.startswith( 'http://mp.weixin.qq.com/' ): # 被发布者删除的文章 无url和其他信息, 此时取不到mid 且不用入库, 或者商城类的url不入库 mid = tools.get_param(url, 'mid') or tools.get_param( url, 'appmsgid') # 图文消息id 同一天发布的图文消息 id一样 idx = tools.get_param(url, 'idx') or tools.get_param( url, 'itemidx') # 第几条图文消息 从1开始 article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260 idx = 1,则article_id = 26504922601 # 判断该文章库中是否已存在 if WechatAction._wechat_service.is_exist( 'wechat_article', article_id) or (ONLY_TODAY_MSG and release_time < tools.get_current_date('%Y-%m-%d')): self._is_need_get_more = False return # 不往下进行 舍弃之后的文章 __biz = tools.get_param(url, '__biz') # 用于关联公众号 # 缓存文章信息 WechatAction._article_info[article_id] = { 'article_id': int(article_id), 'title': title, 'summary': summary, 'release_time': release_time, 'url': url, 'source_url': source_url, 'cover': cover, 'account': '', 'author': author, '__biz': __biz, 'read_num': None, 'like_num': None, 'content': '', 'comment': [], 'record_time': tools.get_current_date() } # 将文章url添加到待抓取队列 WechatAction._todo_urls.append(url) # log.debug(tools.dumps_json(article_list)) article_list = tools.get_json(article_list) article_list = article_list.get('list', []) for article in article_list: article_type = article.get('comm_msg_info', {}).get('type') if article_type != 49: # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一 continue release_time = article.get('comm_msg_info', {}).get('datetime') release_time = tools.timestamp_to_date(release_time) # 微信公众号每次可以发多个图文消息 # 第一个图文消息 app_msg_ext_info = article.get('app_msg_ext_info', {}) parse_article_info(app_msg_ext_info, release_time) if not self._is_need_get_more: break # 同一天附带的图文消息 multi_app_msg_item_list = app_msg_ext_info.get( 'multi_app_msg_item_list') for multi_app_msg_item in multi_app_msg_item_list: parse_article_info(multi_app_msg_item, release_time) if not self._is_need_get_more: break
def __parse_article_list(self, article_list, __biz, is_first_page=False): """ @summary: 解析文章列表 --------- @param article_list: 文章列表信息 str --------- @result: True / None (True: 继续向下抓取; None: 停止向下抓取) """ # log.debug(tools.dumps_json(article_list)) # 解析json内容里文章信息 def parse_article_info(article_info, comm_msg_info): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get("title") digest = article_info.get("digest") url = article_info.get("content_url").replace("\\", "").replace( "amp;", "") source_url = article_info.get("source_url").replace("\\", "") # 引用的文章链接 cover = article_info.get("cover").replace("\\", "") subtype = article_info.get("subtype") is_multi = article_info.get("is_multi") author = article_info.get("author") copyright_stat = article_info.get("copyright_stat") duration = article_info.get("duration") del_flag = article_info.get("del_flag") type = comm_msg_info.get("type") publish_time = tools.timestamp_to_date( comm_msg_info.get("datetime")) sn = tools.get_param(url, "sn") if sn: # 缓存文章信息 article_data = { "title": title, "digest": digest, "url": url, "source_url": source_url, "cover": cover, "subtype": subtype, "is_multi": is_multi, "author": author, "copyright_stat": copyright_stat, "duration": duration, "del_flag": del_flag, "type": type, "publish_time": publish_time, "sn": sn, "__biz": __biz, "spider_time": tools.get_current_date(), } return article_data # log.debug(tools.dumps_json(article_list)) article_list = tools.get_json(article_list) article_list_data = [] publish_time = None is_need_get_more = True article_list = article_list.get("list", []) is_first_article = True for article in article_list: comm_msg_info = article.get("comm_msg_info", {}) publish_timestamp = comm_msg_info.get("datetime") publish_time = tools.timestamp_to_date(publish_timestamp) # 记录最新发布时间 if is_first_page and is_first_article: self._task_manager.record_new_last_article_publish_time( __biz, publish_time) is_first_article = False if publish_timestamp and self._task_manager.is_zombie_account( publish_timestamp): # 首页检测是否为最新发布的文章 若最近未发布 则为僵尸账号 log.info("公众号 {} 为僵尸账号 不再监控".format(__biz)) self._task_manager.sign_account_is_zombie( __biz, publish_time) is_need_get_more = False break # 对比时间 若采集到上次时间,则跳出 is_reach = self._task_manager.is_reach_last_article_publish_time( __biz, publish_time) if is_reach: log.info("采集到上次发布时间 公众号 {} 采集完成".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) is_need_get_more = False break elif is_reach is None: log.info( "公众号 {} 为爬虫启动时的手点公众号。不遍历历史消息,即将抓取监控池里的公众号".format(__biz)) return article_type = comm_msg_info.get("type") if article_type != 49: # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一 continue # 看是否在抓取时间范围 publish_time_status = self._task_manager.is_in_crawl_time_range( __biz, publish_time) if publish_time_status == TaskManager.OVER_MIN_TIME_RANGE: log.info("公众号 {} 超过采集时间范围 采集完成".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) is_need_get_more = False break elif publish_time_status == TaskManager.NOT_REACH_TIME_RANGE: log.info("公众号 {} 当前采集到的时间 {} 未到采集时间范围 不采集".format( __biz, publish_time)) continue # 在时间范围 # 微信公众号每次可以发多个图文消息 # 第一个图文消息 app_msg_ext_info = article.get("app_msg_ext_info", {}) article_data = parse_article_info(app_msg_ext_info, comm_msg_info) if article_data: article_list_data.append(article_data) # 同一天附带的图文消息 multi_app_msg_item_list = app_msg_ext_info.get( "multi_app_msg_item_list") for multi_app_msg_item in multi_app_msg_item_list: article_data = parse_article_info(multi_app_msg_item, comm_msg_info) if article_data: article_list_data.append(article_data) if article_list_data: data_pipeline.save_article_list(article_list_data) if is_need_get_more: return publish_time
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark']['keyword'] monitor_type = url_info['remark']['monitor_type'] official_accounts_id = remark retry_times = url_info['retry_times'] headers = { "Host": "weixin.sogou.com", "Connection": "keep-alive", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Cookie": "ABTEST=8|1506658658|v1; IPLOC=CN1100; SUID=C5C47C7B642E940A0000000059CDC962; SUID=C5C47C7B1508990A0000000059CDC963; weixinIndexVisited=1; SUV=00F95AA57B7CC4C559CDC963CE316529; SNUID=2B2A9295EDE8B7A2BCECB605EE30F1BE; JSESSIONID=aaadcwpP9yaKs-PCMhz6v", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1" } # 获取代理 proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() # 解析 # print(proxies) # html, r = tools.get_html_by_requests('http://ip.chinaz.com/getip.aspx', headers = headers, proxies = proxies) # print(html) html, request = tools.get_html_by_requests(root_url, headers = headers, proxies = proxies) if not html: base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1) return # print(html) regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">' check_info = tools.get_info(html, regex, fetch_one = True) print(root_url) log.debug('取文章链接' + check_info) if check_info: base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1) return # 公众号信息块 regex = '<!-- a -->(.*?)<!-- z -->' account_block = tools.get_info(html, regex, fetch_one = True) # url regex = '<a.*?account_name.*?href="(.*?)">' account_url = tools.get_info(account_block, regex, fetch_one = True) account_url = account_url.replace('&',"&") log.debug('account_url = ' + account_url) if not account_url: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return headers = { "Accept-Language": "zh-CN,zh;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Host": "mp.weixin.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive" } # 代理 proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() proxies = {} #使用代理会出现验证码 暂时不使用 html, request = tools.get_html_by_requests(account_url, headers = headers, proxies = proxies) regex = '<input class="weui_input frm_input" id="input" placeholder="(.*?)" maxlength="4">' check_info = tools.get_info(html, regex, fetch_one = True) log.debug(''' 取文章详细内容 %s url %s request.headers %s '''%(check_info, account_url, request.headers)) # print(html) regex = 'var msgList = (.*?});' article_json = tools.get_info(html, regex, fetch_one = True) article_json = tools.get_json(article_json) article_list = article_json.get('list', {}) for article in article_list: title = tools.get_json_value(article, 'app_msg_ext_info.title') is_have = mongodb.find('WWA_wechat_article', {'title' : title}) if is_have: log.debug(title + " 已存在") continue summary = tools.get_json_value(article, 'app_msg_ext_info.digest') image_url = tools.get_json_value(article, 'app_msg_ext_info.cover') sexy_image_url = [] # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' sexy_image_url.append(local_image_url) article_url = tools.get_json_value(article, 'app_msg_ext_info.content_url') article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url) article_url = article_url.replace('&',"&") release_time = tools.get_json_value(article, 'comm_msg_info.datetime') release_time = tools.timestamp_to_date(int(release_time)) if release_time else '' content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies) regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce' content = tools.get_info(content_html, regex, fetch_one = True) # # 取content里的图片 下载图片 然后替换内容中原来的图片地址 regex = '<img.*?data-src="(.*?)"' images = tools.get_info(content, regex) for image in images: local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg') is_download = tools.download_file(image, local_image_path) if is_download: content = content.replace(image, local_image_path) sexy_image_url.append(local_image_path) tools.delay_time(5) # 敏感事件 sensitive_id = '' if monitor_type == 1 or monitor_type == 2: sensitive_event_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[1].split(',') if sensitive_event_info[1] else [] keyword2 = sensitive_event_info[2].split(',') if sensitive_event_info[2] else [] keyword3 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else [] if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3): sensitive_id = _id break # 违规事件 violate_id = '' if monitor_type == 0 or monitor_type == 2: vioation_knowledge_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[1].split(',') if vioation_knowledge_info[1] else [] keyword2 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else [] keyword3 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else [] if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break log.debug(''' 标题 %s 简介 %s 图片地址 %s 文章地址 %s 发布时间 %s 内容 %s 本地贴图地址 %s 违规状态 %s 敏感事件 %s 图片鉴别地址 %s '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url)) base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url) # 同一天发布的 oneday_article_list = article.get('app_msg_ext_info', {}).get('multi_app_msg_item_list', []) for article in oneday_article_list: title = tools.get_json_value(article, 'title') summary = tools.get_json_value(article, 'digest') image_url = tools.get_json_value(article, 'cover') sexy_image_url = [] # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' sexy_image_url.append(local_image_url) article_url = tools.get_json_value(article, 'content_url') article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url) article_url = article_url.replace('&',"&") content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies) regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce' content = tools.get_info(content_html, regex, fetch_one = True) # 取content里的图片 下载图片 然后替换内容中原来的图片地址 regex = '<img.*?data-src="(.*?)"' images = tools.get_info(content, regex) for image in images: local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg') is_download = tools.download_file(image, local_image_path) if is_download: content = content.replace(image, local_image_path) sexy_image_url.append(local_image_path) tools.delay_time(5) # 敏感事件 sensitive_id = '' sensitive_event_infos = oracledb.find('select * from tab_mvms_sensitive_event') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else [] keyword2 = sensitive_event_info[4].split(',') if sensitive_event_info[4] else [] keyword3 = sensitive_event_info[5].split(',') if sensitive_event_info[5] else [] if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3): sensitive_id = _id break # 违规事件 violate_id = '' vioation_knowledge_infos = oracledb.find('select * from tab_mvms_violation_knowledge') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else [] keyword2 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else [] keyword3 = vioation_knowledge_info[4].split(',') if vioation_knowledge_info[4] else [] if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break log.debug(''' 标题 %s 简介 %s 图片地址 %s 文章地址 %s 发布时间 %s 内容 %s 本地贴图地址 %s 违规状态 %s 敏感事件 %s 图片鉴别地址 %s '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url)) base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url) base_parser.update_url('WWA_wechat_article_url', root_url, Constance.DONE) tools.delay_time()
def deal_article_list(self, req_url, text): """ @summary: 获取文章列表 分为两种 1、第一次查看历史消息 返回的是html格式 包含公众号信息 2、下拉显示更多时 返回json格式 但是文章列表都是json格式 且合适相同 抓取思路: 1、如果是第一种格式,直接解析文章内容,拼接下一页json格式的地址 2、如果是第二种格式, --------- @param data: --------- @result: """ try: # 判断是否为被封的账号, 被封账号没有文章列表 __biz = tools.get_param(req_url, "__biz") if "list" in text: # 取html格式里的文章列表 if "action=home" in req_url: # 解析公众号信息 self.__parse_account_info(text, req_url) # 解析文章列表 regex = "msgList = '(.*?})';" article_list = tools.get_info(text, regex, fetch_one=True) article_list = article_list.replace(""", '"') publish_time = self.__parse_article_list( article_list, __biz, is_first_page=True) # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 regex = "can_msg_continue = '(\d)'" can_msg_continue = tools.get_info(text, regex, fetch_one=True) if can_msg_continue == "0": # 无更多文章 log.info("抓取到列表底部 无更多文章,公众号 {} 抓取完毕".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) if not new_last_publish_time: # 标记成僵尸号 log.info("公众号 {} 为僵尸账号 不再监控".format(__biz)) self._task_manager.sign_account_is_zombie(__biz) else: self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) elif publish_time: # 以下是拼接下拉显示更多的历史文章 跳转 # 取appmsg_token 在html中 regex = 'appmsg_token = "(.*?)";' appmsg_token = tools.get_info(text, regex, fetch_one=True) # 取其他参数 在url中 __biz = tools.get_param(req_url, "__biz") pass_ticket = tools.get_param(req_url, "pass_ticket") next_page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json".format( __biz=__biz, offset=10, pass_ticket=pass_ticket, appmsg_token=appmsg_token, ) return self._task_manager.get_task( next_page_url, tip="正在抓取列表 next_offset {} 抓取到 {}".format( 10, publish_time), ) else: # json格式 text = tools.get_json(text) article_list = text.get("general_msg_list", {}) publish_time = self.__parse_article_list( article_list, __biz) # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 can_msg_continue = text.get("can_msg_continue") if not can_msg_continue: # 无更多文章 log.info("抓取到列表底部 无更多文章,公众号 {} 抓取完毕".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) pass elif publish_time: # 以下是拼接下拉显示更多的历史文章 跳转 # 取参数 在url中 __biz = tools.get_param(req_url, "__biz") pass_ticket = tools.get_param(req_url, "pass_ticket") appmsg_token = tools.get_param(req_url, "appmsg_token") # 取offset 在json中 offset = text.get("next_offset", 0) next_page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json".format( __biz=__biz, offset=offset, pass_ticket=pass_ticket, appmsg_token=appmsg_token, ) return self._task_manager.get_task( next_page_url, tip="正在抓取列表 next_offset {} 抓取到 {}".format( offset, publish_time), ) else: # 该__biz 账号已被封 self._task_manager.sign_account_is_zombie(__biz) pass except Exception as e: log.exception(e) return self._task_manager.get_task()
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] description = url_info['remark'] def get_tkey(t): def ror(val, key): i = 0 while (i < key): val = (0x7fffffff & (val >> 1)) | ((val & 1) << 31) i += 1 return val key = 185025305 val = ror(t, key % 17) val = val ^ key return val def getdownload(episode_download_url_json): episode_json = tools.get_json_by_requests(episode_download_url_json) #print(episode_download_url_json) episode_download_url = tools.get_json_value(episode_json, 'msgs.playurl.domain') episode_download_url = episode_download_url and episode_download_url[ 0] or '' #print('-----',episode_download_url) episode_download_url_definition = tools.get_json_value( episode_json, 'msgs.playurl.dispatch.1080p') episode_download_url_definition = episode_download_url_definition and episode_download_url_definition[ 0] or '' print(episode_download_url_definition, '*********') episode_download_url = episode_download_url + episode_download_url_definition episode_download_url += "&ctv=pc&m3v=1&termid=1&format=1&hwtype=un&ostype=Linux&tag=letv&sign=letv&expect=3&tn={}&pay=0&iscpn=f9051&rateid={}".format( random.random(), '1080p') episode_download_url_json = tools.get_json_by_requests( episode_download_url) episode_download_url = tools.get_json_value(episode_download_url_json, 'location') return episode_download_url if depth == 0: cs_regex = 'cs(.*?)_' o_regex = 'cs.*?_o(.*?)_p' cs = tools.get_info(source_url, cs_regex) cs_value = cs and cs[0] or '' o = tools.get_info(source_url, o_regex) o_value = o and o[0] or '' #print('1'+o_value+'2','***', cs_value) url = 'http://list.le.com/apin/chandata.json?cs=' + cs_value + '&_o=' + o_value + '&_p=' base_parser.add_url('PROGRAM_urls', site_id, url, depth + 1) if depth == 1: page = '1' #电视剧 if 'cs=2' in source_url: while True: json = tools.get_json_by_requests(source_url + page) json_list = tools.get_json_value(json, 'album_list') #print(source_url) for info in json_list: image_url = tools.get_json_value(info, 'images.1080*608') program_name = tools.get_json_value(info, 'name') program_url = tools.get_json_value(info, 'aid') program_url = 'http://www.le.com/tv/' + program_url + '.html' episode = tools.get_json_value(info, 'nowEpisodes') directors = tools.get_json_value(info, 'directory') #print(type(directors)) directors = ','.join(tools.get_json(directors).values()) actors = tools.get_json_value(info, 'starring') actors = ' '.join(actors.values()) summary = tools.get_json_value(info, 'description') release_time = tools.get_json_value(info, 'releaseDate') release_time = int(release_time) / 1000 x = time.localtime(release_time) release_time = time.strftime("%Y-%m-%d", x) log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time) episode_url = tools.get_json_value(info, 'vids') episode_url = episode_url + ',' regex = '(\d*?),' episode_urls = tools.get_info(episode_url, regex) for episode_url_num in episode_urls: episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html' episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) episode_json = tools.get_json_by_requests( episode_download_url_json) episode_image_url = tools.get_json_value( episode_json, 'msgs.playurl.pic') episode_name = tools.get_json_value( episode_json, 'msgs.playurl.title') episode_num_regex = "(\d*?)" episode_num = tools.get_info(episode_name, episode_num_regex) episode_num = episode_num and episode_num[0] or '' episode_download_url = getdownload( episode_download_url_json) time_length = '' episode_summary = '' download_status = '' log.debug( ''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, '') page = str(int(page) + 1) if not json_list: return False #体育 if 'cs=4' in source_url: while True: json = tools.get_json_by_requests(source_url + page) video_list = tools.get_json_value(json, 'video_list') for info in video_list: episode_name = tools.get_json_value(info, 'name') episode_num = tools.get_json_value(info, 'name') regex_episode_num = '第(.*?)期' episode_num = tools.get_info(episode_num, regex_episode_num) episode_num = ''.join(episode_num) episode_summary = tools.get_json_value(info, 'description') episode_image_url = tools.get_json_value( info, 'images.1080*608') episode_url_num = tools.get_json_value(info, 'vid') episode_url = 'http://sports.le.com/video/' + episode_url_num + '.html' #获取Vid 也就是num episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) episode_download_url = getdownload( episode_download_url_json) program_name = tools.get_json_value(info, 'albumName') summary = '' program_url = '' actors = '' directors = '' release_time = '' image_url = '' episode = '' aid = tools.get_json_value(info, 'aid') download_status = 102 time_length = '' log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s aid = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time, aid)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time, _id=aid) log.debug(''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode, time_length, program_name, download_status, episode_download_url, program_url, summary, image_url, '') page = str(int(page) + 1) if not json_list: return False # 综艺 if 'cs=11' in source_url: while True: json = tools.get_json_by_requests(source_url + page) video_list = tools.get_json_value(json, 'video_list') for info in video_list: episode_name = tools.get_json_value(info, 'name') episode_num = tools.get_json_value(info, 'name') regex_episode_num = '第(.*?)期' episode_num = tools.get_info(episode_num, regex_episode_num) episode_num = ''.join(episode_num) episode_summary = tools.get_json_value(info, 'description') episode_image_url = tools.get_json_value( info, 'images.1080*608') episode_url_num = tools.get_json_value(info, 'vid') episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html' #获取Vid 也就是num episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) episode_download_url = getdownload( episode_download_url_json) program_name = tools.get_json_value(info, 'albumName') summary = '' actors = '' directors = '' release_time = '' image_url = '' episode = '' aid = tools.get_json_value(info, 'aid') program_url = '' download_status = 102 time_length = '' log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s aid = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time, aid)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time, _id=aid) log.debug(''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, '') page = str(int(page) + 1) if not video_list: return False # 音乐 if 'cs=9' in source_url: while True: json = tools.get_json_by_requests(source_url + page) video_list = tools.get_json_value(json, 'video_list') for info in video_list: episode_name = tools.get_json_value(info, 'name') episode_num = tools.get_json_value(info, 'name') regex_episode_num = '(\d*?):' episode_num = tools.get_info(episode_num, regex_episode_num) episode_num = ''.join(episode_num) episode_summary = tools.get_json_value(info, 'description') episode_image_url = tools.get_json_value( info, 'images.1080*608') episode_url_num = tools.get_json_value(info, 'vid') episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html' #获取Vid 也就是num episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) episode_download_url = getdownload( episode_download_url_json) program_name = tools.get_json_value(info, 'albumName') summary = '' actors = tools.get_json_value(info, 'actor').values() actors = ''.join(actors) #print('**********', actors) directors = '' release_time = '' image_url = '' episode = '' aid = tools.get_json_value(info, 'aid') program_url = '' download_status = 102 time_length = '' log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s aid = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time, aid)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time, _id=aid) log.debug(''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, '') page = str(int(page) + 1) if not video_list: return False # # # 取当前页的文章信息 # # # 标题 # # # # # # everyone_html = tools.get_html_by_requests(program_url) # # # # regexs_directors = '<span class="editor" style="color:#333;">(.*?)</span>' # # directors = tools.get_info(everyone_html, regexs_directors) # # directors = directors and directors[0] or '' # # # # # 摘要 # # regexs_summary = '<p class="summaryList_long">(.*?)</p>' # # summary = tools.get_info(everyone_html, regexs_summary) # # summary = summary and summary[0] or '' # # # # # 更新时间 # # regexs_release_time = ' <dt>发布时间:</dt>.*?<dd>(.*?)</dd>' # # release_time = tools.get_info(everyone_html, regexs_release_time) # # release_time = release_time and release_time[0] or '' # # # # # 下载地址 # # regexs_download_url = 'videoUrl=(.*?)"' # # download_url = tools.get_info(everyone_html, regexs_download_url) # # download_url = download_url and download_url[0] or '' # # # # download_status = 102 # # time_length = '' # # # # # # if download_url: # # program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url = image_url, # # episode = episode, directors = directors, actors = '', summary = summary, # # release_time = release_time) # # # # sto_path = '/video/' + program_name + '.mp4' # # is_download = tools.download_file(download_url, FILE_LOCAL_PATH, sto_path) # # download_status = 101 if is_download else 102 # # # # base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode, time_length, program_name, download_status, # # download_url, program_url, summary, image_url, sto_path) # # # # # # # 更新source_url为done # # base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE) # 电影 if 'cs=1' in source_url: while True: json = tools.get_json_by_requests(source_url + page) json_list = tools.get_json_value(json, 'album_list') #print(source_url) for info in json_list: image_url = tools.get_json_value(info, 'images.1080*608') program_name = tools.get_json_value(info, 'name') program_url = tools.get_json_value(info, 'aid') program_url = 'http://www.le.com/movie/' + program_url + '.html' episode = ' ' directors = tools.get_json_value(info, 'directory') directors = ','.join(tools.get_json(directors).values()) actors = tools.get_json_value(info, 'starring') actors = ' '.join(actors.values()) summary = tools.get_json_value(info, 'description') release_time = tools.get_json_value(info, 'releaseDate') release_time = int(release_time) / 1000 x = time.localtime(release_time) release_time = time.strftime("%Y-%m-%d", x) log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time) episode_url = tools.get_json_value(info, 'vids') episode_url = episode_url + ',' regex = '(.*?),' episode_urls = tools.get_info(episode_url, regex) for episode_url_num in episode_urls: episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html' episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) print(episode_download_url_json) episode_json = tools.get_json_by_requests( episode_download_url_json) episode_image_url = tools.get_json_value( episode_json, 'msgs.playurl.pic') episode_name = tools.get_json_value( episode_json, 'msgs.playurl.title') episode_num_regex = "第(.*?)期" episode_num = tools.get_info(episode_name, episode_num_regex) episode_num = episode_num and episode_num[0] or '' episode_download_url = getdownload( episode_download_url_json) time_length = '' episode_summary = '' download_status = '' log.debug( ''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, '') page = str(int(page) + 1) if not json_list: return False
def parser_episode_info(url_info): log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] program_mongo_id = remark['program_mongo_id'] classify = remark['classify'] if classify == '电视剧': episode_url = root_url episode_name = remark['program_name'] + '_' + remark['episode_num'] episode_num = remark['episode_num'] # vid http://v.qq.com/x/cover/nuijxf6k13t6z9b/u0023ickfto.html vid = root_url[root_url.rfind('/') + 1:root_url.rfind('.')] download_url = get_download_url(vid) download_url = '^_^'.join(download_url) image_url = 'http://puui.qpic.cn/vpic/0/%s_160_90_3.jpg/0' % vid log.debug( ''' episode_name %s episode_num %s episode_url %s image_url %s download_url %s ''' % (episode_name, episode_num, episode_url, image_url, download_url)) base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_mongo_id, episode_num=episode_num, time_length='', episode_name=episode_name, download_status='', download_url=download_url, episode_url=episode_url, summary='', image_url=image_url, sto_path='') elif classify == '综艺': # 用youget 的qq html, request = tools.get_html_by_requests(root_url) if not html: base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION) return episodes_json = html[len('QZOutputJson='):-1] # print(episodes_json) episodes_json = tools.get_json(episodes_json) episodes = episodes_json.get('PlaylistItem', {}).get('videoPlayList', []) for episode in episodes: try: image_url = episode['pic'] episode_url = episode[ 'playUrl'] # http://v.qq.com/x/cover/3e70vfbgrss48n8/e00237dneke.html episode_name = episode['title'] vid = episode_url[episode_url.rfind('/') + 1:episode_url.rfind('.html')] print('qq_download_by_vid begin') import time b = time.time() download_url = qq.qq_download_by_vid(vid) print('qq_download_by_vid end time = %s' % str(time.time() - b)) download_url = '^_^'.join(download_url) log.debug(''' episode_name %s image_url %s episode_url %s download_url %s ''' % (episode_name, image_url, episode_url, download_url)) base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_mongo_id, episode_num='', time_length='', episode_name=episode_name, download_status='', download_url=download_url, episode_url=episode_url, summary='', image_url=image_url, sto_path='') except Exception as e: log.error(e) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def _get_per_record_time(self): news_record_time = '' news_record_time = tools.get_json(self._record_time).get('news_record_time') return news_record_time