def load_task(self): if TaskService._offset == 1: log.info('开始新的一轮抓取') TaskService._spider_start_timestamp = tools.get_current_timestamp() TaskService._total_task_size = 0 # 清空url表 TaskService._redisdb.clear('news:news_urls') TaskService._redisdb.clear('news:news_urls_dupefilter') task_sql = ''' select * from (select t.id, t.name, t.position, t.url, t.depth, rownum r from TAB_IOPM_SITE t where classify = 1 and t.mointor_status = 701 and t.position != 35 and rownum < {page_size}) where r >= {offset} '''.format(page_size = TaskService._offset + TASK_BUFFER_SIZE, offset = TaskService._offset) TaskService._offset += TASK_BUFFER_SIZE print(task_sql) tasks = TaskService._db.find(task_sql) TaskService._total_task_size += len(tasks) if not tasks: TaskService._spider_end_timestamp = tools.get_current_timestamp() log.info('已做完一轮,共处理网站%s个 耗时%s'%(TaskService._total_task_size, tools.seconds_to_h_m_s(TaskService._spider_end_timestamp - TaskService._spider_start_timestamp))) TaskService._offset = 1 self.load_task() TaskService._task_ring_buff.put_data(tasks)
def get_download_url(url): html, r = tools.get_html_by_requests(url) tvid = re.compile('player-tvid="(\d{4,11})"').findall(str(html)) if not tvid: tvid = re.compile('list-tvid="(\d{4,11})"').findall(str(html)) for i in tvid: tvid = i album_id = ''.join(re.compile('player-albumid="(\d{4,11})"').findall(str(html))) if not album_id: album_id = ''.join(re.compile('list-albumid="(\d{4,11})"').findall(str(html))) if not album_id: album_id = ''.join(re.compile('albumId: ?(\d{4,11}),').findall(str(html))) if not album_id: album_id = ''.join(re.compile('param\[\'albumId\'\] ?= ?"(\d{4,11})"').findall(str(html))) current_time = tools.get_current_timestamp() * 1000 current_time = str(current_time) url = 'http://iface2.iqiyi.com/video/3.0/v_download?app_k=8e48946f144759d86a50075555fd5862&app_v=8.1&qyid=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&secure_p=iPhone&secure_v=1&dev_hw=%7B%22cpu%22:%22%22,%22mem%22:%222802%22%7D&net_sts=1&device_id=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&dev_os=10.2.1&dev_ua=iPhone9,2&net_ip=%7B%22country%22:%22%E4%B8%AD%E5%9B%BD%22,%22province%22:%22%E5%8C%97%E4%BA%AC%22,%22city%22:%22%E5%8C%97%E4%BA%AC%22,%22cc%22:%22%E5%9B%BD%E5%86%85%E5%85%B6%E4%BB%96%22,%22area%22:%22%E5%8D%8E%E5%8C%97%22,%22timeout%22:0,%22respcode%22:0%7D&album_id=' + album_id + '&tvid=' + tvid + '&req_times=1&play_core=0&platform_id=12&app_p=iphone&app_t=0&usr_res=16&ppid=1229289410&cookie=53igk5Vn7X1xpazWBjzW2HUN4XGjNSP4aQypF7affdnBUaC6rknOS4dzvIcU1pMm2m2Qfb&lang=zh_CN&app_lm=cn&pps=0&req_sn=' + current_time json_ = tools.get_json_by_requests(url, headers=DOWNLOAD_HEADER) try: video_download_url = ''.join(re.compile('\'1\': {(.+?)},').findall(str(json_))) video_download_url = ''.join(re.compile('\'url\': ?\'(.+?)\'').findall(str(video_download_url))) video_download_url, r = tools.get_html_by_requests(video_download_url) video_download_url = ''.join(re.compile('"l":"(.+?)"').findall(str(video_download_url))) except: video_download_url = '' return video_download_url
def parser_comment(content_id, wall_id, page=1): log.debug('正在爬取第 %s 页文章评论 content_id = %s' % (page, content_id)) flow_comment_url = 'http://sns-comment.iqiyi.com/v2/comment/get_comments.action?contentid={content_id}&page={page}&authcookie=null&page_size=40&wallId={wall_id}&agenttype=117&t={timestamp_m}'.format( content_id=content_id, page=page, wall_id=wall_id, timestamp_m=int(tools.get_current_timestamp() * 1000)) comment_json = tools.get_json_by_requests(flow_comment_url) data = comment_json.get('data', {}) # 可作为翻页的依据 total_count = data.get('totalCount', 0) count = data.get('count', 0) replies = data.get('replies', []) for reply in replies: reply_source = reply.get("replySource", {}) if not deal_comment(reply_source): break if not deal_comment(reply): break else: if replies: parser_comment(content_id, wall_id, page + 1)
def __open_next_page(self): ''' @summary: 跳转到历史文章 --------- @param __biz: @param pass_ticket: @param appmsg_token: @param offset: --------- @result: ''' is_done = False # 是否做完一轮 is_all_done = False # 是否全部做完(所有公众号当日的发布的信息均已采集) if WechatAction._todo_urls: url = WechatAction._todo_urls.popleft() else: # 做完一个公众号 更新其文章数 WechatAction._wechat_service.update_account_article_num( WechatAction._current_account_biz) # 跳转到下一个公众号 account_id, __biz, is_done, is_all_done = WechatAction._wechat_service.get_next_account( ) WechatAction._account_info[__biz] = account_id or '' # url = 'http://mp.weixin.qq.com/mp/getmasssendmsg?__biz=%s#wechat_webview_type=1&wechat_redirect'%__biz url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect' % __biz log.debug(''' 下一个公众号 : %s ''' % url) # 注入js脚本实现自动跳转 if is_all_done: # 当天文章均已爬取 下一天再爬 # 睡眠到下一天 sleep_time = self.get_next_day_time_interval() elif is_done: # 做完一轮 休息 sleep_time = self.get_wait_time() elif ONLY_TODAY_MSG and tools.get_current_date( ) < tools.get_current_date( "%Y-%m-%d" ) + ' ' + SPIDER_START_TIME: # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章 sleep_time = self.get_spider_start_time_interval() else: # 做完一篇文章 间隔一段时间 sleep_time = self.get_sleep_time() log.debug(''' next_page_url : %s is_done: %s is_all_done: %s sleep_time: %s next_start_time %s ''' % (url, is_done, is_all_done, tools.seconds_to_h_m_s(sleep_time / 1000), tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time / 1000))) next_page = "<script>setTimeout(function(){window.location.href='%s';},%d);</script>" % ( url, sleep_time) return next_page
def get_task(self, url=None, tip=''): """ 获取任务 :param url: 指定url时,返回该url包装后的任务。否则先取公众号任务,无则取文章任务。若均无任务,则休眠一段时间之后再取 :return: """ sleep_time = random.randint(self._spider_interval_min, self._spider_interval_max) if not url: account_task = self.get_account_task() if account_task: __biz = account_task.get('__biz') last_publish_time = account_task.get('last_publish_time') self.record_last_article_publish_time(__biz, last_publish_time) tip = '正在抓取列表' url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={}&scene=124#wechat_redirect'.format( __biz) else: article_task = self.get_article_task() if article_task: tip = '正在抓取详情' url = article_task.get('article_url') else: sleep_time = config.get('spider').get('no_task_sleep_time') log.info('暂无任务 休眠 {}s'.format(sleep_time)) tip = '暂无任务 ' if url: next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.href='{url}';}},{sleep_time_msec});</script>".format( tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date( tools.get_current_timestamp() + sleep_time), url=url, sleep_time_msec=sleep_time * 1000) else: next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.reload();}},{sleep_time_msec});</script>".format( tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date( tools.get_current_timestamp() + sleep_time), sleep_time_msec=sleep_time * 1000) return next_page
def __open_next_page(self): ''' @summary: 跳转到历史文章 --------- @param __biz: @param pass_ticket: @param appmsg_token: @param offset: --------- @result: ''' is_done = False # 是否做完一轮 url = None while WechatAction._todo_urls: result = WechatAction._todo_urls.popleft() if callable(result): # 为更新公众号已做完的回调 result() #执行回调 else: url = result break if not url: # 跳转到下一个公众号 account = WechatAction._wechat_service.get_next_account() if account: account_id, __biz = account WechatAction._account_info[__biz] = account_id or '' url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect'%__biz log.debug(''' 下一个公众号 : %s '''%url) else: is_done = True # 注入js脚本实现自动跳转 if is_done: # 做完一轮 休息 sleep_time = self.get_wait_time() elif ONLY_TODAY_MSG and tools.get_current_date() < tools.get_current_date("%Y-%m-%d") + ' ' + SPIDER_START_TIME: # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章 sleep_time = self.get_spider_start_time_interval() else: # 做完一篇文章 间隔一段时间 sleep_time = self.get_sleep_time() tip_sleep_time = tools.seconds_to_h_m_s(sleep_time / 1000) tip_next_start_time = tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time / 1000) if not url: url = 'http://localhost:6210/tip/wait?sleep_time={}&next_start_time={}'.format(tip_sleep_time, tip_next_start_time) log.debug(''' next_page_url : %s is_done: %s sleep_time: %s next_start_time %s '''%(url, is_done, tip_sleep_time, tip_next_start_time)) next_page = "休眠 %s 下次刷新时间 %s<script>setTimeout(function(){window.location.href='%s';},%d);</script>"%(tip_sleep_time, tip_next_start_time, url, sleep_time) return next_page
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) base_url = 'http://is.snssdk.com/api/news/feed/v51/' params = 泸州 time_interval = ONE_PAGE_TIME_INTERVAL content_released_time = tools.get_current_timestamp() - 86400 * 30 # 一天 current_timestamp = tools.get_current_timestamp() max_behot_time = current_timestamp while max_behot_time >= content_released_time: max_behot_time -= time_interval current_timestamp = current_timestamp + random.randint(60, 300) # 泸州的文章 params['category'] = 'news_local' params[ 'last_refresh_sub_entrance_interval'] = current_timestamp # + random.randint(60, 300) params['max_behot_time'] = max_behot_time url = tools.joint_url(base_url, params) base_parser.add_url('WWA_app_urls', SITE_ID, url, remark=NEWS_LOCAL) # 视频 params['category'] = 'video' params[ 'last_refresh_sub_entrance_interval'] = current_timestamp # + random.randint(60, 300) params['max_behot_time'] = max_behot_time url = tools.joint_url(base_url, params) base_parser.add_url('WWA_app_urls', SITE_ID, url, remark=VIDEO)
def get_url(time_lenght = 60): ''' @summary: --------- @param time_lenght: 时间段 分钟 --------- @result: ''' current_date = tools.get_current_date() per_date = tools.read_file(STO_PER_SYNC_TIME) or tools.timestamp_to_date(tools.get_current_timestamp() - time_lenght * 60) tools.write_file(STO_PER_SYNC_TIME, current_date) root_url = 'http://192.168.60.38:8001/hotspot_al/interface/getCluesDataSearchInfo?pageNo=%d&pageSize=100&updateSTime={per_date}&updateETime={current_date}&sort=5&isDesc=0'.format(per_date = per_date, current_date = current_date) return root_url
def get_next_day_time_interval(self): ''' @summary: 获取爬虫次日开始爬取的时间 当日公众号新发布的文章均已爬取,则次日9:00开始爬取 --------- --------- @result: ''' tomorrow = tools.get_tomorrow() + ' ' + SPIDER_START_TIME current_timestamp = tools.get_current_timestamp() tomorrow_timestamp = tools.date_to_timestamp(tomorrow) next_day_time_interval = tomorrow_timestamp - current_timestamp # 秒 # 转换为毫秒 next_day_time_interval *= 1000 return next_day_time_interval
def get_wait_check_account(self): ''' @summary: --------- @param : --------- @result: ''' # 取抓取完的公众号,且最近发布时间已过去两小时,则再次监测是否又发布新文章 before_tow_hours = tools.timestamp_to_date( tools.get_current_timestamp() - 60 * 60 * 2) sql = ''' select t.id, t.domain, t.name, to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'), t.biz from TAB_IOPM_SITE t where t.biz is not null and mointor_status = 701 and t.spider_status = 603 and (t.last_article_release_time is null or t.last_article_release_time <= to_date('{}', 'yyyy-mm-dd hh24:mi:ss')) '''.format(before_tow_hours) accounts = self._oracledb.find(sql) # 若无抓取完的公众号,且redis中无抓取任务,则数据库中非603任务可能为丢失任务,需要重新下发 if not accounts and not self._redisdb.sget_count('wechat:account'): sql = ''' select t.id, t.domain, t.name, to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'), t.biz from TAB_IOPM_SITE t where t.biz is not null and mointor_status = 701 and t.spider_status != 603 ''' accounts = self._oracledb.find(sql) return accounts
def get_spider_start_time_interval(self): ''' @summary: 获取爬虫开始爬取的时间 当日爬取时间小于9:00 则9点后爬取 --------- --------- @result: ''' spider_start_time = tools.get_current_date("%Y-%m-%d") + ' ' + SPIDER_START_TIME current_timestamp = tools.get_current_timestamp() spider_start_timestamp = tools.date_to_timestamp(spider_start_time) spider_start_time_interval = spider_start_timestamp - current_timestamp # 秒 # 转换为毫秒 spider_start_time_interval *= 1000 return spider_start_time_interval
def add_root_url(parser_params = {}): log.debug(''' 添加根url parser_params : %s '''%str(parser_params)) category_infos = [(-1, "推荐"), (10, "体育"), (11, "资讯"), (12, "影视"), (13, "娱乐"), (17,"社会")] for cid, cname in category_infos: nwtime = tools.get_current_timestamp() #推荐模块 sign = get_md5_sign(nwtime) tj_base_url = 'http://user.xiaoyouzb.net/v3/vod/small_recommend?' para = { "nwtime": "{}".format(nwtime), "sign": "{}".format(sign), "type": "1", "cateId": "{}".format(cid), "pageNum": "0", "isFirst": "N", "_u": "edac2c15598946bd9ba7bda78a83489c", "version": "4.7.0", "platform": "android", "appx": "yuntu", "apppn": "org.fungo.fungolive", "enterprise": "0", "channel": "tencent", "market": "32", "os_version": "8.0.0", "device_model": "MIX%202", "device_code": "780493075490198", "udid": "77e2cb72797f20afdcaaa6265872cea9", "androidId": "220240afd2e0e640", "source": "android", } tj_url = tj_base_url + urlencode(para) base_parser.add_url('urls', SITE_ID, tj_url, remark={"category_name": cname}, depth=0) url = tj_url.replace("pageNum=0", "pageNum={}") url_pages = [url.format(page) for page in range(1, 61)] for url_page in url_pages: base_parser.add_url('urls', SITE_ID, url_page, remark={"category_name": cname}, depth=0)
def monitor_cookies(self): ''' @summary: 监控管理cookies 1、删除无用的cookie : 不可用次数超过最大值 2、将闲置24小时的cookie 设为可用 --------- --------- @result: ''' # 删除无用的cookie sql = 'delete from sogou_cookies where un_available_times > %d'%MAX_UN_AVAILABLE_TIMES self._sqlite3db.delete(sql) # 将闲置24小时的cookie 设为可用 sql = ''' update sogou_cookies set is_available = 1 where un_available_time < '%s' '''%(tools.timestamp_to_date(tools.get_current_timestamp() - 24 * 60 * 60 )) self._sqlite3db.update(sql)
def juji_parser(url, remark): program_id = remark html, res = tools.get_html_by_requests(url) tvid = tools.get_info( html, ['player-tvid="(\d{4,11})"', 'list-tvid="(\d{4,11})"'], fetch_one=True) pcInfo_url = "http://mixer.video.iqiyi.com/jp/mixin/videos/" + str(tvid) # print(pcInfo_url) html2, res = tools.get_html_by_requests(pcInfo_url) album_id = tools.get_info(html, [ 'player-albumid="(\d{4,11})', 'list-albumid="(\d{4,11})"', 'albumId: ?(\d{4,11}),', 'param\[\'albumId\'\] ?= ?"(\d{4,11})"' ], fetch_one=True) episode_name = tools.get_info(html, ['meta.+?"irTitle" content="(.+?)"'], fetch_one=True) image_url = tools.get_info(html, ['<meta property="og:image" content="(.+?)"/>'], fetch_one=True) image_url = image_url.replace('.jpg', '_160_90.jpg') play_count = tools.get_info(html2, ['"playCount":(.+?),'], fetch_one=True) time_length = tools.get_info(html2, ['"duration":\s*(.+?),'], fetch_one=True) episode_num = tools.get_info(html2, ['"order":\s*(.+?),'], fetch_one=True) current_time = tools.get_current_timestamp() * 1000 current_time = str(current_time) download_json_url = 'http://iface2.iqiyi.com/video/3.0/v_download?app_k=8e48946f144759d86a50075555fd5862&app_v=8.1&qyid=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&secure_p=iPhone&secure_v=1&dev_hw=%7B%22cpu%22:%22%22,%22mem%22:%222802%22%7D&net_sts=1&device_id=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&dev_os=10.2.1&dev_ua=iPhone9,2&net_ip=%7B%22country%22:%22%E4%B8%AD%E5%9B%BD%22,%22province%22:%22%E5%8C%97%E4%BA%AC%22,%22city%22:%22%E5%8C%97%E4%BA%AC%22,%22cc%22:%22%E5%9B%BD%E5%86%85%E5%85%B6%E4%BB%96%22,%22area%22:%22%E5%8D%8E%E5%8C%97%22,%22timeout%22:0,%22respcode%22:0%7D&album_id=' + album_id + '&tvid=' + tvid + '&req_times=1&play_core=0&platform_id=12&app_p=iphone&app_t=0&usr_res=16&ppid=1229289410&cookie=53igk5Vn7X1xpazWBjzW2HUN4XGjNSP4aQypF7affdnBUaC6rknOS4dzvIcU1pMm2m2Qfb&lang=zh_CN&app_lm=cn&pps=0&req_sn=' + current_time json_ = tools.get_json_by_requests(download_json_url, headers=download_header) download_url = tools.get_json_value(json_, 'video.mp4_res.1.url') download_url, res = tools.get_html_by_requests(download_url) download_url = tools.get_info(download_url, ['"l":"(.+?)"'], fetch_one=True) log.debug(''' 节目id: %s 当前集数: %s 本集时长: %s 播放次数: %s 节目名称: %s 下载地址: %s 节目链接: %s 图片地址: %s ''' % (program_id, episode_num, time_length, play_count, episode_name, download_url, url, image_url)) base_parser.add_program_episode_info('PROGRAM_EPISODE_info', SITE_ID, program_id=program_id, episode_num=episode_num, time_length=time_length, episode_name=episode_name, download_url=download_url, episode_url=url, image_url=image_url, play_count=play_count) base_parser.update_url('PROGRAM_urls', url, Constance.DONE)
class WechatService(): _db = OracleDB() _es = ES() _redisdb = RedisDB() _wechat_sogou = WechatSogou() _wechat_public_platform = WechatPublicPlatform() _todo_accounts = collections.deque() _rownum = 1 _is_done = False # 做完一轮 _is_all_done = False # 所有账号当日发布的消息均已爬取 # wechat_sogou 最后没被封的时间 _wechat_sogou_enable = True _wechat_sogou_last_unenable_time = tools.get_current_timestamp() # wechat_public_platform 最后没被封的时间 _wechat_public_platform_enable = True _wechat_public_platform_last_unenable_time = tools.get_current_timestamp() def __init__(self): pass def __load_todo_account(self): accounts = WechatService._redisdb.sget('wechat:account', count=1) for account in accounts: account = eval(account) WechatService._todo_accounts.append(account) def is_have_new_article(self, account_id, account_name, __biz): ''' @summary: 检查是否有新发布的文章 --------- @param account_id: @param __biz: --------- @result: ''' result = '' if WechatService._wechat_sogou_enable: # 搜狗微信可用 result = WechatService._wechat_sogou.is_have_new_article( account_id=account_id, account=account_name) if result == constance.UPDATE: # 有新发布的文章 抓取 pass elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: pass elif result == constance.VERIFICATION_CODE: # 被封了 请求失败 记录下失败时间 WechatService._wechat_sogou_enable = False WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp( ) # 搜狗微信停用时间超过24小时了 可重新尝试 elif tools.get_current_timestamp( ) - WechatService._wechat_sogou_last_unenable_time > TIME_INTERVAL: # 搜狗微信不可用 但是已经间歇一天 还可以一试 result = WechatService._wechat_sogou.is_have_new_article( account_id=account_id, account=account_name) if result == constance.UPDATE: # 搜狗微信可用 WechatService._wechat_sogou_enable = True elif result == constance.NOT_UPDATE: pass elif result == constance.ERROR: pass elif result == constance.VERIFICATION_CODE: pass # 更新下可用时间 WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp( ) # 如果搜狗微信不可用 则使用微信公众平台检查是否有新发布的文章 if not result or result == constance.VERIFICATION_CODE: if WechatService._wechat_public_platform_enable: # 微信公众平台可用 result = WechatService._wechat_public_platform.is_have_new_article( __biz) if result == constance.UPDATE: # 有新发布的文章 抓取 pass elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: # 被封了 请求失败 记录下失败时间 WechatService._wechat_public_platform_enable = False WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp( ) elif tools.get_current_timestamp( ) - WechatService._wechat_public_platform_last_unenable_time > TIME_INTERVAL: # 搜狗微信不可用 但是已经间歇一天 还可以一试 result = WechatService._wechat_public_platform.is_have_new_article( __biz) if result == constance.UPDATE: # 有新发布的文章 抓取 WechatService._wechat_public_platform_enable = True elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: # 被封了 请求失败 记录下失败时间 pass # 更新下可用时间 WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp( ) return result def get_next_account(self): ''' @summary: --------- --------- @result: 返回biz, 是否已做完一圈 (biz, True) ''' if not WechatService._todo_accounts: self.__load_todo_account() if not WechatService._todo_accounts: return None oralce_id, account_id, account_name, last_article_release_time, biz = WechatService._todo_accounts.popleft( ) next_account_id = account_id next_account_biz = biz next_account_name = account_name next_account = next_account_id, next_account_biz sql = "update TAB_IOPM_SITE t set t.spider_status=602 where t.biz = '%s'" % ( next_account_biz) WechatService._db.update(sql) return next_account def update_account_article_num(self, __biz): # 查询es 统计数量 # 今日 body = { "size": 0, "query": { "filtered": { "filter": { "range": { "record_time": { "gte": tools.get_current_date('%Y-%m-%d') + ' 00:00:00', "lte": tools.get_current_date('%Y-%m-%d') + ' 23:59:59' } } }, "query": { 'match': { "__biz": __biz } } } } } result = WechatService._es.search('wechat_article', body) today_msg = result.get('hits', {}).get('total', 0) # 历史总信息量 body = { "size": 0, "query": { "filtered": { "query": { 'match': { "__biz": __biz } } } } } result = WechatService._es.search('wechat_article', body) total_msg = result.get('hits', {}).get('total', 0) if total_msg: sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.total_msg = %d, t.spider_status=603 where t.biz = '%s'" % ( today_msg, total_msg, __biz) else: sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.spider_status=603 where t.biz = '%s'" % ( today_msg, __biz) print(sql) WechatService._db.update(sql) def is_exist(self, table, data_id): if WechatService._es.get(table, data_id=data_id, doc_type=table): return True else: return False def add_article_info(self, article_info): ''' @summary: --------- @param article_info: --------- @result: ''' log.debug(''' -----文章信息----- 标题 %s 发布时间 %s 作者 %s 公众号 %s url %s ''' % (article_info['title'], article_info['release_time'], article_info['author'], article_info['account'], article_info['url'])) WechatService._es.add('wechat_article', article_info, article_info.get('article_id')) def add_account_info(self, account_info): log.debug(''' -----公众号信息----- %s''' % tools.dumps_json(account_info)) WechatService._es.add('wechat_account', account_info, account_info.get('__biz'))
def is_zombie_account(self, last_publish_timestamp): if tools.get_current_timestamp() - last_publish_timestamp > self._zombie_account_not_publish_article_days * 86400: return True return False
def parser_next_page_article(video_id, wall_id, feed_id, sns_time, url): article_json_url = 'http://api-t.iqiyi.com/feed/get_feeds?authcookie=&device_id=pc_web&m_device_id=a11e6ea94270eaaa0b46be30af84fc54&agenttype=118&wallId={wall_id}&feedTypes=1%2C7%2C8%2C9&count=20&top=1&hasRecomFeed=1&feedId={feed_id}&needTotal=1¬ice=1&version=1&upOrDown=1&snsTime={sns_time}&_={timestamp_m}'.format( wall_id=wall_id, feed_id=feed_id, sns_time=sns_time, timestamp_m=int(tools.get_current_timestamp() * 1000)) print(article_json_url) article_json = tools.get_json_by_requests(article_json_url) wall_id = article_json.get('data', {}).get('wallId') # 评论数组 feeds = article_json.get('data', {}).get('feeds', []) for feed in feeds: article_id = feed.get('commentId') head_url = feed.get('icon') name = feed.get('name') release_time = feed.get('releaseDate') release_time = tools.timestamp_to_date(release_time) title = feed.get('feedTitle') content = feed.get('description') image_urls = ','.join( [img.get('url') for img in feed.get('pictures', [])]) #逗号分隔 watch_count = feed.get('uvCount') up_count = feed.get('agreeCount') comment_count = feed.get('commentCount') log.debug(''' id: %s 节目id %s 头像地址: %s 名字: %s 发布时间: %s 标题: %s 内容: %s 图片地址: %s 观看量: %s 点赞量: %s 评论量: %s ''' % (article_id, video_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count)) if self_base_parser.add_article(article_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count, program_id=video_id, gender=random.randint(0, 1), url=url, info_type=3, emotion=random.randint(0, 2), collect=0, source='爱奇艺'): # 解析評論 parser_comment(article_id, wall_id) else: break else: if feeds: feed_id = feeds[-1].get('feedId') sns_time = feeds[-1].get('snsTime') parser_next_page_article(video_id, wall_id, feed_id, sns_time, url)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) base_url = 'http://is.snssdk.com/api/news/feed/v51/' params = { 'version_code': '6.0.3', 'app_name': 'news_article', 'vid': 'B0DB5DD0-FF94-4773-85B1-EFC11132C2A4', 'device_id': '34633749953', 'channel': 'App Store', 'resolution': '1242*2208', 'aid': 13, 'ab_version': '117912,112577,101786,117787,115757,101533,117646,118765,110341,113607,118273,114108,113114,106784,113608,101558,105475,112401,117714,105610,118581,118607,105821,112578,115570,118604,118850,116615,31210,118530,118216,114338', 'ab_feature': 'z1', 'openudid': '7064ff7d773ef8efeb5d6a25f62cd3d85035674f', 'live_sdk_version': '1.6.5', 'idfv': 'B0DB5DD0-FF94-4773-85_b1-EFC11132C2A4', 'ac': 'WIFI', 'os_version': '10.2.1', 'ssmix': 'a', 'device_platform': 'iphone', 'iid': 8954368598, 'ab_client': 'a1,f2,f7,e1', 'device_type': 'iPhone 7 Plus', 'idfa': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB', 'LBS_status': 'deny', 'category': 'news_local', 'city': '', 'concern_id': '', 'count': 20, 'cp': '5089F85eBd4BDq1', 'detail': 1, 'image': 1, 'language': 'zh-Hans-CN', 'last_refresh_sub_entrance_interval': 1482077184, 'loc_mode': 0, 'max_behot_time': 1481063762, 'refer': 1, 'strict': 0, 'tt_from': 'load_more', 'user_city': '泸州' } time_interval = ONE_PAGE_TIME_INTERVAL content_released_time = tools.get_current_timestamp() - 86400 # 一天 current_timestamp = tools.get_current_timestamp() max_behot_time = current_timestamp while max_behot_time >= content_released_time: max_behot_time -= time_interval current_timestamp = current_timestamp + random.randint(60, 300) # 泸州的文章 params['category'] = 'news_local' params[ 'last_refresh_sub_entrance_interval'] = current_timestamp # + random.randint(60, 300) params['max_behot_time'] = max_behot_time url = tools.joint_url(base_url, params) base_parser.add_url('VAApp_urls', SITE_ID, url, remark=NEWS_LOCAL) # 视频 params['category'] = 'video' params[ 'last_refresh_sub_entrance_interval'] = current_timestamp # + random.randint(60, 300) params['max_behot_time'] = max_behot_time url = tools.joint_url(base_url, params) base_parser.add_url('VAApp_urls', SITE_ID, url, remark=VIDEO)
sys.path.append('../../') import init import base.base_parser as base_parser import base.constance as Constance import utils.tools as tools from utils.log import log # 必须定义 网站id SITE_ID = 1 # 必须定义 网站名 NAME = '映客' # 拼參數用 SECRET_KEY = "8D2E##1[5$^(38#%#d3z96;]35q#MD28" CURRENT_TIMESTAMP = tools.get_current_timestamp() S_SG = tools.get_md5(SECRET_KEY + str(CURRENT_TIMESTAMP)) #Sig由固定密钥 @tools.run_safe_model(__name__) # 必须定义 添加网站信息 def add_site_info(): log.debug('添加网站信息') site_id = SITE_ID name = NAME table = 'LiveApp_site_info' url = 'http://www.inke.cn/hotlive_list.html' base_parser.add_website_info(table, site_id, url, name)
def monitor_task(): task_manager = TaskManager() total_time = 0 task_count = 0 begin_time = None end_time = None spend_hours = None is_show_start_tip = False is_show_have_task = False while True: task_count = task_manager.get_task_count() if not task_count: if not is_show_start_tip: log.info('开始监控任务池...') is_show_start_tip = True total_time += CHECK_HAVE_TASK_SLEEP_TIME tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME) else: if not is_show_have_task: log.info('任务池中有%s条任务,work可以正常工作' % task_count) is_show_have_task = True total_time = 0 tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME) if total_time > MAX_NULL_TASK_TIME: is_show_start_tip = False is_show_have_task = False # 结束一轮 做些统计 if begin_time: # 统计时间 end_time = tools.timestamp_to_date( tools.get_current_timestamp() - MAX_NULL_TASK_TIME) spend_time = tools.date_to_timestamp( end_time) - tools.date_to_timestamp(begin_time) spend_hours = tools.seconds_to_h_m_s(spend_time) # 统计url数量 depth_count_info = task_manager.get_ever_depth_count(5) # 统计文章数量 article_count_msg = statistic_article_count.get_article_count_msg( begin_time, end_time) log.info( ''' ------- 已做完一轮 -------- \r开始时间:%s \r结束时间:%s \r耗时:%s \r网站数量:%s \rurl数量信息:%s \r文章数量信息:%s ''' % (begin_time, end_time, spend_hours, task_count, tools.dumps_json(depth_count_info), article_count_msg)) # 删除url指纹 log.info('删除url指纹...') task_manager.clear_task() log.info('redis 中连续%s秒无任务,超过允许最大等待%s秒 开始添加任务' % (total_time, MAX_NULL_TASK_TIME)) # 取任务 tasks = task_manager.get_task_from_oracle() if tasks: total_time = 0 task_manager.add_task_to_redis(tasks) task_count = task_manager.get_task_count() if task_count: begin_time = tools.get_current_date() log.info('添加任务到redis中成功 共添加%s条任务。 work开始工作' % (task_count)) else: log.error('未从oracle中取到任务')
def is_have_new_article(self, account_id, account_name, __biz): ''' @summary: 检查是否有新发布的文章 --------- @param account_id: @param __biz: --------- @result: ''' result = '' if WechatService._wechat_sogou_enable: # 搜狗微信可用 result = WechatService._wechat_sogou.is_have_new_article( account_id=account_id, account=account_name) if result == constance.UPDATE: # 有新发布的文章 抓取 pass elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: pass elif result == constance.VERIFICATION_CODE: # 被封了 请求失败 记录下失败时间 WechatService._wechat_sogou_enable = False WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp( ) # 搜狗微信停用时间超过24小时了 可重新尝试 elif tools.get_current_timestamp( ) - WechatService._wechat_sogou_last_unenable_time > TIME_INTERVAL: # 搜狗微信不可用 但是已经间歇一天 还可以一试 result = WechatService._wechat_sogou.is_have_new_article( account_id=account_id, account=account_name) if result == constance.UPDATE: # 搜狗微信可用 WechatService._wechat_sogou_enable = True elif result == constance.NOT_UPDATE: pass elif result == constance.ERROR: pass elif result == constance.VERIFICATION_CODE: pass # 更新下可用时间 WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp( ) # 如果搜狗微信不可用 则使用微信公众平台检查是否有新发布的文章 if not result or result == constance.VERIFICATION_CODE: if WechatService._wechat_public_platform_enable: # 微信公众平台可用 result = WechatService._wechat_public_platform.is_have_new_article( __biz) if result == constance.UPDATE: # 有新发布的文章 抓取 pass elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: # 被封了 请求失败 记录下失败时间 WechatService._wechat_public_platform_enable = False WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp( ) elif tools.get_current_timestamp( ) - WechatService._wechat_public_platform_last_unenable_time > TIME_INTERVAL: # 搜狗微信不可用 但是已经间歇一天 还可以一试 result = WechatService._wechat_public_platform.is_have_new_article( __biz) if result == constance.UPDATE: # 有新发布的文章 抓取 WechatService._wechat_public_platform_enable = True elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: # 被封了 请求失败 记录下失败时间 pass # 更新下可用时间 WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp( ) return result
class WechatService(): _db = OracleDB() _es = ES() _wechat_sogou = WechatSogou() _wechat_public_platform = WechatPublicPlatform() _todo_accounts = collections.deque() _rownum = 1 _is_done = False # 做完一轮 _is_all_done = False # 所有账号当日发布的消息均已爬取 # wechat_sogou 最后没被封的时间 _wechat_sogou_enable = True _wechat_sogou_last_unenable_time = tools.get_current_timestamp() # wechat_public_platform 最后没被封的时间 _wechat_public_platform_enable = True _wechat_public_platform_last_unenable_time = tools.get_current_timestamp() def __init__(self): pass def __load_todo_account(self): if not WechatService._todo_accounts: sql = '' if not WechatService._is_all_done: sql = ''' select * from (select rownum r, t.id, t.domain, t.biz, t.name from TAB_IOPM_SITE t where t.biz is not null and mointor_status = 701 and (today_msg is null or today_msg = 0) and rownum < {size}) where r >= {rownum} '''.format(rownum=WechatService._rownum, size=WechatService._rownum + SIZE) else: # 今日公众号发布的新文章均已爬取 sql = ''' select * from (select rownum r, t.id, t.domain, t.biz, t.name from TAB_IOPM_SITE t where t.biz is not null and mointor_status = 701 and rownum < {size}) where r >= {rownum} '''.format(rownum=WechatService._rownum, size=WechatService._rownum + SIZE) print(sql) results = WechatService._db.find(sql) if not results: if WechatService._rownum == 1: # 今日公众号发布的新文章均已爬取,爬虫休息,明日再爬 WechatService._is_all_done = True # 为了WeichatAction 设置休眠时间用 # 取下一天的公众号 self.__load_todo_account() else: WechatService._is_done = True WechatService._rownum = 1 self.__load_todo_account() else: WechatService._todo_accounts = collections.deque( results) # 转为队列 WechatService._rownum += SIZE def is_have_new_article(self, account_id, account_name, __biz): ''' @summary: 检查是否有新发布的文章 --------- @param account_id: @param __biz: --------- @result: ''' result = '' if WechatService._wechat_sogou_enable: # 搜狗微信可用 result = WechatService._wechat_sogou.is_have_new_article( account_id=account_id, account=account_name) if result == constance.UPDATE: # 有新发布的文章 抓取 pass elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: pass elif result == constance.VERIFICATION_CODE: # 被封了 请求失败 记录下失败时间 WechatService._wechat_sogou_enable = False WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp( ) # 搜狗微信停用时间超过24小时了 可重新尝试 elif tools.get_current_timestamp( ) - WechatService._wechat_sogou_last_unenable_time > TIME_INTERVAL: # 搜狗微信不可用 但是已经间歇一天 还可以一试 result = WechatService._wechat_sogou.is_have_new_article( account_id=account_id, account=account_name) if result == constance.UPDATE: # 搜狗微信可用 WechatService._wechat_sogou_enable = True elif result == constance.NOT_UPDATE: pass elif result == constance.ERROR: pass elif result == constance.VERIFICATION_CODE: pass # 更新下可用时间 WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp( ) # 如果搜狗微信不可用 则使用微信公众平台检查是否有新发布的文章 if not result or result == constance.VERIFICATION_CODE: if WechatService._wechat_public_platform_enable: # 微信公众平台可用 result = WechatService._wechat_public_platform.is_have_new_article( __biz) if result == constance.UPDATE: # 有新发布的文章 抓取 pass elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: # 被封了 请求失败 记录下失败时间 WechatService._wechat_public_platform_enable = False WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp( ) elif tools.get_current_timestamp( ) - WechatService._wechat_public_platform_last_unenable_time > TIME_INTERVAL: # 搜狗微信不可用 但是已经间歇一天 还可以一试 result = WechatService._wechat_public_platform.is_have_new_article( __biz) if result == constance.UPDATE: # 有新发布的文章 抓取 WechatService._wechat_public_platform_enable = True elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: # 被封了 请求失败 记录下失败时间 pass # 更新下可用时间 WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp( ) return result def get_next_account(self): ''' @summary: --------- --------- @result: 返回biz, 是否已做完一圈 (biz, True) ''' while True: if not WechatService._todo_accounts: self.__load_todo_account() next_account_info = WechatService._todo_accounts.popleft() next_account_id = next_account_info[2] next_account_biz = next_account_info[3] next_account_name = next_account_info[4] next_account = next_account_id, next_account_biz, WechatService._is_done, WechatService._is_all_done if not WechatService._wechat_sogou_enable: log.debug('搜狗微信不可用') if not WechatService._wechat_public_platform_enable: log.debug('微信公众平台不可用') # 不用检查是否发布新文章 直接跳出 if not CHECK_NEW_ARTICLE: break # 搜狗微信和微信公众平台均不可用 跳出 if not WechatService._wechat_sogou_enable and not WechatService._wechat_public_platform_enable: break # 使用检查新文章时,有一定的几率跳出, 采用微信客户端直接爬取,防止搜狗微信使用频繁出现验证码 if random.randint(1, 5) == 1: log.debug('跳出 防止搜狗微信被封') break # 检查是今日是否有文章发布 result = self.is_have_new_article(next_account_id, next_account_name, next_account_biz) if result == constance.UPDATE: break elif result == constance.NOT_UPDATE: if WechatService._is_done: # 防止公众号都没更新, 产生死循环 都检查完一遍 发现都没更新 直接跳出 break else: # tools.delay_time(5) continue elif result == constance.ERROR: break elif result == constance.VERIFICATION_CODE: break else: # 检查更新不可用 直接调用客户端爬取 break # 重置_is_done与_is_all_done 状态 WechatService._is_done = False WechatService._is_all_done = False return next_account def update_account_article_num(self, __biz): # 查询es 统计数量 # 今日 body = { "size": 0, "query": { "filtered": { "filter": { "range": { "record_time": { "gte": tools.get_current_date('%Y-%m-%d') + ' 00:00:00', "lte": tools.get_current_date('%Y-%m-%d') + ' 23:59:59' } } }, "query": { 'match': { "__biz": __biz } } } } } result = WechatService._es.search('wechat_article', body) today_msg = result.get('hits', {}).get('total', 0) # 历史总信息量 body = { "size": 0, "query": { "filtered": { "query": { 'match': { "__biz": __biz } } } } } result = WechatService._es.search('wechat_article', body) total_msg = result.get('hits', {}).get('total', 0) if total_msg: sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.total_msg = %d where t.biz = '%s'" % ( today_msg, total_msg, __biz) else: sql = "update TAB_IOPM_SITE t set t.today_msg = %d where t.biz = '%s'" % ( today_msg, __biz) print(sql) WechatService._db.update(sql) def is_exist(self, table, data_id): if WechatService._es.get(table, data_id=data_id, doc_type=table): return True else: return False def add_article_info(self, article_info): ''' @summary: --------- @param article_info: --------- @result: ''' log.debug(''' -----文章信息----- 标题 %s 发布时间 %s 作者 %s 公众号 %s url %s ''' % (article_info['title'], article_info['release_time'], article_info['author'], article_info['account'], article_info['url'])) WechatService._es.add('wechat_article', article_info, article_info.get('article_id')) def add_account_info(self, account_info): log.debug(''' -----公众号信息----- %s''' % tools.dumps_json(account_info)) WechatService._es.add('wechat_account', account_info, account_info.get('__biz'))