def __open_next_page(self): ''' @summary: 跳转到历史文章 --------- @param __biz: @param pass_ticket: @param appmsg_token: @param offset: --------- @result: ''' is_done = False # 是否做完一轮 is_all_done = False # 是否全部做完(所有公众号当日的发布的信息均已采集) if WechatAction._todo_urls: url = WechatAction._todo_urls.popleft() else: # 做完一个公众号 更新其文章数 WechatAction._wechat_service.update_account_article_num( WechatAction._current_account_biz) # 跳转到下一个公众号 account_id, __biz, is_done, is_all_done = WechatAction._wechat_service.get_next_account( ) WechatAction._account_info[__biz] = account_id or '' # url = 'http://mp.weixin.qq.com/mp/getmasssendmsg?__biz=%s#wechat_webview_type=1&wechat_redirect'%__biz url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect' % __biz log.debug(''' 下一个公众号 : %s ''' % url) # 注入js脚本实现自动跳转 if is_all_done: # 当天文章均已爬取 下一天再爬 # 睡眠到下一天 sleep_time = self.get_next_day_time_interval() elif is_done: # 做完一轮 休息 sleep_time = self.get_wait_time() elif ONLY_TODAY_MSG and tools.get_current_date( ) < tools.get_current_date( "%Y-%m-%d" ) + ' ' + SPIDER_START_TIME: # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章 sleep_time = self.get_spider_start_time_interval() else: # 做完一篇文章 间隔一段时间 sleep_time = self.get_sleep_time() log.debug(''' next_page_url : %s is_done: %s is_all_done: %s sleep_time: %s next_start_time %s ''' % (url, is_done, is_all_done, tools.seconds_to_h_m_s(sleep_time / 1000), tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time / 1000))) next_page = "<script>setTimeout(function(){window.location.href='%s';},%d);</script>" % ( url, sleep_time) return next_page
def sign_account_is_zombie(self, __biz, last_publish_time=None): if last_publish_time: sql = 'update wechat_account_task set last_publish_time = "{}", last_spider_time="{}", is_zombie=1 where __biz="{}"'.format( last_publish_time, tools.get_current_date(), __biz) else: sql = 'update wechat_account_task set last_spider_time="{}", is_zombie=1 where __biz="{}"'.format( tools.get_current_date(), __biz) self._mysqldb.update(sql)
def __open_next_page(self): ''' @summary: 跳转到历史文章 --------- @param __biz: @param pass_ticket: @param appmsg_token: @param offset: --------- @result: ''' is_done = False # 是否做完一轮 url = None while WechatAction._todo_urls: result = WechatAction._todo_urls.popleft() if callable(result): # 为更新公众号已做完的回调 result() #执行回调 else: url = result break if not url: # 跳转到下一个公众号 account = WechatAction._wechat_service.get_next_account() if account: account_id, __biz = account WechatAction._account_info[__biz] = account_id or '' url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect'%__biz log.debug(''' 下一个公众号 : %s '''%url) else: is_done = True # 注入js脚本实现自动跳转 if is_done: # 做完一轮 休息 sleep_time = self.get_wait_time() elif ONLY_TODAY_MSG and tools.get_current_date() < tools.get_current_date("%Y-%m-%d") + ' ' + SPIDER_START_TIME: # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章 sleep_time = self.get_spider_start_time_interval() else: # 做完一篇文章 间隔一段时间 sleep_time = self.get_sleep_time() tip_sleep_time = tools.seconds_to_h_m_s(sleep_time / 1000) tip_next_start_time = tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time / 1000) if not url: url = 'http://localhost:6210/tip/wait?sleep_time={}&next_start_time={}'.format(tip_sleep_time, tip_next_start_time) log.debug(''' next_page_url : %s is_done: %s sleep_time: %s next_start_time %s '''%(url, is_done, tip_sleep_time, tip_next_start_time)) next_page = "休眠 %s 下次刷新时间 %s<script>setTimeout(function(){window.location.href='%s';},%d);</script>"%(tip_sleep_time, tip_next_start_time, url, sleep_time) return next_page
def update_account_article_num(self, __biz): # 查询es 统计数量 # 今日 body = { "size": 0, "query": { "filtered": { "filter": { "range": { "record_time": { "gte": tools.get_current_date('%Y-%m-%d') + ' 00:00:00', "lte": tools.get_current_date('%Y-%m-%d') + ' 23:59:59' } } }, "query": { 'match': { "__biz": __biz } } } } } result = WechatService._es.search('wechat_article', body) today_msg = result.get('hits', {}).get('total', 0) # 历史总信息量 body = { "size": 0, "query": { "filtered": { "query": { 'match': { "__biz": __biz } } } } } result = WechatService._es.search('wechat_article', body) total_msg = result.get('hits', {}).get('total', 0) if total_msg: sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.total_msg = %d where t.biz = '%s'" % ( today_msg, total_msg, __biz) else: sql = "update TAB_IOPM_SITE t set t.today_msg = %d where t.biz = '%s'" % ( today_msg, __biz) print(sql) WechatService._db.update(sql)
def parse_article_info(article_info, release_time): if not article_info: return # log.debug(tools.dumps_json(article_info)) title = article_info.get('title') summary = article_info.get('digest') url = article_info.get('content_url').replace('\\', '').replace( 'amp;', '') source_url = article_info.get('source_url').replace('\\', '') # 引用的文章链接 cover = article_info.get('cover').replace('\\', '') author = article_info.get('author') if url and url.startswith( 'http://mp.weixin.qq.com/' ): # 被发布者删除的文章 无url和其他信息, 此时取不到mid 且不用入库, 或者商城类的url不入库 mid = tools.get_param(url, 'mid') or tools.get_param( url, 'appmsgid') # 图文消息id 同一天发布的图文消息 id一样 idx = tools.get_param(url, 'idx') or tools.get_param( url, 'itemidx') # 第几条图文消息 从1开始 article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260 idx = 1,则article_id = 26504922601 # 判断该文章库中是否已存在 if WechatAction._wechat_service.is_exist( 'wechat_article', article_id) or (ONLY_TODAY_MSG and release_time < tools.get_current_date('%Y-%m-%d')): self._is_need_get_more = False return # 不往下进行 舍弃之后的文章 __biz = tools.get_param(url, '__biz') # 用于关联公众号 # 缓存文章信息 WechatAction._article_info[article_id] = { 'article_id': int(article_id), 'title': title, 'summary': summary, 'release_time': release_time, 'url': url, 'source_url': source_url, 'cover': cover, 'account': '', 'author': author, '__biz': __biz, 'read_num': None, 'like_num': None, 'content': '', 'comment': [], 'record_time': tools.get_current_date() } # 将文章url添加到待抓取队列 WechatAction._todo_urls.append(url)
def __parse_account_info(self, data, req_url): __biz = tools.get_param(req_url, "__biz") regex = 'id="nickname">(.*?)</strong>' account = tools.get_info(data, regex, fetch_one=True).strip() regex = 'profile_avatar">.*?<img src="(.*?)"' head_url = tools.get_info(data, regex, fetch_one=True) regex = 'class="profile_desc">(.*?)</p>' summary = tools.get_info(data, regex, fetch_one=True).strip() # 认证信息(关注的账号直接点击查看历史消息,无认证信息) regex = '<i class="icon_verify success">.*?</i>(.*?)</span>' verify = tools.get_info(data, regex, fetch_one=True) verify = verify.strip() if verify else "" # 二维码 regex = 'var username = "" \|\| "(.*?)";' # || 需要转译 qr_code = tools.get_info(data, regex, fetch_one=True) qr_code = "http://open.weixin.qq.com/qr/code?username="******"__biz": __biz, "account": account, "head_url": head_url, "summary": summary, "qr_code": qr_code, "verify": verify, "spider_time": tools.get_current_date(), } if account_data: data_pipeline.save_account(account_data)
def check_new_article(self, account): oralce_id, account_id, account_name, last_article_release_time, biz = account article_release_time = self._wechat_sogo.get_article_release_time( account_id=account_id, account=account_name) print(article_release_time) if article_release_time: last_article_release_time = last_article_release_time or '' if article_release_time >= tools.get_current_date( '%Y-%m-%d' ) and article_release_time > last_article_release_time: print('{} 有新文章发布,等待抓取。 发布时间:{}'.format(account_name, article_release_time)) sql = ''' update TAB_IOPM_SITE t set t.spider_status = 601, t.last_article_release_time = to_date('{}', 'yyyy-mm-dd hh24:mi:ss') where id = {} '''.format(article_release_time, oralce_id) # 多线程, 数据库需每个线程持有一个 oracledb = OracleDB() oracledb.update(sql) oracledb.close() # 入redis, 作为微信爬虫的任务池 data = (oralce_id, account_id, account_name, last_article_release_time, biz) self._redisdb.sadd('wechat:account', data)
def add_website_info(table, site_id, url, name, domain = '', ip = '', address = '', video_license = '', public_safety = '', icp = ''): ''' @summary: 添加网站信息 --------- @param table: 表名 @param site_id: 网站id @param url: 网址 @param name: 网站名 @param domain: 域名 @param ip: 服务器ip @param address: 服务器地址 @param video_license: 网络视听许可证| @param public_safety: 公安备案号 @param icp: ICP号 --------- @result: ''' # 用程序获取domain,ip,address,video_license,public_safety,icp 等信息 domain = tools.get_domain(url) site_info = { 'site_id':site_id, 'name':name, 'domain':domain, 'url':url, 'ip':ip, 'address':address, 'video_license':video_license, 'public_safety':public_safety, 'icp':icp, 'read_status':0, 'record_time': tools.get_current_date() } mongodb.add(table, site_info)
def is_have_new_article(self, account_id='', account=''): ''' @summary: 检查公众号今日是否发文 --------- @param account_id: @param account: --------- @result: ''' account_block = self.__get_account_blocks(account_id, account) if account_block == constance.VERIFICATION_CODE: return constance.VERIFICATION_CODE regex = "timeConvert\('(\d*?)'\)" release_time = tools.get_info(account_block, regex, fetch_one=True) if release_time: release_time = int(release_time) release_time = tools.timestamp_to_date(release_time) log.debug("最近发文时间 %s" % release_time) if release_time >= tools.get_current_date('%Y-%m-%d'): return constance.UPDATE else: return constance.NOT_UPDATE else: return constance.ERROR
def get_article(self): ''' @summary: 目前取的是record_time 为了保证有数据, 正常应该取releast_time TODO --------- --------- @result: ''' per_record_time = self.get_per_record_time() today_time = tools.get_current_date('%Y-%m-%d') if per_record_time: sql = "select * from {table} where record_time > '{record_time}' and release_time >= '{today_time} 00:00:00' and release_time <= '{today_time} 23:59:59' order by record_time".format( table=self._table, record_time=per_record_time, today_time=today_time) else: sql = "select * from {table} where release_time >= '{today_time} 00:00:00' and release_time <= '{today_time} 23:59:59' order by record_time".format( table=self._table, today_time=today_time) url = 'http://{address}/_sql?sql={sql}'.format(address=ADDRESS, sql=sql) log.debug(url) article = tools.get_json_by_requests(url) return article.get('hits', {}).get('hits', [])
def image_predict(self, image_url): if not image_url: return -1 # 如果是网络图片 先下载 识别 然后删除 if image_url.startswith('http'): local_image_path = TEMP_IMAGE_SAVE_PATH + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' is_success = tools.download_file(image_url, local_image_path) image_url = local_image_path if is_success else image_url result = ImagePornRecg.__image_porn_dll.Pic_Predict( image_url, self._porn_image_index) tools.del_file(local_image_path) # 如果是本地图片 不是jpg格式 则需要转成jpg格式 elif not image_url.endswith('jpg'): jpg_image_url = image_url[:image_url.rfind('.')] + '.jpg' is_success = ffmpeg_manager.convert_file_format( image_url, jpg_image_url) image_url = jpg_image_url if is_success else image_url result = ImagePornRecg.__image_porn_dll.Pic_Predict( image_url, self._porn_image_index) else: result = ImagePornRecg.__image_porn_dll.Pic_Predict( image_url, self._porn_image_index) return result
def get_release_time(mblog): try: release_time = mblog['created_at'] data = tools.time.time() ltime = tools.time.localtime(data) timeStr = tools.time.strftime("%Y-%m-%d", ltime) if tools.re.compile('今天').findall(release_time): release_time = release_time.replace('今天', '%s' % timeStr) elif tools.re.compile('昨天').findall(release_time): today = datetime.date.today() yesterday = today - datetime.timedelta(days=1) release_time = release_time.replace('昨天', '%s' % yesterday) elif '小时前' in release_time: nhours = tools.re.compile('(\d+)小时前').findall(release_time) hours_ago = (tools.datetime.datetime.now() - tools.datetime.timedelta(hours=int(nhours[0]))) release_time = hours_ago.strftime("%Y-%m-%d %H:%M") elif tools.re.compile('分钟前').findall(release_time): nminutes = tools.re.compile('(\d+)分钟前').findall(release_time) minutes_ago = (tools.datetime.datetime.now() - tools.datetime.timedelta(minutes=int(nminutes[0]))) release_time = minutes_ago.strftime("%Y-%m-%d %H:%M") elif tools.re.compile('刚刚').findall(release_time): release_time = tools.get_current_date() else: if len(release_time) < 10: release_time = '%s-%s' % (timeStr[0:4], release_time) except: release_time = '' finally: return release_time
def add_net_program(rank, rank_wave, url, name, video_id, image_url, mini_summary, episode_msg, today_play_count, total_play_count, director, classify, institution, release_year, description, actor, score, video_type, net_source): ''' @summary: --------- @param rank: @param rank_wave: @param url: @param name: @param video_id: @param image_url: @param mini_summary: @param episode_msg: @param today_play_count: @param total_play_count: @param director: @param classify: @param institution: @param release_year: @param description: @param actor: @param score: @param type: 节目类型 电影 1 电视剧 2 综艺等 @param net_source: 来源 爱奇艺 --------- @result: ''' program = { 'rank': rank, 'rank_wave': rank_wave, 'url': url, 'program_name': name, 'image_url': image_url, 'keywords': mini_summary, 'episode': episode_msg, 'play_count_total': today_play_count, 'total_play_count': total_play_count, 'director': director, 'classify': classify, 'institution': institution, 'release_year': release_year, 'description': description, 'actor': actor, 'score': score, 'type': video_type, 'net_source': net_source, 'record_time': tools.get_current_date(), 'is_setmenu': 0, 'baidu_score': None, 'up_count': None, 'collect': 0, 'sensitive': 0, 'program_id': video_id } es.add('tab_mms_net_program', program, video_id)
def add_wechat_account_info(table, site_id, name, account_id, account_url, image_url, local_image_url, article_count, summary, certification, is_verified, barcode_url, local_barcode_url): account_info = { 'name': name, 'account_id': account_id, 'account_url': account_url, 'image_url': image_url, 'local_image_url': local_image_url, 'article_count': article_count, 'summary': summary, 'certification': certification, 'is_verified': is_verified, 'barcode_url': barcode_url, 'local_barcode_url': local_barcode_url, 'read_status': 0, 'record_time': tools.get_current_date(), 'sexy_image_url': local_image_url, 'sexy_image_status': '', 'image_pron_status': 0 } if not db.add(table, account_info): account_info.pop('_id') account_info.pop('image_pron_status') account_info.pop('sexy_image_status') account_info.pop('sexy_image_url') db.update(table, old_value={'account_id': account_id}, new_value=account_info)
def save_video_info(release_time='', content='', url='', author='', title='', image_url='', site_name='', play_count=None, comment_count=None, praise_count=None, summary='', time_length=None): domain = tools.get_domain(url) content_info = { 'domain': domain, 'uuid': tools.get_uuid(title, domain), 'site_name': site_name, 'image_url': image_url, 'title': title, 'author': author, 'url': url, 'content': content, 'release_time': tools.format_date(release_time), 'play_count': play_count, 'comment_count': comment_count, 'praise_count': praise_count, 'time_length': time_length, 'record_time': tools.get_current_date(), 'summary': summary } log.debug(tools.dumps_json(content_info)) es.add('video_news', content_info, content_info['uuid'])
def set_cookie_un_available(self, cookie): ''' @summary: 设置cookie不可用 --------- @param cookie:(id. cookie, un_available_times) --------- @result: ''' if not cookie: return try: # 从列表中移除 self._cookies.remove(cookie) # 更新数据库 sql = ''' update sogou_cookies set is_available = 0, un_available_time = '%s', un_available_times = un_available_times + 1 where id = %d '''%(tools.get_current_date(), cookie[0]) self._sqlite3db.update(sql) except Exception as e: log.error(e)
def add_url(table, site_id='', url='', depth=0, remark='', status=Constance.TODO, title='', origin='', domain='', retrieval_layer=0, image_url='', release_time=''): url_dict = { 'site_id': site_id, 'url': url, 'depth': depth, 'remark': remark, 'status': status, 'title': title, 'origin': origin, 'release_time': release_time, 'domain': domain, 'record_time': tools.get_current_date(), 'image_url': image_url, 'retrieval_layer': retrieval_layer } return db.add(table, url_dict)
def add_wp_content_episode_info(table, title='', image_url='', video_url='', watched_count='', play_length='', comments_count='', release_time='', content_id='', data_type=''): wp_content_episode_info_dict = { 'content_id': content_id, 'image_url': image_url, 'title': title, 'video_url': video_url, 'watched_count': watched_count, 'play_length': play_length, 'comment_count': comments_count, 'release_time': release_time, 'image_pron_status': 0, 'record_time': tools.get_current_date(), 'data_type': data_type, 'read_status': 0 } db.add(table, wp_content_episode_info_dict)
def add_appsite_info(table, site_id, url, name, app_url='', summary='', update_info='', author='', image_url='', classify='', size='', tag='', platform='android', download_count='', release_time=''): ''' @summary: 添加app 网站信息 --------- @param table: 表名 @param site_id: 网站id @param url: 网址 @param name: app名 @param app_url: app url @param summary: 简介 @param update_info: 更新信息 @param author: 开发者 @param image_url: 图标url @param classify: 分类 @param size: 大小 @param tag: 版本 @param platform: 平台 默认android @param download_count: 下载次数 @param release_time: 发布时间 --------- @result: ''' app_info = { 'site_id': site_id, 'url': url, 'name': name, 'app_url': app_url, 'summary': summary, 'update_info': update_info, 'author': author, 'image_url': image_url, 'classify': classify, 'size': size, 'tag': tag, 'platform': platform, 'download_count': download_count, 'release_time': release_time, 'read_status': 0, 'record_time': tools.get_current_date(), 'sexy_image_status': '', 'sexy_image_url': '', 'image_pron_status': 0 } db.add(table, app_info)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: next_keyword = False for page_index in range(1, 10): url = 'http://so.video.sina.com.cn/interface/s?from=video&wd=%s&s_id=w00001&p=%s&n=20&s=1' \ % (keyword, page_index) info_json = tools.get_json_by_requests(url) video_info_list = info_json['list'] if not video_info_list: print(url) break for video_info in video_info_list: image_url = video_info['thumburl'] title = tools.del_html_tag(video_info['videoname']) url = video_info['url'] release_time = video_info['showtime'] current_date = tools.get_current_date('%Y-%m-%d') if current_date > release_time: next_keyword = True break base_parser.save_video_info(image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if next_keyword: break
def deal_comment(self, req_url, text): """ 解析评论 :param req_url: :param text: :return: """ data = tools.get_json(text) __biz = tools.get_param(req_url, "__biz") comment_id = tools.get_param(req_url, "comment_id") # 与文章关联 elected_comment = data.get("elected_comment", []) comment_datas = [ dict( __biz=__biz, comment_id=comment_id, nick_name=comment.get("nick_name"), logo_url=comment.get("logo_url"), content=comment.get("content"), create_time=tools.timestamp_to_date( comment.get("create_time")), content_id=comment.get("content_id"), like_num=comment.get("like_num"), is_top=comment.get("is_top"), spider_time=tools.get_current_date(), ) for comment in elected_comment ] if comment_datas: data_pipeline.save_article_commnet(comment_datas)
def deal_comment(self, req_url, text): data = tools.get_json(text) __biz = tools.get_param(req_url, '__biz') comment_id = tools.get_param(req_url, 'comment_id') # 与文章关联 elected_comment = data.get('elected_comment', []) comment_datas = [ dict( __biz=__biz, comment_id=comment_id, nick_name=comment.get('nick_name'), logo_url=comment.get('logo_url'), content=comment.get('content'), create_time=tools.timestamp_to_date(comment.get('create_time')), content_id=comment.get('content_id'), like_num=comment.get('like_num'), is_top=comment.get('is_top'), spider_time=tools.get_current_date() ) for comment in elected_comment ] if comment_datas: data_pipeline.save_article_commnet(comment_datas)
def deal_article(self, req_url, text): """ 解析文章 :param req_url: :param text: :return: """ sn = tools.get_param(req_url, "sn") if not text: self._task_manager.update_article_task_state(sn, -1) return None selector = Selector(text) content = selector.xpath( '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]' ).extract_first(default="") title = (selector.xpath('//h2[@class="rich_media_title"]/text()'). extract_first(default="").strip()) account = (selector.xpath('//a[@id="js_name"]/text()').extract_first( default="").strip()) author = (selector.xpath( '//span[@class="rich_media_meta rich_media_meta_text"]//text()'). extract_first(default="").strip()) publish_timestamp = selector.re_first('n="(\d{10})"') publish_timestamp = int( publish_timestamp) if publish_timestamp else None publish_time = (tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None) biz = tools.get_param(req_url, "__biz") text = remove_tags(content).strip() spider_name = 'wechat' collection_mode = 'spider' data_source_type = '微信公众号' article_data = { "data_type": account, "title": title, "data_address": req_url, "author": author, "publish_time": publish_time, "__biz": biz, "text": text, "spider_name": spider_name, "collection_mode": collection_mode, "data_source_type": data_source_type, "sn": sn, "collection_time": tools.get_current_date(), } # 入库 if article_data and data_pipeline.save_article( article_data) is not None: self._task_manager.update_article_task_state(sn, 1) return self._task_manager.get_task()
def add_va_app_content_info(table, site_id, title, summary, image_url, img_stor_path, url, release_time, video_url, video_stor_path, content, column_id, is_download, sensitive_id, violate_id, storage_id): ''' @summary: --------- @param table: @param site_id: @param title: @param summary: @param image_url: @param img_stor_path: @param url: @param release_time: @param video_url: @param video_stor_path: @param content: @param column_id: @param is_download: @param sensitive_id: @param violate_id: --------- @result: ''' is_audio = video_url and 1 or 0 content_info_dict = { 'title': title, 'summary': summary, 'image_url': image_url, 'img_stor_path': img_stor_path, 'url': url, 'release_time': release_time, 'video_url': video_url, 'video_stor_path': video_stor_path, 'content': content, 'column_id': column_id, 'is_download': is_download, 'sensitive_id': sensitive_id, 'violate_id': violate_id, 'storage_id': storage_id, 'site_id': site_id, 'record_time': tools.get_current_date(), 'sexy_image_status': '', 'sexy_image_url': '', 'image_pron_status': 0, 'read_status': 0, 'is_audio': is_audio } db.add(table, content_info_dict) if sensitive_id or violate_id: content_info_dict['content_id'] = content_info_dict['_id'] db.add('VAApp_vioation_content_info', content_info_dict)
def save_es(): while True: content_infos = db.find('TIANJIN_APP_content_info', {'es_read_status': 0}, limit=40000) #print(content_infos) if not content_infos: break for content_info in content_infos: mongo_id = content_info['_id'] mongo_id = int(str(mongo_id)[-6:], 16) uuid = str(mongo_id) + '_4' # site_name = content_info['site_name'] if not content_info['release_time']: content_info['release_time'] = tools.get_current_date() site_find_date = '2018-09-23 00:00:00' es_content_info = { 'ID': mongo_id, 'UUID': uuid, 'ARTICLE_URL': content_info['url'], 'FIND_DATE': content_info['record_time'], 'PRAISE_COUNT': 0, # 'IMAGE_CODE': 5, 'IMAGE_CODE': content_info['image_pron_status'], 'RELEASE_TIME': content_info['release_time'], 'CONTENT': content_info['content'], 'IMAGE_URL': content_info['image_url'], 'SOURCE_ID': content_info['site_id'], 'SITE_FIND_DATE': site_find_date, 'COMMENT_COUNT': 0, 'SOURCE_NAME': content_info['site_name'], 'OUT_CHAIN_STATUS': 1, 'NAME': content_info['title'], 'TRANSPOND_COUNT': 0, 'TYPE_ID': '4', # 1持证网站 2备案网站 3无证网站 4 APP 5微博 6微信 7OTT 'TYPE_NAME': 'APP', 'TASK_ID': '', 'VIOLATE_LIBRARY': '', 'ADDVIOLATE_DATE': None, 'READ': 1, 'CHECK_STATUS': 1, 'VIOLATE_STATUS': 1, 'MATERIAL_CHECK_VIOLATE_TYPE': '', 'VIOLATE_CHECK_VIOLATE_TYPE': '', 'FIELD_STR1': '', 'FIELD_STR2': '', 'MATERIAL_LIBRARY': '', 'ADDMATERIAL_DATE': None, } # print(es_content_info['UUID']) es.add('tab_iimp_all_program_info', es_content_info, uuid) #{"es_read_status": "1"} # es.add_batch(es_content_info, uuid,'tab_iimp_all_program_info') # info.update({"_id":content_info['mongo_id']},{"$set": {"es_read_status": "1"}}) db.update('TIANJIN_APP_content_info', {"_id": content_info['_id']}, {"es_read_status": 1})
def export_callback(execute_type, sql, data_json): if execute_type != ExportData.EXCEPTION: infoIds = data['infoIds'] url = root_url % infoIds json = tools.get_json_by_requests(url, headers=HEADERS) articles = json['data'] # "EMOTION": 'vint_3', # "ACCOUNT": null, # "WEIGHT": 0, # "TITLE": "str_title", # "URL": "str_url", # "MAY_INVALID": , # "CLUES_IDS": "", # "WEBSITE_NAME": "str_site", # "KEYWORDS_COUNT": 1, # "HOST": "str_site", # "INFO_TYPE": 'int_type', # "COMMENT_COUNT": null, # "HOT_ID": "vint_%d"%hot_id, # "REVIEW_COUNT": null, # "UUID": "73ec16038e074530ff109e3cfad2594c", # "ID": 'vint_%d'%article_id, # "IS_VIP": null, # "IMAGE_URL": 'str_picture', # "KEYWORDS": "str_keywords", # "KEYWORD_CLUES_ID": "{"中央电视台":"88758"}", # "RELEASE_TIME": "date_pubtime", # "AUTHOR": "江门日报", # "CONTENT": "clob_content", # "RECORD_TIME": 'vdate_%s'%tools.get_current_date(), # "UP_COUNT": 'vint_null' key_map = { 'id': 'int_dataId', 'content': 'clob_content', 'url': 'str_url', 'website_name': 'str_site', 'image_url': 'str_picture', 'release_time': 'date_pubtime', 'keywords': 'str_keywords', 'emotion': 'str_emotion', 'host': 'str_site', 'title': 'str_title', 'info_type': 'int_type', 'hot_id': "vint_%d" % hot_id, 'record_time': 'vdate_%s' % tools.get_current_date() } export_data.export_to_oracle( key_map=key_map, aim_table='TAB_IOPM_ARTICLE_INFO', unique_key='url', datas=articles, unique_key_mapping_source_key={'url': 'str_url'}, sync_to_es=True)
def update_task_status(self, tasks, status): TaskService._lock.acquire() #加锁 for task in tasks: website_id = task[0] sql = "update tab_iopm_site t set t.spider_time = to_date('%s', 'yyyy-mm-dd :hh24:mi:ss'), t.spider_status = %s where id = %s" % ( tools.get_current_date(), status, website_id) TaskService._db.update(sql) TaskService._lock.release()
def add_anchor_info(table, site_id, name='', image_url='', room_id='', room_url='', video_path='', watched_count='', fans_count='', sex='', age='', address='', live_view=1, watched_count_url=''): ''' @summary: --------- @param table: 表名 @param site_id: 网站id @param name: 主播名 @param image_url: 贴图地址 @param room_id: 房间号 @param room_url: 房间网页的url @param video_path: 房间视频流地址 @param watched_count: 观众数 @param fans_count: 粉丝数 @param sex: 性别 @param age: 年龄 @param address: 主播所在地址(城市) @param live_view: 直播状态(0 未直播 1 直播) @param watched_count_url: 实时观众数地址 --------- @result: ''' anchor_info_dict = { 'site_id': site_id, 'name': name, 'image_url': image_url, 'sex': sex, 'age': age, 'address': address, 'fans_count': fans_count, 'watched_count': watched_count, 'room_id': room_id, 'room_url': room_url, 'video_path': video_path, 'live_view': live_view, 'record_time': tools.get_current_date(), 'watched_count_url': watched_count_url, 'read_status': 0 } if not db.add(table, anchor_info_dict): anchor_info_dict.pop('_id') db.update(table, {'name': name}, anchor_info_dict)
def add_article_info(table, website_id, source_url, title, content): content_info_dict = { 'site_id': website_id, 'url': source_url, 'title': title, 'content': content, 'record_time': tools.get_current_date(), 'read_status': 0 } db.add(table, content_info_dict)
def add_program_episode_info(table, site_id, program_id, episode_num='', time_length='', episode_name='', download_status='', download_url='', episode_url='', summary='', image_url='', sto_path='', play_count=''): ''' @summary: --------- @param table: @param site_id: @param program_id: 节目id @param episode_num: 当前集数 @param time_length: 时长 @param episode_name: 节目名称 @param download_status: 下载状态 @param download_url: 下载地址 @param episode_url: 原文地址 @param summary: 简介 @param image_url: 图片地址 --------- @result: ''' download_status = 101 if sto_path else 102 sto_id = 1 if sto_path else '' episode_info = { 'site_id': site_id, 'program_id': program_id, 'episode_num': episode_num, 'time_length': time_length, 'episode_name': episode_name, 'download_status': download_status, 'download_url': download_url, 'episode_url': episode_url, 'summary': summary, 'image_url': image_url, 'read_status': 0, 'record_time': tools.get_current_date(), 'sto_path': sto_path, 'sto_id': sto_id, 'play_count': play_count } db.add(table, episode_info)