def deal_null_biz(self): sql = 'select id, name, domain from TAB_IOPM_SITE t where classify = 2 and t.biz is null' accounts_info = self._db.find(sql) for account_info in accounts_info: print(account_info) _id = account_info[0] account = account_info[1] account_id = account_info[2] account_info = self.get_account_info(account_id, account) log.debug(tools.dumps_json(account_info)) if account_info.get('__biz'): account = account or account_info.get('account') account_id = account_id or account_info.get('account_id') __biz = account_info.get('__biz') or '' sql = "update TAB_IOPM_SITE set name = '%s', domain = '%s', biz = '%s' where id = %s"%(account, account_id, __biz, _id) log.debug(sql) self._db.update(sql) elif not account_info.get('check_info'): log.debug('查无此公众号 :%s'% account) tools.delay_time(60)
def run(self): while not self._thread_stop: try: self.__add_url_to_db() tools.delay_time(1) except Exception as e: log.error(e)
def run(self): while not self._thread_stop: try: self.__add_article_to_db() except Exception as e: log.error(e) log.debug('缓存中文章数量 %s'%len(self._articles_deque)) tools.delay_time(1)
def run(self): while True: tools.delay_time(60 * 60) # 一小时后更细权重 print('更新线索权重...') self.load_clues_weight() self.load_classify_weight() self.load_related_factor() print('更新线索权重完毕')
def run(self): while True: try: datas = self.get_data_from_redis(SYNC_STEP) if not datas: print('无数据 休眠...') elif self.add_data_to_es(datas): self._sync_count += len(datas) tools.print_one_line('已同步 %d 条数据' % self._sync_count) tools.delay_time(1) except Exception as e: log.error(e)
def main(): while True: if task_status.is_doing: log.debug('正在做 不取任务') tools.delay_time(SEARCH_TASK_SLEEP_TIME) continue task_status.is_doing = True # 查找任务 get_task_url = MASTER_ADDRESS + '/task/get_task' print(get_task_url) update_task_url = MASTER_ADDRESS + '/task/update_task' data = tools.get_json_by_requests(get_task_url) # tasks = [[209690, '百度新闻', 11, 'http://news.baidu.com/?tn=news', 3]] print(data) tasks = data.get('tasks', []) parser_count = data.get('thread_count') def begin_callback(): log.info('\n********** news begin **********') # 更新任务状态 doing data = {'tasks': str(tasks), 'status': 602} if tools.get_json_by_requests(update_task_url, data=data): log.debug('更新任务状态 正在做...') def end_callback(): log.info('\n********** news end **********') task_status.is_doing = False data = {'tasks': str(tasks), 'status': 603} if tools.get_json_by_requests(update_task_url, data=data): log.debug('更新任务状态 已做完!') # 配置spider spider = Spider(tab_urls='news:news_urls', parser_count=parser_count, begin_callback=begin_callback, end_callback=end_callback, parser_params=tasks, delete_tab_urls=False) # 添加parser spider.add_parser(news_parser) spider.start()
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) keywords = parser_params['keywords'] for keyword in keywords: if keyword: url = 'http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=y&type=1&page=1&ie=utf8' % keyword if mongodb.find('WWA_wechat_account_url', {'url': url}): continue headers = { "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Accept-Encoding": "gzip, deflate", "Cookie": "wuid=AAGPF/32GQAAAAqLFD2BdAAAGwY=; CXID=A468F618D67D4868DC83E6061B1B3CCC; ABTEST=0|1500285612|v1; weixinIndexVisited=1; SUV=006317867B7CC4C5596C8AAD6B089707; SUIR=0A14ACB4D0CA9B50A8ABB33CD0CA69FA; ld=ekllllllll2BbH49lllllVOm1tylllll1kecBlllll9lllll9Zlll5@@@@@@@@@@; ad=AZllllllll2Bzw7GlllllVOeQA6lllll1kectkllll9lllllVqxlw@@@@@@@@@@@; SUID=72780CD23D148B0A59688B0C0002AD65; IPLOC=CN1100; sct=11; SNUID=B4B50E097177247B9A6BE55E72153425; JSESSIONID=aaaVCfkabuJQTfaNW5f1v", "Host": "weixin.sogou.com" } html, r = tools.get_html_by_requests(url, headers=headers) # 判断是否存在公众号 not_page_tip = '/new/pc/images/bg_404_2.png' if not_page_tip in html: continue # 取页码 regex = 'id="pagebar_container">.*>(\d*?)</a>.*?<a id="sogou_next"' page_num = tools.get_info(html, regex, fetch_one=True) page_num = int(page_num) if page_num else 1 for page in range(1, page_num + 1): url = 'http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=y&type=1&page=%d&ie=utf8' % ( keyword, page) base_parser.add_url('WWA_wechat_account_url', SITE_ID, url) tools.delay_time()
def run(self): is_show_tip = False while True: try: datas = self.get_data_from_redis(SYNC_STEP) if not datas: if not is_show_tip: print('\n{time} 无数据 休眠... '.format( time=tools.get_current_date())) is_show_tip = True elif self.add_data_to_es(datas): is_show_tip = False self._sync_count += len(datas) tools.print_one_line('已同步 %d 条数据' % self._sync_count) tools.delay_time(1) except Exception as e: log.error(e)
def run(self): while True: tools.delay_time(60 * 60) print('更新事件知识库...') self._event_knowledges = self.load_event_knowledges() print('更新事件知识库完毕')
import sys sys.path.append('..') import init import utils.tools as tools from utils.log import log from db.oracledb import OracleDB from base.wechat_public_platform import WechatPublicPlatform from base.wechat_sogou import WechatSogou if __name__ == '__main__': db = OracleDB() # wechat_public_platform = WechatPublicPlatform() wechat_sogou = WechatSogou() # 取微信号 # sql = 'select t.name, t.keyword2 from TAB_IOPM_CLUES t where t.zero_id = 7 and t.first_id = 137 and t.second_id = 183' # accounts = db.find(sql) accounts = ['骨朵网络影视'] for account in accounts: account_id = '' account_name = account biz = wechat_sogou.get_biz(account_id=account_id, account=account_name) if biz: sql = "insert into TAB_IOPM_SITE t (t.id, t.name, t.position, t.classify, t.mointor_status, t.biz, t.priority) values (seq_iopm_site.nextval, '{name}', 1, 2, 701, '{biz}', 1)".format( name=account_name, biz=biz) print(sql) db.add(sql) tools.delay_time(10) # break
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] weibo_id = url_info['remark']['search_keyword'] monitor_type = url_info['remark']['monitor_type'] for i in range(1, 100): weibo_content_url = root_url + '&page=%d' % i # 代理 headers = { "Cache-Control": "max-age=0", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011", "Accept-Language": "zh-CN,zh;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Host": "m.weibo.cn", "Accept-Encoding": "gzip, deflate, br", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" } proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() proxies = {} html = tools.get_json_by_requests(weibo_content_url, headers=headers, proxies=proxies) cards = tools.get_json_value(html, 'cards') if len(cards) < 2: base_parser.update_url('WWA_weibo_info_urls', root_url, Constance.DONE) return tools.delay_time(10) for card in cards: mblog = tools.get_json_value(card, 'mblog') if not mblog: continue url = tools.get_json_value(card, 'scheme') # 代理 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011", "Host": "m.weibo.cn", "Accept-Language": "zh-CN,zh;q=0.8", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive" } proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() proxies = {} origin_html, r = tools.get_html_by_requests(url, headers=headers, proxies=proxies) if not origin_html: continue release_time = get_release_time(mblog) come_from = tools.get_json_value(mblog, 'source') regexs = ['"text": "(.+?)",'] content = ''.join(tools.get_info(origin_html, regexs)) # content = tools.del_html_tag(content) content = content.replace('\\', '') sexy_image_url = [] regexs = ['"pic_ids": \[(.*?)\],'] image_url = ''.join(tools.get_info(origin_html, regexs)) image_url = tools.del_html_tag(image_url).replace('\"', '').replace( '\\n', '') if image_url: image_url = image_url.split(',') for i in range(len(image_url)): image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[ i] + '.jpg' sexy_image_url = image_url image_url = ','.join(image_url) regexs = ['"stream_url": "(.*?)"'] video_url = ''.join(tools.get_info(origin_html, regexs)) transpond_count = tools.get_json_value(mblog, 'reposts_count') praise_count = tools.get_json_value(mblog, 'attitudes_count') # 敏感事件 sensitive_id = '' if monitor_type == 1 or monitor_type == 2: sensitive_event_infos = oracledb.find( 'select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time' ) for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[1].split( ',') if sensitive_event_info[1] else [] keyword2 = sensitive_event_info[2].split( ',') if sensitive_event_info[2] else [] keyword3 = sensitive_event_info[3].split( ',') if sensitive_event_info[3] else [] if base_parser.is_violate(content, key1=keyword1, key2=keyword2, key3=keyword3): sensitive_id = _id break # 违规事件 violate_id = '' if monitor_type == 0 or monitor_type == 2: vioation_knowledge_infos = oracledb.find( 'select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time' ) for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[1].split( ',') if vioation_knowledge_info[1] else [] keyword2 = vioation_knowledge_info[2].split( ',') if vioation_knowledge_info[2] else [] keyword3 = vioation_knowledge_info[3].split( ',') if vioation_knowledge_info[3] else [] if base_parser.is_violate(content, key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break # 下载视频 is_mp4 = tools.is_file(video_url, 'mp4') if is_mp4: local_video_path = FILE_LOCAL_PATH + 'videos/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.mp4' is_download = tools.download_file(video_url, local_video_path) video_url = local_video_path if is_download else '' else: video_url = '' log.debug(''' 原文地址: %s 微博ID: %s 发布时间: %s 来自: %s 内容: %s 图片地址: %s 视频地址: %s 转发数: %s 点赞数: %s 违规id: %s 敏感事件 %s 图像鉴别地址 %s ''' % (url, weibo_id, release_time, come_from, content, image_url, video_url, transpond_count, praise_count, violate_id, sensitive_id, sexy_image_url)) if content: base_parser.add_wwa_weibo_info_info( 'WWA_weibo_info_info', SITE_ID, url, weibo_id, release_time, come_from, content, image_url, video_url, transpond_count, praise_count, violate_id, sensitive_id=sensitive_id, sexy_image_url=sexy_image_url) tools.delay_time() base_parser.update_url('WWA_weibo_info_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark']['keyword'] monitor_type = url_info['remark']['monitor_type'] official_accounts_id = remark retry_times = url_info['retry_times'] headers = { "Host": "weixin.sogou.com", "Connection": "keep-alive", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Cookie": "ABTEST=8|1506658658|v1; IPLOC=CN1100; SUID=C5C47C7B642E940A0000000059CDC962; SUID=C5C47C7B1508990A0000000059CDC963; weixinIndexVisited=1; SUV=00F95AA57B7CC4C559CDC963CE316529; SNUID=2B2A9295EDE8B7A2BCECB605EE30F1BE; JSESSIONID=aaadcwpP9yaKs-PCMhz6v", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1" } # 获取代理 proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() # 解析 # print(proxies) # html, r = tools.get_html_by_requests('http://ip.chinaz.com/getip.aspx', headers = headers, proxies = proxies) # print(html) html, request = tools.get_html_by_requests(root_url, headers = headers, proxies = proxies) if not html: base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1) return # print(html) regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">' check_info = tools.get_info(html, regex, fetch_one = True) print(root_url) log.debug('取文章链接' + check_info) if check_info: base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1) return # 公众号信息块 regex = '<!-- a -->(.*?)<!-- z -->' account_block = tools.get_info(html, regex, fetch_one = True) # url regex = '<a.*?account_name.*?href="(.*?)">' account_url = tools.get_info(account_block, regex, fetch_one = True) account_url = account_url.replace('&',"&") log.debug('account_url = ' + account_url) if not account_url: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return headers = { "Accept-Language": "zh-CN,zh;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Host": "mp.weixin.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive" } # 代理 proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() proxies = {} #使用代理会出现验证码 暂时不使用 html, request = tools.get_html_by_requests(account_url, headers = headers, proxies = proxies) regex = '<input class="weui_input frm_input" id="input" placeholder="(.*?)" maxlength="4">' check_info = tools.get_info(html, regex, fetch_one = True) log.debug(''' 取文章详细内容 %s url %s request.headers %s '''%(check_info, account_url, request.headers)) # print(html) regex = 'var msgList = (.*?});' article_json = tools.get_info(html, regex, fetch_one = True) article_json = tools.get_json(article_json) article_list = article_json.get('list', {}) for article in article_list: title = tools.get_json_value(article, 'app_msg_ext_info.title') is_have = mongodb.find('WWA_wechat_article', {'title' : title}) if is_have: log.debug(title + " 已存在") continue summary = tools.get_json_value(article, 'app_msg_ext_info.digest') image_url = tools.get_json_value(article, 'app_msg_ext_info.cover') sexy_image_url = [] # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' sexy_image_url.append(local_image_url) article_url = tools.get_json_value(article, 'app_msg_ext_info.content_url') article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url) article_url = article_url.replace('&',"&") release_time = tools.get_json_value(article, 'comm_msg_info.datetime') release_time = tools.timestamp_to_date(int(release_time)) if release_time else '' content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies) regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce' content = tools.get_info(content_html, regex, fetch_one = True) # # 取content里的图片 下载图片 然后替换内容中原来的图片地址 regex = '<img.*?data-src="(.*?)"' images = tools.get_info(content, regex) for image in images: local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg') is_download = tools.download_file(image, local_image_path) if is_download: content = content.replace(image, local_image_path) sexy_image_url.append(local_image_path) tools.delay_time(5) # 敏感事件 sensitive_id = '' if monitor_type == 1 or monitor_type == 2: sensitive_event_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[1].split(',') if sensitive_event_info[1] else [] keyword2 = sensitive_event_info[2].split(',') if sensitive_event_info[2] else [] keyword3 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else [] if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3): sensitive_id = _id break # 违规事件 violate_id = '' if monitor_type == 0 or monitor_type == 2: vioation_knowledge_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[1].split(',') if vioation_knowledge_info[1] else [] keyword2 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else [] keyword3 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else [] if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break log.debug(''' 标题 %s 简介 %s 图片地址 %s 文章地址 %s 发布时间 %s 内容 %s 本地贴图地址 %s 违规状态 %s 敏感事件 %s 图片鉴别地址 %s '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url)) base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url) # 同一天发布的 oneday_article_list = article.get('app_msg_ext_info', {}).get('multi_app_msg_item_list', []) for article in oneday_article_list: title = tools.get_json_value(article, 'title') summary = tools.get_json_value(article, 'digest') image_url = tools.get_json_value(article, 'cover') sexy_image_url = [] # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' sexy_image_url.append(local_image_url) article_url = tools.get_json_value(article, 'content_url') article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url) article_url = article_url.replace('&',"&") content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies) regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce' content = tools.get_info(content_html, regex, fetch_one = True) # 取content里的图片 下载图片 然后替换内容中原来的图片地址 regex = '<img.*?data-src="(.*?)"' images = tools.get_info(content, regex) for image in images: local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg') is_download = tools.download_file(image, local_image_path) if is_download: content = content.replace(image, local_image_path) sexy_image_url.append(local_image_path) tools.delay_time(5) # 敏感事件 sensitive_id = '' sensitive_event_infos = oracledb.find('select * from tab_mvms_sensitive_event') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else [] keyword2 = sensitive_event_info[4].split(',') if sensitive_event_info[4] else [] keyword3 = sensitive_event_info[5].split(',') if sensitive_event_info[5] else [] if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3): sensitive_id = _id break # 违规事件 violate_id = '' vioation_knowledge_infos = oracledb.find('select * from tab_mvms_violation_knowledge') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else [] keyword2 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else [] keyword3 = vioation_knowledge_info[4].split(',') if vioation_knowledge_info[4] else [] if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break log.debug(''' 标题 %s 简介 %s 图片地址 %s 文章地址 %s 发布时间 %s 内容 %s 本地贴图地址 %s 违规状态 %s 敏感事件 %s 图片鉴别地址 %s '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url)) base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url) base_parser.update_url('WWA_wechat_article_url', root_url, Constance.DONE) tools.delay_time()
def get_datas(root_url): count = 0 page = 1 retry_times = 0 max_retry_times = 5 while True: url = root_url%page print(url) datas = tools.get_json_by_requests(url, headers = HEADERS) if not datas: if retry_times > max_retry_times: break else: retry_times += 1 tools.delay_time(2) continue else: retry_times = 0 if datas['message'] == '查询记录为0': print('每页100条 第%d页无数据 共导出 %d 条数据'%(page, count)) break messages = datas['data']['data'] for msg in messages: if not msg['url']: continue weight = 0 # 权重 clues_ids = msg['cluesIds'] # 取id sql = 'select SEQ_IOPM_ARTICLE.nextval from dual' article_id = db.find(sql)[0][0] def export_callback(execute_type, sql, data_json): if execute_type != ExportData.EXCEPTION: for clues_id in clues_ids.split(','): print(clues_id) key_map = { 'id':'vint_sequence.nextval', 'article_id':'vint_%d'%article_id, 'clues_id':'vint_%s'%clues_id } export_data.export_to_oracle(key_map = key_map, aim_table = 'TAB_IOPM_ARTICLE_CLUES_SRC', datas = [{}], sync_to_es = True) is_negative_emotion = (msg['emotion'] == 2) and 1 or 0 is_vip = vip_checked.is_vip(msg['url']) or vip_checked.is_vip(msg['websiteName'])or vip_checked.is_vip(msg['author']) # 计算权重 print('===============================') url = IOPM_SERVICE_ADDRESS + '/related_sort?article_id=%d&clues_ids=%s&may_invalid=%s&vip_count=%s&negative_emotion_count=%s'%(article_id, msg['cluesIds'], msg['mayInvalid'] or '0', is_vip and 1 or 0, is_negative_emotion) weight = tools.get_json_by_requests(url).get('weight', 0) print(url) print('----------------------------') key_map = { 'id':'vint_%d'%article_id, 'account': 'str_account', 'author': 'str_author', 'clues_ids': 'str_cluesIds', 'comment_count': 'int_commtcount', 'content': 'clob_content', 'emotion': 'vint_%s'%(msg['emotion'] or 3), 'host': 'str_host', 'keywords': 'str_keywords', 'image_url': 'str_picture', 'release_time': 'date_pubtime', 'review_count': 'int_reviewCount', 'title': 'str_title', 'info_type': 'int_type', 'up_count': 'int_upCount', 'url': 'str_url', 'uuid': 'str_uuid', 'website_name': 'str_websiteName', 'MAY_INVALID':'int_mayInvalid', 'KEYWORD_CLUES_ID':'str_keywordAndIds', 'keywords_count':'vint_%d'%len(msg['keywords'].split(',')), 'is_vip':'vint_%d'%vip_checked.is_vip(msg['url']) or vip_checked.is_vip(msg['websiteName'])or vip_checked.is_vip(msg['author']), 'weight':'vint_%s'%weight, 'record_time':'vdate_%s'%tools.get_current_date(), 'transmit_count':'str_forwardcount', 'INTERACTION_COUNT':'vint_%s'%get_interaction_count(msg['commtcount'], msg['reviewCount'], msg['forwardcount'], msg['upCount']) } export_data.export_to_oracle(key_map = key_map, aim_table = 'TAB_IOPM_ARTICLE_INFO', unique_key = 'url', datas = msg, callback = export_callback, unique_key_mapping_source_key = {'url': 'str_url'}, sync_to_es = True) count += 1 page += 1
if __name__ == '__main__': # "hot_value": 52.0, # "article_count": 8, # "clues_ids": "250,925,924,389,274,924,250,273,250,430,279,916,916,925,925,274,274,250,275,102,274,916,927,953,930,927,930,930,250,928,928,109,273,928", # "vip_count": 3, # "zero_ids": "6,2,5,7", # "negative_emotion_count": 8, # "hot_id": "f443d613-bc0e-330b-9643-7798e0c5ca97" related_sort = RelatedSortService() related_sort.start() clue_ids = '936,936,274,936' a = related_sort.deal_hot('25cd565c-4c0d-30a8-b853-21913e2dc6fa', hot_value=52.0, clues_id=clue_ids, zero_ids='6,2,5,7', article_count=8, vip_count=3, negative_emotion_count=8) print(a) tools.delay_time(5) # # b = related_sort.get_article_releated_weight(1123802) # # print(b) # # related_sort.load_related_factor() # # print(related_sort.get_related_factor(RelatedSortService.CLUES_FACTOR)) # # print(related_sort.get_related_factor(RelatedSortService.HOT_FACTOR)) # print(0.23 * 0.3 + 0.25 * 0.7 + 0.2* 1 + 0.5 *0)
def monitor_task(): task_manager = TaskManager() total_time = 0 task_count = 0 begin_time = None end_time = None spend_hours = None is_show_start_tip = False is_show_have_task = False while True: task_count = task_manager.get_task_count() if not task_count: if not is_show_start_tip: log.info('开始监控任务池...') is_show_start_tip = True total_time += CHECK_HAVE_TASK_SLEEP_TIME tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME) else: if not is_show_have_task: log.info('任务池中有%s条任务,work可以正常工作' % task_count) is_show_have_task = True total_time = 0 tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME) if total_time > MAX_NULL_TASK_TIME: is_show_start_tip = False is_show_have_task = False # 结束一轮 做些统计 if begin_time: # 统计时间 end_time = tools.timestamp_to_date( tools.get_current_timestamp() - MAX_NULL_TASK_TIME) spend_time = tools.date_to_timestamp( end_time) - tools.date_to_timestamp(begin_time) spend_hours = tools.seconds_to_h_m_s(spend_time) # 统计url数量 depth_count_info = task_manager.get_ever_depth_count(5) # 统计文章数量 article_count_msg = statistic_article_count.get_article_count_msg( begin_time, end_time) log.info( ''' ------- 已做完一轮 -------- \r开始时间:%s \r结束时间:%s \r耗时:%s \r网站数量:%s \rurl数量信息:%s \r文章数量信息:%s ''' % (begin_time, end_time, spend_hours, task_count, tools.dumps_json(depth_count_info), article_count_msg)) # 删除url指纹 log.info('删除url指纹...') task_manager.clear_task() log.info('redis 中连续%s秒无任务,超过允许最大等待%s秒 开始添加任务' % (total_time, MAX_NULL_TASK_TIME)) # 取任务 tasks = task_manager.get_task_from_oracle() if tasks: total_time = 0 task_manager.add_task_to_redis(tasks) task_count = task_manager.get_task_count() if task_count: begin_time = tools.get_current_date() log.info('添加任务到redis中成功 共添加%s条任务。 work开始工作' % (task_count)) else: log.error('未从oracle中取到任务')
def run(self): while True: tools.delay_time(60 * 60) print('更新keywords...') self._clues = self.get_clues() print('更新keywords完毕')
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] monitor_type = url_info['remark'] for i in range(2, 100): list_url = root_url + '&page=%d' % i html = tools.get_json_by_requests(list_url) cards = tools.get_json_value(html, 'cards') card_group = [] for i in cards: card_group = tools.get_json_value(i, 'card_group') if card_group: break if not card_group: break for info in card_group: user_info = tools.get_json_value(info, 'user') _id = tools.get_json_value(user_info, 'id') user_url = 'http://m.weibo.cn/api/container/getIndex?containerid=230283%s_-_INFO' % _id user_url_html = tools.get_json_by_requests(user_url) user_url_cards = tools.get_json_value(user_url_html, 'cards') user_url_card_group = tools.get_json_value(user_url_cards[0], 'card_group') area = '' for i in user_url_card_group: if tools.get_json_value(i, 'item_name') == '所在地': area = tools.get_json_value(i, 'item_content') else: continue name = tools.get_json_value(user_info, 'screen_name') is_verified_reason = 101 verified_reason = tools.get_json_value(user_info, 'verified_reason') if verified_reason: is_verified_reason = 102 sex = tools.get_json_value(user_info, 'gender') if sex == 'f': sex = 1 elif sex == 'm': sex = 0 else: sex = '' image_url = tools.get_json_value(user_info, 'profile_image_url') url = tools.get_json_value(user_info, 'profile_url') summary = tools.get_json_value(user_info, 'description') user_url_2 = 'http://m.weibo.cn/api/container/getIndex?containerid=100505%s' % _id user_url_html_2 = tools.get_json_by_requests(user_url_2) fans_count = tools.get_json_value(user_url_html_2, 'userInfo.followers_count') follow_count = tools.get_json_value(user_url_html_2, 'userInfo.follow_count') log.debug(''' 用户id: %s 微博昵称: %s 微博地址: %s 头像地址: %s 微博认证: %s 是否认证: %s 所在地: %s 性别: %s 简介: %s 粉丝数: %s 关注数: %s 监测状态: %s ''' % (_id, name, url, image_url, verified_reason, is_verified_reason, area, sex, summary, fans_count, follow_count, monitor_type)) base_parser.add_wwa_weibo_user_info('WWA_weibo_user_info', SITE_ID, _id, name, url, image_url, verified_reason, is_verified_reason, area, sex, summary, fans_count, follow_count, monitor_type) tools.delay_time() base_parser.update_url('WWA_weibo_user_urls', root_url, Constance.DONE) tools.delay_time() # parser({'url': 'http://m.weibo.cn/api/container/getIndex?type=user&containerid=100103type%3D3%26q%3D%E9%87%8D%E5%BA%86%E7%94%B5%E8%A7%86%E5%8F%B0'})
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] headers = { "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Accept-Encoding": "gzip, deflate", "Cookie": "wuid=AAGPF/32GQAAAAqLFD2BdAAAGwY=; CXID=A468F618D67D4868DC83E6061B1B3CCC; ABTEST=0|1500285612|v1; weixinIndexVisited=1; SUV=006317867B7CC4C5596C8AAD6B089707; SUIR=0A14ACB4D0CA9B50A8ABB33CD0CA69FA; ld=ekllllllll2BbH49lllllVOm1tylllll1kecBlllll9lllll9Zlll5@@@@@@@@@@; ad=AZllllllll2Bzw7GlllllVOeQA6lllll1kectkllll9lllllVqxlw@@@@@@@@@@@; SUID=72780CD23D148B0A59688B0C0002AD65; IPLOC=CN1100; sct=11; SNUID=B4B50E097177247B9A6BE55E72153425; JSESSIONID=aaaVCfkabuJQTfaNW5f1v", "Host": "weixin.sogou.com" } # 解析 html, request = tools.get_html_by_requests(root_url, headers=headers) if not html: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">' check_info = tools.get_info(html, regex, fetch_one=True) log.debug('取公众号列表' + check_info) # 公众号信息块 regex = '<!-- a -->(.*?)<!-- z -->' account_blocks = tools.get_info(html, regex) if not account_blocks: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return # 文章数url regex = '<script>var account_anti_url = "(.*?)";</script>' articles_count_url = tools.get_info(html, regex, fetch_one=True) articles_count_url = tools.get_full_url('http://weixin.sogou.com', articles_count_url) articles_count_json = tools.get_json_by_requests(articles_count_url).get( 'msg', {}) for account_block in account_blocks: # print(account_block) regex = '<a.*?account_name.*?>(.*?)</a>' name = tools.get_info(account_block, regex, fetch_one=True) name = tools.del_html_tag(name) is_have = mongodb.find('WWA_wechat_official_accounts', {'name': name}) if is_have: log.debug(name + " 已存在") continue regex = '<div class="img-box">.*?<img src="(.*?)"' image_url = tools.get_info(account_block, regex, fetch_one=True) # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' regex = '<p class="tit">.*?(<i></i>).*?<p class="info">' is_verified = 102 if tools.get_info( account_block, regex, fetch_one=True) else 101 regex = '<label name="em_weixinhao">(.*?)</label>' account_id = tools.get_info(account_block, regex, fetch_one=True) regex = '<li id="sogou_vr_.*?d="(.*?)">' article_count_key = tools.get_info(account_block, regex, fetch_one=True) article_count = articles_count_json.get(article_count_key, '') article_count = article_count[:article_count.find(',')] regex = '<dt>功能介绍.*?<dd>(.*?)</dd>' summary = tools.get_info(account_block, regex, fetch_one=True) summary = tools.del_html_tag(summary) regex = "认证.*?<dd>(.*?)</dd>" certification = tools.get_info(account_block, regex, fetch_one=True) regex = '微信扫一扫关注.*?<img.*?src="(.*?)"' barcode_url = tools.get_info(account_block, regex, fetch_one=True) barcode_url = barcode_url.replace('&', "&") # 下载图片 local_barcode_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(barcode_url, local_barcode_url) local_barcode_url = local_barcode_url if is_download else '' regex = '<a.*?account_name.*?href="(.*?)">' account_url = tools.get_info(account_block, regex, fetch_one=True) account_url = account_url.replace('&', "&") log.debug(''' 公众号名称 %s 公众号账号 %s 账号url %s 贴图 %s 本地贴图 %s 文章数量 %s 简介 %s 微信认证 %s 是否加V(是否认证) %s 二维码 %s 本地二维码 %s ''' % (name, account_id, account_url, image_url, local_image_url, article_count, summary, certification, is_verified, barcode_url, local_barcode_url)) base_parser.add_wechat_account_info( 'WWA_wechat_official_accounts', site_id, name, account_id, account_url, image_url, local_image_url, article_count, summary, certification, is_verified, barcode_url, local_barcode_url) base_parser.update_url('WWA_wechat_account_url', root_url, Constance.DONE) tools.delay_time()
# 用记事本打开文件后,会在conf文本头前面加上\ufeff,需要处理掉 content = tools.read_file('config.conf') tools.write_file('config.conf', content.replace('\ufeff', '')) # 读配置 cp = configparser.ConfigParser(allow_no_value = True) with codecs.open('config.conf', 'r', encoding='utf-8') as f: cp.read_file(f) sections = cp.sections() for section in sections: remote_url = cp.get(section, 'remote_url') local_save_path = cp.get(section, 'local_save_path') project_path = cp.get(section, 'project_path') main_lnk_paths = cp.get(section, 'main_lnk_paths').split(',') sync_files = cp.get(section, 'sync_files').split(',') ignore_files = cp.get(section, 'ignore_files').split(',') # # 调用 update_code = UpdateCode(remote_url, local_save_path, project_path, main_lnk_paths, sync_files, ignore_files) if update_code.check_remote_tag(): update_code.download_code() update_code.copy_file() update_code.close_process() update_code.start_process() if __name__ == '__main__': while True: main() tools.delay_time(60 * 60)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) # page_max_count = 236 for keyword in keywords: next_keyword = False for page_num in range(1, 236): url = 'https://m.weibo.cn/api/container/getIndex?type=wb&queryVal=%s' % keyword + \ '&featurecode=20000320&luicode=10000011&lfid=100103type%3D1%26q%3D' + keyword \ + '&title=' + keyword + '&containerid=100103type%3D2%26q%3D' + keyword + '&page=%d' % page_num # base_parser.add_url('WEIBO_urls', SITE_ID, url) print('-----------------------------------') print(keyword) print(url) info_json = tools.get_json_by_requests(url) # log.debug(info_json) info_list = info_json.get('data', {}).get('cards', []) if info_list: info_list = info_list[0]['card_group'] else: info_list = [] next_keyword = True for weibo_info in info_list: content = weibo_info['mblog']['text'] _id = weibo_info['mblog']['id'] release_time = weibo_info['mblog']['created_at'] release_time = get_release_time(release_time) url = 'https://m.weibo.cn/status/' + _id user_name = weibo_info['mblog']['user']['screen_name'] video_url = tools.get_info(str(weibo_info), 'stream_url":"(.+?)"', fetch_one=True) reposts_count = weibo_info['mblog']['reposts_count'] comments_count = weibo_info['mblog']['comments_count'] attitudes_count = weibo_info['mblog']['attitudes_count'] is_continue = base_parser.save_weibo_info( 'WEIBO_info', site_id=SITE_ID, content=content, release_time=release_time, user_name=user_name, video_url=video_url, _id=_id, url=url, reposts_count=reposts_count, comments_count=comments_count, attitudes_count=attitudes_count) if not is_continue: next_keyword = True break if next_keyword: break tools.delay_time(10)
def read_data(): try: datas = codecs.open('data.txt', 'r', encoding='utf-8') info = datas.readlines()[-1].strip() infos=info.split(',') return infos[-1] except Exception as err: print(err) return '' while True: with open(packets_file_path, 'rb') as file: streams = file.read() # print(streams.decode('gbk', 'ignore')) tools.delay_time(2) stream_url = tools.get_info(streams.decode('gbk', 'ignore'), 'str_stream_url[a-z](.+?)\n', allow_repeat=False) try: print(len(stream_url)) stream=stream_url[-1] url=read_data() if stream == url: print('数据相同无法存入') else: f = open('data.txt', 'a', encoding="utf-8") f.write(stream + "\n") f.close() print('已存入链接: %s'%stream) except Exception as err:
article_info['RELEASE_TIME'] = news.get('release_time') article_info['RECORD_TIME'] = news.get('record_time') # article_info['RELEASE_TIME'] = tools.get_current_date() article_info['URL'] = news.get('url') article_info['UUID'] = news.get('uuid') article_info['WEBSITE_NAME'] = news.get('site_name') article_info['AUTHOR'] = news.get('author') article_info['INFO_TYPE'] = 8 article_info['ID'] = news.get('uuid') article_info['SUMMARY'] = news.get('title') article_info['IMAGE_URL'] = news.get('image_url') article_infos.append(article_info) max_record_time = news.get('record_time') self.deal_article(article_infos) self.record_now_record_time(max_record_time) if __name__ == '__main__': video_sync = VideoSync() while True: video_news_list = video_sync.get_article() # print(video_news_list) if not video_news_list: log.debug('同步数据到最新 sleep %ds ...' % SLEEP_TIME) tools.delay_time(SLEEP_TIME) else: video_sync.deal_video_article(video_news_list)
oracledb.close() # 入redis, 作为微信爬虫的任务池 data = (oralce_id, account_id, account_name, last_article_release_time, biz) self._redisdb.sadd('wechat:account', data) if __name__ == '__main__': check_new_article = CheckNewArticle() while True: accounts = check_new_article.get_wait_check_account() while accounts: threads = [] for i in range(MAX_THREAD_COUNT): if accounts: thread = threading.Thread( target=check_new_article.check_new_article, args=(accounts.pop(0), )) threads.append(thread) thread.start() else: break for thread in threads: thread.join() print('休眠10分钟之后检查下一轮') tools.delay_time(600)
def run(self): while True: self.monitor_cookies() tools.delay_time(MONITOR_COOKIES_INTERVAL)
def deal_news(self): ''' @summary: 取tab_news_csr_result信息 --------- --------- @result: ''' while True: body = { "query": { "filtered": { "filter": { "range": { "csr_res_id": { # 查询大于该csr_res_id 的信息 "gt": self._current_csr_res_id } } } } }, "_source": ["csr_res_id", "csr_content", "start_time"], "sort": [{ "csr_res_id": "asc" }] } news_json = self._es.search('tab_news_csr_result', body) news_list = news_json.get('hits', {}).get('hits', []) if not news_list: log.debug( 'tab_news_csr_result 表中无大于%s的csr_res_id\nsleep %s...' % (self._current_csr_res_id, SLEEP_TIME)) tools.delay_time(SLEEP_TIME) continue for news_info in news_list: news = news_info.get('_source') csr_res_id = news.get('csr_res_id') csr_content = news.get('csr_content') start_time = news.get('start_time') log.debug(''' 处理 tab_news_csr_result csr_res_id %s start_time %s csr_content %s ''' % (csr_res_id, start_time, csr_content)) # 找相似文章 similar_hot = None hots = self._get_same_day_hots(csr_content, start_time) # 遍历相似的文章,比较相似度 for hot_info in hots: hot = hot_info.get('_source') hot_text = hot.get('csr_content') temp_similarity = compare_text(csr_content, hot_text) if temp_similarity > MIN_SIMILARITY: similar_hot = hot break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较 # 如果找到相似的文章,追加csr_res_id和hot值, 否则将该条信息最为新的热点 if similar_hot: # 找到相似的热点 log.debug('找到所属热点:%s' % similar_hot.get('csr_content')) data = {} # 更新热点的热度及追加文章的id data["hot"] = similar_hot["hot"] + 1 data["csr_res_ids"] = similar_hot[ "csr_res_ids"] + ',' + csr_res_id # 更新热点 self._es.update_by_id("tab_news_csr_hot", data_id=similar_hot.get("hot_id"), data=data) else: # 没有找到相似的热点, 将当前文章作为热点 log.debug('无所属热点') hot_info = { 'hot_id': csr_res_id, 'hot': 1, 'start_time': start_time, 'csr_res_ids': csr_res_id, 'csr_content': csr_content } self._es.add('tab_news_csr_hot', hot_info, data_id=csr_res_id) # 保存当前的id self._current_csr_res_id = csr_res_id self._save_current_id()