def image_predict(self, image_url): if not image_url: return -1 # 如果是网络图片 先下载 识别 然后删除 if image_url.startswith('http'): local_image_path = TEMP_IMAGE_SAVE_PATH + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' is_success = tools.download_file(image_url, local_image_path) image_url = local_image_path if is_success else image_url result = ImagePornRecg.__image_porn_dll.Pic_Predict( image_url, self._porn_image_index) tools.del_file(local_image_path) # 如果是本地图片 不是jpg格式 则需要转成jpg格式 elif not image_url.endswith('jpg'): jpg_image_url = image_url[:image_url.rfind('.')] + '.jpg' is_success = ffmpeg_manager.convert_file_format( image_url, jpg_image_url) image_url = jpg_image_url if is_success else image_url result = ImagePornRecg.__image_porn_dll.Pic_Predict( image_url, self._porn_image_index) else: result = ImagePornRecg.__image_porn_dll.Pic_Predict( image_url, self._porn_image_index) return result
def spider_picture(p_url, end): for i in range(1,11): i = str(i) url = p_url+i+end html, r = tools.get_html_by_requests(url) regex = 'title=".*?".*?src = "(.*?)".*?<div class="wrapper-listTitle">' img_urls = tools.get_info(html, regex) regex_name = 'rseat="dsjp7".*?title="(.*?)".*?src = ".*?"' names = tools.get_info(html, regex_name) j=0 for img_url in img_urls: name = names[j] name = tools.del_html_tag(name) j=j+1 #print(img_url,'---',name,'****',j) FILE_LOCAL_PATH = 'd:' sto_path = '/picture/' + name + '.jpg' tools.download_file(img_url, FILE_LOCAL_PATH, sto_path)
def add_root_urls(url): html, r = tools.get_html_by_requests(url) # print(html) regex = '<div class="site-piclist_pic">(.*?)</li>' html_infos = tools.get_info(html, regex) s = 0 for info in html_infos: regex = 'href = "(.*?)" class="site-piclist_pic_link"' url = tools.get_info(info, regex) url = url and url[0] or '' regex = 'rseat="bigTitle.*?title="(.*?)"' name = tools.get_info(info, regex) name = name and name[0] or '' name = tools.del_html_tag(name) video_download_url = get_download_url(url) FILE_LOCAL_PATH = 'd:' sto_path = '/videos/' + name + '.mp4' tools.download_file(video_download_url, FILE_LOCAL_PATH, sto_path) print(video_download_url, name)
def spider_picture(p_url, end): for i in range(1, 7): i = str(i) url = p_url + i + end #print(url) html, r = tools.get_html_by_requests(url) #print(html) regex = '<a class="figure.*?<img.*?src="(.*?)"/>' img_urls = tools.get_info(html, regex) regex_name = 'data-widget-searchlist-tvname="(.*?)"' names = tools.get_info(html, regex_name) j = 0 for img_url in img_urls: name = names[j] name = tools.del_html_tag(name) j = j + 1 # if not re.match(".jpg", img_url): # img_url = img_url+'.jpg' #print(img_url,'---',name,'****',j) FILE_LOCAL_PATH = 'd:' sto_path = '/ViolatePicture/' + name + '.jpg' tools.download_file(img_url, FILE_LOCAL_PATH, sto_path)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] column_id = remark headers = { 'Host': 'is.snssdk.com', 'Accept': ' */*', 'X-SS-Cookie': '_ba=BA0.2-20170101-51e32-mV0oh6KwzUmWxXl227kO; install_id=8738029911; ttreq=1$b34d173d3544397b1ca82d19a58a7db80e2aef29; qh[360]=1; alert_coverage=33; _ga=GA1.2.1084363974.1479979043; login_flag=cd47dd57ff2f963719bc324163954696; sessionid=3554607744525de375854663cc7e355b; sid_guard="3554607744525de375854663cc7e355b|1489461314|2592000|Thu\054 13-Apr-2017 03:15:14 GMT"; sid_tt=3554607744525de375854663cc7e355b', 'tt-request-time': '1489990271848', 'Cookie': ' _ba=BA0.2-20170101-51e32-mV0oh6KwzUmWxXl227kO; install_id=8738029911; ttreq=1$b34d173d3544397b1ca82d19a58a7db80e2aef29; qh[360]=1; alert_coverage=33; _ga=GA1.2.1084363974.1479979043; login_flag=cd47dd57ff2f963719bc324163954696; sessionid=3554607744525de375854663cc7e355b; sid_guard="3554607744525de375854663cc7e355b|1489461314|2592000|Thu\054 13-Apr-2017 03:15:14 GMT"; sid_tt=3554607744525de375854663cc7e355b', 'User-Agent': 'News/6.0.1 (iPhone; iOS 10.2.1; Scale/3.00)', 'Accept-Language': ' zh-Hans-CN;q=1, en-CN;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Connection': ' keep-alive' } json = tools.get_json_by_requests(root_url) if not json: base_parser.update_url('VAApp_urls', root_url, Constance.EXCEPTION) return datas = json['data'] for data in datas: data = tools.get_json_value(data, 'content') title = tools.get_json_value(data, 'title') # 检测数据库中是否存在,若存在则退出 if db.find('VAApp_content_info', {'title': title}): continue abstract = tools.get_json_value(data, 'abstract') abstract = abstract and abstract or tools.get_json_value( data, 'content') img_url = tools.get_json_value(data, 'image_list.url') img_url = img_url and img_url or tools.get_json_value( data, 'middle_image.url') img_url = img_url and img_url or tools.get_json_value( data, 'large_image_list.url') img_url = img_url and img_url.replace('.webp', '.jpg') or img_url original_url = tools.get_json_value(data, 'article_url') original_url = original_url and original_url or tools.get_json_value( data, 'share_url') release_time = tools.get_json_value(data, 'publish_time') release_time = release_time and release_time or tools.get_json_value( data, '1481012423') release_time = release_time and tools.timestamp_to_date( release_time) or release_time video_msg = tools.get_json_value(data, 'video_play_info') #需要处理 video_main_url = tools.get_json_value(video_msg, 'video_list.video_2.main_url') video_main_url = video_main_url and video_main_url or tools.get_json_value( video_msg, 'video_list.video_1.main_url') parse_video_url = tools.compile_js(PARSE_VIDEO_URL_JSFUNC) video_url = parse_video_url('base64decode', video_main_url) html = tools.get_html_auto_deal_code(original_url) regexs = [ 'class="article-content">(.*?)<div class="article-actions">', '<div class="content">(.*?)<div class="suggestion-list-con"', '<!-- 文章内容 -->(.*?)<!-- @end 文章内容 -->', 'class="yi-content-text">(.*?)<div class="yi-normal"', '<p.*?>(.*?)</p>' ] if video_url: content = abstract else: content = ''.join(tools.get_info(html, regexs)) content = tools.del_html_tag(content) if len(content) < len(abstract): content = abstract # 敏感事件 sensitive_id = '' sensitive_event_infos = oracledb.find( 'select * from tab_mvms_sensitive_event') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[3].split( ' ') if sensitive_event_info[3] else [] keyword2 = sensitive_event_info[4].split( ' ') if sensitive_event_info[4] else [] keyword3 = sensitive_event_info[5].split( ' ') if sensitive_event_info[5] else [] if base_parser.is_violate(title + content, key1=keyword1, key2=keyword2, key3=keyword3): sensitive_id = _id # 违规事件 violate_id = '' vioation_knowledge_infos = oracledb.find( 'select * from tab_mvms_violation_knowledge') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[2].split( ' ') if vioation_knowledge_info[2] else [] keyword2 = vioation_knowledge_info[3].split( ' ') if vioation_knowledge_info[3] else [] keyword3 = vioation_knowledge_info[4].split( ' ') if vioation_knowledge_info[4] else [] if base_parser.is_violate(title + content, key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id log.debug(''' title: %s abstract : %s img_url : %s original_url: %s release_time : %s video_main_url: %s video_url: %s content : %s column_id: %d sensitive_id: %d violate_id: %d ''' % (title, abstract, img_url, original_url, release_time, video_main_url, video_url, content, column_id, sensitive_id and sensitive_id or 0, violate_id and violate_id or 0)) # 如果是视频栏 并且不包含敏感或违法信息 则不下载 if column_id == VIDEO: if not sensitive_id and not violate_id: continue # 下载 base_path = FILE_LOCAL_PATH is_download = 0 # 下载图片 img_name = '' if img_url: img_name = 'images/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(img_url, base_path, img_name) if not is_download: img_name = '' # 下载视频 video_name = '' if video_url: video_name = 'videos/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.mp4' is_download = tools.download_file(video_url, base_path, video_name) if not is_download: video_name = '' if original_url: base_parser.add_va_app_content_info( 'VAApp_content_info', SITE_ID, title, abstract, img_url, img_name, original_url, release_time, video_url, video_name, content, column_id, is_download, sensitive_id, violate_id, STORAGE_ID) base_parser.update_url('VAApp_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] weibo_id = url_info['remark']['search_keyword'] monitor_type = url_info['remark']['monitor_type'] for i in range(1, 100): weibo_content_url = root_url + '&page=%d' % i # 代理 headers = { "Cache-Control": "max-age=0", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011", "Accept-Language": "zh-CN,zh;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Host": "m.weibo.cn", "Accept-Encoding": "gzip, deflate, br", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" } proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() proxies = {} html = tools.get_json_by_requests(weibo_content_url, headers=headers, proxies=proxies) cards = tools.get_json_value(html, 'cards') if len(cards) < 2: base_parser.update_url('WWA_weibo_info_urls', root_url, Constance.DONE) return tools.delay_time(10) for card in cards: mblog = tools.get_json_value(card, 'mblog') if not mblog: continue url = tools.get_json_value(card, 'scheme') # 代理 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011", "Host": "m.weibo.cn", "Accept-Language": "zh-CN,zh;q=0.8", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive" } proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() proxies = {} origin_html, r = tools.get_html_by_requests(url, headers=headers, proxies=proxies) if not origin_html: continue release_time = get_release_time(mblog) come_from = tools.get_json_value(mblog, 'source') regexs = ['"text": "(.+?)",'] content = ''.join(tools.get_info(origin_html, regexs)) # content = tools.del_html_tag(content) content = content.replace('\\', '') sexy_image_url = [] regexs = ['"pic_ids": \[(.*?)\],'] image_url = ''.join(tools.get_info(origin_html, regexs)) image_url = tools.del_html_tag(image_url).replace('\"', '').replace( '\\n', '') if image_url: image_url = image_url.split(',') for i in range(len(image_url)): image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[ i] + '.jpg' sexy_image_url = image_url image_url = ','.join(image_url) regexs = ['"stream_url": "(.*?)"'] video_url = ''.join(tools.get_info(origin_html, regexs)) transpond_count = tools.get_json_value(mblog, 'reposts_count') praise_count = tools.get_json_value(mblog, 'attitudes_count') # 敏感事件 sensitive_id = '' if monitor_type == 1 or monitor_type == 2: sensitive_event_infos = oracledb.find( 'select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time' ) for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[1].split( ',') if sensitive_event_info[1] else [] keyword2 = sensitive_event_info[2].split( ',') if sensitive_event_info[2] else [] keyword3 = sensitive_event_info[3].split( ',') if sensitive_event_info[3] else [] if base_parser.is_violate(content, key1=keyword1, key2=keyword2, key3=keyword3): sensitive_id = _id break # 违规事件 violate_id = '' if monitor_type == 0 or monitor_type == 2: vioation_knowledge_infos = oracledb.find( 'select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time' ) for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[1].split( ',') if vioation_knowledge_info[1] else [] keyword2 = vioation_knowledge_info[2].split( ',') if vioation_knowledge_info[2] else [] keyword3 = vioation_knowledge_info[3].split( ',') if vioation_knowledge_info[3] else [] if base_parser.is_violate(content, key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break # 下载视频 is_mp4 = tools.is_file(video_url, 'mp4') if is_mp4: local_video_path = FILE_LOCAL_PATH + 'videos/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.mp4' is_download = tools.download_file(video_url, local_video_path) video_url = local_video_path if is_download else '' else: video_url = '' log.debug(''' 原文地址: %s 微博ID: %s 发布时间: %s 来自: %s 内容: %s 图片地址: %s 视频地址: %s 转发数: %s 点赞数: %s 违规id: %s 敏感事件 %s 图像鉴别地址 %s ''' % (url, weibo_id, release_time, come_from, content, image_url, video_url, transpond_count, praise_count, violate_id, sensitive_id, sexy_image_url)) if content: base_parser.add_wwa_weibo_info_info( 'WWA_weibo_info_info', SITE_ID, url, weibo_id, release_time, come_from, content, image_url, video_url, transpond_count, praise_count, violate_id, sensitive_id=sensitive_id, sexy_image_url=sexy_image_url) tools.delay_time() base_parser.update_url('WWA_weibo_info_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] column_id = remark while True: try: json = tools.get_json_by_requests(root_url, headers=HEADERS, data=data, params=PARAMS) newslist = tools.get_json_value(json, 'newslist') if not newslist: break data['cachedCount'] += len(newslist) data['page'] += 1 for news in newslist: # print(tools.dumps_json(news)) title = tools.get_json_value(news, 'title') release_time = tools.get_json_value(news, 'time') abstract = tools.get_json_value(news, 'abstract') original_url = tools.get_json_value(news, 'url') img_url = tools.get_json_value( news, 'thumbnails_qqnews')[0] if tools.get_json_value( news, 'thumbnails_qqnews') else '' video_frame_url = tools.get_json_value( news, 'video_channel.video.playurl') # 取content html = tools.get_html_by_urllib(original_url) content = tools.get_tag(html, name='div', attrs={'class': "main"}, find_all=False) content = tools.del_html_tag(str(content)) # 解析视频真实地址 video_url = '' if video_frame_url: video_vid = tools.get_info(html, 'vid\s*=\s*"\s*([^"]+)"', fetch_one=True) video_url = ''.join(qq.qq_download_by_vid(video_vid)) # 判断是否违规 # 敏感事件 sensitive_id = '' sensitive_event_infos = oracledb.find( 'select * from tab_mvms_sensitive_event') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[3].split( ' ') if sensitive_event_info[3] else [] keyword2 = sensitive_event_info[4].split( ' ') if sensitive_event_info[4] else [] keyword3 = sensitive_event_info[5].split( ' ') if sensitive_event_info[5] else [] if base_parser.is_violate(title + content, key1=keyword1, key2=keyword2, key3=keyword3): sensitive_id = _id # 违规事件 violate_id = '' vioation_knowledge_infos = oracledb.find( 'select * from tab_mvms_violation_knowledge') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[2].split( ' ') if vioation_knowledge_info[2] else [] keyword2 = vioation_knowledge_info[3].split( ' ') if vioation_knowledge_info[3] else [] keyword3 = vioation_knowledge_info[4].split( ' ') if vioation_knowledge_info[4] else [] if base_parser.is_violate(title + content, key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id log.debug( ''' title: %s abstract : %s img_url : %s original_url: %s release_time : %s video_url: %s content : %s column_id: %d sensitive_id: %s violate_id: %s ''' % (title, abstract, img_url, original_url, release_time, video_url, content, column_id, sensitive_id, violate_id)) # 下载 base_path = FILE_LOCAL_PATH is_download = 0 # 下载图片 img_name = '' if img_url: img_name = 'images/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(img_url, base_path, img_name) if not is_download: img_name = '' # 下载视频 video_name = '' if video_url: video_name = 'videos/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.mp4' is_download = tools.download_file(video_url, base_path, video_name) if not is_download: video_name = '' if original_url: base_parser.add_va_app_content_info( 'VAApp_content_info', SITE_ID, title, abstract, img_url, img_name, original_url, release_time, video_url, video_name, content, column_id, is_download, sensitive_id, violate_id, STORAGE_ID) except Exception as e: log.debug(e) pass base_parser.update_url('VAApp_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] headers = { "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Accept-Encoding": "gzip, deflate", "Cookie": "wuid=AAGPF/32GQAAAAqLFD2BdAAAGwY=; CXID=A468F618D67D4868DC83E6061B1B3CCC; ABTEST=0|1500285612|v1; weixinIndexVisited=1; SUV=006317867B7CC4C5596C8AAD6B089707; SUIR=0A14ACB4D0CA9B50A8ABB33CD0CA69FA; ld=ekllllllll2BbH49lllllVOm1tylllll1kecBlllll9lllll9Zlll5@@@@@@@@@@; ad=AZllllllll2Bzw7GlllllVOeQA6lllll1kectkllll9lllllVqxlw@@@@@@@@@@@; SUID=72780CD23D148B0A59688B0C0002AD65; IPLOC=CN1100; sct=11; SNUID=B4B50E097177247B9A6BE55E72153425; JSESSIONID=aaaVCfkabuJQTfaNW5f1v", "Host": "weixin.sogou.com" } # 解析 html, request = tools.get_html_by_requests(root_url, headers=headers) if not html: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">' check_info = tools.get_info(html, regex, fetch_one=True) log.debug('取公众号列表' + check_info) # 公众号信息块 regex = '<!-- a -->(.*?)<!-- z -->' account_blocks = tools.get_info(html, regex) if not account_blocks: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return # 文章数url regex = '<script>var account_anti_url = "(.*?)";</script>' articles_count_url = tools.get_info(html, regex, fetch_one=True) articles_count_url = tools.get_full_url('http://weixin.sogou.com', articles_count_url) articles_count_json = tools.get_json_by_requests(articles_count_url).get( 'msg', {}) for account_block in account_blocks: # print(account_block) regex = '<a.*?account_name.*?>(.*?)</a>' name = tools.get_info(account_block, regex, fetch_one=True) name = tools.del_html_tag(name) is_have = mongodb.find('WWA_wechat_official_accounts', {'name': name}) if is_have: log.debug(name + " 已存在") continue regex = '<div class="img-box">.*?<img src="(.*?)"' image_url = tools.get_info(account_block, regex, fetch_one=True) # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' regex = '<p class="tit">.*?(<i></i>).*?<p class="info">' is_verified = 102 if tools.get_info( account_block, regex, fetch_one=True) else 101 regex = '<label name="em_weixinhao">(.*?)</label>' account_id = tools.get_info(account_block, regex, fetch_one=True) regex = '<li id="sogou_vr_.*?d="(.*?)">' article_count_key = tools.get_info(account_block, regex, fetch_one=True) article_count = articles_count_json.get(article_count_key, '') article_count = article_count[:article_count.find(',')] regex = '<dt>功能介绍.*?<dd>(.*?)</dd>' summary = tools.get_info(account_block, regex, fetch_one=True) summary = tools.del_html_tag(summary) regex = "认证.*?<dd>(.*?)</dd>" certification = tools.get_info(account_block, regex, fetch_one=True) regex = '微信扫一扫关注.*?<img.*?src="(.*?)"' barcode_url = tools.get_info(account_block, regex, fetch_one=True) barcode_url = barcode_url.replace('&', "&") # 下载图片 local_barcode_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(barcode_url, local_barcode_url) local_barcode_url = local_barcode_url if is_download else '' regex = '<a.*?account_name.*?href="(.*?)">' account_url = tools.get_info(account_block, regex, fetch_one=True) account_url = account_url.replace('&', "&") log.debug(''' 公众号名称 %s 公众号账号 %s 账号url %s 贴图 %s 本地贴图 %s 文章数量 %s 简介 %s 微信认证 %s 是否加V(是否认证) %s 二维码 %s 本地二维码 %s ''' % (name, account_id, account_url, image_url, local_image_url, article_count, summary, certification, is_verified, barcode_url, local_barcode_url)) base_parser.add_wechat_account_info( 'WWA_wechat_official_accounts', site_id, name, account_id, account_url, image_url, local_image_url, article_count, summary, certification, is_verified, barcode_url, local_barcode_url) base_parser.update_url('WWA_wechat_account_url', root_url, Constance.DONE) tools.delay_time()
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark']['keyword'] monitor_type = url_info['remark']['monitor_type'] official_accounts_id = remark retry_times = url_info['retry_times'] headers = { "Host": "weixin.sogou.com", "Connection": "keep-alive", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Cookie": "ABTEST=8|1506658658|v1; IPLOC=CN1100; SUID=C5C47C7B642E940A0000000059CDC962; SUID=C5C47C7B1508990A0000000059CDC963; weixinIndexVisited=1; SUV=00F95AA57B7CC4C559CDC963CE316529; SNUID=2B2A9295EDE8B7A2BCECB605EE30F1BE; JSESSIONID=aaadcwpP9yaKs-PCMhz6v", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1" } # 获取代理 proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() # 解析 # print(proxies) # html, r = tools.get_html_by_requests('http://ip.chinaz.com/getip.aspx', headers = headers, proxies = proxies) # print(html) html, request = tools.get_html_by_requests(root_url, headers = headers, proxies = proxies) if not html: base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1) return # print(html) regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">' check_info = tools.get_info(html, regex, fetch_one = True) print(root_url) log.debug('取文章链接' + check_info) if check_info: base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1) return # 公众号信息块 regex = '<!-- a -->(.*?)<!-- z -->' account_block = tools.get_info(html, regex, fetch_one = True) # url regex = '<a.*?account_name.*?href="(.*?)">' account_url = tools.get_info(account_block, regex, fetch_one = True) account_url = account_url.replace('&',"&") log.debug('account_url = ' + account_url) if not account_url: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return headers = { "Accept-Language": "zh-CN,zh;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Host": "mp.weixin.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive" } # 代理 proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() proxies = {} #使用代理会出现验证码 暂时不使用 html, request = tools.get_html_by_requests(account_url, headers = headers, proxies = proxies) regex = '<input class="weui_input frm_input" id="input" placeholder="(.*?)" maxlength="4">' check_info = tools.get_info(html, regex, fetch_one = True) log.debug(''' 取文章详细内容 %s url %s request.headers %s '''%(check_info, account_url, request.headers)) # print(html) regex = 'var msgList = (.*?});' article_json = tools.get_info(html, regex, fetch_one = True) article_json = tools.get_json(article_json) article_list = article_json.get('list', {}) for article in article_list: title = tools.get_json_value(article, 'app_msg_ext_info.title') is_have = mongodb.find('WWA_wechat_article', {'title' : title}) if is_have: log.debug(title + " 已存在") continue summary = tools.get_json_value(article, 'app_msg_ext_info.digest') image_url = tools.get_json_value(article, 'app_msg_ext_info.cover') sexy_image_url = [] # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' sexy_image_url.append(local_image_url) article_url = tools.get_json_value(article, 'app_msg_ext_info.content_url') article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url) article_url = article_url.replace('&',"&") release_time = tools.get_json_value(article, 'comm_msg_info.datetime') release_time = tools.timestamp_to_date(int(release_time)) if release_time else '' content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies) regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce' content = tools.get_info(content_html, regex, fetch_one = True) # # 取content里的图片 下载图片 然后替换内容中原来的图片地址 regex = '<img.*?data-src="(.*?)"' images = tools.get_info(content, regex) for image in images: local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg') is_download = tools.download_file(image, local_image_path) if is_download: content = content.replace(image, local_image_path) sexy_image_url.append(local_image_path) tools.delay_time(5) # 敏感事件 sensitive_id = '' if monitor_type == 1 or monitor_type == 2: sensitive_event_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[1].split(',') if sensitive_event_info[1] else [] keyword2 = sensitive_event_info[2].split(',') if sensitive_event_info[2] else [] keyword3 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else [] if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3): sensitive_id = _id break # 违规事件 violate_id = '' if monitor_type == 0 or monitor_type == 2: vioation_knowledge_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[1].split(',') if vioation_knowledge_info[1] else [] keyword2 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else [] keyword3 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else [] if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break log.debug(''' 标题 %s 简介 %s 图片地址 %s 文章地址 %s 发布时间 %s 内容 %s 本地贴图地址 %s 违规状态 %s 敏感事件 %s 图片鉴别地址 %s '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url)) base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url) # 同一天发布的 oneday_article_list = article.get('app_msg_ext_info', {}).get('multi_app_msg_item_list', []) for article in oneday_article_list: title = tools.get_json_value(article, 'title') summary = tools.get_json_value(article, 'digest') image_url = tools.get_json_value(article, 'cover') sexy_image_url = [] # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' sexy_image_url.append(local_image_url) article_url = tools.get_json_value(article, 'content_url') article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url) article_url = article_url.replace('&',"&") content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies) regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce' content = tools.get_info(content_html, regex, fetch_one = True) # 取content里的图片 下载图片 然后替换内容中原来的图片地址 regex = '<img.*?data-src="(.*?)"' images = tools.get_info(content, regex) for image in images: local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg') is_download = tools.download_file(image, local_image_path) if is_download: content = content.replace(image, local_image_path) sexy_image_url.append(local_image_path) tools.delay_time(5) # 敏感事件 sensitive_id = '' sensitive_event_infos = oracledb.find('select * from tab_mvms_sensitive_event') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else [] keyword2 = sensitive_event_info[4].split(',') if sensitive_event_info[4] else [] keyword3 = sensitive_event_info[5].split(',') if sensitive_event_info[5] else [] if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3): sensitive_id = _id break # 违规事件 violate_id = '' vioation_knowledge_infos = oracledb.find('select * from tab_mvms_violation_knowledge') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else [] keyword2 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else [] keyword3 = vioation_knowledge_info[4].split(',') if vioation_knowledge_info[4] else [] if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break log.debug(''' 标题 %s 简介 %s 图片地址 %s 文章地址 %s 发布时间 %s 内容 %s 本地贴图地址 %s 违规状态 %s 敏感事件 %s 图片鉴别地址 %s '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url)) base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url) base_parser.update_url('WWA_wechat_article_url', root_url, Constance.DONE) tools.delay_time()
def download_code(self): tools.download_file(self._remote_zip_url, self._zip_path) tools.unpack_zip(self._zip_path, self._unpack_path)
def save_info(table, site_id, site_name='', url='', title='', content='', release_time='', image_url='', video_url='', is_out_link=1, download_image=False, is_debug=False, es_read_status='', info_type=''): # global num # if num<2000: # num+=1 # image_recogs=image_recog(image_url) # else: # image_recogs=5 if not download_image: sexy_image_url = image_url local_image_path = '' else: file_local_path = tools.get_conf_value('config.conf', 'files', 'zhejiang_app_save_path') if image_url: img_name = 'images/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' tools.download_file(image_url, file_local_path, img_name) local_image_path = file_local_path + img_name sexy_image_url = local_image_path else: local_image_path = '' sexy_image_url = '' if len(content) > 400: temporary_content = content[0:400] else: temporary_content = content # record_time = tools.get_current_date() # release_time = tools.format_date(release_time) try: release_time = tools.format_date(release_time) except Exception as e: log.debug(e, release_time, url) record_time = tools.get_current_date() if release_time > record_time: return content_info = { 'site_name': site_name, 'video_url': video_url, 'image_url': image_url, 'temporary_content': temporary_content, 'title': title, # 'video_local_path': local_video_path,\ 'img_stor_path': local_image_path, 'release_time': release_time, 'is_out_link': is_out_link, 'url': url, 'es_read_status': 0, 'site_id': site_id, 'read_status': 0, 'record_time': record_time, # 'sexy_image_url': sexy_image_url, 'sexy_image_status': '', 'image_pron_status': image_recogs } content_info.pop('temporary_content') content_info['content'] = content if db.add(table, content_info): log.debug(content_info)