def add_root_url(parser_params = {}): log.debug(''' 添加根url parser_params : %s '''% str(parser_params)) _db = base_parser.MongoDB() _db.set_unique_key('PROGRAM_EPISODE_info', 'episode_url') _db.update('PROGRAM_urls', {'depth': 0, 'site_id': SITE_ID}, {'status': 0}, multi=True) for page_num in range(1, 14): urls = [ 'http://list.youku.com/category/show/c_85_g_热门网综_s_1_d_1_p_%d.html' % page_num, 'http://list.youku.com/category/show/c_97_g_优酷出品_s_1_d_1_p_%d.html' % page_num, 'http://list.youku.com/category/show/c_96_g_优酷出品_s_1_d_1_p_%d.html' % page_num, ] for url in urls: print(url) print('********************************************************') html = tools.get_html_by_urllib(url) if tools.get_info(html, ['小酷没有筛选到相关视频']): continue links = tools.get_tag(html, 'div', {'class': 'p-thumb'}) for link in links: try: link = link.a['href'] link = tools.get_full_url('http:', link) link_html = tools.get_html_by_urllib(link) link = tools.get_tag(link_html, 'a', {'class': 'desc-link'}, find_all=False) link = link['href'] link = tools.get_full_url('http:', link) base_parser.add_url('PROGRAM_urls', SITE_ID, link, depth=0) except Exception as e: log.error(e) print(link_html)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] offset = remark.get('offset') html = tools.get_html_by_webdirver(root_url) headers = tools.get_tag(html, 'div', {'class': 'result'}, find_all=True) if not headers: base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE) for header in headers: # 查看更多相关新闻 regex = ' <span class="c-info"><a.*?href="(.*?)".*?查看更多相关新闻' more_news_url = tools.get_info(str(header), regex, fetch_one = True) if more_news_url: more_news_url = tools.get_full_url('http://news.baidu.com', more_news_url) more_news_url = more_news_url.replace('amp;', '') base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, more_news_url, depth = 1, remark = {'offset':0}) url = header.h3.a['href'] article_extractor = ArticleExtractor(url) content = title = release_time = author = website_domain ='' content = article_extractor.get_content() if content: title = article_extractor.get_title() release_time = article_extractor.get_release_time() author = article_extractor.get_author() website_domain = tools.get_domain(url) uuid = tools.get_uuid(title, website_domain) website_name = '' website_position = None log.debug(''' uuid %s title %s author %s release_time %s domain %s url %s content %s '''%(uuid, title, author, release_time, website_domain, url, '...')) # 入库 if tools.is_have_chinese(content): is_continue = self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name , website_domain, website_position, url, content) if not is_continue: break else: # 循环正常结束 该页均正常入库, 继续爬取下页 offset += 50 url = tools.replace_str(root_url, 'pn=\d*', 'pn=%d'%offset) base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, url, depth = 0, remark = {'offset': offset}) base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html = tools.get_html_by_urllib(root_url) title = '<tr height="25"><td><a href=".*?" title="(.*?)"' video_url = ['<tr height="25"><td><a href="(.*?)"'] author = ['<a href="user-.*?.html" target="_blank">(.*?)</a>'] watched_count = ['浏览次数: </span>(.*?) '] file_size = ['资料大小: </span>(.*?) '] download_count = ['下载次数: </span>(.*?) '] titles = tools.get_info(html, title, allow_repeat = True) video_urls = tools.get_info(html, video_url, allow_repeat = True) authors = tools.get_info(html, author, allow_repeat = True) watched_counts = tools.get_info(html, watched_count, allow_repeat = True) file_sizes = tools.get_info(html, file_size, allow_repeat= True) download_counts = tools.get_info(html, download_count, allow_repeat = True) for i in range(len(titles)): title = titles[i] title = tools.del_html_tag(title) video_url = video_urls[i] video_url = tools.get_full_url('http://www.sobaidupan.com', video_url) author = authors[i] watched_count = watched_counts[i] file_size = file_sizes[i] download_count = download_counts[i] log.debug(''' 标题: %s 视频地址: %s 作者: %s 观看数 %s 资料大小 %s 下载次数 %s '''%(title, video_url, author, watched_count, file_size, download_count)) contained_key, contained_key_count = base_parser.get_contained_key(title, '', remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, video_url, title, file_size = file_size, file_name = title, author = author, watched_count = watched_count, download_count = download_count, search_type = search_type, keyword = contained_key, keyword_count = contained_key_count, task_id = remark['task_id']) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def add_html_url(html, depth, spider_depth, website_url, website_name, website_domain, remark): # 近一步取待做url if depth < spider_depth - 1: urls = tools.get_urls(html) for url in urls: url = tools.get_full_url(website_url, url) if website_name == '百度新闻': remark['website_name'] = '' remark['website_domain'] = tools.get_domain(url) remark['website_position'] = None base_parser.add_url(SITE_ID, url, depth + 1, remark=remark) elif website_domain in url: base_parser.add_url(SITE_ID, url, depth + 1, remark=remark)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] json = tools.get_json_by_requests(root_url) # 主播信息 lives = tools.get_json_value(json, 'lives') # print(tools.dumps_json(lives)) for live in lives: name = tools.get_json_value(live, 'creator.nick') image_url = tools.get_json_value(live, 'creator.portrait') image_url = tools.get_full_url('http://img2.inke.cn', image_url) room_id = tools.get_json_value(live, 'creator.id') room_url = tools.get_json_value(live, 'share_addr') video_path = tools.get_json_value(live, 'stream_addr') watched_count = tools.get_json_value(live, 'online_users') address = tools.get_json_value(live, 'city') # 取粉丝数 params = { 'lc': '0000000000000048', 'cc': 'TG0001', 'cv': 'IK3.8.60_Iphone', 'proto': 7, 'idfa': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB', 'idfv': '5779214D-BC8F-446E-A547-913048F7F935', 'devi': '0a4392f06ab0ff10b44c6f88d95bf4d6db67f0e7', 'osversion': 'ios_10.200000', 'ua': 'iPhone9_2', 'imei': '', 'imsi': '', 'uid': 207821358, 'sid': '20RUXGrYPxpJy75btYQYlVp6lYxi0wj1xV50Ttnls6ty3DcXE5i1', 'conn': 'wifi', 'mtid': '987c70ecbcd643998ea6bcd3b8868934', 'mtxid': 'b0958e29253f', 'logid': 133, 'id': room_id, 's_sg': S_SG, 's_sc': 100, 's_st': CURRENT_TIMESTAMP } fans_json = tools.get_json_by_requests( 'http://120.55.238.158/api/user/relation/numrelations', params) fans_count = tools.get_json_value(fans_json, 'num_followers') #主播观众数请求地址 params = { 'lc': '0000000000000048', 'cc': 'TG0001', 'cv': 'IK3.8.60_Iphone', 'proto': 7, 'idfa': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB', 'idfv': '5779214D-BC8F-446E-A547-913048F7F935', 'devi': '0a4392f06ab0ff10b44c6f88d95bf4d6db67f0e7', 'osversion': 'ios_10.200000', 'ua': 'iPhone9_2', 'imei': '', 'imsi': '', 'uid': 207821358, 'sid': '20RUXGrYPxpJy75btYQYlVp6lYxi0wj1xV50Ttnls6ty3DcXE5i1', 'conn': 'wifi', 'mtid': '987c70ecbcd643998ea6bcd3b8868934', 'mtxid': 'b0958e29253f', 'logid': 133, 'id': tools.get_json_value(live, 'id'), 'multiaddr': 1, 's_sg': S_SG, 's_sc': 100, 's_st': CURRENT_TIMESTAMP } watched_count_url = 'http://120.55.238.158/api/live/infos' #?lc=0000000000000048&cc=TG0001&cv=IK3.8.60_Iphone&proto=7&idfa=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&idfv=5779214D-BC8F-446E-A547-913048F7F935&devi=0a4392f06ab0ff10b44c6f88d95bf4d6db67f0e7&osversion=ios_10.200000&ua=iPhone9_2&imei=&imsi=&uid=207821358&sid=20RUXGrYPxpJy75btYQYlVp6lYxi0wj1xV50Ttnls6ty3DcXE5i1&conn=wifi&mtid=987c70ecbcd643998ea6bcd3b8868934&mtxid=b0958e29253f&logid=133&id=1487572239333810%2C1487572432485069%2C1487572763094071%2C1487573160678176%2C1487571635332280&multiaddr=1&s_sg=c3493ab9d9b2e19cfc20f98bb75ff72f&s_sc=100&s_st=1487573119' watched_count_url = tools.joint_url(watched_count_url, params) live_info = tools.get_json_by_requests(watched_count_url) sex = live_info['lives'][0]['creator']['sex'] sex = 0 if sex == '1' else 1 #数据库中 0 男 1女; 映客中 0 和 3是女 1是男 age = '' log.debug( ''' 名字: %s 贴图: %s 主播id: %s 房间url: %s 视频流地址: %s 观看数: %s 地址: %s 粉丝数: %s 性别: %s 年龄: %s 观众数url: %s ''' % (name, image_url, room_id, room_url, video_path, watched_count, address, fans_count, sex, age, watched_count_url)) base_parser.add_anchor_info('LiveApp_anchor_info', SITE_ID, name=name, image_url=image_url, room_id=room_id, room_url=room_url, video_path=video_path, watched_count=watched_count, address=address, fans_count=fans_count, sex=sex, age=age, watched_count_url=watched_count_url) base_parser.update_url('LiveApp_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] website_name = remark['website_name'] website_position = remark['website_position'] website_url = remark['website_url'] website_domain = remark['website_domain'] html = tools.get_html(root_url) if not html: base_parser.update_url('news_urls', root_url, Constance.EXCEPTION) return # 近一步取待做url if depth < DEPTH: urls = tools.get_urls(html) for url in urls: url = tools.get_full_url(website_url, url) if website_name == '百度新闻': remark['website_name'] = '' remark['website_domain'] = tools.get_domain(url) remark['website_position'] = None base_parser.add_url('news_urls', SITE_ID, url, depth + 1, remark=remark) elif website_domain in url: base_parser.add_url('news_urls', SITE_ID, url, depth + 1, remark=remark) # 解析网页 content = title = release_time = author = '' article_extractor = ArticleExtractor(root_url, html) content = article_extractor.get_content() if content: title = article_extractor.get_title() release_time = article_extractor.get_release_time() author = article_extractor.get_author() uuid = tools.get_uuid( title, website_domain) if title != website_name else tools.get_uuid( root_url, ' ') log.debug(''' uuid %s title %s author %s release_time %s website_name %s domain %s position %s url %s content %s ''' % (uuid, title, author, release_time, website_name, website_domain, website_position, root_url, content)) if tools.is_have_chinese(content): # 入库 self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name, website_domain, website_position, root_url, content) log.debug('%s 处理完成' % root_url) base_parser.update_url('news_urls', root_url, Constance.DONE)
def parser_program_info(url_info): log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] program_id = remark['program_id'] classify = remark['classify'] # 解析 html, request = tools.get_html_by_requests(root_url) if not html: base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION) return # 标题 regex = '<h1 class="video_title_cn" >.*?>(.*?)<' program_name = tools.get_info(html, regex, fetch_one=True) # 地址 program_url = root_url # 图片地址 regex = '<img src="(.*?)".*?_stat="info:poster"/>' image_url = tools.get_info(html, regex, fetch_one=True) image_url = tools.get_full_url('http://', image_url) # 集数 regex = ['更新期数:</span>.*?>(.*?)</span>', '总集数:</span>.*?>(.*?)</span>'] episode = tools.get_info(html, regex, fetch_one=True) # 导演 regex = '<span class="director">导演</span>.*?info:actor_name">(.*?)<' directors = tools.get_info(html, regex, split='/') # 演员 regex = '_stat="info:actor_name">(.*?)</span>' # 包含导演 actors = tools.get_info(html, regex, split='/') actors = actors.replace(directors + '/', '') if directors else actors # 发布时间 regex = ['首播时间:</span>.*?>(.*?)<', '出品时间:</span>.*?>(.*?)<'] release_time = tools.get_info(html, regex, fetch_one=True) # 简介 regex = 'desc_txt">(.*?)</span>' summary = tools.get_info(html, regex, fetch_one=True) summary = tools.del_html_tag(summary) log.debug(''' program_name %s program_url %s image_url %s episode %s directors %s actors %s release_time %s summary %s ''' % (program_name, program_url, image_url, episode, directors, actors, release_time, summary)) program_mongo_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url, episode, directors, actors=actors, summary=summary, release_time=release_time) # 解析分集信息的地址 # 综艺(json) if classify == '综艺': # 月份 regex = '_stat="series:tab".*?>(\d*)月' months = tools.get_info(html, regex) # print(months) # 年 regex = '<div class="year_slt_list">.*?data-value="(.*?)"' years = tools.get_info(html, regex) # print(years) for year in years: for month in months: episode_url = 'http://s.video.qq.com/get_playsource?id=%s&plat=2&type=4&data_type=3&video_type=10&year=%s&month=%s&plname=qq&otype=json' % ( program_id, year, month) log.debug('%s分集json地址:%s' % (program_name, episode_url)) base_parser.add_url('PROGRAM_urls', site_id, episode_url, depth=2, remark={ 'program_mongo_id': program_mongo_id, 'classify': '综艺' }) # 电视剧(包含第几集, url) elif classify == '电视剧': regex = 'series:numbtn".*?<a href="(.*?)".*?<span itemprop="episodeNumber">(.*?)</span>' episode_msgs = tools.get_info(html, regex) for episode_msg in episode_msgs: episode_url = episode_msg[0] episode_num = episode_msg[1] log.debug(''' episode_url %s episode_num %s ''' % (episode_url, episode_num)) base_parser.add_url('PROGRAM_urls', site_id, episode_url, depth=2, remark={ 'program_mongo_id': program_mongo_id, 'episode_num': episode_num, 'program_name': program_name, 'classify': '电视剧' }) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] headers = { "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Accept-Encoding": "gzip, deflate", "Cookie": "wuid=AAGPF/32GQAAAAqLFD2BdAAAGwY=; CXID=A468F618D67D4868DC83E6061B1B3CCC; ABTEST=0|1500285612|v1; weixinIndexVisited=1; SUV=006317867B7CC4C5596C8AAD6B089707; SUIR=0A14ACB4D0CA9B50A8ABB33CD0CA69FA; ld=ekllllllll2BbH49lllllVOm1tylllll1kecBlllll9lllll9Zlll5@@@@@@@@@@; ad=AZllllllll2Bzw7GlllllVOeQA6lllll1kectkllll9lllllVqxlw@@@@@@@@@@@; SUID=72780CD23D148B0A59688B0C0002AD65; IPLOC=CN1100; sct=11; SNUID=B4B50E097177247B9A6BE55E72153425; JSESSIONID=aaaVCfkabuJQTfaNW5f1v", "Host": "weixin.sogou.com" } # 解析 html, request = tools.get_html_by_requests(root_url, headers=headers) if not html: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">' check_info = tools.get_info(html, regex, fetch_one=True) log.debug('取公众号列表' + check_info) # 公众号信息块 regex = '<!-- a -->(.*?)<!-- z -->' account_blocks = tools.get_info(html, regex) if not account_blocks: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return # 文章数url regex = '<script>var account_anti_url = "(.*?)";</script>' articles_count_url = tools.get_info(html, regex, fetch_one=True) articles_count_url = tools.get_full_url('http://weixin.sogou.com', articles_count_url) articles_count_json = tools.get_json_by_requests(articles_count_url).get( 'msg', {}) for account_block in account_blocks: # print(account_block) regex = '<a.*?account_name.*?>(.*?)</a>' name = tools.get_info(account_block, regex, fetch_one=True) name = tools.del_html_tag(name) is_have = mongodb.find('WWA_wechat_official_accounts', {'name': name}) if is_have: log.debug(name + " 已存在") continue regex = '<div class="img-box">.*?<img src="(.*?)"' image_url = tools.get_info(account_block, regex, fetch_one=True) # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' regex = '<p class="tit">.*?(<i></i>).*?<p class="info">' is_verified = 102 if tools.get_info( account_block, regex, fetch_one=True) else 101 regex = '<label name="em_weixinhao">(.*?)</label>' account_id = tools.get_info(account_block, regex, fetch_one=True) regex = '<li id="sogou_vr_.*?d="(.*?)">' article_count_key = tools.get_info(account_block, regex, fetch_one=True) article_count = articles_count_json.get(article_count_key, '') article_count = article_count[:article_count.find(',')] regex = '<dt>功能介绍.*?<dd>(.*?)</dd>' summary = tools.get_info(account_block, regex, fetch_one=True) summary = tools.del_html_tag(summary) regex = "认证.*?<dd>(.*?)</dd>" certification = tools.get_info(account_block, regex, fetch_one=True) regex = '微信扫一扫关注.*?<img.*?src="(.*?)"' barcode_url = tools.get_info(account_block, regex, fetch_one=True) barcode_url = barcode_url.replace('&', "&") # 下载图片 local_barcode_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(barcode_url, local_barcode_url) local_barcode_url = local_barcode_url if is_download else '' regex = '<a.*?account_name.*?href="(.*?)">' account_url = tools.get_info(account_block, regex, fetch_one=True) account_url = account_url.replace('&', "&") log.debug(''' 公众号名称 %s 公众号账号 %s 账号url %s 贴图 %s 本地贴图 %s 文章数量 %s 简介 %s 微信认证 %s 是否加V(是否认证) %s 二维码 %s 本地二维码 %s ''' % (name, account_id, account_url, image_url, local_image_url, article_count, summary, certification, is_verified, barcode_url, local_barcode_url)) base_parser.add_wechat_account_info( 'WWA_wechat_official_accounts', site_id, name, account_id, account_url, image_url, local_image_url, article_count, summary, certification, is_verified, barcode_url, local_barcode_url) base_parser.update_url('WWA_wechat_account_url', root_url, Constance.DONE) tools.delay_time()
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark']['keyword'] monitor_type = url_info['remark']['monitor_type'] official_accounts_id = remark retry_times = url_info['retry_times'] headers = { "Host": "weixin.sogou.com", "Connection": "keep-alive", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Cookie": "ABTEST=8|1506658658|v1; IPLOC=CN1100; SUID=C5C47C7B642E940A0000000059CDC962; SUID=C5C47C7B1508990A0000000059CDC963; weixinIndexVisited=1; SUV=00F95AA57B7CC4C559CDC963CE316529; SNUID=2B2A9295EDE8B7A2BCECB605EE30F1BE; JSESSIONID=aaadcwpP9yaKs-PCMhz6v", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1" } # 获取代理 proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() # 解析 # print(proxies) # html, r = tools.get_html_by_requests('http://ip.chinaz.com/getip.aspx', headers = headers, proxies = proxies) # print(html) html, request = tools.get_html_by_requests(root_url, headers = headers, proxies = proxies) if not html: base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1) return # print(html) regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">' check_info = tools.get_info(html, regex, fetch_one = True) print(root_url) log.debug('取文章链接' + check_info) if check_info: base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1) return # 公众号信息块 regex = '<!-- a -->(.*?)<!-- z -->' account_block = tools.get_info(html, regex, fetch_one = True) # url regex = '<a.*?account_name.*?href="(.*?)">' account_url = tools.get_info(account_block, regex, fetch_one = True) account_url = account_url.replace('&',"&") log.debug('account_url = ' + account_url) if not account_url: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return headers = { "Accept-Language": "zh-CN,zh;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Host": "mp.weixin.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive" } # 代理 proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() proxies = {} #使用代理会出现验证码 暂时不使用 html, request = tools.get_html_by_requests(account_url, headers = headers, proxies = proxies) regex = '<input class="weui_input frm_input" id="input" placeholder="(.*?)" maxlength="4">' check_info = tools.get_info(html, regex, fetch_one = True) log.debug(''' 取文章详细内容 %s url %s request.headers %s '''%(check_info, account_url, request.headers)) # print(html) regex = 'var msgList = (.*?});' article_json = tools.get_info(html, regex, fetch_one = True) article_json = tools.get_json(article_json) article_list = article_json.get('list', {}) for article in article_list: title = tools.get_json_value(article, 'app_msg_ext_info.title') is_have = mongodb.find('WWA_wechat_article', {'title' : title}) if is_have: log.debug(title + " 已存在") continue summary = tools.get_json_value(article, 'app_msg_ext_info.digest') image_url = tools.get_json_value(article, 'app_msg_ext_info.cover') sexy_image_url = [] # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' sexy_image_url.append(local_image_url) article_url = tools.get_json_value(article, 'app_msg_ext_info.content_url') article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url) article_url = article_url.replace('&',"&") release_time = tools.get_json_value(article, 'comm_msg_info.datetime') release_time = tools.timestamp_to_date(int(release_time)) if release_time else '' content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies) regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce' content = tools.get_info(content_html, regex, fetch_one = True) # # 取content里的图片 下载图片 然后替换内容中原来的图片地址 regex = '<img.*?data-src="(.*?)"' images = tools.get_info(content, regex) for image in images: local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg') is_download = tools.download_file(image, local_image_path) if is_download: content = content.replace(image, local_image_path) sexy_image_url.append(local_image_path) tools.delay_time(5) # 敏感事件 sensitive_id = '' if monitor_type == 1 or monitor_type == 2: sensitive_event_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[1].split(',') if sensitive_event_info[1] else [] keyword2 = sensitive_event_info[2].split(',') if sensitive_event_info[2] else [] keyword3 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else [] if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3): sensitive_id = _id break # 违规事件 violate_id = '' if monitor_type == 0 or monitor_type == 2: vioation_knowledge_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[1].split(',') if vioation_knowledge_info[1] else [] keyword2 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else [] keyword3 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else [] if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break log.debug(''' 标题 %s 简介 %s 图片地址 %s 文章地址 %s 发布时间 %s 内容 %s 本地贴图地址 %s 违规状态 %s 敏感事件 %s 图片鉴别地址 %s '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url)) base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url) # 同一天发布的 oneday_article_list = article.get('app_msg_ext_info', {}).get('multi_app_msg_item_list', []) for article in oneday_article_list: title = tools.get_json_value(article, 'title') summary = tools.get_json_value(article, 'digest') image_url = tools.get_json_value(article, 'cover') sexy_image_url = [] # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' sexy_image_url.append(local_image_url) article_url = tools.get_json_value(article, 'content_url') article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url) article_url = article_url.replace('&',"&") content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies) regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce' content = tools.get_info(content_html, regex, fetch_one = True) # 取content里的图片 下载图片 然后替换内容中原来的图片地址 regex = '<img.*?data-src="(.*?)"' images = tools.get_info(content, regex) for image in images: local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg') is_download = tools.download_file(image, local_image_path) if is_download: content = content.replace(image, local_image_path) sexy_image_url.append(local_image_path) tools.delay_time(5) # 敏感事件 sensitive_id = '' sensitive_event_infos = oracledb.find('select * from tab_mvms_sensitive_event') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else [] keyword2 = sensitive_event_info[4].split(',') if sensitive_event_info[4] else [] keyword3 = sensitive_event_info[5].split(',') if sensitive_event_info[5] else [] if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3): sensitive_id = _id break # 违规事件 violate_id = '' vioation_knowledge_infos = oracledb.find('select * from tab_mvms_violation_knowledge') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else [] keyword2 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else [] keyword3 = vioation_knowledge_info[4].split(',') if vioation_knowledge_info[4] else [] if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break log.debug(''' 标题 %s 简介 %s 图片地址 %s 文章地址 %s 发布时间 %s 内容 %s 本地贴图地址 %s 违规状态 %s 敏感事件 %s 图片鉴别地址 %s '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url)) base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url) base_parser.update_url('WWA_wechat_article_url', root_url, Constance.DONE) tools.delay_time()