def inner_add_url(url, keyword): while url: html_json = tools.get_json_by_requests(url) json_value = tools.get_json_value(html_json, 'obj.pageNumberStack') hasNext = tools.get_json_value(html_json, 'obj.hasNext') if hasNext: url = 'http://sj.qq.com/myapp/searchAjax.htm?kw=%s&pns=' % keyword + json_value + '&sid=0' base_parser.add_url('WWA_search_app_urls', SITE_ID, url) continue else: break
def getdownload(episode_download_url_json): episode_json = tools.get_json_by_requests(episode_download_url_json) #print(episode_download_url_json) episode_download_url = tools.get_json_value(episode_json, 'msgs.playurl.domain') episode_download_url = episode_download_url and episode_download_url[ 0] or '' #print('-----',episode_download_url) episode_download_url_definition = tools.get_json_value( episode_json, 'msgs.playurl.dispatch.1080p') episode_download_url_definition = episode_download_url_definition and episode_download_url_definition[ 0] or '' print(episode_download_url_definition, '*********') episode_download_url = episode_download_url + episode_download_url_definition episode_download_url += "&ctv=pc&m3v=1&termid=1&format=1&hwtype=un&ostype=Linux&tag=letv&sign=letv&expect=3&tn={}&pay=0&iscpn=f9051&rateid={}".format( random.random(), '1080p') episode_download_url_json = tools.get_json_by_requests( episode_download_url) episode_download_url = tools.get_json_value(episode_download_url_json, 'location') return episode_download_url
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] column_id = remark headers = { 'Host': 'is.snssdk.com', 'Accept': ' */*', 'X-SS-Cookie': '_ba=BA0.2-20170101-51e32-mV0oh6KwzUmWxXl227kO; install_id=8738029911; ttreq=1$b34d173d3544397b1ca82d19a58a7db80e2aef29; qh[360]=1; alert_coverage=33; _ga=GA1.2.1084363974.1479979043; login_flag=cd47dd57ff2f963719bc324163954696; sessionid=3554607744525de375854663cc7e355b; sid_guard="3554607744525de375854663cc7e355b|1489461314|2592000|Thu\054 13-Apr-2017 03:15:14 GMT"; sid_tt=3554607744525de375854663cc7e355b', 'tt-request-time': '1489990271848', 'Cookie': ' _ba=BA0.2-20170101-51e32-mV0oh6KwzUmWxXl227kO; install_id=8738029911; ttreq=1$b34d173d3544397b1ca82d19a58a7db80e2aef29; qh[360]=1; alert_coverage=33; _ga=GA1.2.1084363974.1479979043; login_flag=cd47dd57ff2f963719bc324163954696; sessionid=3554607744525de375854663cc7e355b; sid_guard="3554607744525de375854663cc7e355b|1489461314|2592000|Thu\054 13-Apr-2017 03:15:14 GMT"; sid_tt=3554607744525de375854663cc7e355b', 'User-Agent': 'News/6.0.1 (iPhone; iOS 10.2.1; Scale/3.00)', 'Accept-Language': ' zh-Hans-CN;q=1, en-CN;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Connection': ' keep-alive' } json = tools.get_json_by_requests(root_url) if not json: base_parser.update_url('VAApp_urls', root_url, Constance.EXCEPTION) return datas = json['data'] for data in datas: data = tools.get_json_value(data, 'content') title = tools.get_json_value(data, 'title') # 检测数据库中是否存在,若存在则退出 if db.find('VAApp_content_info', {'title': title}): continue abstract = tools.get_json_value(data, 'abstract') abstract = abstract and abstract or tools.get_json_value( data, 'content') img_url = tools.get_json_value(data, 'image_list.url') img_url = img_url and img_url or tools.get_json_value( data, 'middle_image.url') img_url = img_url and img_url or tools.get_json_value( data, 'large_image_list.url') img_url = img_url and img_url.replace('.webp', '.jpg') or img_url original_url = tools.get_json_value(data, 'article_url') original_url = original_url and original_url or tools.get_json_value( data, 'share_url') release_time = tools.get_json_value(data, 'publish_time') release_time = release_time and release_time or tools.get_json_value( data, '1481012423') release_time = release_time and tools.timestamp_to_date( release_time) or release_time video_msg = tools.get_json_value(data, 'video_play_info') #需要处理 video_main_url = tools.get_json_value(video_msg, 'video_list.video_2.main_url') video_main_url = video_main_url and video_main_url or tools.get_json_value( video_msg, 'video_list.video_1.main_url') parse_video_url = tools.compile_js(PARSE_VIDEO_URL_JSFUNC) video_url = parse_video_url('base64decode', video_main_url) html = tools.get_html_auto_deal_code(original_url) regexs = [ 'class="article-content">(.*?)<div class="article-actions">', '<div class="content">(.*?)<div class="suggestion-list-con"', '<!-- 文章内容 -->(.*?)<!-- @end 文章内容 -->', 'class="yi-content-text">(.*?)<div class="yi-normal"', '<p.*?>(.*?)</p>' ] if video_url: content = abstract else: content = ''.join(tools.get_info(html, regexs)) content = tools.del_html_tag(content) if len(content) < len(abstract): content = abstract # 敏感事件 sensitive_id = '' sensitive_event_infos = oracledb.find( 'select * from tab_mvms_sensitive_event') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[3].split( ' ') if sensitive_event_info[3] else [] keyword2 = sensitive_event_info[4].split( ' ') if sensitive_event_info[4] else [] keyword3 = sensitive_event_info[5].split( ' ') if sensitive_event_info[5] else [] if base_parser.is_violate(title + content, key1=keyword1, key2=keyword2, key3=keyword3): sensitive_id = _id # 违规事件 violate_id = '' vioation_knowledge_infos = oracledb.find( 'select * from tab_mvms_violation_knowledge') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[2].split( ' ') if vioation_knowledge_info[2] else [] keyword2 = vioation_knowledge_info[3].split( ' ') if vioation_knowledge_info[3] else [] keyword3 = vioation_knowledge_info[4].split( ' ') if vioation_knowledge_info[4] else [] if base_parser.is_violate(title + content, key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id log.debug(''' title: %s abstract : %s img_url : %s original_url: %s release_time : %s video_main_url: %s video_url: %s content : %s column_id: %d sensitive_id: %d violate_id: %d ''' % (title, abstract, img_url, original_url, release_time, video_main_url, video_url, content, column_id, sensitive_id and sensitive_id or 0, violate_id and violate_id or 0)) # 如果是视频栏 并且不包含敏感或违法信息 则不下载 if column_id == VIDEO: if not sensitive_id and not violate_id: continue # 下载 base_path = FILE_LOCAL_PATH is_download = 0 # 下载图片 img_name = '' if img_url: img_name = 'images/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(img_url, base_path, img_name) if not is_download: img_name = '' # 下载视频 video_name = '' if video_url: video_name = 'videos/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.mp4' is_download = tools.download_file(video_url, base_path, video_name) if not is_download: video_name = '' if original_url: base_parser.add_va_app_content_info( 'VAApp_content_info', SITE_ID, title, abstract, img_url, img_name, original_url, release_time, video_url, video_name, content, column_id, is_download, sensitive_id, violate_id, STORAGE_ID) base_parser.update_url('VAApp_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] json = tools.get_json_by_requests(root_url) # 主播信息 lives = tools.get_json_value(json, 'lives') # print(tools.dumps_json(lives)) for live in lives: name = tools.get_json_value(live, 'creator.nick') image_url = tools.get_json_value(live, 'creator.portrait') image_url = tools.get_full_url('http://img2.inke.cn', image_url) room_id = tools.get_json_value(live, 'creator.id') room_url = tools.get_json_value(live, 'share_addr') video_path = tools.get_json_value(live, 'stream_addr') watched_count = tools.get_json_value(live, 'online_users') address = tools.get_json_value(live, 'city') # 取粉丝数 params = { 'lc': '0000000000000048', 'cc': 'TG0001', 'cv': 'IK3.8.60_Iphone', 'proto': 7, 'idfa': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB', 'idfv': '5779214D-BC8F-446E-A547-913048F7F935', 'devi': '0a4392f06ab0ff10b44c6f88d95bf4d6db67f0e7', 'osversion': 'ios_10.200000', 'ua': 'iPhone9_2', 'imei': '', 'imsi': '', 'uid': 207821358, 'sid': '20RUXGrYPxpJy75btYQYlVp6lYxi0wj1xV50Ttnls6ty3DcXE5i1', 'conn': 'wifi', 'mtid': '987c70ecbcd643998ea6bcd3b8868934', 'mtxid': 'b0958e29253f', 'logid': 133, 'id': room_id, 's_sg': S_SG, 's_sc': 100, 's_st': CURRENT_TIMESTAMP } fans_json = tools.get_json_by_requests( 'http://120.55.238.158/api/user/relation/numrelations', params) fans_count = tools.get_json_value(fans_json, 'num_followers') #主播观众数请求地址 params = { 'lc': '0000000000000048', 'cc': 'TG0001', 'cv': 'IK3.8.60_Iphone', 'proto': 7, 'idfa': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB', 'idfv': '5779214D-BC8F-446E-A547-913048F7F935', 'devi': '0a4392f06ab0ff10b44c6f88d95bf4d6db67f0e7', 'osversion': 'ios_10.200000', 'ua': 'iPhone9_2', 'imei': '', 'imsi': '', 'uid': 207821358, 'sid': '20RUXGrYPxpJy75btYQYlVp6lYxi0wj1xV50Ttnls6ty3DcXE5i1', 'conn': 'wifi', 'mtid': '987c70ecbcd643998ea6bcd3b8868934', 'mtxid': 'b0958e29253f', 'logid': 133, 'id': tools.get_json_value(live, 'id'), 'multiaddr': 1, 's_sg': S_SG, 's_sc': 100, 's_st': CURRENT_TIMESTAMP } watched_count_url = 'http://120.55.238.158/api/live/infos' #?lc=0000000000000048&cc=TG0001&cv=IK3.8.60_Iphone&proto=7&idfa=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&idfv=5779214D-BC8F-446E-A547-913048F7F935&devi=0a4392f06ab0ff10b44c6f88d95bf4d6db67f0e7&osversion=ios_10.200000&ua=iPhone9_2&imei=&imsi=&uid=207821358&sid=20RUXGrYPxpJy75btYQYlVp6lYxi0wj1xV50Ttnls6ty3DcXE5i1&conn=wifi&mtid=987c70ecbcd643998ea6bcd3b8868934&mtxid=b0958e29253f&logid=133&id=1487572239333810%2C1487572432485069%2C1487572763094071%2C1487573160678176%2C1487571635332280&multiaddr=1&s_sg=c3493ab9d9b2e19cfc20f98bb75ff72f&s_sc=100&s_st=1487573119' watched_count_url = tools.joint_url(watched_count_url, params) live_info = tools.get_json_by_requests(watched_count_url) sex = live_info['lives'][0]['creator']['sex'] sex = 0 if sex == '1' else 1 #数据库中 0 男 1女; 映客中 0 和 3是女 1是男 age = '' log.debug( ''' 名字: %s 贴图: %s 主播id: %s 房间url: %s 视频流地址: %s 观看数: %s 地址: %s 粉丝数: %s 性别: %s 年龄: %s 观众数url: %s ''' % (name, image_url, room_id, room_url, video_path, watched_count, address, fans_count, sex, age, watched_count_url)) base_parser.add_anchor_info('LiveApp_anchor_info', SITE_ID, name=name, image_url=image_url, room_id=room_id, room_url=room_url, video_path=video_path, watched_count=watched_count, address=address, fans_count=fans_count, sex=sex, age=age, watched_count_url=watched_count_url) base_parser.update_url('LiveApp_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] description = url_info['remark'] html, r = tools.get_html_by_requests(source_url) regexs = '<ul class="st-list cfix">(.*?)<div class="ssPages area">' lis = tools.get_info(html, regexs) regexs = '<li>(.*?)</li>' html_lis = tools.get_info(lis, regexs) for html_li in html_lis: url_regex = '<a href="(.*?)"' url = tools.get_info(html_li, url_regex) url = url and url[0] or '' #取到每一部url url = "http:" + url everyone_html, r = tools.get_html_by_requests(url) # 部 信息 regexs_program_name = '<h3 class="lh-tit">.*?<a.*?>(.*?)</a>' program_name = tools.get_info(html_li, regexs_program_name) program_name = program_name and program_name[0] or '' # 部 url program_url = url #部 发布时间 release_time_regex = '发布时间:(.*?)</p>' release_time = tools.get_info(html_li, release_time_regex) release_time = release_time and release_time[0] or '' # 部 摘要 regexs_summary = '<span class="full_intro" style="display: none">(.*?)</span>' summary = tools.get_info(everyone_html, regexs_summary) summary = summary and summary[0] or '' summary = tools.del_html_tag(summary) # 部 图片 img_url_regex = '<img.*?src="(.*?)".*?>' image_url = tools.get_info(html_li, img_url_regex) image_url = image_url and image_url[0] or '' image_url = "http:"+image_url # log.debug(''' # depth = %s # program_name = %s # program_url = %s # image_url = %s # summary = %s # release_time = %s # ''' % (depth, program_name, program_url, image_url, summary, release_time)) program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode='', directors='', actors='', summary=summary, release_time=release_time) print('-=-=-=-=-=-=-=-=-=-=-') #获取每集信息json url参数playlistId,variety_year everyone_html, r = tools.get_html_by_requests(program_url) playlistId_regex = 'var playlistId="(\d*?)";' playlistId = tools.get_info(everyone_html, playlistId_regex) playlistId = ''.join(playlistId) # 获取每集信息json url参数variety_year variety_years_regex = '<li class="v-year">(.*?)</li>' variety_years_html = tools.get_info(everyone_html, variety_years_regex) variety_years_regex = '<em>(\d*?)</em>' variety_years = tools.get_info(variety_years_html, variety_years_regex) if playlistId and variety_years: for variety_year in variety_years: episode_json_url = 'http://tv.sohu.com/item/VideoServlet?callback=&source=sohu&id=' + \ playlistId + '&year=' + variety_year + '&month=0&page=1' episode_json = tools.get_json_by_requests(episode_json_url) # print(tools.dumps_json(episode_json)) # 获取集数 episode_json_infos = tools.get_json_value(episode_json, 'videos') #episode = len(episode_json_infos) for episode_json_info in episode_json_infos: # 集摘要 episode_summary = tools.get_json_value(episode_json_info, 'videoDesc') # 集名字 episode_name = tools.get_json_value(episode_json_info, 'title') # 集url episode_url = tools.get_json_value(episode_json_info, 'url') # 下载地址 episode_download_url = you_get.get_video_url(episode_url) if episode_download_url: episode_download_url = '^_^'.join(episode_download_url) # 集图片地址 episode_image_url = tools.get_json_value(episode_json_info, 'pic10') # 当前集数 episode_num = tools.get_json_value(episode_json_info, 'showDate') download_status = 102 time_length = '' if episode_download_url: # log.debug(''' # depth = %s # episode_num = %s # time_length = %s # episode_name = %s # episode_url = %s # episode_download_url = %s # episode_summary = %s # episode_image_url = %s # # ''' % ( # depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, # episode_image_url)) base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, sto_path='') if playlistId and not variety_years: regexs = '<!-- start : juqing title -->(.*?)<!-- end : plot content -->' episode_infos = tools.get_info(everyone_html, regexs) for episode_info in episode_infos: # 集名字 regex = '<h4><.*?>(.*?)<span></span></a></h4>' episode_name = tools.get_info(episode_info, regex) episode_name = episode_name and episode_name[0] or '' # 摘要 regex = '<p class="intro synopsis text">(.*?)</p>' episode_summary = tools.get_info(episode_info, regex) episode_summary = episode_summary and episode_summary[0] or '' episode_summary = tools.del_html_tag(episode_summary) # 图片url regex = '<img src="(.*?)" width=".*?" height=".*?"' episode_image_url = tools.get_info(episode_info, regex) episode_image_url = episode_image_url and episode_image_url[0] or '' episode_image_url = "http:" + episode_image_url # 集数 regex = '<h4><a href=.*?>画心师 第一季(.*?)<span></span></a></h4>' episode_num = tools.get_info(episode_info, regex) episode_num = episode_num and episode_num[0] or '' # url regex = '<h4><a href="(.*?)" title=".*?" target="_blank">' episode_url = tools.get_info(episode_info, regex) episode_url = episode_url and episode_url[0] or '' episode_url = "http:" + episode_url # 下载地址 episode_download_url = you_get.get_video_url(episode_url) if episode_download_url: episode_download_url = '^_^'.join(episode_download_url) download_status = 102 time_length = '' if episode_download_url: log.debug(''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s episode_download_url = %s episode_summary = %s episode_image_url = %s ''' % ( depth+1, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, sto_path='') # 更新source_url为done base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] weibo_id = url_info['remark']['search_keyword'] monitor_type = url_info['remark']['monitor_type'] for i in range(1, 100): weibo_content_url = root_url + '&page=%d' % i # 代理 headers = { "Cache-Control": "max-age=0", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011", "Accept-Language": "zh-CN,zh;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Host": "m.weibo.cn", "Accept-Encoding": "gzip, deflate, br", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" } proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() proxies = {} html = tools.get_json_by_requests(weibo_content_url, headers=headers, proxies=proxies) cards = tools.get_json_value(html, 'cards') if len(cards) < 2: base_parser.update_url('WWA_weibo_info_urls', root_url, Constance.DONE) return tools.delay_time(10) for card in cards: mblog = tools.get_json_value(card, 'mblog') if not mblog: continue url = tools.get_json_value(card, 'scheme') # 代理 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011", "Host": "m.weibo.cn", "Accept-Language": "zh-CN,zh;q=0.8", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive" } proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() proxies = {} origin_html, r = tools.get_html_by_requests(url, headers=headers, proxies=proxies) if not origin_html: continue release_time = get_release_time(mblog) come_from = tools.get_json_value(mblog, 'source') regexs = ['"text": "(.+?)",'] content = ''.join(tools.get_info(origin_html, regexs)) # content = tools.del_html_tag(content) content = content.replace('\\', '') sexy_image_url = [] regexs = ['"pic_ids": \[(.*?)\],'] image_url = ''.join(tools.get_info(origin_html, regexs)) image_url = tools.del_html_tag(image_url).replace('\"', '').replace( '\\n', '') if image_url: image_url = image_url.split(',') for i in range(len(image_url)): image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[ i] + '.jpg' sexy_image_url = image_url image_url = ','.join(image_url) regexs = ['"stream_url": "(.*?)"'] video_url = ''.join(tools.get_info(origin_html, regexs)) transpond_count = tools.get_json_value(mblog, 'reposts_count') praise_count = tools.get_json_value(mblog, 'attitudes_count') # 敏感事件 sensitive_id = '' if monitor_type == 1 or monitor_type == 2: sensitive_event_infos = oracledb.find( 'select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time' ) for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[1].split( ',') if sensitive_event_info[1] else [] keyword2 = sensitive_event_info[2].split( ',') if sensitive_event_info[2] else [] keyword3 = sensitive_event_info[3].split( ',') if sensitive_event_info[3] else [] if base_parser.is_violate(content, key1=keyword1, key2=keyword2, key3=keyword3): sensitive_id = _id break # 违规事件 violate_id = '' if monitor_type == 0 or monitor_type == 2: vioation_knowledge_infos = oracledb.find( 'select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time' ) for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[1].split( ',') if vioation_knowledge_info[1] else [] keyword2 = vioation_knowledge_info[2].split( ',') if vioation_knowledge_info[2] else [] keyword3 = vioation_knowledge_info[3].split( ',') if vioation_knowledge_info[3] else [] if base_parser.is_violate(content, key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break # 下载视频 is_mp4 = tools.is_file(video_url, 'mp4') if is_mp4: local_video_path = FILE_LOCAL_PATH + 'videos/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.mp4' is_download = tools.download_file(video_url, local_video_path) video_url = local_video_path if is_download else '' else: video_url = '' log.debug(''' 原文地址: %s 微博ID: %s 发布时间: %s 来自: %s 内容: %s 图片地址: %s 视频地址: %s 转发数: %s 点赞数: %s 违规id: %s 敏感事件 %s 图像鉴别地址 %s ''' % (url, weibo_id, release_time, come_from, content, image_url, video_url, transpond_count, praise_count, violate_id, sensitive_id, sexy_image_url)) if content: base_parser.add_wwa_weibo_info_info( 'WWA_weibo_info_info', SITE_ID, url, weibo_id, release_time, come_from, content, image_url, video_url, transpond_count, praise_count, violate_id, sensitive_id=sensitive_id, sexy_image_url=sexy_image_url) tools.delay_time() base_parser.update_url('WWA_weibo_info_urls', root_url, Constance.DONE)
def parser(url_info): root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] # root_url = 'http://list.youku.com/show/id_ze7cc3b8ed96711e68ce4.html' # depth = 0 # headers = {'Host': 'cmstool.youku.com', # 'Referer': 'http://v.youku.com/v_show/id_XMjY2NzY3MTE4NA.html', # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', # 'Cookie': '__ysuid=1491380613750xxD; __yscnt=1; juid=01bg7f56tqm9e; __aryft=1495434329; yseid=1495503610725JmZw8d; yseidcount=11; seid=01bgpfc8rb2vm6; ykss=fe922359521ce2d462cbda53; cna=Y5NrEThaR2MCAdOcjEogCug8; __ayvstp=6; __aysvstp=110; l=AmdnSHROpJU3344cDsaqhZhFd5Ex5jvO; isg=AlZW_barEwKJtiefqvOnVZcapwzSXpoxTdXpV8C_SDnUg_YdKIfqQbwzbaiV; __ayft=1495503611023; __aysid=1495416942598jZ1; __arpvid=1495504158930FOANHy-1495504158944; __arycid=; __ayscnt=1; __arcms=; __aypstp=5; __ayspstp=140; ypvid=1495504161820uZFGHk; ysestep=5; yseidtimeout=1495511361821; ycid=0; ystep=237; referhost=; seidtimeout=1495505961826'} if depth == 0: html = tools.get_html_by_urllib(root_url) header_info = tools.get_tag(html, 'div', {'class': 'p-thumb'}, find_all=False) program_name = header_info.a['title'] recent_video_url = header_info.a['href'] recent_video_url = 'http:'+recent_video_url recent_video_id = tools.get_info(recent_video_url, ['id_(.+?)='], fetch_one=True) if not recent_video_id: recent_video_id = tools.get_info(recent_video_url, ['id_(.+?)\.h'], fetch_one=True) actors = tools.get_tag(html, 'li', {'class': 'p-row'})[2].get_text() actors = ''.join(tools.re.compile('主持人:(.+)').findall(actors)) summary = tools.get_tag(html, 'span', {'class': 'text'}, find_all=False).get_text() summary = ''.join(tools.re.compile('简介:(.+)').findall(summary)) image_url = tools.get_tag(html, 'div', {'class': 'p-thumb'}, find_all=False) image_url = image_url.img['src'] list_url = 'https://ups.youku.com/ups/get.json?vid=%s==&ccode=0401&client_ip=&utid=Y5NrEThaR2MCAdOcjEogCug8&client_ts=' % recent_video_id list_json = tools.get_json_by_requests(list_url) video_list = tools.get_json_value(list_json, 'data.videos.list') # print(video_list) episode = tools.get_json_value(list_json, 'data.show.episode_total') log.debug(''' recent_video_url: %s recent_video_id: %s 集数: %s 主持人: %s 封面地址: %s 专辑地址: %s 简介: %s 节目名称: %s 视频列表: %s list_url: %s ''' % (recent_video_url, recent_video_id, episode, actors, image_url, root_url, summary, program_name, video_list, list_url)) program_id = base_parser.add_program_info('PROGRAM_info', SITE_ID, actors=actors, image_url=image_url, program_url=root_url, summary=summary, program_name=program_name, episode=episode) for vl in video_list: vl_id = tools.get_json_value(vl, 'encodevid') vl_url = 'http://v.youku.com/v_show/id_%s.html' % vl_id base_parser.add_url('PROGRAM_urls', SITE_ID, vl_url, depth=1, remark=program_id) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE) elif depth == 1: program_id = remark html, res = tools.get_html_by_requests(root_url) episode_name = tools.get_tag(html, 'h1', find_all=False) episode_name = episode_name.get_text() videoId = tools.get_info(html, ['videoId:"(.+?)"'], fetch_one=True) play_count, res = tools.get_html_by_requests('http://v.youku.com/action/getVideoPlayInfo?vid=%s&callback=tuijsonp5'% videoId) if not play_count: print(1) play_count = tools.get_info(play_count, ['"vv":"(.+?)"'], fetch_one=True) play_count = play_count.replace(',', '') # info_html, info_res = tools.get_html_by_requests('http://cmstool.youku.com/cms/playlog/get?callback=tuijsonp7', headers) # # print(info_html) # image_url = tools.get_info(info_html, ['"thumburl":"(.+?)",'], fetch_one=True) # image_url = image_url.replace('\\', '') # print(image_url) # episode_num = tools.get_info(info_html, ['"watchStage":"(.+?)",'], fetch_one=True) # episode_num = tools.to_chinese(episode_num) # print(episode_num) recent_video_id = tools.get_info(root_url, ['id_(.+?)='], fetch_one=True) if not recent_video_id: recent_video_id = tools.get_info(root_url, ['id_(.+?)\.h'], fetch_one=True) list_url = 'https://ups.youku.com/ups/get.json?vid=%s==&ccode=0401&client_ip=&utid=Y5NrEThaR2MCAdOcjEogCug8&client_ts=' % recent_video_id list_info = tools.get_json_by_requests(list_url) stream = tools.get_json_value(list_info, "data.stream") download_url = stream[layer]['m3u8_url'] time_length = tools.get_json_value(list_info, "data.video.seconds") episode_num = tools.get_json_value(list_info, "data.show.stage") image_url = tools.get_json_value(list_info, "data.video.logo") segs = stream[layer]['segs'] cdn_url = [] for video_url in segs: cdn_url.append(video_url['cdn_url']) # print(cdn_url) log.debug(''' 节目id: %s 当前集数: %s 本集时长: %s 播放次数: %s 节目名称: %s 下载地址: %s 节目链接: %s 图片地址: %s ''' % ( program_id, episode_num, time_length, play_count, episode_name, download_url, root_url, image_url)) base_parser.add_program_episode_info('PROGRAM_EPISODE_info', SITE_ID, program_id=program_id, episode_num=episode_num, time_length=time_length, episode_name=episode_name, download_url=download_url, episode_url=root_url, image_url=image_url, play_count=play_count) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html_json = tools.get_json_by_requests(root_url) html_json = tools.dumps_json(html_json) json_values = tools.get_json_value(html_json, 'obj.items') for json_value in json_values: try: url = tools.get_json_value(json_value, 'pkgName') url = 'http://sj.qq.com/myapp/detail.htm?apkName=' + url title = tools.get_json_value(json_value, 'appDetail.appName') author = tools.get_json_value(json_value, 'appDetail.authorName') icon_url = tools.get_json_value(json_value, 'appDetail.iconUrl') icon_url = icon_url.split() image_url = tools.get_json_value(json_value, 'appDetail.images') image_url = ','.join(icon_url + image_url) update_info = tools.get_json_value(json_value, 'appDetail.newFeature') tag = tools.get_json_value(json_value, 'appDetail.versionName') summary = tools.get_json_value(json_value, 'appDetail.description') app_url = tools.get_json_value(json_value, 'appDetail.apkUrl') release_time = tools.get_json_value(json_value, 'appDetail.apkPublishTime') release_time = int(release_time) release_time = tools.timestamp_to_date(release_time) score = tools.get_json_value(json_value, 'appDetail.averageRating') score = round(float(score), 1) software_size = tools.get_json_value(json_value, 'appDetail.fileSize') software_size = str(round(float(software_size) / 1024 / 1024, 1)) + 'MB' download_count = tools.get_json_value(json_value, 'appDetail.appDownCount') platform = 'android' language = '中文' log.debug(''' 标题: %s 原文url: %s 简介: %s 更新: %s 评分: %.1f 作者: %s app下载的url: %s 图片url: %s 大小: %s 版本: %s 平台: %s 下载次数: %s 发布时间: %s 语言 %s ''' % (title, url, summary, update_info, score, author, app_url, image_url, software_size, tag, platform, download_count, release_time, language)) base_parser.add_WWA_search_app_info('WWA_search_app_content_info', site_id, url, title=title, summary=summary, update_info=update_info, score=score, author=author, app_url=app_url, image_url=image_url, software_size=software_size, tag=tag, platform=platform, download_count=download_count, release_time=release_time, language=language, sensitive_id='') except Exception as e: log.error(e) base_parser.update_url('WWA_search_app_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] user_id = url_info['remark']['user_id'] head_url = url_info['remark']['head_url'] user_name = url_info['remark']['user_name'] gender = url_info['remark']['gender'] program_id = url_info['remark']['program_id'] page_count = 50 is_continue = True for i in range(0, page_count + 1): if not is_continue: break weibo_content_url = root_url + '&page=%d' % i headers = { "Cache-Control": "max-age=0", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011", "Accept-Language": "zh-CN,zh;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Host": "m.weibo.cn", "Accept-Encoding": "gzip, deflate, br", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" } html = tools.get_json_by_requests(weibo_content_url, headers=headers) cards = tools.get_json_value(html, 'data.cards') if len(cards) < 2: base_parser.update_url('mms_urls', root_url, Constance.DONE) return for card in cards: mblog = tools.get_json_value(card, 'mblog') if not mblog: continue url = tools.get_json_value(card, 'scheme') article_id = tools.get_json_value(mblog, 'id') article_url = 'https://m.weibo.cn/status/' + article_id headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011", "Host": "m.weibo.cn", "Accept-Language": "zh-CN,zh;q=0.8", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive" } origin_html, r = tools.get_html_by_requests(url, headers=headers) if not origin_html: continue # 精确到具体时分秒 需进入到article_url release_time = mblog['created_at'] release_time = tools.format_time(release_time) # release_time = get_release_time(mblog) release_time = tools.format_date(release_time) come_from = tools.get_json_value(mblog, 'source') regexs = ['"text": "(.+?)",'] content = ''.join(tools.get_info(origin_html, regexs)) # content = tools.del_html_tag(content) content = content.replace('\\', '') regexs = ['"pic_ids": \[(.*?)\],'] image_url = ''.join(tools.get_info(origin_html, regexs)) image_url = tools.del_html_tag(image_url).replace('\"', '').replace( '\\n', '') if image_url: image_url = image_url.split(',') for i in range(len(image_url)): image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[ i] + '.jpg' image_url = ','.join(image_url) regexs = ['"stream_url": "(.*?)"'] video_url = ''.join(tools.get_info(origin_html, regexs)) transpond_count = tools.get_json_value(mblog, 'reposts_count') praise_count = tools.get_json_value(mblog, 'attitudes_count') comments_count = tools.get_json_value(mblog, 'comments_count') log.debug(''' 原文地址: %s 博主ID: %s 文章id %s 发布时间: %s 来自: %s 内容: %s 图片地址: %s 视频地址: %s 评论数: %s 转发数: %s 点赞数: %s ''' % (article_url, user_id, article_id, release_time, come_from, content, image_url, video_url, comments_count, transpond_count, praise_count)) if self_base_parser.add_article(article_id, head_url, user_name, release_time, None, content, image_url, None, praise_count, comments_count, program_id=program_id, gender=gender, url=article_url, info_type=1, emotion=random.randint(0, 2), collect=0, source='新浪微博'): if comments_count > 0: parser_comment(article_id) else: is_continue = False break base_parser.update_url('mms_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] stid = remark['stid'] title = remark['title'] room_url = root_url image_url = remark['cover'] infosv2_headers = { 'Cookie': 'MMID=8a15a5674fa198503dcac35dff04bee0; __v3_c_review_10052=1; __v3_c_last_10052=1487233333348; __v3_c_visitor=1486539373375345; Hm_lvt_96a25bfd79bc4377847ba1e9d5dfbe8a=1486539374,1487233333; cId=23443128402874; L_V_T=db9ad5b0-800d-4f43-b6fa-33aa5e32afbc; L_V_T.sig=jOJ6vGu87WNyc-iYOuqGG0O75do; s_id=a68f007aa8644cc112f2b026a915e5c4; webmomo.sig=k0F5PIijTCK14gJvogLq-fqt978; web-imi-bew=s%3A434904554.1rDSKSZKt%2B0YDpAAi%2B2B3XBLPWR8s4QItn0tZZlA4aA; web-imi-bew.sig=J4cE69g51WFdUOxLDC--7QO8_mE; io=XTckP8nk8qE9G3U3AW1E; Hm_lvt_c391e69b0f7798b6e990aecbd611a3d4=1487664000,1487664487,1487667595,1487668109; Hm_lpvt_c391e69b0f7798b6e990aecbd611a3d4=1487668223', 'Host': 'web.immomo.com', 'Origin': 'https://web.immomo.com', 'Referer': 'https://web.immomo.com/live/%s' % stid, 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } infosv2_params = {'stid': stid, 'src': 'url'} infosv2 = tools.requests.post( 'https://web.immomo.com/webmomo/api/scene/profile/infosv2', data=infosv2_params, headers=infosv2_headers) infosv2 = infosv2.json() infosv2_data = infosv2['data'] # print(infosv2_data) watched_count = infosv2_data['on'] rid = infosv2_data['rid'] name = infosv2_data['name'] token = infosv2_data['token'] video_path = infosv2_data['url'] watched_count_url = { 'url': 'https://web.immomo.com/webmomo/api/scene/profile/userinfo', 'header': infosv2_headers, 'data': infosv2_params } userinfo_params = { 'dmid': stid, 'rd': rid, 'token': token, 'source': 'profile' } userinfo = tools.requests.post( 'https://web.immomo.com/webmomo/api/scene/profile/userinfo', data=userinfo_params, headers=infosv2_headers) userinfo = userinfo.json() userinfo_data = userinfo['data'] sex = userinfo_data['sex'] sex = 1 if sex == 'F' else 0 #0 男 1女 age = userinfo_data['age'] fanscount_headers = { 'Host': 'live-api.immomo.com', 'Accept': '*/*', 'X-PTOKEN': '', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9', 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', 'X-LV': '1', 'Content-Length': '176', 'User-Agent': 'MomoChat/7.5.6 ios/664 (iPhone 7 Plus; iOS 10.2.1; zh_CN; iPhone9,2; S1)', 'Connection': 'keep-alive', 'X-KV': '88e95f44', 'Cookie': 'SESSIONID=9C3DF7F1-C39D-06F7-EC1F-9DBE56DDBF15', } fanscount_date = { '_idfa_': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB', '_net_': 'wifi', '_uid_': 'e3bb287c00673c9a701c60bf79ca24b7', 'lat': 39.90266719310277, 'lng': 116.348690083085, 'remoteid': stid, 'roomid': rid, } fans_count_data = tools.requests.post( 'https://live-api.immomo.com/v3/user/card/lite', data=fanscount_date, headers=fanscount_headers) fans_count_data = fans_count_data.json() # print(fans_count_data) fans_count = tools.get_json_value(fans_count_data, 'data.fansCount') city = tools.get_json_value(fans_count_data, 'data.city') log.debug(''' 房间名: %s 主播名: %s 贴图: %s 主播id: %s 城市: %s 房间url: %s 视频流地址: %s 观看数: %s 粉丝数: %s 性别: %s 年龄: %s ''' % (title, name, image_url, stid, city, room_url, video_path, watched_count, fans_count, sex, age)) base_parser.add_anchor_info('LiveApp_anchor_info', SITE_ID, title=title, name=name, image_url=image_url, room_id=stid, room_url=room_url, video_path=video_path, watched_count=watched_count, fans_count=fans_count, sex=sex, age=age, address=city, watched_count_url=watched_count_url) base_parser.update_url('LiveApp_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] # 解析 html, request = tools.get_html_by_requests(root_url, headers=HEADER) if not html: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return news_box = tools.get_tag(html, name='div', attrs={'class': "news-box"})[0] news_list = tools.get_tag(news_box, name='li') for news in news_list: try: # 图片 image = tools.get_tag(news, name='img')[0] image = tools.get_json_value(image, 'src') # url url = tools.get_tag(news, name='h3')[0] try: url = tools.get_json_value(url.a, 'href') except: url = '' # 标题 title = tools.get_tag(news, name='h3')[0] title = tools.get_text(title) title = tools.del_html_tag(title) # 内容 content = tools.get_tag(news, name='p', attrs={'class': "txt-info"})[0] content = tools.get_text(content) content = tools.del_html_tag(content) # 观看数 watched_count = '' # 来源 origin = tools.get_tag(news, name='div', attrs={'class': "s-p"})[0] origin = ''.join(tools.get_info(origin, '<a.*?>(.*?)<')) # 日期 release_time = tools.get_tag(news, name='div', attrs={'class': "s-p"})[0] release_time = tools.get_json_value(release_time, 't') release_time = tools.timestamp_to_date(int(release_time)) # 判断是否有视频 根据视频播放图标判断 regex = '<div class="img-box">.*?<i></i>.*?</div>' play_icon = tools.get_info(news, regex) except: continue contained_key, contained_key_count = base_parser.get_contained_key( title, content, remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) log.debug( ''' 标题: %s 内容: %s 来源: %s 原文url:%s 图片url:%s 观看数: %s 日期: %s 有视频: %d 关键词: %s 关键词数:%s ''' % (title, content, origin, url, image, watched_count, release_time, play_icon and True or False, contained_key, contained_key_count)) if not contained_key or not play_icon: continue base_parser.add_content_info('VA_content_info', SITE_ID, url, title, content, image_url=image, release_time=release_time, origin=origin, watched_count=watched_count, search_type=SEARCH_TYPE, keyword=contained_key, keyword_count=contained_key_count) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] description = url_info['remark'] def get_tkey(t): def ror(val, key): i = 0 while (i < key): val = (0x7fffffff & (val >> 1)) | ((val & 1) << 31) i += 1 return val key = 185025305 val = ror(t, key % 17) val = val ^ key return val def getdownload(episode_download_url_json): episode_json = tools.get_json_by_requests(episode_download_url_json) #print(episode_download_url_json) episode_download_url = tools.get_json_value(episode_json, 'msgs.playurl.domain') episode_download_url = episode_download_url and episode_download_url[ 0] or '' #print('-----',episode_download_url) episode_download_url_definition = tools.get_json_value( episode_json, 'msgs.playurl.dispatch.1080p') episode_download_url_definition = episode_download_url_definition and episode_download_url_definition[ 0] or '' print(episode_download_url_definition, '*********') episode_download_url = episode_download_url + episode_download_url_definition episode_download_url += "&ctv=pc&m3v=1&termid=1&format=1&hwtype=un&ostype=Linux&tag=letv&sign=letv&expect=3&tn={}&pay=0&iscpn=f9051&rateid={}".format( random.random(), '1080p') episode_download_url_json = tools.get_json_by_requests( episode_download_url) episode_download_url = tools.get_json_value(episode_download_url_json, 'location') return episode_download_url if depth == 0: cs_regex = 'cs(.*?)_' o_regex = 'cs.*?_o(.*?)_p' cs = tools.get_info(source_url, cs_regex) cs_value = cs and cs[0] or '' o = tools.get_info(source_url, o_regex) o_value = o and o[0] or '' #print('1'+o_value+'2','***', cs_value) url = 'http://list.le.com/apin/chandata.json?cs=' + cs_value + '&_o=' + o_value + '&_p=' base_parser.add_url('PROGRAM_urls', site_id, url, depth + 1) if depth == 1: page = '1' #电视剧 if 'cs=2' in source_url: while True: json = tools.get_json_by_requests(source_url + page) json_list = tools.get_json_value(json, 'album_list') #print(source_url) for info in json_list: image_url = tools.get_json_value(info, 'images.1080*608') program_name = tools.get_json_value(info, 'name') program_url = tools.get_json_value(info, 'aid') program_url = 'http://www.le.com/tv/' + program_url + '.html' episode = tools.get_json_value(info, 'nowEpisodes') directors = tools.get_json_value(info, 'directory') #print(type(directors)) directors = ','.join(tools.get_json(directors).values()) actors = tools.get_json_value(info, 'starring') actors = ' '.join(actors.values()) summary = tools.get_json_value(info, 'description') release_time = tools.get_json_value(info, 'releaseDate') release_time = int(release_time) / 1000 x = time.localtime(release_time) release_time = time.strftime("%Y-%m-%d", x) log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time) episode_url = tools.get_json_value(info, 'vids') episode_url = episode_url + ',' regex = '(\d*?),' episode_urls = tools.get_info(episode_url, regex) for episode_url_num in episode_urls: episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html' episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) episode_json = tools.get_json_by_requests( episode_download_url_json) episode_image_url = tools.get_json_value( episode_json, 'msgs.playurl.pic') episode_name = tools.get_json_value( episode_json, 'msgs.playurl.title') episode_num_regex = "(\d*?)" episode_num = tools.get_info(episode_name, episode_num_regex) episode_num = episode_num and episode_num[0] or '' episode_download_url = getdownload( episode_download_url_json) time_length = '' episode_summary = '' download_status = '' log.debug( ''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, '') page = str(int(page) + 1) if not json_list: return False #体育 if 'cs=4' in source_url: while True: json = tools.get_json_by_requests(source_url + page) video_list = tools.get_json_value(json, 'video_list') for info in video_list: episode_name = tools.get_json_value(info, 'name') episode_num = tools.get_json_value(info, 'name') regex_episode_num = '第(.*?)期' episode_num = tools.get_info(episode_num, regex_episode_num) episode_num = ''.join(episode_num) episode_summary = tools.get_json_value(info, 'description') episode_image_url = tools.get_json_value( info, 'images.1080*608') episode_url_num = tools.get_json_value(info, 'vid') episode_url = 'http://sports.le.com/video/' + episode_url_num + '.html' #获取Vid 也就是num episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) episode_download_url = getdownload( episode_download_url_json) program_name = tools.get_json_value(info, 'albumName') summary = '' program_url = '' actors = '' directors = '' release_time = '' image_url = '' episode = '' aid = tools.get_json_value(info, 'aid') download_status = 102 time_length = '' log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s aid = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time, aid)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time, _id=aid) log.debug(''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode, time_length, program_name, download_status, episode_download_url, program_url, summary, image_url, '') page = str(int(page) + 1) if not json_list: return False # 综艺 if 'cs=11' in source_url: while True: json = tools.get_json_by_requests(source_url + page) video_list = tools.get_json_value(json, 'video_list') for info in video_list: episode_name = tools.get_json_value(info, 'name') episode_num = tools.get_json_value(info, 'name') regex_episode_num = '第(.*?)期' episode_num = tools.get_info(episode_num, regex_episode_num) episode_num = ''.join(episode_num) episode_summary = tools.get_json_value(info, 'description') episode_image_url = tools.get_json_value( info, 'images.1080*608') episode_url_num = tools.get_json_value(info, 'vid') episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html' #获取Vid 也就是num episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) episode_download_url = getdownload( episode_download_url_json) program_name = tools.get_json_value(info, 'albumName') summary = '' actors = '' directors = '' release_time = '' image_url = '' episode = '' aid = tools.get_json_value(info, 'aid') program_url = '' download_status = 102 time_length = '' log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s aid = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time, aid)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time, _id=aid) log.debug(''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, '') page = str(int(page) + 1) if not video_list: return False # 音乐 if 'cs=9' in source_url: while True: json = tools.get_json_by_requests(source_url + page) video_list = tools.get_json_value(json, 'video_list') for info in video_list: episode_name = tools.get_json_value(info, 'name') episode_num = tools.get_json_value(info, 'name') regex_episode_num = '(\d*?):' episode_num = tools.get_info(episode_num, regex_episode_num) episode_num = ''.join(episode_num) episode_summary = tools.get_json_value(info, 'description') episode_image_url = tools.get_json_value( info, 'images.1080*608') episode_url_num = tools.get_json_value(info, 'vid') episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html' #获取Vid 也就是num episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) episode_download_url = getdownload( episode_download_url_json) program_name = tools.get_json_value(info, 'albumName') summary = '' actors = tools.get_json_value(info, 'actor').values() actors = ''.join(actors) #print('**********', actors) directors = '' release_time = '' image_url = '' episode = '' aid = tools.get_json_value(info, 'aid') program_url = '' download_status = 102 time_length = '' log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s aid = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time, aid)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time, _id=aid) log.debug(''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, '') page = str(int(page) + 1) if not video_list: return False # # # 取当前页的文章信息 # # # 标题 # # # # # # everyone_html = tools.get_html_by_requests(program_url) # # # # regexs_directors = '<span class="editor" style="color:#333;">(.*?)</span>' # # directors = tools.get_info(everyone_html, regexs_directors) # # directors = directors and directors[0] or '' # # # # # 摘要 # # regexs_summary = '<p class="summaryList_long">(.*?)</p>' # # summary = tools.get_info(everyone_html, regexs_summary) # # summary = summary and summary[0] or '' # # # # # 更新时间 # # regexs_release_time = ' <dt>发布时间:</dt>.*?<dd>(.*?)</dd>' # # release_time = tools.get_info(everyone_html, regexs_release_time) # # release_time = release_time and release_time[0] or '' # # # # # 下载地址 # # regexs_download_url = 'videoUrl=(.*?)"' # # download_url = tools.get_info(everyone_html, regexs_download_url) # # download_url = download_url and download_url[0] or '' # # # # download_status = 102 # # time_length = '' # # # # # # if download_url: # # program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url = image_url, # # episode = episode, directors = directors, actors = '', summary = summary, # # release_time = release_time) # # # # sto_path = '/video/' + program_name + '.mp4' # # is_download = tools.download_file(download_url, FILE_LOCAL_PATH, sto_path) # # download_status = 101 if is_download else 102 # # # # base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode, time_length, program_name, download_status, # # download_url, program_url, summary, image_url, sto_path) # # # # # # # 更新source_url为done # # base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE) # 电影 if 'cs=1' in source_url: while True: json = tools.get_json_by_requests(source_url + page) json_list = tools.get_json_value(json, 'album_list') #print(source_url) for info in json_list: image_url = tools.get_json_value(info, 'images.1080*608') program_name = tools.get_json_value(info, 'name') program_url = tools.get_json_value(info, 'aid') program_url = 'http://www.le.com/movie/' + program_url + '.html' episode = ' ' directors = tools.get_json_value(info, 'directory') directors = ','.join(tools.get_json(directors).values()) actors = tools.get_json_value(info, 'starring') actors = ' '.join(actors.values()) summary = tools.get_json_value(info, 'description') release_time = tools.get_json_value(info, 'releaseDate') release_time = int(release_time) / 1000 x = time.localtime(release_time) release_time = time.strftime("%Y-%m-%d", x) log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time) episode_url = tools.get_json_value(info, 'vids') episode_url = episode_url + ',' regex = '(.*?),' episode_urls = tools.get_info(episode_url, regex) for episode_url_num in episode_urls: episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html' episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) print(episode_download_url_json) episode_json = tools.get_json_by_requests( episode_download_url_json) episode_image_url = tools.get_json_value( episode_json, 'msgs.playurl.pic') episode_name = tools.get_json_value( episode_json, 'msgs.playurl.title') episode_num_regex = "第(.*?)期" episode_num = tools.get_info(episode_name, episode_num_regex) episode_num = episode_num and episode_num[0] or '' episode_download_url = getdownload( episode_download_url_json) time_length = '' episode_summary = '' download_status = '' log.debug( ''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, '') page = str(int(page) + 1) if not json_list: return False
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark']['keyword'] monitor_type = url_info['remark']['monitor_type'] official_accounts_id = remark retry_times = url_info['retry_times'] headers = { "Host": "weixin.sogou.com", "Connection": "keep-alive", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Cookie": "ABTEST=8|1506658658|v1; IPLOC=CN1100; SUID=C5C47C7B642E940A0000000059CDC962; SUID=C5C47C7B1508990A0000000059CDC963; weixinIndexVisited=1; SUV=00F95AA57B7CC4C559CDC963CE316529; SNUID=2B2A9295EDE8B7A2BCECB605EE30F1BE; JSESSIONID=aaadcwpP9yaKs-PCMhz6v", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1" } # 获取代理 proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() # 解析 # print(proxies) # html, r = tools.get_html_by_requests('http://ip.chinaz.com/getip.aspx', headers = headers, proxies = proxies) # print(html) html, request = tools.get_html_by_requests(root_url, headers = headers, proxies = proxies) if not html: base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1) return # print(html) regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">' check_info = tools.get_info(html, regex, fetch_one = True) print(root_url) log.debug('取文章链接' + check_info) if check_info: base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1) return # 公众号信息块 regex = '<!-- a -->(.*?)<!-- z -->' account_block = tools.get_info(html, regex, fetch_one = True) # url regex = '<a.*?account_name.*?href="(.*?)">' account_url = tools.get_info(account_block, regex, fetch_one = True) account_url = account_url.replace('&',"&") log.debug('account_url = ' + account_url) if not account_url: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return headers = { "Accept-Language": "zh-CN,zh;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Host": "mp.weixin.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive" } # 代理 proxies = base_parser.get_proxies() headers["User-Agent"] = base_parser.get_user_agent() proxies = {} #使用代理会出现验证码 暂时不使用 html, request = tools.get_html_by_requests(account_url, headers = headers, proxies = proxies) regex = '<input class="weui_input frm_input" id="input" placeholder="(.*?)" maxlength="4">' check_info = tools.get_info(html, regex, fetch_one = True) log.debug(''' 取文章详细内容 %s url %s request.headers %s '''%(check_info, account_url, request.headers)) # print(html) regex = 'var msgList = (.*?});' article_json = tools.get_info(html, regex, fetch_one = True) article_json = tools.get_json(article_json) article_list = article_json.get('list', {}) for article in article_list: title = tools.get_json_value(article, 'app_msg_ext_info.title') is_have = mongodb.find('WWA_wechat_article', {'title' : title}) if is_have: log.debug(title + " 已存在") continue summary = tools.get_json_value(article, 'app_msg_ext_info.digest') image_url = tools.get_json_value(article, 'app_msg_ext_info.cover') sexy_image_url = [] # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' sexy_image_url.append(local_image_url) article_url = tools.get_json_value(article, 'app_msg_ext_info.content_url') article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url) article_url = article_url.replace('&',"&") release_time = tools.get_json_value(article, 'comm_msg_info.datetime') release_time = tools.timestamp_to_date(int(release_time)) if release_time else '' content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies) regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce' content = tools.get_info(content_html, regex, fetch_one = True) # # 取content里的图片 下载图片 然后替换内容中原来的图片地址 regex = '<img.*?data-src="(.*?)"' images = tools.get_info(content, regex) for image in images: local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg') is_download = tools.download_file(image, local_image_path) if is_download: content = content.replace(image, local_image_path) sexy_image_url.append(local_image_path) tools.delay_time(5) # 敏感事件 sensitive_id = '' if monitor_type == 1 or monitor_type == 2: sensitive_event_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[1].split(',') if sensitive_event_info[1] else [] keyword2 = sensitive_event_info[2].split(',') if sensitive_event_info[2] else [] keyword3 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else [] if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3): sensitive_id = _id break # 违规事件 violate_id = '' if monitor_type == 0 or monitor_type == 2: vioation_knowledge_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[1].split(',') if vioation_knowledge_info[1] else [] keyword2 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else [] keyword3 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else [] if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break log.debug(''' 标题 %s 简介 %s 图片地址 %s 文章地址 %s 发布时间 %s 内容 %s 本地贴图地址 %s 违规状态 %s 敏感事件 %s 图片鉴别地址 %s '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url)) base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url) # 同一天发布的 oneday_article_list = article.get('app_msg_ext_info', {}).get('multi_app_msg_item_list', []) for article in oneday_article_list: title = tools.get_json_value(article, 'title') summary = tools.get_json_value(article, 'digest') image_url = tools.get_json_value(article, 'cover') sexy_image_url = [] # 下载图片 local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(image_url, local_image_url) local_image_url = local_image_url if is_download else '' sexy_image_url.append(local_image_url) article_url = tools.get_json_value(article, 'content_url') article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url) article_url = article_url.replace('&',"&") content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies) regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce' content = tools.get_info(content_html, regex, fetch_one = True) # 取content里的图片 下载图片 然后替换内容中原来的图片地址 regex = '<img.*?data-src="(.*?)"' images = tools.get_info(content, regex) for image in images: local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg') is_download = tools.download_file(image, local_image_path) if is_download: content = content.replace(image, local_image_path) sexy_image_url.append(local_image_path) tools.delay_time(5) # 敏感事件 sensitive_id = '' sensitive_event_infos = oracledb.find('select * from tab_mvms_sensitive_event') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else [] keyword2 = sensitive_event_info[4].split(',') if sensitive_event_info[4] else [] keyword3 = sensitive_event_info[5].split(',') if sensitive_event_info[5] else [] if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3): sensitive_id = _id break # 违规事件 violate_id = '' vioation_knowledge_infos = oracledb.find('select * from tab_mvms_violation_knowledge') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else [] keyword2 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else [] keyword3 = vioation_knowledge_info[4].split(',') if vioation_knowledge_info[4] else [] if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id break log.debug(''' 标题 %s 简介 %s 图片地址 %s 文章地址 %s 发布时间 %s 内容 %s 本地贴图地址 %s 违规状态 %s 敏感事件 %s 图片鉴别地址 %s '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url)) base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url) base_parser.update_url('WWA_wechat_article_url', root_url, Constance.DONE) tools.delay_time()
def parser(url_info): url = url_info['url'] list_datas = tools.get_json_by_requests(url) list_datas = list_datas['list'] for list_data in list_datas: title = list_data['title'] watched_count = list_data['playsCounts'] image_url = list_data['coverLarge'] comment_count = list_data['commentsCount'] charge_type = list_data['priceTypeId'] is_finished = list_data['isFinished'] article_type = list_data['tags'] origin = list_data['provider'] episodes = list_data['tracks'] # uid = list_data['uid'] author = list_data['nickname'] album_id = list_data['albumId'] abstract = list_data['intro'] score = tools.get_json_value(list_data, 'score') # id = list_data['id'] new_url_2 = 'http://mobile.ximalaya.com/mobile/v1/album/rich?albumId=%s' % album_id list_datas_2 = tools.get_json_by_requests(new_url_2) content = tools.get_json_value(list_datas_2, 'data.album.intro') release_time = tools.get_json_value(list_datas_2, 'data.album.createdAt') release_time = tools.timestamp_to_date(release_time / 1000) update_time = tools.get_json_value(list_datas_2, 'data.album.lastUptrackAt') update_time = tools.timestamp_to_date(update_time / 1000) subscribe_count = tools.get_json_value(list_datas_2, 'data.album.subscribeCount') new_url_3 = 'http://mobile.ximalaya.com/mobile/v1/album/track?albumId=%s&device=android&isAsc=true&pageId=1&' \ 'pageSize=5000&pre_page=1' % album_id list_datas_3 = tools.get_json_by_requests(new_url_3) lists = tools.get_json_value(list_datas_3, 'data.list') log.debug(''' 书名: %s 作品类型: %s 集数: %s 评分: %s (免费作品均无评分) 订阅数: %s 作者: %s 创建时间: %s 最近更新日期: %s 贴图: %s 播放次数: %s 评论数: %s (免费作品均无评论) 收费类型: %s (0:免费,1:单期购买, 2:全集购买) 是否完结: %s (0、1:未完结, 2:完结) 提供者: %s 简介: %s 完整介绍: %s ''' % (title, article_type, episodes, score, subscribe_count, author, release_time, update_time, image_url, watched_count, comment_count, charge_type, is_finished, origin, abstract, content)) content_id = base_parser.add_wp_content_info( 'WP_content_info', SITE_ID, title=title, article_type=article_type, episodes=episodes, score=score, subscribe_count=subscribe_count, author=author, release_time=release_time, update_time=update_time, image_url=image_url, watched_count=watched_count, comment_count=comment_count, charge_type=charge_type, is_finished=is_finished, origin=origin, abstract=abstract, content=content, data_type=DATA_TYPE) for list in lists: title = list['title'] download_url = list['playPathAacv164'] watched_count = list['playtimes'] play_length = list['duration'] comments_count = list['comments'] create_time = list['createdAt'] create_time = tools.timestamp_to_date(create_time / 1000) # log.debug(''' # 书名: %s # 下载链接: %s # 播放次数: %s # 播放时长: %s # 评论数: %s # 创建时间: %s # ''' % ( # title, download_url, watched_count, play_length, comments_count, create_time)) base_parser.add_wp_content_episode_info( 'WP_content_episode_info', content_id=content_id, title=title, video_url=download_url, watched_count=watched_count, play_length=play_length, comments_count=comments_count, release_time=create_time, data_type=DATA_TYPE) base_parser.update_url('WP_urls', url, Constance.DONE)
def juji_parser(url, remark): program_id = remark html, res = tools.get_html_by_requests(url) tvid = tools.get_info( html, ['player-tvid="(\d{4,11})"', 'list-tvid="(\d{4,11})"'], fetch_one=True) pcInfo_url = "http://mixer.video.iqiyi.com/jp/mixin/videos/" + str(tvid) # print(pcInfo_url) html2, res = tools.get_html_by_requests(pcInfo_url) album_id = tools.get_info(html, [ 'player-albumid="(\d{4,11})', 'list-albumid="(\d{4,11})"', 'albumId: ?(\d{4,11}),', 'param\[\'albumId\'\] ?= ?"(\d{4,11})"' ], fetch_one=True) episode_name = tools.get_info(html, ['meta.+?"irTitle" content="(.+?)"'], fetch_one=True) image_url = tools.get_info(html, ['<meta property="og:image" content="(.+?)"/>'], fetch_one=True) image_url = image_url.replace('.jpg', '_160_90.jpg') play_count = tools.get_info(html2, ['"playCount":(.+?),'], fetch_one=True) time_length = tools.get_info(html2, ['"duration":\s*(.+?),'], fetch_one=True) episode_num = tools.get_info(html2, ['"order":\s*(.+?),'], fetch_one=True) current_time = tools.get_current_timestamp() * 1000 current_time = str(current_time) download_json_url = 'http://iface2.iqiyi.com/video/3.0/v_download?app_k=8e48946f144759d86a50075555fd5862&app_v=8.1&qyid=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&secure_p=iPhone&secure_v=1&dev_hw=%7B%22cpu%22:%22%22,%22mem%22:%222802%22%7D&net_sts=1&device_id=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&dev_os=10.2.1&dev_ua=iPhone9,2&net_ip=%7B%22country%22:%22%E4%B8%AD%E5%9B%BD%22,%22province%22:%22%E5%8C%97%E4%BA%AC%22,%22city%22:%22%E5%8C%97%E4%BA%AC%22,%22cc%22:%22%E5%9B%BD%E5%86%85%E5%85%B6%E4%BB%96%22,%22area%22:%22%E5%8D%8E%E5%8C%97%22,%22timeout%22:0,%22respcode%22:0%7D&album_id=' + album_id + '&tvid=' + tvid + '&req_times=1&play_core=0&platform_id=12&app_p=iphone&app_t=0&usr_res=16&ppid=1229289410&cookie=53igk5Vn7X1xpazWBjzW2HUN4XGjNSP4aQypF7affdnBUaC6rknOS4dzvIcU1pMm2m2Qfb&lang=zh_CN&app_lm=cn&pps=0&req_sn=' + current_time json_ = tools.get_json_by_requests(download_json_url, headers=download_header) download_url = tools.get_json_value(json_, 'video.mp4_res.1.url') download_url, res = tools.get_html_by_requests(download_url) download_url = tools.get_info(download_url, ['"l":"(.+?)"'], fetch_one=True) log.debug(''' 节目id: %s 当前集数: %s 本集时长: %s 播放次数: %s 节目名称: %s 下载地址: %s 节目链接: %s 图片地址: %s ''' % (program_id, episode_num, time_length, play_count, episode_name, download_url, url, image_url)) base_parser.add_program_episode_info('PROGRAM_EPISODE_info', SITE_ID, program_id=program_id, episode_num=episode_num, time_length=time_length, episode_name=episode_name, download_url=download_url, episode_url=url, image_url=image_url, play_count=play_count) base_parser.update_url('PROGRAM_urls', url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] description = url_info['remark'] def r1(pattern, text): m = re.search(pattern, text) if m: return m.group(1) program_name = '风行星风范' actors = '姜武,秦海璐,黄海波,柳岩' release_time = '2011年07月23日' directors = '' program_url = 'http://www.fun.tv/vplay/g-98097/' summary = '' image_url = 'http://img3.funshion.com/sdw?oid=cc09e4ab792d4008d86efcbbbf4c55dc&w=200&h=280' id = '98097' json_episode_info = tools.get_json_by_requests( 'http://pm.funshion.com/v5/media/episode?id=' + id + '&cl=aphone&uc=5') episode = len(json_episode_info) log.debug(''' depth = %s program_name = %s program_url = %s episode = %s summary = %s image_url = %s ''' % (depth, program_name, program_url, episode, summary, image_url)) program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors='', actors=actors, summary='', release_time=release_time) if re.match(r'http://www.fun.tv/vplay/.*g-(\w+)', source_url): id = r1(r'http://www.fun.tv/vplay/.*g-(\d+)', source_url) json_info = tools.get_json_by_requests( 'http://pm.funshion.com/v5/media/episode?id=' + id + '&cl=aphone&uc=5') json_episodes_info = tools.get_json_value(json_info, 'episodes') for json_episode_info in json_episodes_info: vid = tools.get_json_value(json_episode_info, 'id') episode_name = tools.get_json_value(json_episode_info, 'name') image_episode_info = tools.get_json_value(json_episode_info, 'still') episode_url = tools.get_json_value(json_episode_info, 'num') episode_url = 'http://pm.funshion.com/v5/media/share?id=98097&num=' + episode_url episode_num = tools.get_json_value(json_episode_info, 'num') #总集数 episode = len(json_episode_info) time_length = '' episode_summary = '' download_status = '' download_url = '' log.debug( ''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s image_episode_info = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, download_url, episode_summary, image_episode_info)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, download_url, episode_url, episode_summary, image_episode_info, '') # 取当前页的文章信息 # 标题 # 更新source_url为done base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] program_id = url_info['remark']['program_id'] program_name = url_info['remark']['program_name'] chan_name = url_info['remark']['chan_name'] is_continue = True for i in range(1, 2): # 只取了第一页 if not is_continue: break list_url = root_url + '&page=%d' % i html = tools.get_json_by_requests(list_url) cards = html.get('data', {}).get('cards') card_group = [] for i in cards: card_group = tools.get_json_value(i, 'card_group') if card_group: break if not card_group: break for info in card_group: user_info = tools.get_json_value(info, 'user') user_id = tools.get_json_value(user_info, 'id') user_url = 'http://m.weibo.cn/api/container/getIndex?containerid=230283%s_-_INFO' % user_id user_url_html = tools.get_json_by_requests(user_url) user_url_cards = tools.get_json_value(user_url_html, 'data.cards') user_url_card_group = tools.get_json_value(user_url_cards[0], 'card_group') area = '' for i in user_url_card_group: if tools.get_json_value(i, 'item_name') == '所在地': area = tools.get_json_value(i, 'item_content') else: continue name = tools.get_json_value(user_info, 'screen_name') verified_reason = tools.get_json_value(user_info, 'verified_reason') is_verified = 0 if verified_reason: is_verified = 1 sex = tools.get_json_value(user_info, 'gender') if sex == 'f': sex = 1 elif sex == 'm': sex = 0 else: sex = '' image_url = tools.get_json_value(user_info, 'profile_image_url') url = tools.get_json_value(user_info, 'profile_url') summary = tools.get_json_value(user_info, 'description') user_url_2 = 'http://m.weibo.cn/api/container/getIndex?containerid=100505%s' % user_id user_url_html_2 = tools.get_json_by_requests(user_url_2) fans_count = tools.get_json_value(user_url_html_2, 'userInfo.followers_count') follow_count = tools.get_json_value(user_url_html_2, 'userInfo.follow_count') log.debug( ''' 节目id: %s 用户id: %s 微博昵称: %s 微博地址: %s 头像地址: %s 微博认证: %s 是否认证: %s 所在地: %s 性别: %s 简介: %s 粉丝数: %s 关注数: %s ''' % (program_id, user_id, name, url, image_url, verified_reason, is_verified, area, sex, summary, fans_count, follow_count)) if program_name in name or program_name in verified_reason: # 搜索到想搜的用戶 不往下進行 self_base_parser.add_weibo_user(program_id, user_id, name, url, image_url, verified_reason, is_verified, area, sex, summary, fans_count, follow_count) # 更新 是否有官博字段 sql = 'update TAB_MMS_PROGRAM set official_blog = 1 where program_id = %d' % program_id db.update(sql) is_continue = False break if is_continue: # 更新 是否有官博字段 sql = 'update TAB_MMS_PROGRAM set official_blog = 0 where program_id = %d' % program_id db.update(sql) base_parser.update_url('mms_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] column_id = remark while True: try: json = tools.get_json_by_requests(root_url, headers=HEADERS, data=data, params=PARAMS) newslist = tools.get_json_value(json, 'newslist') if not newslist: break data['cachedCount'] += len(newslist) data['page'] += 1 for news in newslist: # print(tools.dumps_json(news)) title = tools.get_json_value(news, 'title') release_time = tools.get_json_value(news, 'time') abstract = tools.get_json_value(news, 'abstract') original_url = tools.get_json_value(news, 'url') img_url = tools.get_json_value( news, 'thumbnails_qqnews')[0] if tools.get_json_value( news, 'thumbnails_qqnews') else '' video_frame_url = tools.get_json_value( news, 'video_channel.video.playurl') # 取content html = tools.get_html_by_urllib(original_url) content = tools.get_tag(html, name='div', attrs={'class': "main"}, find_all=False) content = tools.del_html_tag(str(content)) # 解析视频真实地址 video_url = '' if video_frame_url: video_vid = tools.get_info(html, 'vid\s*=\s*"\s*([^"]+)"', fetch_one=True) video_url = ''.join(qq.qq_download_by_vid(video_vid)) # 判断是否违规 # 敏感事件 sensitive_id = '' sensitive_event_infos = oracledb.find( 'select * from tab_mvms_sensitive_event') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[3].split( ' ') if sensitive_event_info[3] else [] keyword2 = sensitive_event_info[4].split( ' ') if sensitive_event_info[4] else [] keyword3 = sensitive_event_info[5].split( ' ') if sensitive_event_info[5] else [] if base_parser.is_violate(title + content, key1=keyword1, key2=keyword2, key3=keyword3): sensitive_id = _id # 违规事件 violate_id = '' vioation_knowledge_infos = oracledb.find( 'select * from tab_mvms_violation_knowledge') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[2].split( ' ') if vioation_knowledge_info[2] else [] keyword2 = vioation_knowledge_info[3].split( ' ') if vioation_knowledge_info[3] else [] keyword3 = vioation_knowledge_info[4].split( ' ') if vioation_knowledge_info[4] else [] if base_parser.is_violate(title + content, key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id log.debug( ''' title: %s abstract : %s img_url : %s original_url: %s release_time : %s video_url: %s content : %s column_id: %d sensitive_id: %s violate_id: %s ''' % (title, abstract, img_url, original_url, release_time, video_url, content, column_id, sensitive_id, violate_id)) # 下载 base_path = FILE_LOCAL_PATH is_download = 0 # 下载图片 img_name = '' if img_url: img_name = 'images/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(img_url, base_path, img_name) if not is_download: img_name = '' # 下载视频 video_name = '' if video_url: video_name = 'videos/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.mp4' is_download = tools.download_file(video_url, base_path, video_name) if not is_download: video_name = '' if original_url: base_parser.add_va_app_content_info( 'VAApp_content_info', SITE_ID, title, abstract, img_url, img_name, original_url, release_time, video_url, video_name, content, column_id, is_download, sensitive_id, violate_id, STORAGE_ID) except Exception as e: log.debug(e) pass base_parser.update_url('VAApp_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url, code='GBK') episode_list = 'var url = "(.*?)"' episode_list_json = tools.get_info(html, episode_list) episode_list_json = episode_list_json and episode_list_json[0] or '' episode_list_json_url = episode_list_json + '&cb=jsonp' + str( int(time.time())) episode_list_json_url = episode_list_json_url.replace("\\", "") #print(episode_list_json_url) # base_parser.add_url('PROGRAM_urls', site_id, url, depth+1) # 取类型 # 标题 regexs_program_name = '<meta name="keywords" content="(.*?)" />' program_name = tools.get_info(html, regexs_program_name) program_name = program_name and program_name[0] or '' program_url = source_url episode_list_json_html, r = tools.get_html_by_requests( episode_list_json_url) regexs = 'jsonp\d*?\((.*)\)' episode_list_json = tools.get_info(episode_list_json_html, regexs) episode_list_json = episode_list_json and episode_list_json[0] or '' episode_list_json = tools.dumps_json(episode_list_json) episode_list_json_value_list = tools.get_json_value( episode_list_json, 'data.list') episode = len(episode_list_json_value_list) summary = '' log.debug(''' depth = %s program_name = %s program_url = %s episode = %s summary = %s ''' % (depth, program_name, program_url, episode, summary)) program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url='', episode=episode, directors='', actors='', summary=summary, release_time='') for episode_info in episode_list_json_value_list: episode_name = tools.get_json_value(episode_info, 'title') episode_image_url = tools.get_json_value(episode_info, 'picurl') episode_url = tools.get_json_value(episode_info, 'podurl') episode_summary = tools.get_json_value(episode_info, 'desc') episode_num = tools.get_json_value(episode_info, 'title') episode_num_regex = '第(\d*?)期' episode_num = tools.get_info(episode_num, episode_num_regex) episode_num = episode_num and episode_num[0] or '' if episode_num: episode_num = '第' + episode_num + '期' download_url_json_str = tools.get_json_value(episode_info, 'vid') download_url_json_url = 'http://v.ku6.com/fetchVideo4Player/' + download_url_json_str + '.html' download_url_json = tools.get_json_by_requests(download_url_json_url) download_url = tools.get_json_value(download_url_json, 'data.f') download_status = 102 time_length = '' if download_url: # sto_path = '/video/' + program_name + '.mp4' # is_download = tools.download_file(download_url, FILE_LOCAL_PATH, sto_path) # download_status = 101 if is_download else 102 log.debug(''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth + 1, episode_num, time_length, episode_name, episode_url, download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, download_url, episode_url, episode_summary, episode_image_url, '') # 更新source_url为done base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] monitor_type = url_info['remark'] for i in range(2, 100): list_url = root_url + '&page=%d' % i html = tools.get_json_by_requests(list_url) cards = tools.get_json_value(html, 'cards') card_group = [] for i in cards: card_group = tools.get_json_value(i, 'card_group') if card_group: break if not card_group: break for info in card_group: user_info = tools.get_json_value(info, 'user') _id = tools.get_json_value(user_info, 'id') user_url = 'http://m.weibo.cn/api/container/getIndex?containerid=230283%s_-_INFO' % _id user_url_html = tools.get_json_by_requests(user_url) user_url_cards = tools.get_json_value(user_url_html, 'cards') user_url_card_group = tools.get_json_value(user_url_cards[0], 'card_group') area = '' for i in user_url_card_group: if tools.get_json_value(i, 'item_name') == '所在地': area = tools.get_json_value(i, 'item_content') else: continue name = tools.get_json_value(user_info, 'screen_name') is_verified_reason = 101 verified_reason = tools.get_json_value(user_info, 'verified_reason') if verified_reason: is_verified_reason = 102 sex = tools.get_json_value(user_info, 'gender') if sex == 'f': sex = 1 elif sex == 'm': sex = 0 else: sex = '' image_url = tools.get_json_value(user_info, 'profile_image_url') url = tools.get_json_value(user_info, 'profile_url') summary = tools.get_json_value(user_info, 'description') user_url_2 = 'http://m.weibo.cn/api/container/getIndex?containerid=100505%s' % _id user_url_html_2 = tools.get_json_by_requests(user_url_2) fans_count = tools.get_json_value(user_url_html_2, 'userInfo.followers_count') follow_count = tools.get_json_value(user_url_html_2, 'userInfo.follow_count') log.debug(''' 用户id: %s 微博昵称: %s 微博地址: %s 头像地址: %s 微博认证: %s 是否认证: %s 所在地: %s 性别: %s 简介: %s 粉丝数: %s 关注数: %s 监测状态: %s ''' % (_id, name, url, image_url, verified_reason, is_verified_reason, area, sex, summary, fans_count, follow_count, monitor_type)) base_parser.add_wwa_weibo_user_info('WWA_weibo_user_info', SITE_ID, _id, name, url, image_url, verified_reason, is_verified_reason, area, sex, summary, fans_count, follow_count, monitor_type) tools.delay_time() base_parser.update_url('WWA_weibo_user_urls', root_url, Constance.DONE) tools.delay_time() # parser({'url': 'http://m.weibo.cn/api/container/getIndex?type=user&containerid=100103type%3D3%26q%3D%E9%87%8D%E5%BA%86%E7%94%B5%E8%A7%86%E5%8F%B0'})