def add_root_url(parser_params = {}): log.debug(''' 添加根url parser_params : %s '''% str(parser_params)) _db = base_parser.MongoDB() _db.set_unique_key('PROGRAM_EPISODE_info', 'episode_url') _db.update('PROGRAM_urls', {'depth': 0, 'site_id': SITE_ID}, {'status': 0}, multi=True) for page_num in range(1, 14): urls = [ 'http://list.youku.com/category/show/c_85_g_热门网综_s_1_d_1_p_%d.html' % page_num, 'http://list.youku.com/category/show/c_97_g_优酷出品_s_1_d_1_p_%d.html' % page_num, 'http://list.youku.com/category/show/c_96_g_优酷出品_s_1_d_1_p_%d.html' % page_num, ] for url in urls: print(url) print('********************************************************') html = tools.get_html_by_urllib(url) if tools.get_info(html, ['小酷没有筛选到相关视频']): continue links = tools.get_tag(html, 'div', {'class': 'p-thumb'}) for link in links: try: link = link.a['href'] link = tools.get_full_url('http:', link) link_html = tools.get_html_by_urllib(link) link = tools.get_tag(link_html, 'a', {'class': 'desc-link'}, find_all=False) link = link['href'] link = tools.get_full_url('http:', link) base_parser.add_url('PROGRAM_urls', SITE_ID, link, depth=0) except Exception as e: log.error(e) print(link_html)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: next_keyword = False quote_keyword = tools.quote(keyword) for page_index in range(1, 10): url = 'http://www.soku.com/search_video/q_%s_orderby_2_limitdate_0?spm=a2h0k.8191407.0.0&site=14&' \ '_lg=10&page=%s' % (quote_keyword, page_index) log.debug(''' 处理: %s url : %s''' % (keyword, url)) html, res = tools.get_html_by_requests(url) video_list_title = tools.get_tag(html, 'div', {'class': 'v-thumb'}) video_list_url = tools.get_tag(html, 'div', {'class': 'v-meta'}) video_list_time = tools.get_tag(html, 'div', {'class': 'v-meta-data'}) if not video_list_title: break for info_index, video_info in enumerate(video_list_title): image_url = tools.get_info(str(video_info), 'src="(.+?)"', fetch_one=True) image_url = 'http:' + image_url print(image_url) title = tools.get_info(str(video_info), 'alt="(.+?)"', fetch_one=True) print(title) url = tools.get_info(str(video_list_url[info_index]), 'href="(.+?)"', fetch_one=True) url = 'http:' + url print(url) release_time = tools.get_info(str( video_list_time[info_index * 2 + 1]), 'lass="r">(.+?)<', fetch_one=True) release_time = get_release_time(release_time) print(release_time) is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break if next_keyword: break
def spider_gonggao(): urls = [ 'http://www.sapprft.gov.cn/sapprft/channels/6588.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_2.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_3.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_4.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_5.shtml' ] count = 0 for url in urls: html, res = tools.get_html_by_requests(url) links = tools.get_tag(html, 'a', {'class': 'fl'}) release_times = tools.get_tag(html, 'span', {'class': 'fr'}) for link_num in range(len(links)): title = links[link_num].get_text() link = links[link_num]['href'] link = 'http://www.sapprft.gov.cn' + link release_time = release_times[link_num].get_text() link_html, res = tools.get_html_by_requests(link) content = tools.get_tag(link_html, 'div', {'id': 'artibody'}, find_all=False) content = content.get_text() content_info = { 'title': title, 'url': link, 'release_time': release_time, 'content': content } print(title + ' ' + release_time) key_map = { 'id': 'vint_sequence.nextval', 'title': 'str_title', 'content': 'clob_content', 'url': 'str_url', 'release_time': 'date_release_time' } def export_callback(execute_type, sql, data_json): if execute_type == ExportData.EXCEPTION: print('共导出 %s 条公告' % count) exit() count += export_data.export_to_oracle(key_map=key_map, aim_table='TAB_IOPM_notice', unique_key='url', datas=content_info, callback=export_callback) print('共导出 %s 条公告' % count)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: print(keyword) next_keyword = False keyword = tools.quote(keyword) for page_index in range(1, 20): url = 'http://so.iqiyi.com/so/q_%s_ctg__t_0_page_%s_p_1_qc_0_rd__site__m_4_bitrate_' % ( keyword, page_index) print(url) html, res = tools.get_html_by_requests(url) video_list_title = tools.get_tag(html, 'a', {'class': 'figure-180101'}) video_list_time = tools.get_tag(html, 'div', {'class': 'result_info'}) if not video_list_time: print('无视频列表 跳出') break for info_index, video_info in enumerate(video_list_time): try: image_url = tools.get_info(str( video_list_title[info_index]), 'src="(.+?)"', fetch_one=True) title = tools.get_info(str(video_list_title[info_index]), 'title="(.+?)"', fetch_one=True) url = tools.get_info(str(video_list_title[info_index]), 'href="(.+?)"', fetch_one=True) release_time = tools.get_tag( video_info, 'em', { 'class': 'result_info_desc' }, find_all=False).get_text() is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break except Exception as e: log.error(e) if next_keyword: break
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: next_keyword = False keyword = tools.quote(keyword) for page_index in range(1, 10): url = 'https://v.qq.com/x/search/?q=%s&filter=sort=1&&cur=%s' % ( keyword, page_index) print(url) html, res = tools.get_html_by_requests(url) video_list_title = tools.get_tag(html, 'div', {'class': 'result_item'}) if not video_list_title: break for info_index, video_info in enumerate(video_list_title): try: image_url = tools.get_tag(video_info, 'img', find_all=False)['src'] image_url = 'http:' + image_url title = tools.get_tag(video_info, 'h2', find_all=False).get_text() print(title) url = tools.get_tag(video_info, 'h2', find_all=False).a['href'] release_time = tools.get_tag(video_info, 'span', { 'class': 'content' }, find_all=False).get_text() print(release_time) release_time = get_release_time(release_time) print(release_time) except Exception as e: log.error(e) continue is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break if next_keyword: break
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) _db = base_parser.MongoDB() _db.set_unique_key('PROGRAM_EPISODE_info', 'episode_url') _db.update('PROGRAM_urls', { 'depth': 0, 'site_id': SITE_ID }, {'status': 0}, multi=True) urls_zongyi = [ 'http://list.iqiyi.com/www/6/-30279------------11-1-1-iqiyi--.html', 'http://list.iqiyi.com/www/6/-30279------------11-2-1-iqiyi--.html', 'http://list.iqiyi.com/www/6/-30279------------11-3-1-iqiyi--.html' ] for urls in urls_zongyi: html, res = tools.get_html_by_requests(urls) list_infos = tools.get_tag(html, 'div', {'class': 'site-piclist_pic'}, find_all=True) for list_info in list_infos: link = list_info.a['href'] image_url = list_info.a.img['src'] print(link + ' ' + image_url) base_parser.add_url('PROGRAM_urls', SITE_ID, link, remark=image_url) urls_juji = 'http://www.iqiyi.com/kszt/iqiyizzj.html' html, res = tools.get_html_by_requests(urls_juji) list_part_A = tools.get_tag(html, 'div', {'class': 'pro-pic'}) for i in list_part_A: url = i.a['href'] image_url = i.a.img['src'] print(url + ' ' + image_url) base_parser.add_url('PROGRAM_urls', SITE_ID, url, remark=image_url) list_part_B = tools.get_tag(html, 'div', {'class': 'partB'}, find_all=False) part_B_url = tools.get_info(list_part_B, '<a href="([^>]*?)"><img.*?src="(.*?)"') for pb in part_B_url: base_parser.add_url('PROGRAM_urls', SITE_ID, pb[0], remark=pb[1])
def parser_program_url(url_info): log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] classify = remark['classify'] # 解析 html, request = tools.get_html_by_requests(root_url) if not html: base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION) return program_blocks = tools.get_tag(html, 'li', {'class': "list_item"}) for program_block in program_blocks: program_block = str(program_block) # 地址 regex = 'r-props="{id: \'(.*?)\'' program_id = tools.get_info(program_block, regex, fetch_one=True) program_url = 'http://v.qq.com/detail/5/%s.html' % program_id base_parser.add_url("PROGRAM_urls", site_id, program_url, depth=1, remark={ 'program_id': program_id, 'classify': classify }) base_parser.update_url("PROGRAM_urls", root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] offset = remark.get('offset') html = tools.get_html_by_webdirver(root_url) headers = tools.get_tag(html, 'div', {'class': 'result'}, find_all=True) if not headers: base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE) for header in headers: # 查看更多相关新闻 regex = ' <span class="c-info"><a.*?href="(.*?)".*?查看更多相关新闻' more_news_url = tools.get_info(str(header), regex, fetch_one = True) if more_news_url: more_news_url = tools.get_full_url('http://news.baidu.com', more_news_url) more_news_url = more_news_url.replace('amp;', '') base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, more_news_url, depth = 1, remark = {'offset':0}) url = header.h3.a['href'] article_extractor = ArticleExtractor(url) content = title = release_time = author = website_domain ='' content = article_extractor.get_content() if content: title = article_extractor.get_title() release_time = article_extractor.get_release_time() author = article_extractor.get_author() website_domain = tools.get_domain(url) uuid = tools.get_uuid(title, website_domain) website_name = '' website_position = None log.debug(''' uuid %s title %s author %s release_time %s domain %s url %s content %s '''%(uuid, title, author, release_time, website_domain, url, '...')) # 入库 if tools.is_have_chinese(content): is_continue = self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name , website_domain, website_position, url, content) if not is_continue: break else: # 循环正常结束 该页均正常入库, 继续爬取下页 offset += 50 url = tools.replace_str(root_url, 'pn=\d*', 'pn=%d'%offset) base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, url, depth = 0, remark = {'offset': offset}) base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: next_keyword = False for page_index in range(1, 10): keyword = tools.quote(keyword) url = 'https://so.tv.sohu.com/mts?wd=%s&c=0&v=0&length=0&limit=0&site=0&o=3&p=%s&st=&suged=&filter=0' % \ (keyword, page_index) log.debug('处理 url = %s' % url) html, res = tools.get_html_by_requests(url) video_list_time = tools.get_tag(html, 'a', {'class': 'tcount'}) video_list_title = tools.get_tag(html, 'div', {'class': 'pic170'}) if not video_list_title: break for info_index, video_info in enumerate(video_list_title): image_url = tools.get_tag(video_info, 'img', find_all=False)['src'] image_url = 'http:' + image_url title = video_info.a['title'] url = video_info.a['href'] url = 'http:' + url release_time = video_list_time[info_index].get_text() print(release_time) release_time = get_release_time(release_time) print(release_time) is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break if next_keyword: break
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html, requests = tools.get_html_by_requests(root_url) titles = tools.get_tag(html, 'h3') video_infos = tools.get_tag(html, 'dt') for i in range(0, len(titles)): title = tools.get_text(titles[i]) title = tools.del_html_tag(title) try: url = titles[i].a['href'] except: continue url = 'http://www.bturls.net' + url release_time = video_infos[i].span release_time = tools.get_text(release_time) file_size = video_infos[i].span.next_sibling.next_sibling file_size = tools.get_text(file_size) watched_count = video_infos[ i].span.next_sibling.next_sibling.next_sibling.next_sibling watched_count = tools.get_text(watched_count) regexs = ['t/(.+?)\.'] magnet_link = 'magnet:?xt=urn:btih:' + ''.join( tools.get_info(url, regexs)) log.debug( ''' 标题: %s 文件大小:%s 原文url: %s 观看数量:%s 磁力链接:%s 日期: %s ''' % (title, file_size, url, watched_count, magnet_link, release_time)) contained_key, contained_key_count = base_parser.get_contained_key( title, '', remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, url, title, file_size=file_size, release_time=release_time, watched_count=watched_count, magnet_link=magnet_link, search_type=search_type, keyword=contained_key, keyword_count=contained_key_count, task_id=remark['task_id']) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html = tools.get_html_by_webdirver(root_url) headers = tools.get_tag(html, 'h3', {'class': 't'}) for i in range(0, len(headers)): title = tools.get_text(headers[i]) title = tools.del_html_tag(title) if tools.re.compile('的相关视频在线观看_百度视频').findall(title): continue try: ssurl = headers[i].a["href"] except: continue r = tools.requests.head(ssurl) url = r.headers['Location'] try: img = headers[i].next_sibling()[0].img['src'] except: img = '' try: release_time = headers[i].next_sibling()[0] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) if not release_time: release_time = headers[i].next_sibling()[1] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) if not release_time: release_time = headers[i].next_sibling()[2] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) if not release_time: release_time = headers[i].next_sibling()[3] release_time = ''.join( tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall( str(release_time))) release_time = release_time.replace('年', '-').replace('月', '-').replace( '日', '') except: release_time = '' content = '' for content in headers[i].next_sibling(): content = tools.get_tag(content, 'div', {'class': 'c-abstract'}, find_all=False) if content: content = tools.get_text(content) break else: content = '' log.debug(''' 标题: %s 内容: %s 原文url:%s 图片url:%s 日期: %s ''' % (title, content, url, img, release_time)) contained_key, contained_key_count = base_parser.get_contained_key( title, content, remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue is_video1 = base_parser.is_have_video_by_site(url) if not is_video1: is_video2 = base_parser.is_have_video_by_judge(title, content) if is_video2: html2 = tools.get_html_by_requests(url) is_video3 = base_parser.is_have_video_by_common(html2) if not is_video3: continue else: continue base_parser.add_content_info('VA_content_info', SITE_ID, url=url, title=title, content=content, image_url=img, release_time=release_time, search_type=search_type, keyword=contained_key, keyword_count=contained_key_count) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info): root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] # root_url = 'http://list.youku.com/show/id_ze7cc3b8ed96711e68ce4.html' # depth = 0 # headers = {'Host': 'cmstool.youku.com', # 'Referer': 'http://v.youku.com/v_show/id_XMjY2NzY3MTE4NA.html', # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', # 'Cookie': '__ysuid=1491380613750xxD; __yscnt=1; juid=01bg7f56tqm9e; __aryft=1495434329; yseid=1495503610725JmZw8d; yseidcount=11; seid=01bgpfc8rb2vm6; ykss=fe922359521ce2d462cbda53; cna=Y5NrEThaR2MCAdOcjEogCug8; __ayvstp=6; __aysvstp=110; l=AmdnSHROpJU3344cDsaqhZhFd5Ex5jvO; isg=AlZW_barEwKJtiefqvOnVZcapwzSXpoxTdXpV8C_SDnUg_YdKIfqQbwzbaiV; __ayft=1495503611023; __aysid=1495416942598jZ1; __arpvid=1495504158930FOANHy-1495504158944; __arycid=; __ayscnt=1; __arcms=; __aypstp=5; __ayspstp=140; ypvid=1495504161820uZFGHk; ysestep=5; yseidtimeout=1495511361821; ycid=0; ystep=237; referhost=; seidtimeout=1495505961826'} if depth == 0: html = tools.get_html_by_urllib(root_url) header_info = tools.get_tag(html, 'div', {'class': 'p-thumb'}, find_all=False) program_name = header_info.a['title'] recent_video_url = header_info.a['href'] recent_video_url = 'http:'+recent_video_url recent_video_id = tools.get_info(recent_video_url, ['id_(.+?)='], fetch_one=True) if not recent_video_id: recent_video_id = tools.get_info(recent_video_url, ['id_(.+?)\.h'], fetch_one=True) actors = tools.get_tag(html, 'li', {'class': 'p-row'})[2].get_text() actors = ''.join(tools.re.compile('主持人:(.+)').findall(actors)) summary = tools.get_tag(html, 'span', {'class': 'text'}, find_all=False).get_text() summary = ''.join(tools.re.compile('简介:(.+)').findall(summary)) image_url = tools.get_tag(html, 'div', {'class': 'p-thumb'}, find_all=False) image_url = image_url.img['src'] list_url = 'https://ups.youku.com/ups/get.json?vid=%s==&ccode=0401&client_ip=&utid=Y5NrEThaR2MCAdOcjEogCug8&client_ts=' % recent_video_id list_json = tools.get_json_by_requests(list_url) video_list = tools.get_json_value(list_json, 'data.videos.list') # print(video_list) episode = tools.get_json_value(list_json, 'data.show.episode_total') log.debug(''' recent_video_url: %s recent_video_id: %s 集数: %s 主持人: %s 封面地址: %s 专辑地址: %s 简介: %s 节目名称: %s 视频列表: %s list_url: %s ''' % (recent_video_url, recent_video_id, episode, actors, image_url, root_url, summary, program_name, video_list, list_url)) program_id = base_parser.add_program_info('PROGRAM_info', SITE_ID, actors=actors, image_url=image_url, program_url=root_url, summary=summary, program_name=program_name, episode=episode) for vl in video_list: vl_id = tools.get_json_value(vl, 'encodevid') vl_url = 'http://v.youku.com/v_show/id_%s.html' % vl_id base_parser.add_url('PROGRAM_urls', SITE_ID, vl_url, depth=1, remark=program_id) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE) elif depth == 1: program_id = remark html, res = tools.get_html_by_requests(root_url) episode_name = tools.get_tag(html, 'h1', find_all=False) episode_name = episode_name.get_text() videoId = tools.get_info(html, ['videoId:"(.+?)"'], fetch_one=True) play_count, res = tools.get_html_by_requests('http://v.youku.com/action/getVideoPlayInfo?vid=%s&callback=tuijsonp5'% videoId) if not play_count: print(1) play_count = tools.get_info(play_count, ['"vv":"(.+?)"'], fetch_one=True) play_count = play_count.replace(',', '') # info_html, info_res = tools.get_html_by_requests('http://cmstool.youku.com/cms/playlog/get?callback=tuijsonp7', headers) # # print(info_html) # image_url = tools.get_info(info_html, ['"thumburl":"(.+?)",'], fetch_one=True) # image_url = image_url.replace('\\', '') # print(image_url) # episode_num = tools.get_info(info_html, ['"watchStage":"(.+?)",'], fetch_one=True) # episode_num = tools.to_chinese(episode_num) # print(episode_num) recent_video_id = tools.get_info(root_url, ['id_(.+?)='], fetch_one=True) if not recent_video_id: recent_video_id = tools.get_info(root_url, ['id_(.+?)\.h'], fetch_one=True) list_url = 'https://ups.youku.com/ups/get.json?vid=%s==&ccode=0401&client_ip=&utid=Y5NrEThaR2MCAdOcjEogCug8&client_ts=' % recent_video_id list_info = tools.get_json_by_requests(list_url) stream = tools.get_json_value(list_info, "data.stream") download_url = stream[layer]['m3u8_url'] time_length = tools.get_json_value(list_info, "data.video.seconds") episode_num = tools.get_json_value(list_info, "data.show.stage") image_url = tools.get_json_value(list_info, "data.video.logo") segs = stream[layer]['segs'] cdn_url = [] for video_url in segs: cdn_url.append(video_url['cdn_url']) # print(cdn_url) log.debug(''' 节目id: %s 当前集数: %s 本集时长: %s 播放次数: %s 节目名称: %s 下载地址: %s 节目链接: %s 图片地址: %s ''' % ( program_id, episode_num, time_length, play_count, episode_name, download_url, root_url, image_url)) base_parser.add_program_episode_info('PROGRAM_EPISODE_info', SITE_ID, program_id=program_id, episode_num=episode_num, time_length=time_length, episode_name=episode_name, download_url=download_url, episode_url=root_url, image_url=image_url, play_count=play_count) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def zhuanji_parser(url, remark): image_url = remark html, res = tools.get_html_by_requests(url) program_name = tools.get_info(html, ['<title>(.+?)-.+?</title>'], fetch_one=True) if not program_name: base_parser.update_url('PROGRAM_urls', url, Constance.EXCEPTION) return ablum_list_id = tools.get_info(html, ['data-bodansubid="(.+?)"'], fetch_one=True) if ablum_list_id: video_list_url = 'http://cache.video.qiyi.com/jp/plst/%s/' % ablum_list_id list_json, res = tools.get_html_by_requests(video_list_url) video_list = tools.get_info(list_json, ['http://www.iqiyi.com/v_.+?.html']) else: ablum_list_id = tools.get_info(html, ['sourceId: (.+?),'], fetch_one=True) video_list_url = 'http://cache.video.qiyi.com/jp/sdvlst/6/%s/' % ablum_list_id list_json, res = tools.get_html_by_requests(video_list_url) # video_list = tools.get_info(list_json, ['http://www.iqiyi.com/v_.+?.html']) video_list = tools.get_info(list_json, ['http://www.iqiyi.com/.+?.html']) if ablum_list_id == '0': video_list = [] if not video_list: video_list = tools.get_tag(html, 'div', {'class': 'wrapper-piclist'}, find_all=False) video_list = tools.get_info(str(video_list), ['(http://www.iqiyi.com/v_.+?.html)']) video_list = list(set(video_list)) if not video_list: video_list = tools.get_tag(html, 'div', {'class': 'piclist-wrapper'}, find_all=False) video_list = tools.get_info(str(video_list), ['(http://www.iqiyi.com/v_.+?.html)']) video_list = list(set(video_list)) if not video_list: video_list = tools.get_tag(html, 'ul', {'class': 'juji-list'}, find_all=False) video_list = tools.get_info(str(video_list), ['(http://www.iqiyi.com/v_.+?.html)']) video_list = list(set(video_list)) if not video_list: video_list = tools.get_tag(html, 'div', {'class': 'videoList'}, find_all=False) video_list = tools.get_info(str(video_list), ['(http://www.iqiyi.com/v_.+?.html)']) video_list = list(set(video_list)) summary = tools.get_tag(html, 'span', {'class': 'showMoreText'}, find_all=False) if summary: summary = summary.get_text().replace('简介:', '') if not summary: summary = tools.get_tag(html, 'div', {'data-moreorless': 'moreinfo'}, find_all=False) if summary: summary = summary.get_text() log.debug(''' 封面地址: %s 专辑地址: %s 简介: %s 节目名称: %s 视频列表: %s ''' % (image_url, url, summary, program_name, video_list)) program_id = base_parser.add_program_info('PROGRAM_info', SITE_ID, image_url=image_url, program_url=url, summary=summary, program_name=program_name) for link in video_list: base_parser.add_url('PROGRAM_urls', SITE_ID, link, depth=1, remark=program_id) base_parser.update_url('PROGRAM_urls', url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] # 解析 html, request = tools.get_html_by_requests(root_url, headers=HEADER) if not html: base_parser.update_url('urls', root_url, Constance.EXCEPTION) return news_box = tools.get_tag(html, name='div', attrs={'class': "news-box"})[0] news_list = tools.get_tag(news_box, name='li') for news in news_list: try: # 图片 image = tools.get_tag(news, name='img')[0] image = tools.get_json_value(image, 'src') # url url = tools.get_tag(news, name='h3')[0] try: url = tools.get_json_value(url.a, 'href') except: url = '' # 标题 title = tools.get_tag(news, name='h3')[0] title = tools.get_text(title) title = tools.del_html_tag(title) # 内容 content = tools.get_tag(news, name='p', attrs={'class': "txt-info"})[0] content = tools.get_text(content) content = tools.del_html_tag(content) # 观看数 watched_count = '' # 来源 origin = tools.get_tag(news, name='div', attrs={'class': "s-p"})[0] origin = ''.join(tools.get_info(origin, '<a.*?>(.*?)<')) # 日期 release_time = tools.get_tag(news, name='div', attrs={'class': "s-p"})[0] release_time = tools.get_json_value(release_time, 't') release_time = tools.timestamp_to_date(int(release_time)) # 判断是否有视频 根据视频播放图标判断 regex = '<div class="img-box">.*?<i></i>.*?</div>' play_icon = tools.get_info(news, regex) except: continue contained_key, contained_key_count = base_parser.get_contained_key( title, content, remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) log.debug( ''' 标题: %s 内容: %s 来源: %s 原文url:%s 图片url:%s 观看数: %s 日期: %s 有视频: %d 关键词: %s 关键词数:%s ''' % (title, content, origin, url, image, watched_count, release_time, play_icon and True or False, contained_key, contained_key_count)) if not contained_key or not play_icon: continue base_parser.add_content_info('VA_content_info', SITE_ID, url, title, content, image_url=image, release_time=release_time, origin=origin, watched_count=watched_count, search_type=SEARCH_TYPE, keyword=contained_key, keyword_count=contained_key_count) base_parser.update_url('VA_urls', root_url, Constance.DONE)
export_data = ExportData() urls = [ 'http://www.sapprft.gov.cn/sapprft/channels/6588.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_2.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_3.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_4.shtml', 'http://www.sapprft.gov.cn/sapprft/channels/6588_5.shtml' ] count = 0 for url in urls: html, res = tools.get_html_by_requests(url) links = tools.get_tag(html, 'a', {'class': 'fl'}) release_times = tools.get_tag(html, 'span', {'class': 'fr'}) for link_num in range(len(links)): title = links[link_num].get_text() link = links[link_num]['href'] link = 'http://www.sapprft.gov.cn' + link release_time = release_times[link_num].get_text() link_html, res = tools.get_html_by_requests(link) content = tools.get_tag(link_html, 'div', {'id': 'artibody'}, find_all=False) content = content.get_text() content_info = { 'title': title,
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html, requests = tools.get_html_by_requests(root_url, headers=HEADER) titles = tools.get_tag( html, 'div', {'id': tools.re.compile('id_cse_content_item_mid_.')}) for i in range(0, len(titles)): try: url = tools.get_tag(titles[i].previous_sibling.previous_sibling, 'a', find_all=False) url = url['href'] html2 = tools.get_html_by_urllib(url) regexs = ['<title>(.+?)</title>'] mark = ''.join(tools.get_info(html2, regexs)) regexs = ['不存在', '取消'] if not tools.get_info(mark, regexs): title = tools.get_text( titles[i].previous_sibling.previous_sibling) title = tools.del_html_tag(title) info = tools.get_text(titles[i]) file_name = tools.del_html_tag(''.join( tools.get_info(info, '文件名:(.+?)文'))) file_size = tools.del_html_tag(''.join( tools.get_info(info, '文件大小:(.+?)分'))) author = tools.del_html_tag(''.join( tools.get_info(info, '分享者:(.+?)时'))) release_time = ''.join(tools.get_info(info, '时间:(.+?)下')).replace( '\n', '') download_count = tools.del_html_tag(''.join( tools.get_info(info, '下载次数:(.+?)\.'))) except: continue log.debug(''' 标题: %s 文件大小:%s 文件名字:%s 作者: %s 原文url: %s 下载数量:%s 日期: %s ''' % (title, file_size, file_name, author, url, download_count, release_time)) contained_key, contained_key_count = base_parser.get_contained_key( title, '', remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, url, title, file_size=file_size, file_name=file_name, author=author, release_time=release_time, download_count=download_count, search_type=search_type, keyword=contained_key, keyword_count=contained_key_count, task_id=remark['task_id']) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] column_id = remark while True: try: json = tools.get_json_by_requests(root_url, headers=HEADERS, data=data, params=PARAMS) newslist = tools.get_json_value(json, 'newslist') if not newslist: break data['cachedCount'] += len(newslist) data['page'] += 1 for news in newslist: # print(tools.dumps_json(news)) title = tools.get_json_value(news, 'title') release_time = tools.get_json_value(news, 'time') abstract = tools.get_json_value(news, 'abstract') original_url = tools.get_json_value(news, 'url') img_url = tools.get_json_value( news, 'thumbnails_qqnews')[0] if tools.get_json_value( news, 'thumbnails_qqnews') else '' video_frame_url = tools.get_json_value( news, 'video_channel.video.playurl') # 取content html = tools.get_html_by_urllib(original_url) content = tools.get_tag(html, name='div', attrs={'class': "main"}, find_all=False) content = tools.del_html_tag(str(content)) # 解析视频真实地址 video_url = '' if video_frame_url: video_vid = tools.get_info(html, 'vid\s*=\s*"\s*([^"]+)"', fetch_one=True) video_url = ''.join(qq.qq_download_by_vid(video_vid)) # 判断是否违规 # 敏感事件 sensitive_id = '' sensitive_event_infos = oracledb.find( 'select * from tab_mvms_sensitive_event') for sensitive_event_info in sensitive_event_infos: _id = sensitive_event_info[0] keyword1 = sensitive_event_info[3].split( ' ') if sensitive_event_info[3] else [] keyword2 = sensitive_event_info[4].split( ' ') if sensitive_event_info[4] else [] keyword3 = sensitive_event_info[5].split( ' ') if sensitive_event_info[5] else [] if base_parser.is_violate(title + content, key1=keyword1, key2=keyword2, key3=keyword3): sensitive_id = _id # 违规事件 violate_id = '' vioation_knowledge_infos = oracledb.find( 'select * from tab_mvms_violation_knowledge') for vioation_knowledge_info in vioation_knowledge_infos: _id = vioation_knowledge_info[0] keyword1 = vioation_knowledge_info[2].split( ' ') if vioation_knowledge_info[2] else [] keyword2 = vioation_knowledge_info[3].split( ' ') if vioation_knowledge_info[3] else [] keyword3 = vioation_knowledge_info[4].split( ' ') if vioation_knowledge_info[4] else [] if base_parser.is_violate(title + content, key1=keyword1, key2=keyword2, key3=keyword3): violate_id = _id log.debug( ''' title: %s abstract : %s img_url : %s original_url: %s release_time : %s video_url: %s content : %s column_id: %d sensitive_id: %s violate_id: %s ''' % (title, abstract, img_url, original_url, release_time, video_url, content, column_id, sensitive_id, violate_id)) # 下载 base_path = FILE_LOCAL_PATH is_download = 0 # 下载图片 img_name = '' if img_url: img_name = 'images/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' is_download = tools.download_file(img_url, base_path, img_name) if not is_download: img_name = '' # 下载视频 video_name = '' if video_url: video_name = 'videos/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.mp4' is_download = tools.download_file(video_url, base_path, video_name) if not is_download: video_name = '' if original_url: base_parser.add_va_app_content_info( 'VAApp_content_info', SITE_ID, title, abstract, img_url, img_name, original_url, release_time, video_url, video_name, content, column_id, is_download, sensitive_id, violate_id, STORAGE_ID) except Exception as e: log.debug(e) pass base_parser.update_url('VAApp_urls', root_url, Constance.DONE)