def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]): log.debug( ''' 添加根url search_keyword1 = %s search_keyword2 = %s search_keyword3 = %s ''' % (str(search_keyword1), str(search_keyword2), str(search_keyword3))) remark = { 'search_keyword1': search_keyword1, 'search_keyword2': search_keyword2, 'search_keyword3': search_keyword3 } search_keyword = search_keyword1 + search_keyword2 for i in search_keyword: if not i: continue for num in range(0, 760, 10): link = "https://www.baidu.com/s?wd=%s%s&pn=%d" % (i, ' 视频', num) link = tools.quote(link, safe='#/:?=&%') if not base_parser.add_url('VA_urls', SITE_ID, link, remark=remark): base_parser.update_url('VA_urls', link, Constance.TODO)
def add_root_url(parser_params): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) for program in parser_params: #[[91, '山东卫视', '调查', '新闻'], [...]] program_id = program[0] chan_name = program[1] program_name = program[2] program_type = program[3] image_url = program[4] is_have_official_blog = program[5] if is_have_official_blog == 2: search_keyword = tools.quote(chan_name + ' ' + program_name, safe='/:?=&%') url = 'http://m.weibo.cn/api/container/getIndex?type=user&containerid=100103type%3D3%26q%3D' + search_keyword base_parser.add_url('mms_urls', SITE_ID, url, remark={ 'program_id': program_id, 'chan_name': chan_name, 'program_name': program_name })
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) search_keyword1 = parser_params['search_keyword1'] search_keyword2 = parser_params['search_keyword2'] search_keyword3 = parser_params['search_keyword3'] remark = parser_params search_keywords = [] for str_key1 in search_keyword1: for str_key2 in search_keyword2: search_keywords.append((str_key1 + str_key2).strip()) else: if not search_keyword1: search_keywords = search_keyword2 if not search_keyword2: search_keywords = search_keyword1 for j in search_keywords: if not j.strip(): continue for i in range(1, 109): url = 'https://m.weibo.cn/container/getIndex?type=all&queryVal=%s&luicode=10000011' % j + \ '&lfid=106003type%3D1&' + 'title=%s&containerid=100103type' % j + '%3D1%26q%3D' + '%s&' % j + \ 'page=%d' % i url = tools.quote(url, safe='/:?=&%') if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) search_keyword1 = parser_params['search_keyword1'] search_keyword2 = parser_params['search_keyword2'] search_keyword3 = parser_params['search_keyword3'] remark = parser_params search_keywords = [] for str_key1 in search_keyword1: for str_key2 in search_keyword2: search_keywords.append((str_key1 + str_key2).strip()) else: if not search_keyword1: search_keywords = search_keyword2 if not search_keyword2: search_keywords = search_keyword1 for i in search_keywords: # print(i) if not i.strip(): continue for num in range(0, 760, 10): link = "https://www.baidu.com/s?wd=%s%s&pn=%d" % (i, ' 视频', num) # print(link) link = tools.quote(link, safe='#/:?=&%') if not base_parser.add_url('VA_urls', SITE_ID, link, remark=remark): base_parser.update_url('VA_urls', link, Constance.TODO)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: next_keyword = False quote_keyword = tools.quote(keyword) for page_index in range(1, 10): url = 'http://www.soku.com/search_video/q_%s_orderby_2_limitdate_0?spm=a2h0k.8191407.0.0&site=14&' \ '_lg=10&page=%s' % (quote_keyword, page_index) log.debug(''' 处理: %s url : %s''' % (keyword, url)) html, res = tools.get_html_by_requests(url) video_list_title = tools.get_tag(html, 'div', {'class': 'v-thumb'}) video_list_url = tools.get_tag(html, 'div', {'class': 'v-meta'}) video_list_time = tools.get_tag(html, 'div', {'class': 'v-meta-data'}) if not video_list_title: break for info_index, video_info in enumerate(video_list_title): image_url = tools.get_info(str(video_info), 'src="(.+?)"', fetch_one=True) image_url = 'http:' + image_url print(image_url) title = tools.get_info(str(video_info), 'alt="(.+?)"', fetch_one=True) print(title) url = tools.get_info(str(video_list_url[info_index]), 'href="(.+?)"', fetch_one=True) url = 'http:' + url print(url) release_time = tools.get_info(str( video_list_time[info_index * 2 + 1]), 'lass="r">(.+?)<', fetch_one=True) release_time = get_release_time(release_time) print(release_time) is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break if next_keyword: break
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: print(keyword) next_keyword = False keyword = tools.quote(keyword) for page_index in range(1, 20): url = 'http://so.iqiyi.com/so/q_%s_ctg__t_0_page_%s_p_1_qc_0_rd__site__m_4_bitrate_' % ( keyword, page_index) print(url) html, res = tools.get_html_by_requests(url) video_list_title = tools.get_tag(html, 'a', {'class': 'figure-180101'}) video_list_time = tools.get_tag(html, 'div', {'class': 'result_info'}) if not video_list_time: print('无视频列表 跳出') break for info_index, video_info in enumerate(video_list_time): try: image_url = tools.get_info(str( video_list_title[info_index]), 'src="(.+?)"', fetch_one=True) title = tools.get_info(str(video_list_title[info_index]), 'title="(.+?)"', fetch_one=True) url = tools.get_info(str(video_list_title[info_index]), 'href="(.+?)"', fetch_one=True) release_time = tools.get_tag( video_info, 'em', { 'class': 'result_info_desc' }, find_all=False).get_text() is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break except Exception as e: log.error(e) if next_keyword: break
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: log.debug('添加根url 关键词 ' + keyword) keyword = tools.quote(keyword) link = 'http://news.baidu.com/ns?word=%s&pn=0&cl=2&ct=0&tn=news&rn=50&ie=utf-8&bt=0&et=0' % (keyword) base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, link, remark = {'offset':0})
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: next_keyword = False keyword = tools.quote(keyword) for page_index in range(1, 10): url = 'https://v.qq.com/x/search/?q=%s&filter=sort=1&&cur=%s' % ( keyword, page_index) print(url) html, res = tools.get_html_by_requests(url) video_list_title = tools.get_tag(html, 'div', {'class': 'result_item'}) if not video_list_title: break for info_index, video_info in enumerate(video_list_title): try: image_url = tools.get_tag(video_info, 'img', find_all=False)['src'] image_url = 'http:' + image_url title = tools.get_tag(video_info, 'h2', find_all=False).get_text() print(title) url = tools.get_tag(video_info, 'h2', find_all=False).a['href'] release_time = tools.get_tag(video_info, 'span', { 'class': 'content' }, find_all=False).get_text() print(release_time) release_time = get_release_time(release_time) print(release_time) except Exception as e: log.error(e) continue is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break if next_keyword: break
def add_root_url(parser_params): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) result_list = parser_params['result_list'] for result in result_list: monitor_type = result[1] keywords = str(result[0]).split(',') for search_keyword in keywords: if not search_keyword: continue search_keyword = tools.quote(search_keyword, safe='/:?=&%') url = 'http://m.weibo.cn/api/container/getIndex?type=user&containerid=100103type%3D3%26q%3D' + search_keyword base_parser.add_url('WWA_weibo_user_urls', SITE_ID, url, remark=monitor_type)
def add_root_url(keywords): log.debug(''' 添加根url parser_params : %s ''' % str(keywords)) for keyword in keywords: next_keyword = False for page_index in range(1, 10): keyword = tools.quote(keyword) url = 'https://so.tv.sohu.com/mts?wd=%s&c=0&v=0&length=0&limit=0&site=0&o=3&p=%s&st=&suged=&filter=0' % \ (keyword, page_index) log.debug('处理 url = %s' % url) html, res = tools.get_html_by_requests(url) video_list_time = tools.get_tag(html, 'a', {'class': 'tcount'}) video_list_title = tools.get_tag(html, 'div', {'class': 'pic170'}) if not video_list_title: break for info_index, video_info in enumerate(video_list_title): image_url = tools.get_tag(video_info, 'img', find_all=False)['src'] image_url = 'http:' + image_url title = video_info.a['title'] url = video_info.a['href'] url = 'http:' + url release_time = video_list_time[info_index].get_text() print(release_time) release_time = get_release_time(release_time) print(release_time) is_continue = base_parser.save_video_info( image_url=image_url, url=url, title=title, release_time=release_time, site_name=NAME) if not is_continue: next_keyword = True break if next_keyword: break
def add_root_url(search_keyword1 = [], search_keyword2 = [], search_keyword3 = []): log.debug(''' 添加根url search_keyword1 = %s search_keyword2 = %s search_keyword3 = %s '''%(str(search_keyword1), str(search_keyword2), str(search_keyword3))) remark = {'search_keyword1': search_keyword1, 'search_keyword2': search_keyword2, 'search_keyword3': search_keyword3} search_keyword = search_keyword1 + search_keyword2 for j in search_keyword: if not j: continue for i in range(1, 109): url = 'https://m.weibo.cn/container/getIndex?type=all&queryVal=%s&luicode=10000011' % j + \ '&lfid=106003type%3D1&' + 'title=%s&containerid=100103type' % j + '%3D1%26q%3D' + '%s&' % j + \ 'page=%d' % i url = tools.quote(url, safe='/:?=&%') if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)