Esempio n. 1
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        next_keyword = False
        for page_index in range(1, 10):
            url = 'http://so.video.sina.com.cn/interface/s?from=video&wd=%s&s_id=w00001&p=%s&n=20&s=1' \
                  % (keyword, page_index)
            info_json = tools.get_json_by_requests(url)
            video_info_list = info_json['list']
            if not video_info_list:
                print(url)
                break
            for video_info in video_info_list:
                image_url = video_info['thumburl']
                title = tools.del_html_tag(video_info['videoname'])
                url = video_info['url']
                release_time = video_info['showtime']
                current_date = tools.get_current_date('%Y-%m-%d')
                if current_date > release_time:
                    next_keyword = True
                    break
                base_parser.save_video_info(image_url=image_url,
                                            url=url,
                                            title=title,
                                            release_time=release_time,
                                            site_name=NAME)
            if next_keyword:
                break
Esempio n. 2
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        print(keyword)
        next_keyword = False
        for page_index in range(1, 10):
            keyword = tools.quote(keyword)
            url = 'http://so.iqiyi.com/so/q_%s_ctg__t_0_page_%s_p_1_qc_0_rd__site__m_4_bitrate_' % (
                keyword, page_index)
            html, res = tools.get_html_by_requests(url)
            video_list_title = tools.get_tag(html, 'a',
                                             {'class': 'figure-180101'})
            video_list_time = tools.get_tag(html, 'div',
                                            {'class': 'result_info'})
            if not video_list_time:
                break

            for info_index, video_info in enumerate(video_list_time):
                try:
                    image_url = tools.get_info(str(
                        video_list_title[info_index]),
                                               'src="(.+?)"',
                                               fetch_one=True)
                    title = tools.get_info(str(video_list_title[info_index]),
                                           'title="(.+?)"',
                                           fetch_one=True)
                    url = tools.get_info(str(video_list_title[info_index]),
                                         'href="(.+?)"',
                                         fetch_one=True)
                    release_time = tools.get_tag(
                        video_info,
                        'em', {
                            'class': 'result_info_desc'
                        },
                        find_all=False).get_text()
                    current_date = tools.get_current_date('%Y-%m-%d')
                    if current_date > release_time:
                        next_keyword = True
                        break
                    base_parser.save_video_info(image_url=image_url,
                                                url=url,
                                                title=title,
                                                release_time=release_time,
                                                site_name=NAME)
                except:
                    pass

            if next_keyword:
                break
Esempio n. 3
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        next_keyword = False
        for page_index in range(1, 10):
            keyword = tools.quote(keyword)
            url = 'http://www.soku.com/search_video/q_%s_orderby_2_limitdate_0?spm=a2h0k.8191407.0.0&site=14&' \
                  '_lg=10&page=%s' % (keyword, page_index)
            html, res = tools.get_html_by_requests(url)
            video_list_title = tools.get_tag(html, 'div', {'class': 'v-thumb'})
            video_list_url = tools.get_tag(html, 'div', {'class': 'v-meta'})
            video_list_time = tools.get_tag(html, 'div',
                                            {'class': 'v-meta-data'})

            if not video_list_title:
                break

            for info_index, video_info in enumerate(video_list_title):
                image_url = tools.get_info(str(video_info),
                                           'src="(.+?)"',
                                           fetch_one=True)
                image_url = 'http:' + image_url
                print(image_url)
                title = tools.get_info(str(video_info),
                                       'alt="(.+?)"',
                                       fetch_one=True)
                print(title)
                url = tools.get_info(str(video_list_url[info_index]),
                                     'href="(.+?)"',
                                     fetch_one=True)
                url = 'http:' + url
                print(url)
                release_time = tools.get_info(str(
                    video_list_time[info_index * 2 + 1]),
                                              'lass="r">(.+?)<',
                                              fetch_one=True)
                release_time = get_release_time(release_time)
                print(release_time)
                current_date = tools.get_current_date('%Y-%m-%d')
                if current_date > release_time:
                    next_keyword = True
                    break
                base_parser.save_video_info(image_url=image_url,
                                            url=url,
                                            title=title,
                                            release_time=release_time,
                                            site_name=NAME)
            if next_keyword:
                break
Esempio n. 4
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        next_keyword = False
        keyword = tools.quote(keyword)
        for page_index in range(1, 10):
            url = 'https://v.qq.com/x/search/?q=%s&filter=sort=1&&cur=%s' % (
                keyword, page_index)
            print(url)
            html, res = tools.get_html_by_requests(url)
            video_list_title = tools.get_tag(html, 'div',
                                             {'class': 'result_item'})
            if not video_list_title:
                break
            for info_index, video_info in enumerate(video_list_title):
                try:
                    image_url = tools.get_tag(video_info,
                                              'img',
                                              find_all=False)['src']
                    image_url = 'http:' + image_url
                    title = tools.get_tag(video_info, 'h2',
                                          find_all=False).get_text()
                    print(title)
                    url = tools.get_tag(video_info, 'h2',
                                        find_all=False).a['href']
                    release_time = tools.get_tag(video_info,
                                                 'span', {
                                                     'class': 'content'
                                                 },
                                                 find_all=False).get_text()
                    print(release_time)
                    release_time = get_release_time(release_time)
                    print(release_time)
                except Exception as e:
                    log.error(e)
                    continue

                is_continue = base_parser.save_video_info(
                    image_url=image_url,
                    url=url,
                    title=title,
                    release_time=release_time,
                    site_name=NAME)
                if not is_continue:
                    next_keyword = True
                    break

            if next_keyword:
                break
Esempio n. 5
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        next_keyword = False
        for page_index in range(1, 10):
            keyword = tools.quote(keyword)
            url = 'https://so.tv.sohu.com/mts?wd=%s&c=0&v=0&length=0&limit=0&site=0&o=3&p=%s&st=&suged=&filter=0' % \
                  (keyword, page_index)
            log.debug('处理 url = %s' % url)
            html, res = tools.get_html_by_requests(url)
            video_list_time = tools.get_tag(html, 'a', {'class': 'tcount'})
            video_list_title = tools.get_tag(html, 'div', {'class': 'pic170'})
            if not video_list_title:
                break
            for info_index, video_info in enumerate(video_list_title):
                image_url = tools.get_tag(video_info, 'img',
                                          find_all=False)['src']
                image_url = 'http:' + image_url
                title = video_info.a['title']
                url = video_info.a['href']
                url = 'http:' + url
                release_time = video_list_time[info_index].get_text()
                print(release_time)
                release_time = get_release_time(release_time)
                print(release_time)
                current_date = tools.get_current_date('%Y-%m-%d')
                if current_date > release_time:
                    next_keyword = True
                    break
                base_parser.save_video_info(image_url=image_url,
                                            url=url,
                                            title=title,
                                            release_time=release_time,
                                            site_name=NAME)
            if next_keyword:
                break