def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    search_keyword1 = parser_params['search_keyword1']
    search_keyword2 = parser_params['search_keyword2']
    search_keyword3 = parser_params['search_keyword3']

    remark = parser_params

    search_keywords = []
    for str_key1 in search_keyword1:
        for str_key2 in search_keyword2:
            search_keywords.append((str_key1 + str_key2).strip())
    else:
        if not search_keyword1:
            search_keywords = search_keyword2
        if not search_keyword2:
            search_keywords = search_keyword1

    for j in search_keywords:
        if not j.strip():
            continue
        for i in range(0, 91, 10):
            url = 'http://www.wangpansou.cn/s.php?q=%s&wp=0&start=%d' % (j, i)
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)
Esempio n. 2
0
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]):
    log.debug(
        '''
        添加根url
        search_keyword1 = %s
        search_keyword2 = %s
        search_keyword3 = %s
        ''' %
        (str(search_keyword1), str(search_keyword2), str(search_keyword3)))

    remark = {
        'search_keyword1': search_keyword1,
        'search_keyword2': search_keyword2,
        'search_keyword3': search_keyword3
    }

    search_keywords = search_keyword1 + search_keyword2

    for search_keyword in search_keywords:
        # 取页数
        url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=0' % search_keyword
        html = tools.get_html_by_urllib(url)
        regex = ['分页:1/(.*?)页']  # 测试0页
        page_count = tools.get_info(html, regex)
        page_count = int(page_count[0]) if page_count else 0
        print(page_count)

        for page in range(0, page_count):
            url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=%d' % (
                search_keyword, page)
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)
def parser_program_url(url_info):
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    classify = remark['classify']

    # 解析
    html, request = tools.get_html_by_requests(root_url)
    if not html:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION)
        return

    program_blocks = tools.get_tag(html, 'li', {'class': "list_item"})
    for program_block in program_blocks:
        program_block = str(program_block)

        # 地址
        regex = 'r-props="{id: \'(.*?)\''
        program_id = tools.get_info(program_block, regex, fetch_one=True)
        program_url = 'http://v.qq.com/detail/5/%s.html' % program_id
        base_parser.add_url("PROGRAM_urls",
                            site_id,
                            program_url,
                            depth=1,
                            remark={
                                'program_id': program_id,
                                'classify': classify
                            })

    base_parser.update_url("PROGRAM_urls", root_url, Constance.DONE)
Esempio n. 4
0
    def run(self):
        while True:
            try:
                urls = self._collector.get_urls(self._urlCount)
                log.debug("取到的url大小 %d"%len(urls))

                # 判断是否结束
                if self._collector.is_finished():
                    break

                for url in urls:
                    for parser in self._parsers:
                        if parser.SITE_ID == url['site_id']:
                            try:
                                parser.parser(url)
                            except Exception as e:
                                log.error(parser.NAME + " parser -- " + str(e))
                                print(self._tab_urls)
                                print(url['url'])
                                base_parser.update_url(self._tab_urls, url['url'], Constance.EXCEPTION)
                            break

                time.sleep(self._interval)
            except Exception as e:
                log.debug(e)
def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    search_keyword1 = parser_params['search_keyword1']
    search_keyword2 = parser_params['search_keyword2']
    search_keyword3 = parser_params['search_keyword3']

    remark = parser_params

    search_keywords = []
    for str_key1 in search_keyword1:
        for str_key2 in search_keyword2:
            search_keywords.append((str_key1 + str_key2).strip())
    else:
        if not search_keyword1:
            search_keywords = search_keyword2
        if not search_keyword2:
            search_keywords = search_keyword1

    n = 100
    for j in search_keywords:
        if not j.strip():
            continue
        for i in range(1, n + 1):
            url = 'http://www.bturls.net/search/%s_ctime_%d.html' % (j, i)
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)
Esempio n. 6
0
    def run(self):
        while True:
            try:
                urls = self._collector.get_urls(self._url_count)
                log.debug("取到的url条数 %d" % len(urls))

                # 判断是否结束
                if self._collector.is_finished():
                    break

                for url in urls:
                    for parser in self._parsers:
                        if parser.SITE_ID == url['site_id']:
                            try:
                                parser.parser(url)
                            except Exception as e:
                                log.error('''
                                    -------------- parser error -------------
                                    parer name  %s
                                    error       %s
                                    table       %s
                                    deal url    %s
                                    ''' % (parser.NAME, str(e), self._tab_urls,
                                           str(url)))

                                base_parser.update_url(self._tab_urls,
                                                       url['url'],
                                                       Constance.EXCEPTION)
                            break

                time.sleep(self._interval)
            except Exception as e:
                log.debug(e)
def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    search_keyword1 = parser_params['search_keyword1']
    search_keyword2 = parser_params['search_keyword2']
    search_keyword3 = parser_params['search_keyword3']

    remark = parser_params

    search_keywords = []
    for str_key1 in search_keyword1:
        for str_key2 in search_keyword2:
            search_keywords.append((str_key1 + str_key2).strip())
    else:
        if not search_keyword1:
            search_keywords = search_keyword2
        if not search_keyword2:
            search_keywords = search_keyword1

    for i in search_keywords:
        # print(i)
        if not i.strip():
            continue
        for num in range(0, 760, 10):
            link = "https://www.baidu.com/s?wd=%s%s&pn=%d" % (i, ' 视频', num)
            # print(link)
            link = tools.quote(link, safe='#/:?=&%')
            if not base_parser.add_url('VA_urls', SITE_ID, link,
                                       remark=remark):
                base_parser.update_url('VA_urls', link, Constance.TODO)
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]):
    log.debug(
        '''
        添加根url
        search_keyword1 = %s
        search_keyword2 = %s
        search_keyword3 = %s
        ''' %
        (str(search_keyword1), str(search_keyword2), str(search_keyword3)))

    remark = {
        'search_keyword1': search_keyword1,
        'search_keyword2': search_keyword2,
        'search_keyword3': search_keyword3
    }

    search_keyword = search_keyword1 + search_keyword2

    for i in search_keyword:
        if not i:
            continue
        for num in range(0, 760, 10):
            link = "https://www.baidu.com/s?wd=%s%s&pn=%d" % (i, ' 视频', num)
            link = tools.quote(link, safe='#/:?=&%')
            if not base_parser.add_url('VA_urls', SITE_ID, link,
                                       remark=remark):
                base_parser.update_url('VA_urls', link, Constance.TODO)
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]):
    log.debug(
        '''
        添加根url
        search_keyword1 = %s
        search_keyword2 = %s
        search_keyword3 = %s
        ''' %
        (str(search_keyword1), str(search_keyword2), str(search_keyword3)))

    remark = {
        'search_keyword1': search_keyword1,
        'search_keyword2': search_keyword2,
        'search_keyword3': search_keyword3
    }

    search_keywords = search_keyword1 + search_keyword2

    for search_keyword in search_keywords:
        if not search_keyword:
            continue
        # 最多显示10页
        for page in range(1, 11):
            url = 'http://weixin.sogou.com/weixin?type=2&query=' + search_keyword + '&page=%d&ie=utf8' % page
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]):
    log.debug(
        '''
        添加根url
        search_keyword1 = %s
        search_keyword2 = %s
        search_keyword3 = %s
        ''' %
        (str(search_keyword1), str(search_keyword2), str(search_keyword3)))

    remark = {
        'search_keyword1': search_keyword1,
        'search_keyword2': search_keyword2,
        'search_keyword3': search_keyword3
    }

    search_keywords = search_keyword1 + search_keyword2

    for search_keyword in search_keywords:
        # 取页数
        url = 'https://movie.douban.com/subject_search?start=0&search_text=%s&cat=1002' % search_keyword
        html = tools.get_html_by_urllib(url)
        regex = '<div class="paginator">.*<a href.*?>(.*?)</a><span class="next"'
        page_count = tools.get_info(html, regex)
        page_count = int(page_count[0]) if page_count else 0
        print(page_count)

        for page in range(0, page_count):
            url = 'https://movie.douban.com/subject_search?start=%d&search_text=%s&cat=1002' % (
                page * 15, search_keyword)
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)
Esempio n. 11
0
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]):
    log.debug(
        '''
        添加根url
        search_keyword1 = %s
        search_keyword2 = %s
        search_keyword3 = %s
        ''' %
        (str(search_keyword1), str(search_keyword2), str(search_keyword3)))

    remark = {
        'search_keyword1': search_keyword1,
        'search_keyword2': search_keyword2,
        'search_keyword3': search_keyword3
    }

    search_keyword = search_keyword1 + search_keyword2

    for j in search_keyword:
        if not j:
            continue
        for i in range(0, 91, 10):
            url = 'http://www.wangpansou.cn/s.php?q=%s&wp=0&start=%d' % (j, i)
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)
Esempio n. 12
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    html = tools.get_json_by_requests(root_url, headers=headers)
    data_info = jsonpath.jsonpath(html, '$..video_info')
    for data in data_info:
        title = data.get('title')
        video_url = data.get('play_url')
        img_url = data.get('cover_url')
        release_time = stamp_to_date(data.get('upline_time'))

        if video_url !='':
            info_type = 1
        else:
            info_type = 2

        base_parser.save_info('content_info', site_id=SITE_ID, url=video_url, title=title,site_name=NAME,
                                          content='', release_time=release_time, image_url=img_url,
                                          video_url=video_url, is_out_link=1, download_image=False, is_debug=False,
                                          info_type=info_type)

    base_parser.update_url('urls', root_url, Constance.DONE)
Esempio n. 13
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    offset = remark.get('offset')

    html = tools.get_html_by_webdirver(root_url)
    headers = tools.get_tag(html, 'div', {'class': 'result'}, find_all=True)
    if not headers:
        base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)

    for header in headers:
        # 查看更多相关新闻
        regex = ' <span class="c-info"><a.*?href="(.*?)".*?查看更多相关新闻'
        more_news_url = tools.get_info(str(header), regex, fetch_one = True)
        if more_news_url:
            more_news_url = tools.get_full_url('http://news.baidu.com', more_news_url)
            more_news_url = more_news_url.replace('amp;', '')
            base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, more_news_url, depth = 1, remark = {'offset':0})

        url = header.h3.a['href']
        article_extractor = ArticleExtractor(url)
        content = title = release_time = author = website_domain =''
        content = article_extractor.get_content()
        if content:
            title = article_extractor.get_title()
            release_time = article_extractor.get_release_time()
            author = article_extractor.get_author()
            website_domain = tools.get_domain(url)
            uuid = tools.get_uuid(title, website_domain)
            website_name = ''
            website_position = None

            log.debug('''
                uuid         %s
                title        %s
                author       %s
                release_time %s
                domain       %s
                url          %s
                content      %s
                '''%(uuid, title, author, release_time, website_domain, url, '...'))

            # 入库
            if tools.is_have_chinese(content):
                is_continue = self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name , website_domain, website_position, url, content)

                if not is_continue:
                    break
    else:
        # 循环正常结束 该页均正常入库, 继续爬取下页
        offset += 50
        url = tools.replace_str(root_url, 'pn=\d*', 'pn=%d'%offset)
        base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, url, depth = 0, remark = {'offset': offset})

    base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)
Esempio n. 14
0
def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    search_keyword1 = parser_params['search_keyword1']
    search_keyword2 = parser_params['search_keyword2']
    search_keyword3 = parser_params['search_keyword3']

    remark = parser_params

    search_keywords = []
    for str_key1 in search_keyword1:
        for str_key2 in search_keyword2:
            search_keywords.append((str_key1 + str_key2).strip())
    else:
        if not search_keyword1:
            search_keywords = search_keyword2
        if not search_keyword2:
            search_keywords = search_keyword1

    for j in search_keywords:
        if not j.strip():
            continue

        for i in range(1, 109):
            url = 'https://m.weibo.cn/container/getIndex?type=all&queryVal=%s&luicode=10000011' % j + \
                  '&lfid=106003type%3D1&' + 'title=%s&containerid=100103type' % j + '%3D1%26q%3D' + '%s&' % j + \
                  'page=%d' % i
            url = tools.quote(url, safe='/:?=&%')
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]):
    log.debug(
        '''
        添加根url
        search_keyword1 = %s
        search_keyword2 = %s
        search_keyword3 = %s
        ''' %
        (str(search_keyword1), str(search_keyword2), str(search_keyword3)))

    remark = {
        'search_keyword1': search_keyword1,
        'search_keyword2': search_keyword2,
        'search_keyword3': search_keyword3
    }

    search_keyword = search_keyword1 + search_keyword2
    n = 100
    for j in search_keyword:
        if not j:
            continue
        for i in range(1, n + 1):
            url = 'http://www.bturls.net/search/%s_ctime_%d.html' % (j, i)
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html = tools.get_html_by_urllib(root_url)
    title = '<tr height="25"><td><a href=".*?"  title="(.*?)"'
    video_url = ['<tr height="25"><td><a href="(.*?)"']
    author = ['<a href="user-.*?.html" target="_blank">(.*?)</a>']
    watched_count = ['浏览次数: </span>(.*?)&nbsp']
    file_size = ['资料大小: </span>(.*?)&nbsp']
    download_count = ['下载次数: </span>(.*?)&nbsp']

    titles = tools.get_info(html, title, allow_repeat = True)
    video_urls = tools.get_info(html, video_url, allow_repeat = True)
    authors = tools.get_info(html, author, allow_repeat = True)
    watched_counts = tools.get_info(html, watched_count, allow_repeat = True)
    file_sizes = tools.get_info(html, file_size, allow_repeat= True)
    download_counts = tools.get_info(html, download_count, allow_repeat = True)


    for i in range(len(titles)):
        title = titles[i]
        title = tools.del_html_tag(title)

        video_url = video_urls[i]
        video_url = tools.get_full_url('http://www.sobaidupan.com', video_url)

        author = authors[i]
        watched_count = watched_counts[i]
        file_size = file_sizes[i]
        download_count = download_counts[i]

        log.debug('''
            标题:    %s
            视频地址: %s
            作者:    %s
            观看数    %s
            资料大小  %s
            下载次数  %s
        '''%(title, video_url, author, watched_count, file_size, download_count))

        contained_key, contained_key_count = base_parser.get_contained_key(title, '',
                                                            remark['search_keyword1'],
                                                            remark['search_keyword2'], remark['search_keyword3'])
        if not contained_key:
            continue

        base_parser.add_content_info('VA_content_info', SITE_ID, video_url, title, file_size = file_size,
                                     file_name = title, author = author, watched_count = watched_count,
                                     download_count = download_count, search_type = search_type,
                                     keyword = contained_key, keyword_count = contained_key_count, task_id = remark['task_id'])

    base_parser.update_url('VA_urls', root_url, Constance.DONE)
def parser_program(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    # 解析
    html, request = tools.get_html_by_requests(root_url)
    if not html:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION)
        return

    regex = '<li class="v-item-v5.*?">(.*?)</li>'
    video_blocks = tools.get_info(html, regex)
    for video_block in video_blocks:
        regex = '<a class="u-video" href="(.*?)"'
        program_url = tools.get_info(video_block, regex, fetch_one = True)
        program_id = program_url[program_url.find('b/') + 2 : program_url.rfind('/')]
        program_url = 'http://www.mgtv.com/h/%s.html'%program_id

        regex = '<img class="u-image" src="(.*?)"'
        image_url = tools.get_info(video_block, regex, fetch_one = True)

        regex = 'em class="u-time">(.*?)</em>'
        episode = tools.get_info(video_block, regex, fetch_one = True)

        regex = '<a class="u-title".*?>(.*?)</a>'
        title = tools.get_info(video_block, regex, fetch_one = True)

        regex = '<span class="u-desc">(.*?)</span>'
        actors_block = tools.get_info(video_block, regex, fetch_one = True)
        regex = '<a .*?>(.*?)</a?'
        actors = tools.get_info(actors_block, regex)
        actors = '/'.join(actors) if actors else '暂无'

        detail_html, r = tools.get_html_by_requests(program_url)
        regex = '<em class="label">简介.*?<span>(.*?)</span>'
        summary = tools.get_info(detail_html, regex, fetch_one = True) if detail_html else ''

        log.debug('''
            program_url %s
            image_url   %s
            episode     %s
            title       %s
            actors      %s
            summary     %s
            '''%(program_url, image_url, episode, title, actors, summary))

        program_mongo_id = base_parser.add_program_info('PROGRAM_info', site_id, title, program_url, image_url, episode, directors = '', actors = actors, summary = summary, release_time = '')

        # 获取集信息url  没月份参数默认是最近月份的数据
        episode_detail_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=' + program_id
        base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id})

    base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
Esempio n. 18
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']

    # 获取搜索词比配到的url
    start = 0
    while True:
        urls = mg.search_url(query=root_url,
                             num=50,
                             start=start,
                             pause=random.randint(MIN_SLEEP_TIME,
                                                  MAX_SLEEP_TIME))
        if not urls:
            break

        for url in urls:
            url = url.replace('amp;', '')

            article_extractor = ArticleExtractor(url)
            content = title = release_time = author = website_domain = ''
            content = article_extractor.get_content()
            if content:
                title = article_extractor.get_title()
                release_time = article_extractor.get_release_time()
                author = article_extractor.get_author()
                website_domain = tools.get_domain(url)
                uuid = tools.get_uuid(title, website_domain)
                website_name = ''
                website_position = 35  # 境外

                log.debug('''
                    uuid         %s
                    title        %s
                    author       %s
                    release_time %s
                    domain       %s
                    url          %s
                    content      %s
                    ''' % (uuid, title, author, release_time, website_domain,
                           url, '...'))

                # 入库
                if tools.is_have_chinese(content):
                    is_continue = self_base_parser.add_news_acticle(
                        uuid, title, author, release_time, website_name,
                        website_domain, website_position, url, content)

                    if not is_continue:
                        break
        else:
            # 循环正常结束 该页均正常入库, 继续爬取下页
            start += 50

    base_parser.update_url('google_news_urls', root_url, Constance.DONE)
Esempio n. 19
0
 def inner_add_url(base_url, url, remark):
     html = tools.get_html_by_urllib(base_url)
     regex = 'pg.pageCount = (.*?);'
     page_count = tools.get_info(html, regex, allow_repeat=True)
     page_count = ''.join(page_count)
     page_count = round(float(page_count))
     page_count = int(page_count)
     for i in range(0, page_count+1):
         url = url % i
         if not base_parser.add_url('GameApp_urls', SITE_ID, url, remark=remark):
             base_parser.update_url('GameApp_urls', url, Constance.TODO)
 def inner_add_url(url, remark):
     html = tools.get_html_by_urllib(url)
     regex = '<li><span></span><a  href="(.*?)">.*?</a></li>'
     infos = tools.get_info(html, regex)
     for info in infos:
         info = ''.join(info)
         type_url = 'http://shouji.baidu.com' + info
         type_html = tools.get_html_by_urllib(type_url)
         page_count = '<div class="pager">.*">(.*?)</a>.*?<li class="next">'
         page_count = tools.get_info(type_html, page_count)
         page_count = ''.join(page_count)
         if not page_count:
             page_count = '1'
         page_count = int(page_count)
         for page in range(1, page_count + 1):
             url = type_url + 'list_%d.html' % page
             if not base_parser.add_url(
                     'GameApp_urls', SITE_ID, url, remark=remark):
                 base_parser.update_url('GameApp_urls', url, Constance.TODO)
Esempio n. 21
0
    def run(self):
        while True:
            try:
                urls = self._collector.get_urls(self._url_count)
                log.debug("取到的url大小 %d" % len(urls))

                # 判断是否结束
                if self._collector.is_finished():
                    break

                for url in urls:
                    for parser in self._parsers:
                        if parser.SITE_ID == url['site_id']:
                            try:
                                if url.get('retry_times', 0) > MAX_RETRY_TIMES:
                                    print('超过最大重试数,放弃url = %s' % url['url'])
                                    base_parser.update_url(
                                        self._tab_urls, url['url'],
                                        Constance.EXCEPTION)
                                else:
                                    parser.parser(url)
                                # base_parser.update_url(self._tab_urls, url['url'], Constance.DONE)
                            except Exception as e:
                                log.error(
                                    '''
                                    -------------- parser error -------------
                                    parer name  %s
                                    error       %s
                                    deal url    %s
                                    table       %s
                                    ''' %
                                    (parser.NAME, str(e), url, self._tab_urls))

                                base_parser.update_url(self._tab_urls,
                                                       url['url'],
                                                       Constance.EXCEPTION)
                            break

                time.sleep(self._interval)
            except Exception as e:
                log.debug(e)
Esempio n. 22
0
def parser(url_info):
    # url  = 'http://user.xiaoyouzb.net/v3/vod/small_recommend?nwtime=1571816563&sign=883f96aee2655d8885e7815de3423df7&type=1&cateId=13&pageNum=0&isFirst=N&_u=edac2c15598946bd9ba7bda78a83489c&version=4.7.0&platform=android&appx=yuntu&apppn=org.fungo.fungolive&enterprise=0&channel=tencent&market=32&os_version=8.0.0&device_model=MIX%25202&device_code=780493075490198&udid=77e2cb72797f20afdcaaa6265872cea9&androidId=220240afd2e0e640&source=android'
    root_url = url_info['url']
    cname = url_info['remark']["category_name"]
    headers = {
        "User-Agent": "yuntutv/4.7.0 (Android 8.0.0)",
        "Host": "user.xiaoyouzb.net"
    }
    json_data = tools.get_json_by_requests(root_url, headers=headers)
    data_infos = json_data["data"]
    for data_info in data_infos:
        publishTime = data_info["publishTime"]
        release_time = tools.timestamp_to_date(str(publishTime)[:-3])
        title = data_info["content"]
        content = data_info["content"]
        video_url = data_info["videoUrl"]
        img_url = data_info["coverUrl"]
        base_parser.save_info('content_info', site_id=SITE_ID, url=video_url, title=title, site_name=NAME,
                              content=content, release_time=release_time, image_url=img_url,
                              video_url=video_url, is_out_link=1, download_image=False, is_debug=False,
                              )
    base_parser.update_url('urls', root_url, Constance.DONE)
Esempio n. 23
0
def parser(url_info):
    root_url = url_info['url']
    para = url_info["remark"]["para_template"]
    headers = url_info["remark"]["header_template"]
    response = requests.get(root_url, params=para, headers=headers)
    time.sleep(2)
    json_info = response.json()
    cate = url_info["remark"]["cate_name"]
    data_jsons = jsonpath(json_info, "$..items..data")
    if cate != '':
        for data_info in data_jsons:
            data_json = json.loads(data_info)
            title = jsonpath(data_json, "$..title")[0]
            img_str = glom(data_json, "coverUrl")
            img_json = json.loads(img_str)
            img_url = img_json["L"][0]
            content = jsonpath(data_json, "$..summary")[0]
            updateTime = jsonpath(data_json, "$..updateTime")[0]
            video_str = glom(data_json, "videoUrl")
            video_json = json.loads(video_str)
            video_url = video_json["source"]["hd"]
            release_time = tools.timestamp_to_date(str(updateTime)[:-3])
            base_parser.save_info(
                'content_info',
                site_id=SITE_ID,
                url=video_url,
                title=title,
                site_name=NAME,
                content=content,
                release_time=release_time,
                image_url=img_url,
                video_url=video_url,
                is_out_link=1,
                download_image=False,
                is_debug=False,
            )

    base_parser.update_url('urls', root_url, Constance.DONE)
def add_root_url(parser_params = {}):
    log.debug('''
        添加根url
        parser_params : %s
        '''%str(parser_params))


    search_keyword1 = parser_params['search_keyword1']
    search_keyword2 = parser_params['search_keyword2']
    search_keyword3 = parser_params['search_keyword3']

    remark = parser_params

    search_keywords = []
    for str_key1 in search_keyword1:
        for str_key2 in search_keyword2:
            search_keywords.append((str_key1 + str_key2).strip())
    else:
        if not search_keyword1:
            search_keywords = search_keyword2
        if not search_keyword2:
            search_keywords = search_keyword1

    for search_keyword in search_keywords:
        if not search_keyword.strip():
            continue
        # 取页数
        url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=0' % search_keyword
        html = tools.get_html_by_urllib(url)
        regex = ['分页:1/(.*?)页'] # 测试0页
        page_count = tools.get_info(html, regex)
        page_count = int(page_count[0]) if page_count else 0
        print(page_count)

        for page in range(0, page_count):
            url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=%d'%(search_keyword, page)
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    fit_url = tools.fit_url(urls, FIT_URLS)
    for url in fit_url:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)
    # 内容
    regexs = ['<div id="content">(.*?)<div class="clear"></div>',
              '<div class="article">(.*?)<!--文章操作-->',
              '<div id="video_area">(.*?)<!--文章操作-->',
              '<div class="content">(.*?)<div id="article_edit">'
              ]

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             '''%(depth+1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id, source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)
def add_root_url(search_keyword1 = [], search_keyword2 = [], search_keyword3 = []):
    log.debug('''
        添加根url
        search_keyword1 = %s
        search_keyword2 = %s
        search_keyword3 = %s
        '''%(str(search_keyword1), str(search_keyword2), str(search_keyword3)))

    remark = {'search_keyword1': search_keyword1, 'search_keyword2': search_keyword2,
              'search_keyword3': search_keyword3}

    search_keyword = search_keyword1 + search_keyword2

    for j in search_keyword:
        if not j:
            continue
        for i in range(1, 109):
            url = 'https://m.weibo.cn/container/getIndex?type=all&queryVal=%s&luicode=10000011' % j + \
                  '&lfid=106003type%3D1&' + 'title=%s&containerid=100103type' % j + '%3D1%26q%3D' + '%s&' % j + \
                  'page=%d' % i
            url = tools.quote(url, safe='/:?=&%')
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)
Esempio n. 27
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    data = tools.get_json_by_requests(root_url)
    data_info = data.get("returnData").get('news')
    for info in data_info:
        # print(info)
        url = info['url']
        release_time = info['publishDate']
        title = info['title']
        video_url = jsonpath.jsonpath(info['video'], '$..relativeUrl')[0]
        img_url = info['logo']

        if video_url != '':
            info_type = 1
        else:
            info_type = 2

        base_parser.save_info('content_info',
                              site_id=SITE_ID,
                              url=url,
                              title=title,
                              site_name=NAME,
                              content='',
                              release_time=release_time,
                              image_url=img_url,
                              video_url=video_url,
                              is_out_link=1,
                              download_image=False,
                              is_debug=False,
                              info_type=info_type)

    base_parser.update_url('urls', root_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return
    urls = tools.get_urls(html, STOP_URLS)

    urls = tools.fit_url(urls, "cctv.com")
    for url in urls:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<h1><!--repaste.title.begin-->(.*?)<!--repaste.title.end-->'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)
    # 内容
    regexs = ['<!--repaste.body.begin-->(.*?)<!--repaste.body.end-->']

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             ''' % (depth + 1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id,
                                     source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)
def parser_episode_info(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    program_id = remark['program_id']
    program_mongo_id = remark['program_mongo_id']

    episode_json = tools.get_json_by_requests(root_url)
    if not episode_json:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
        return

    code = episode_json.get('code')
    if code is not 200:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
        return

    episode_data = episode_json.get('data', {})
    episode_info = episode_data.get('info', {})

    name = episode_info.get('title', '')
    url = episode_info.get('url', '')
    image_url = episode_info.get('thumb', '')
    episode_num = episode_info.get('series', '')
    summary = episode_info.get('desc', '')
    time_length = episode_info.get('duration', '')

    episode_download_url = episode_data.get('stream', [{'url':''}])[0].get('url')
    episode_download_url = 'http://disp.titan.mgtv.com' + episode_download_url

    episode_download_info = tools.get_json_by_requests(episode_download_url)
    if episode_download_info:
        episode_download_url = episode_download_info.get('info', '')
    else:
        episode_download_url = ''

    log.debug('''
        program_mongo_id     %s
        name                 %s
        url                  %s
        image_url            %s
        episode_num          %s
        summary              %s
        time_length          %s
        episode_download_url %s
        '''%(program_mongo_id, name, url, image_url, episode_num, summary, time_length, episode_download_url))

    base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_mongo_id, episode_num = episode_num, time_length = time_length, episode_name = name, download_status = '', download_url = episode_download_url, episode_url = url, summary = summary, image_url = image_url, sto_path = '')
    base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def parser_episode_detail_url(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    program_id = remark['program_id']
    program_mongo_id = remark['program_mongo_id']

    episode_json = tools.get_json_by_requests(root_url)
    if not episode_json:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
        return

    code = episode_json.get('code')
    if code is not 200:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
        return

    episode_data = episode_json.get('data', {})
    # 解析分集详细信息地址
    episode_list = episode_data.get('list', [])
    for episode in episode_list:
        episode_id = episode['video_id']
        episode_detail_url = 'http://pcweb.api.mgtv.com/player/video?video_id=' + episode_id
        base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 2, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id})

    # 解析其他年份和月份的url
    episode_years = episode_data.get('tab_y', [])
    episode_months = episode_data.get('tab_m', [])
    for episode_year in episode_years: #
        year = episode_year['t']
        temp_program_id = episode_year['id']
        episode_list_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=%s'%temp_program_id
        # 添加url 没月份参数默认是最近月份的数据
        base_parser.add_url('PROGRAM_urls', SITE_ID, episode_list_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : temp_program_id})

    for episode_month in episode_months[1:]: #去掉最近月份的数据
        episode_month = episode_month['m']
        episode_list_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=%s&month=%s'%(program_id, episode_month)
        # 添加url
        base_parser.add_url('PROGRAM_urls', SITE_ID, episode_list_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id})

    base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)