Beispiel #1
0
    def get_download_url(url):
        html, r = tools.get_html_by_requests(url)

        tvid = re.compile('player-tvid="(\d{4,11})"').findall(str(html))
        if not tvid:
            tvid = re.compile('list-tvid="(\d{4,11})"').findall(str(html))
        for i in tvid:
            tvid = i

        album_id = ''.join(re.compile('player-albumid="(\d{4,11})"').findall(str(html)))
        if not album_id:
            album_id = ''.join(re.compile('list-albumid="(\d{4,11})"').findall(str(html)))
            if not album_id:
                album_id = ''.join(re.compile('albumId: ?(\d{4,11}),').findall(str(html)))
                if not album_id:
                    album_id = ''.join(re.compile('param\[\'albumId\'\] ?= ?"(\d{4,11})"').findall(str(html)))

        current_time = tools.get_current_timestamp() * 1000
        current_time = str(current_time)

        url = 'http://iface2.iqiyi.com/video/3.0/v_download?app_k=8e48946f144759d86a50075555fd5862&app_v=8.1&qyid=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&secure_p=iPhone&secure_v=1&dev_hw=%7B%22cpu%22:%22%22,%22mem%22:%222802%22%7D&net_sts=1&device_id=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&dev_os=10.2.1&dev_ua=iPhone9,2&net_ip=%7B%22country%22:%22%E4%B8%AD%E5%9B%BD%22,%22province%22:%22%E5%8C%97%E4%BA%AC%22,%22city%22:%22%E5%8C%97%E4%BA%AC%22,%22cc%22:%22%E5%9B%BD%E5%86%85%E5%85%B6%E4%BB%96%22,%22area%22:%22%E5%8D%8E%E5%8C%97%22,%22timeout%22:0,%22respcode%22:0%7D&album_id=' + album_id + '&tvid=' + tvid + '&req_times=1&play_core=0&platform_id=12&app_p=iphone&app_t=0&usr_res=16&ppid=1229289410&cookie=53igk5Vn7X1xpazWBjzW2HUN4XGjNSP4aQypF7affdnBUaC6rknOS4dzvIcU1pMm2m2Qfb&lang=zh_CN&app_lm=cn&pps=0&req_sn=' + current_time
        json_ = tools.get_json_by_requests(url, headers=DOWNLOAD_HEADER)

        try:
            video_download_url = ''.join(re.compile('\'1\': {(.+?)},').findall(str(json_)))
            video_download_url = ''.join(re.compile('\'url\': ?\'(.+?)\'').findall(str(video_download_url)))
            video_download_url, r = tools.get_html_by_requests(video_download_url)
            video_download_url = ''.join(re.compile('"l":"(.+?)"').findall(str(video_download_url)))
        except:
            video_download_url = ''
        return video_download_url
def parser_program(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    # 解析
    html, request = tools.get_html_by_requests(root_url)
    if not html:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION)
        return

    regex = '<li class="v-item-v5.*?">(.*?)</li>'
    video_blocks = tools.get_info(html, regex)
    for video_block in video_blocks:
        regex = '<a class="u-video" href="(.*?)"'
        program_url = tools.get_info(video_block, regex, fetch_one = True)
        program_id = program_url[program_url.find('b/') + 2 : program_url.rfind('/')]
        program_url = 'http://www.mgtv.com/h/%s.html'%program_id

        regex = '<img class="u-image" src="(.*?)"'
        image_url = tools.get_info(video_block, regex, fetch_one = True)

        regex = 'em class="u-time">(.*?)</em>'
        episode = tools.get_info(video_block, regex, fetch_one = True)

        regex = '<a class="u-title".*?>(.*?)</a>'
        title = tools.get_info(video_block, regex, fetch_one = True)

        regex = '<span class="u-desc">(.*?)</span>'
        actors_block = tools.get_info(video_block, regex, fetch_one = True)
        regex = '<a .*?>(.*?)</a?'
        actors = tools.get_info(actors_block, regex)
        actors = '/'.join(actors) if actors else '暂无'

        detail_html, r = tools.get_html_by_requests(program_url)
        regex = '<em class="label">简介.*?<span>(.*?)</span>'
        summary = tools.get_info(detail_html, regex, fetch_one = True) if detail_html else ''

        log.debug('''
            program_url %s
            image_url   %s
            episode     %s
            title       %s
            actors      %s
            summary     %s
            '''%(program_url, image_url, episode, title, actors, summary))

        program_mongo_id = base_parser.add_program_info('PROGRAM_info', site_id, title, program_url, image_url, episode, directors = '', actors = actors, summary = summary, release_time = '')

        # 获取集信息url  没月份参数默认是最近月份的数据
        episode_detail_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=' + program_id
        base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id})

    base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
Beispiel #3
0
def spider_gonggao():
    urls = [
        'http://www.sapprft.gov.cn/sapprft/channels/6588.shtml',
        'http://www.sapprft.gov.cn/sapprft/channels/6588_2.shtml',
        'http://www.sapprft.gov.cn/sapprft/channels/6588_3.shtml',
        'http://www.sapprft.gov.cn/sapprft/channels/6588_4.shtml',
        'http://www.sapprft.gov.cn/sapprft/channels/6588_5.shtml'
    ]

    count = 0

    for url in urls:
        html, res = tools.get_html_by_requests(url)

        links = tools.get_tag(html, 'a', {'class': 'fl'})
        release_times = tools.get_tag(html, 'span', {'class': 'fr'})

        for link_num in range(len(links)):
            title = links[link_num].get_text()
            link = links[link_num]['href']
            link = 'http://www.sapprft.gov.cn' + link
            release_time = release_times[link_num].get_text()
            link_html, res = tools.get_html_by_requests(link)
            content = tools.get_tag(link_html,
                                    'div', {'id': 'artibody'},
                                    find_all=False)
            content = content.get_text()

            content_info = {
                'title': title,
                'url': link,
                'release_time': release_time,
                'content': content
            }

            print(title + '    ' + release_time)

            key_map = {
                'id': 'vint_sequence.nextval',
                'title': 'str_title',
                'content': 'clob_content',
                'url': 'str_url',
                'release_time': 'date_release_time'
            }

            def export_callback(execute_type, sql, data_json):
                if execute_type == ExportData.EXCEPTION:
                    print('共导出 %s 条公告' % count)
                    exit()

            count += export_data.export_to_oracle(key_map=key_map,
                                                  aim_table='TAB_IOPM_notice',
                                                  unique_key='url',
                                                  datas=content_info,
                                                  callback=export_callback)

    print('共导出 %s 条公告' % count)
def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    _db = base_parser.MongoDB()
    _db.set_unique_key('PROGRAM_EPISODE_info', 'episode_url')
    _db.update('PROGRAM_urls', {
        'depth': 0,
        'site_id': SITE_ID
    }, {'status': 0},
               multi=True)

    urls_zongyi = [
        'http://list.iqiyi.com/www/6/-30279------------11-1-1-iqiyi--.html',
        'http://list.iqiyi.com/www/6/-30279------------11-2-1-iqiyi--.html',
        'http://list.iqiyi.com/www/6/-30279------------11-3-1-iqiyi--.html'
    ]
    for urls in urls_zongyi:
        html, res = tools.get_html_by_requests(urls)
        list_infos = tools.get_tag(html,
                                   'div', {'class': 'site-piclist_pic'},
                                   find_all=True)
        for list_info in list_infos:
            link = list_info.a['href']
            image_url = list_info.a.img['src']
            print(link + ' ' + image_url)
            base_parser.add_url('PROGRAM_urls',
                                SITE_ID,
                                link,
                                remark=image_url)

    urls_juji = 'http://www.iqiyi.com/kszt/iqiyizzj.html'
    html, res = tools.get_html_by_requests(urls_juji)

    list_part_A = tools.get_tag(html, 'div', {'class': 'pro-pic'})
    for i in list_part_A:
        url = i.a['href']
        image_url = i.a.img['src']
        print(url + ' ' + image_url)
        base_parser.add_url('PROGRAM_urls', SITE_ID, url, remark=image_url)

    list_part_B = tools.get_tag(html,
                                'div', {'class': 'partB'},
                                find_all=False)
    part_B_url = tools.get_info(list_part_B,
                                '<a href="([^>]*?)"><img.*?src="(.*?)"')
    for pb in part_B_url:
        base_parser.add_url('PROGRAM_urls', SITE_ID, pb[0], remark=pb[1])
def get_proxies():
    '''
    @summary: 获取 需要运行IPPProxyPool
    ---------
    @param :
    ---------
    @result:
    '''

    try:
        proxies, r = tools.get_html_by_requests(
            'http://127.0.0.1:8000/?types=0&count=50')
        proxies = eval(proxies)
        proxie = random.choice(proxies)

        ip = proxie[0]
        port = proxie[1]

        return {
            'http': "http://{ip}:{port}".format(ip=ip, port=port),
            'https': "https://{ip}:{port}".format(ip=ip, port=port)
        }

    except:
        return {}
def parser_program_url(url_info):
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    classify = remark['classify']

    # 解析
    html, request = tools.get_html_by_requests(root_url)
    if not html:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION)
        return

    program_blocks = tools.get_tag(html, 'li', {'class': "list_item"})
    for program_block in program_blocks:
        program_block = str(program_block)

        # 地址
        regex = 'r-props="{id: \'(.*?)\''
        program_id = tools.get_info(program_block, regex, fetch_one=True)
        program_url = 'http://v.qq.com/detail/5/%s.html' % program_id
        base_parser.add_url("PROGRAM_urls",
                            site_id,
                            program_url,
                            depth=1,
                            remark={
                                'program_id': program_id,
                                'classify': classify
                            })

    base_parser.update_url("PROGRAM_urls", root_url, Constance.DONE)
Beispiel #7
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        next_keyword = False
        quote_keyword = tools.quote(keyword)
        for page_index in range(1, 10):
            url = 'http://www.soku.com/search_video/q_%s_orderby_2_limitdate_0?spm=a2h0k.8191407.0.0&site=14&' \
                  '_lg=10&page=%s' % (quote_keyword, page_index)
            log.debug('''
                处理: %s
                url : %s''' % (keyword, url))
            html, res = tools.get_html_by_requests(url)
            video_list_title = tools.get_tag(html, 'div', {'class': 'v-thumb'})
            video_list_url = tools.get_tag(html, 'div', {'class': 'v-meta'})
            video_list_time = tools.get_tag(html, 'div',
                                            {'class': 'v-meta-data'})

            if not video_list_title:
                break

            for info_index, video_info in enumerate(video_list_title):
                image_url = tools.get_info(str(video_info),
                                           'src="(.+?)"',
                                           fetch_one=True)
                image_url = 'http:' + image_url
                print(image_url)
                title = tools.get_info(str(video_info),
                                       'alt="(.+?)"',
                                       fetch_one=True)
                print(title)
                url = tools.get_info(str(video_list_url[info_index]),
                                     'href="(.+?)"',
                                     fetch_one=True)
                url = 'http:' + url
                print(url)
                release_time = tools.get_info(str(
                    video_list_time[info_index * 2 + 1]),
                                              'lass="r">(.+?)<',
                                              fetch_one=True)
                release_time = get_release_time(release_time)
                print(release_time)

                is_continue = base_parser.save_video_info(
                    image_url=image_url,
                    url=url,
                    title=title,
                    release_time=release_time,
                    site_name=NAME)

                if not is_continue:
                    next_keyword = True
                    break

            if next_keyword:
                break
def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    url = "http://www.lzy.edu.cn/"
    html, request = tools.get_html_by_requests(url)
    base_parser.add_url('op_urls', SITE_ID, url)
Beispiel #9
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        print(keyword)
        next_keyword = False
        keyword = tools.quote(keyword)
        for page_index in range(1, 20):
            url = 'http://so.iqiyi.com/so/q_%s_ctg__t_0_page_%s_p_1_qc_0_rd__site__m_4_bitrate_' % (
                keyword, page_index)

            print(url)
            html, res = tools.get_html_by_requests(url)
            video_list_title = tools.get_tag(html, 'a',
                                             {'class': 'figure-180101'})
            video_list_time = tools.get_tag(html, 'div',
                                            {'class': 'result_info'})
            if not video_list_time:
                print('无视频列表  跳出')
                break

            for info_index, video_info in enumerate(video_list_time):
                try:
                    image_url = tools.get_info(str(
                        video_list_title[info_index]),
                                               'src="(.+?)"',
                                               fetch_one=True)
                    title = tools.get_info(str(video_list_title[info_index]),
                                           'title="(.+?)"',
                                           fetch_one=True)
                    url = tools.get_info(str(video_list_title[info_index]),
                                         'href="(.+?)"',
                                         fetch_one=True)
                    release_time = tools.get_tag(
                        video_info,
                        'em', {
                            'class': 'result_info_desc'
                        },
                        find_all=False).get_text()
                    is_continue = base_parser.save_video_info(
                        image_url=image_url,
                        url=url,
                        title=title,
                        release_time=release_time,
                        site_name=NAME)
                    if not is_continue:
                        next_keyword = True
                        break

                except Exception as e:
                    log.error(e)

            if next_keyword:
                break
 def add_root_url(url, start, end):
     html, r = tools.get_html_by_requests(url)
     page_regex = '<div class="ssPages area">.*>(\d*?)</a>.*?<a title="下一页"'
     pages = tools.get_info(html, page_regex)
     pages = pages and pages[0] or ''
     if pages:
         pages = int(pages)
         for page in range(1, pages+1):
             url = start+str(page)+end
             base_parser.add_url('PROGRAM_urls', SITE_ID, url)
Beispiel #11
0
def add_root_url(parser_params = {}):
    log.debug('''
        添加根url
        parser_params : %s
        '''%str(parser_params))

    url = "http://www.luzhoutianli.com/"
    html, request = tools.get_html_by_requests(url)
    urls = tools.get_urls(html)
    for url in urls:
        base_parser.add_url('op_urls', SITE_ID, url)
Beispiel #12
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        next_keyword = False
        keyword = tools.quote(keyword)
        for page_index in range(1, 10):
            url = 'https://v.qq.com/x/search/?q=%s&filter=sort=1&&cur=%s' % (
                keyword, page_index)
            print(url)
            html, res = tools.get_html_by_requests(url)
            video_list_title = tools.get_tag(html, 'div',
                                             {'class': 'result_item'})
            if not video_list_title:
                break
            for info_index, video_info in enumerate(video_list_title):
                try:
                    image_url = tools.get_tag(video_info,
                                              'img',
                                              find_all=False)['src']
                    image_url = 'http:' + image_url
                    title = tools.get_tag(video_info, 'h2',
                                          find_all=False).get_text()
                    print(title)
                    url = tools.get_tag(video_info, 'h2',
                                        find_all=False).a['href']
                    release_time = tools.get_tag(video_info,
                                                 'span', {
                                                     'class': 'content'
                                                 },
                                                 find_all=False).get_text()
                    print(release_time)
                    release_time = get_release_time(release_time)
                    print(release_time)
                except Exception as e:
                    log.error(e)
                    continue

                is_continue = base_parser.save_video_info(
                    image_url=image_url,
                    url=url,
                    title=title,
                    release_time=release_time,
                    site_name=NAME)
                if not is_continue:
                    next_keyword = True
                    break

            if next_keyword:
                break
Beispiel #13
0
def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    keywords = parser_params['keywords']

    for keyword in keywords:
        if keyword:
            url = 'http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=y&type=1&page=1&ie=utf8' % keyword
            if mongodb.find('WWA_wechat_account_url', {'url': url}):
                continue

            headers = {
                "Upgrade-Insecure-Requests": "1",
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
                "Cache-Control": "max-age=0",
                "Connection": "keep-alive",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Language": "zh-CN,zh;q=0.8",
                "Accept-Encoding": "gzip, deflate",
                "Cookie":
                "wuid=AAGPF/32GQAAAAqLFD2BdAAAGwY=; CXID=A468F618D67D4868DC83E6061B1B3CCC; ABTEST=0|1500285612|v1; weixinIndexVisited=1; SUV=006317867B7CC4C5596C8AAD6B089707; SUIR=0A14ACB4D0CA9B50A8ABB33CD0CA69FA; ld=ekllllllll2BbH49lllllVOm1tylllll1kecBlllll9lllll9Zlll5@@@@@@@@@@; ad=AZllllllll2Bzw7GlllllVOeQA6lllll1kectkllll9lllllVqxlw@@@@@@@@@@@; SUID=72780CD23D148B0A59688B0C0002AD65; IPLOC=CN1100; sct=11; SNUID=B4B50E097177247B9A6BE55E72153425; JSESSIONID=aaaVCfkabuJQTfaNW5f1v",
                "Host": "weixin.sogou.com"
            }

            html, r = tools.get_html_by_requests(url, headers=headers)
            # 判断是否存在公众号
            not_page_tip = '/new/pc/images/bg_404_2.png'
            if not_page_tip in html:
                continue

            # 取页码
            regex = 'id="pagebar_container">.*>(\d*?)</a>.*?<a id="sogou_next"'
            page_num = tools.get_info(html, regex, fetch_one=True)
            page_num = int(page_num) if page_num else 1

            for page in range(1, page_num + 1):
                url = 'http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=y&type=1&page=%d&ie=utf8' % (
                    keyword, page)
                base_parser.add_url('WWA_wechat_account_url', SITE_ID, url)

            tools.delay_time()
Beispiel #14
0
def spider_picture(p_url, end):
    for i in range(1,11):
        i = str(i)
        url = p_url+i+end
        html, r = tools.get_html_by_requests(url)
        regex = 'title=".*?".*?src = "(.*?)".*?<div class="wrapper-listTitle">'
        img_urls = tools.get_info(html, regex)
        regex_name = 'rseat="dsjp7".*?title="(.*?)".*?src = ".*?"'
        names = tools.get_info(html, regex_name)
        j=0
        for img_url in img_urls:
            name = names[j]
            name = tools.del_html_tag(name)
            j=j+1
            #print(img_url,'---',name,'****',j)
            FILE_LOCAL_PATH = 'd:'
            sto_path = '/picture/' + name + '.jpg'
            tools.download_file(img_url, FILE_LOCAL_PATH, sto_path)
Beispiel #15
0
 def add_root_urls(url):
     html, r = tools.get_html_by_requests(url)
     # print(html)
     regex = '<div class="site-piclist_pic">(.*?)</li>'
     html_infos = tools.get_info(html, regex)
     s = 0
     for info in html_infos:
         regex = 'href = "(.*?)" class="site-piclist_pic_link"'
         url = tools.get_info(info, regex)
         url = url and url[0] or ''
         regex = 'rseat="bigTitle.*?title="(.*?)"'
         name = tools.get_info(info, regex)
         name = name and name[0] or ''
         name = tools.del_html_tag(name)
         video_download_url = get_download_url(url)
         FILE_LOCAL_PATH = 'd:'
         sto_path = '/videos/' + name + '.mp4'
         tools.download_file(video_download_url, FILE_LOCAL_PATH, sto_path)
         print(video_download_url, name)
Beispiel #16
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        next_keyword = False
        for page_index in range(1, 10):
            keyword = tools.quote(keyword)
            url = 'https://so.tv.sohu.com/mts?wd=%s&c=0&v=0&length=0&limit=0&site=0&o=3&p=%s&st=&suged=&filter=0' % \
                  (keyword, page_index)
            log.debug('处理 url = %s' % url)
            html, res = tools.get_html_by_requests(url)
            video_list_time = tools.get_tag(html, 'a', {'class': 'tcount'})
            video_list_title = tools.get_tag(html, 'div', {'class': 'pic170'})
            if not video_list_title:
                break
            for info_index, video_info in enumerate(video_list_title):
                image_url = tools.get_tag(video_info, 'img',
                                          find_all=False)['src']
                image_url = 'http:' + image_url
                title = video_info.a['title']
                url = video_info.a['href']
                url = 'http:' + url
                release_time = video_list_time[info_index].get_text()
                print(release_time)
                release_time = get_release_time(release_time)
                print(release_time)

                is_continue = base_parser.save_video_info(
                    image_url=image_url,
                    url=url,
                    title=title,
                    release_time=release_time,
                    site_name=NAME)
                if not is_continue:
                    next_keyword = True
                    break

            if next_keyword:
                break
def spider_picture(p_url, end):
    for i in range(1, 7):
        i = str(i)
        url = p_url + i + end
        #print(url)
        html, r = tools.get_html_by_requests(url)
        #print(html)
        regex = '<a class="figure.*?<img.*?src="(.*?)"/>'
        img_urls = tools.get_info(html, regex)

        regex_name = 'data-widget-searchlist-tvname="(.*?)"'
        names = tools.get_info(html, regex_name)
        j = 0
        for img_url in img_urls:
            name = names[j]
            name = tools.del_html_tag(name)
            j = j + 1
            # if not re.match(".jpg", img_url):
            #     img_url = img_url+'.jpg'
            #print(img_url,'---',name,'****',j)
            FILE_LOCAL_PATH = 'd:'
            sto_path = '/ViolatePicture/' + name + '.jpg'
            tools.download_file(img_url, FILE_LOCAL_PATH, sto_path)
Beispiel #18
0
    def check_remote_tag(self):
        '''
        @summary: 检查远程代码的版本
        ---------
        ---------
        @result: True / False (需要更新 / 不需要更新)
        '''
        # 加载上次同步的版本
        log.info('检查版本更新:%s'%self._project_name)
        per_tag = self.__get_per_tag()

        html = tools.get_html_by_requests(self._remote_url)
        regex = '<span class="tag-name">(.*?)</span>'
        current_tag = tools.get_info(html, regex, fetch_one = True)

        if current_tag > per_tag:
            self._tag = current_tag
            self._remote_zip_url = self._remote_url.replace('releases', 'archive/{tag}.zip'.format(tag = current_tag))
            self._zip_path = tools.join_path(self._local_save_path, self._project_name + '-' + self._tag + '.zip')
            self._unpack_path = tools.join_path(self._local_save_path, self._project_name + '-' + self._tag)
            log.info('''
                项目 :   %s
                本地版本:%s
                同步版本:%s
                版本地址:%s
                正在同步 ...
                '''%(self._project_name, per_tag, current_tag, self._remote_zip_url))
            return True
        else:
            log.info('''
                项目 :   %s
                本地版本:%s
                同步版本:%s
                版本一致 不需要同步。
                '''%(self._project_name, per_tag, current_tag))

            return False
Beispiel #19
0
    def get_biz(self, account_id='', account=''):
        '''
        @summary: 获取公众号的__biz参数
        ---------
        @param account_id:
        @param account:
        ---------
        @result:
        '''
        account_block = self.__get_account_blocks(account_id, account)
        if account_block == constance.VERIFICATION_CODE:
            return constance.VERIFICATION_CODE

        keyword = account_id or account

        regex = '<a.*?account_name.*?>(.*?)</a>'
        account = tools.get_info(account_block, regex, fetch_one=True)
        account = tools.del_html_tag(account)

        regex = '<label name="em_weixinhao">(.*?)</label>'
        account_id = tools.get_info(account_block, regex, fetch_one=True)

        regex = '<a.*?account_name.*?href="(.*?)">'
        account_url = tools.get_info(account_block, regex, fetch_one=True)
        account_url = account_url.replace('&amp;', "&")

        # 取biz
        headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Language":
            "zh-CN,zh;q=0.8",
            "Host":
            "mp.weixin.qq.com",
            "Connection":
            "keep-alive",
            "Referer":
            "http://weixin.sogou.com/weixin?type=1&s_from=input&query=%s&ie=utf8&_sug_=n&_sug_type_="
            % keyword,
            "Cookie":
            account_url,
            "Accept-Encoding":
            "gzip, deflate, br",
            "Cache-Control":
            "max-age=0",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
            "Upgrade-Insecure-Requests":
            "1"
        }

        # proxies = ip_proxies.get_proxies()
        # headers["User-Agent"] = ip_proxies.get_user_agent()

        html, request = tools.get_html_by_requests(
            account_url)  #, proxies = proxies)
        regex = '<div class="weui_cells_tips">(.*?)</div>'
        check_info = tools.get_info(html, regex, fetch_one=True)
        if check_info:
            log.debug('''取公众号文章页 : %s
                         url : %s
                      ''' % (check_info, account_url))
            return ''

        regex = 'var biz = "(.*?)"'

        __biz = tools.get_info(html, regex, fetch_one=True)

        log.debug('''
            公众号名称          %s
            公众号账号          %s
            账号url             %s
            __biz               %s
            ''' % (account, account_id, account_url, __biz))

        return __biz
Beispiel #20
0
    def __get_account_blocks(self, account_id='', account=''):
        keyword = account_id or account  # 账号id优先

        log.debug('search keywords ' + keyword)

        cookie = self._sogou_cookies_manager.get_cookie()

        headers = {
            "Upgrade-Insecure-Requests": "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8",
            "Accept-Encoding": "gzip, deflate",
            "Cookie": cookie[1] if cookie else
            "ABTEST=5|1518054397|v1; SNUID=EAEB52552E2B4B87BB3903692F2AC2DE; IPLOC=CN1100; SUID=C5C47C7B6E2F940A000000005A7BABFD; JSESSIONID=aaa2WHQuoILPuc70EEQfw; SUID=C5C47C7B2313940A000000005A7BABFE; SUV=00BC2C447B7CC4C55A7BABFE845F5410",
            "Host": "weixin.sogou.com"
        }

        proxies = ip_proxies.get_proxies()
        headers["User-Agent"] = ip_proxies.get_user_agent()

        url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&query=%s&ie=utf8&_sug_=n&_sug_type_=' % (
            keyword)
        html, request = tools.get_html_by_requests(
            url, headers=headers)  #, proxies = proxies)

        # 公众号信息块
        regex = '<!-- a -->(.*?)<!-- z -->'
        account_blocks = tools.get_info(html, regex)

        regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">'
        check_info = tools.get_info(html, regex, fetch_one=True)
        if check_info:
            log.debug('''取公众号列表 : %s
                         url : %s
                      ''' % (check_info, url))

            self._sogou_cookies_manager.set_cookie_un_available(cookie)
            self._sogou_cookies_manager.monitor_cookies()

            # return constance.VERIFICATION_CODE
        else:
            self._sogou_cookies_manager.set_cookie_available(cookie)

        for account_block in account_blocks:
            regex = '<a.*?account_name.*?>(.*?)</a>'
            account = tools.get_info(account_block, regex, fetch_one=True)
            account = tools.del_html_tag(account)

            regex = '<label name="em_weixinhao">(.*?)</label>'
            account_id = tools.get_info(account_block, regex, fetch_one=True)

            if account.lower() == keyword.lower() or account_id.lower(
            ) == keyword.lower():
                return account_block
        else:
            return ''
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match("&#xD;&#xA", url):
            regex = '.*?(/GovPublicInfo.+?000)'
            new_url = tools.get_info(url, regex)
            new_url = new_url[0]
            new_url = 'http://www.luzhou.gov.cn' + new_url
        else:
            new_url = 'http://www.luzhou.gov.cn' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h2 class="title">(.*?)</h2>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<span>发布时间:(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.format_date(release_time)

    #文章来源
    regexs = '<span>文章来源:(.*?)</span>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    #点击数
    regexs = '<span>点击数.*?src="(.*?)"></script>'
    times_script_url = tools.get_info(html, regexs)
    times_script_url = ''.join(times_script_url)
    times_script_url = 'http://www.luzhou.gov.cn' + times_script_url
    watched_count_html, request = tools.get_html_by_requests(times_script_url)
    regexs = '\'(\d*?)\''
    watched_count = tools.get_info(watched_count_html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<div class="conTxt">(.*?)</div>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, title, source_url, release_time, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

    # # 解析
    # html, request = tools.get_html_by_requests(root_url)
    # if not html:
    #     base_parser.update_url('urls', root_url, Constance.EXCEPTION)


if __name__ == '__main__':
    depth = 1
    url = 'http://www.lzzjw.com/List.asp?ID=13781'
    html = tools.get_html_by_requests(url, code='gb2312')
    regexs = '<h2 class="title">(.*?)</h2>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    # 时间
    regexs = '<span>发布时间:(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.del_html_tag(release_time)

    # 文章来源
    regexs = '<span>文章来源:(.*?)</span>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
Beispiel #23
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match("/", url):
            new_url = 'http://www.naxi.gov.cn' + url
        else:
            new_url = 'http://www.naxi.gov.cn/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<DIV class=news_conent_two_title>(.*?)</DIV>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<SPAN>日期:(.*?)</SPAN>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''

    #文章来源
    regexs = '<SPAN>来源:(.*?)</SPAN>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = '<SPAN>点击数:(\d*?)</SPAN>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = [
        '<DIV id=news_conent_two_text class=news_conent_two_text>(.*?)</DIV>'
    ]
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, source_url, title, release_time, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='gb2312')

    regexs = 'charset=(.*?)"'
    code = tools.get_info(html, regexs)
    code = code and code[0] or 'gb2312'
    html, request = tools.get_html_by_requests(source_url, code=code)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.scpolicec.edu.cn' + url
        else:
            new_url = 'http://www.scpolicec.edu.cn/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = ['<div class="main_title">(.*?)<div class="top_about">', '<h1>(.*?)</h1>', '<title>(.*?)</title>',
              '<div class="contentPageTitle">(.*?)</div>']
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>','<small>时间:</small>(.*?)<small>',
              '<h2><span>更新时间:(.*?)</span>']
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    if not release_time:
            regexs = '</a> 发布时间:(.*?) 点击数'
            release_time = tools.get_info(html, regexs)
            release_time = release_time and release_time[0] or ''
            release_time = tools.format_date(release_time)


    # #作者
    regexs = ['作者:(.*?) 【']
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    #author = tools.del_html_tag(author)

    #文章来源
    regexs = '来源:(.*?)</a>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = ['浏览:<font id="hits">(\d*?)</font>次', '点击数:(\d*?)&#xA;发表时间']
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<p style="text-align: center;">(.*?)</table>', '<div class="contentPageContent">(.*?)</table>'
              '<div id="endtext" style="width:900px;">(.*?)<div id="pages"></div>',
              '<div id="articleContnet">(.*?)<div class="page_css">']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                    depth               = %s
                    url                 = %s
                    title               = %s
                    release_time        = %s
                    author              = %s
                    origin              = %s
                    watched_count       = %s
                    content             = %s
                 ''' % (depth, source_url, title, release_time, author, origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id,url=source_url, title=title, release_time=release_time, author=author,
                                origin=origin, watched_count=watched_count, content=content)
    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
                    author              = %s
                    origin              = %s
                    watched_count       = %s
                    content             = %s
                 ''' % (depth, source_url, title, release_time, author, origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id,url=source_url, title=title, release_time=release_time, author=author,
                                origin=origin, watched_count=watched_count, content=content)
    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

if __name__ == '__main__':
    # depth=1
    url = "http://scjyzsjy.ncss.org.cn/job/index"
    html, request = tools.get_html_by_requests(url, code='gb2312')

    regexs = 'charset=(.*?)"'
    code = tools.get_info(html, regexs)
    code = code and code[0] or 'gb2312'
    html, request = tools.get_html_by_requests(url, code=code)
    print(code)

    regexs = '<div class="main_title">(.*?)<div class="top_about">'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    # 时间
    regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>']
    release_time = tools.get_info(html, regexs)
Beispiel #26
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='GBK')
    episode_list = 'var url = "(.*?)"'
    episode_list_json = tools.get_info(html, episode_list)
    episode_list_json = episode_list_json and episode_list_json[0] or ''

    episode_list_json_url = episode_list_json + '&cb=jsonp' + str(
        int(time.time()))
    episode_list_json_url = episode_list_json_url.replace("\\", "")
    #print(episode_list_json_url)
    # base_parser.add_url('PROGRAM_urls', site_id, url, depth+1)

    # 取类型

    # 标题
    regexs_program_name = '<meta name="keywords" content="(.*?)" />'
    program_name = tools.get_info(html, regexs_program_name)
    program_name = program_name and program_name[0] or ''

    program_url = source_url

    episode_list_json_html, r = tools.get_html_by_requests(
        episode_list_json_url)

    regexs = 'jsonp\d*?\((.*)\)'
    episode_list_json = tools.get_info(episode_list_json_html, regexs)
    episode_list_json = episode_list_json and episode_list_json[0] or ''
    episode_list_json = tools.dumps_json(episode_list_json)

    episode_list_json_value_list = tools.get_json_value(
        episode_list_json, 'data.list')

    episode = len(episode_list_json_value_list)

    summary = ''

    log.debug('''
                    depth                       = %s
                    program_name                = %s
                    program_url                 = %s
                    episode                     = %s
                    summary                     = %s

                 ''' % (depth, program_name, program_url, episode, summary))

    program_id = base_parser.add_program_info('PROGRAM_info',
                                              site_id,
                                              program_name,
                                              program_url,
                                              image_url='',
                                              episode=episode,
                                              directors='',
                                              actors='',
                                              summary=summary,
                                              release_time='')

    for episode_info in episode_list_json_value_list:
        episode_name = tools.get_json_value(episode_info, 'title')

        episode_image_url = tools.get_json_value(episode_info, 'picurl')

        episode_url = tools.get_json_value(episode_info, 'podurl')

        episode_summary = tools.get_json_value(episode_info, 'desc')

        episode_num = tools.get_json_value(episode_info, 'title')

        episode_num_regex = '第(\d*?)期'
        episode_num = tools.get_info(episode_num, episode_num_regex)
        episode_num = episode_num and episode_num[0] or ''
        if episode_num:
            episode_num = '第' + episode_num + '期'

        download_url_json_str = tools.get_json_value(episode_info, 'vid')

        download_url_json_url = 'http://v.ku6.com/fetchVideo4Player/' + download_url_json_str + '.html'
        download_url_json = tools.get_json_by_requests(download_url_json_url)
        download_url = tools.get_json_value(download_url_json, 'data.f')

        download_status = 102
        time_length = ''

        if download_url:
            #     sto_path = '/video/' + program_name + '.mp4'
            #     is_download = tools.download_file(download_url, FILE_LOCAL_PATH, sto_path)
            #     download_status = 101 if is_download else 102
            log.debug('''
                                depth                       = %s
                                episode_num                 = %s
                                time_length                 = %s
                                episode_name                = %s
                                episode_url                 = %s
                                download_url                = %s
                                episode_summary             = %s
                                episode_image_url           = %s

                             ''' % (depth + 1, episode_num, time_length,
                                    episode_name, episode_url, download_url,
                                    episode_summary, episode_image_url))
            base_parser.add_program_episode_info(
                'PROGRAM_EPISODE_info', site_id, program_id, episode_num,
                time_length, episode_name, download_status, download_url,
                episode_url, episode_summary, episode_image_url, '')

        # 更新source_url为done
    base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
Beispiel #27
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    user_id = url_info['remark']['user_id']
    head_url = url_info['remark']['head_url']
    user_name = url_info['remark']['user_name']
    gender = url_info['remark']['gender']
    program_id = url_info['remark']['program_id']

    page_count = 50
    is_continue = True

    for i in range(0, page_count + 1):
        if not is_continue: break

        weibo_content_url = root_url + '&page=%d' % i

        headers = {
            "Cache-Control":
            "max-age=0",
            "Cookie":
            "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011",
            "Accept-Language":
            "zh-CN,zh;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
            "Host":
            "m.weibo.cn",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Upgrade-Insecure-Requests":
            "1",
            "Connection":
            "keep-alive",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
        }
        html = tools.get_json_by_requests(weibo_content_url, headers=headers)

        cards = tools.get_json_value(html, 'data.cards')
        if len(cards) < 2:
            base_parser.update_url('mms_urls', root_url, Constance.DONE)
            return

        for card in cards:
            mblog = tools.get_json_value(card, 'mblog')
            if not mblog:
                continue

            url = tools.get_json_value(card, 'scheme')
            article_id = tools.get_json_value(mblog, 'id')
            article_url = 'https://m.weibo.cn/status/' + article_id

            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Cookie":
                "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011",
                "Host": "m.weibo.cn",
                "Accept-Language": "zh-CN,zh;q=0.8",
                "Upgrade-Insecure-Requests": "1",
                "Connection": "keep-alive"
            }
            origin_html, r = tools.get_html_by_requests(url, headers=headers)
            if not origin_html:
                continue

            # 精确到具体时分秒 需进入到article_url
            release_time = mblog['created_at']
            release_time = tools.format_time(release_time)
            # release_time = get_release_time(mblog)
            release_time = tools.format_date(release_time)

            come_from = tools.get_json_value(mblog, 'source')
            regexs = ['"text": "(.+?)",']
            content = ''.join(tools.get_info(origin_html, regexs))
            # content = tools.del_html_tag(content)
            content = content.replace('\\', '')

            regexs = ['"pic_ids": \[(.*?)\],']
            image_url = ''.join(tools.get_info(origin_html, regexs))
            image_url = tools.del_html_tag(image_url).replace('\"',
                                                              '').replace(
                                                                  '\\n', '')
            if image_url:
                image_url = image_url.split(',')
                for i in range(len(image_url)):
                    image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[
                        i] + '.jpg'

                image_url = ','.join(image_url)

            regexs = ['"stream_url": "(.*?)"']
            video_url = ''.join(tools.get_info(origin_html, regexs))
            transpond_count = tools.get_json_value(mblog, 'reposts_count')
            praise_count = tools.get_json_value(mblog, 'attitudes_count')
            comments_count = tools.get_json_value(mblog, 'comments_count')

            log.debug('''
                原文地址:     %s
                博主ID:       %s
                文章id         %s
                发布时间:     %s
                来自:         %s
                内容:         %s
                图片地址:     %s
                视频地址:     %s
                评论数:       %s
                转发数:       %s
                点赞数:       %s
                ''' % (article_url, user_id, article_id, release_time,
                       come_from, content, image_url, video_url,
                       comments_count, transpond_count, praise_count))

            if self_base_parser.add_article(article_id,
                                            head_url,
                                            user_name,
                                            release_time,
                                            None,
                                            content,
                                            image_url,
                                            None,
                                            praise_count,
                                            comments_count,
                                            program_id=program_id,
                                            gender=gender,
                                            url=article_url,
                                            info_type=1,
                                            emotion=random.randint(0, 2),
                                            collect=0,
                                            source='新浪微博'):

                if comments_count > 0:
                    parser_comment(article_id)
            else:
                is_continue = False
                break

    base_parser.update_url('mms_urls', root_url, Constance.DONE)
Beispiel #28
0
             ''' % (depth, source_url, title, release_time, author, origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, author=author,
                                origin=origin, watched_count=watched_count, content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

    # # 解析
    # html, request = tools.get_html_by_requests(root_url)
    # if not html:
    #     base_parser.update_url('urls', root_url, Constance.EXCEPTION)
if __name__ == '__main__':
    url = "http://www.luzhoutianli.com/luzhotuianli/item_14864969_732306.html"
    html,request = tools.get_html_by_requests(url)
    print(html)
    regexs = '<strong class="NameTxt"><a>(.*?)</a></strong>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    #title = tools.del_html_tag(title)
    print(title)
    #urls = tools.get_urls(html)
    #print(urls)
    # for url in urls:
    #     print(url)
        #base_parser.add_url('article_urls', SITE_ID, url)



def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html, requests = tools.get_html_by_requests(root_url, headers=HEADER)
    titles = tools.get_tag(
        html, 'div', {'id': tools.re.compile('id_cse_content_item_mid_.')})

    for i in range(0, len(titles)):
        try:
            url = tools.get_tag(titles[i].previous_sibling.previous_sibling,
                                'a',
                                find_all=False)
            url = url['href']

            html2 = tools.get_html_by_urllib(url)
            regexs = ['<title>(.+?)</title>']
            mark = ''.join(tools.get_info(html2, regexs))
            regexs = ['不存在', '取消']
            if not tools.get_info(mark, regexs):
                title = tools.get_text(
                    titles[i].previous_sibling.previous_sibling)
                title = tools.del_html_tag(title)
                info = tools.get_text(titles[i])

                file_name = tools.del_html_tag(''.join(
                    tools.get_info(info, '文件名:(.+?)文')))

                file_size = tools.del_html_tag(''.join(
                    tools.get_info(info, '文件大小:(.+?)分')))

                author = tools.del_html_tag(''.join(
                    tools.get_info(info, '分享者:(.+?)时')))

                release_time = ''.join(tools.get_info(info,
                                                      '时间:(.+?)下')).replace(
                                                          '\n', '')

                download_count = tools.del_html_tag(''.join(
                    tools.get_info(info, '下载次数:(.+?)\.')))

        except:
            continue

            log.debug('''
                标题:    %s
                文件大小:%s
                文件名字:%s
                作者:    %s
                原文url: %s
                下载数量:%s
                日期:    %s
                   ''' % (title, file_size, file_name, author, url,
                          download_count, release_time))

            contained_key, contained_key_count = base_parser.get_contained_key(
                title, '', remark['search_keyword1'],
                remark['search_keyword2'], remark['search_keyword3'])
            if not contained_key:
                continue

            base_parser.add_content_info('VA_content_info',
                                         SITE_ID,
                                         url,
                                         title,
                                         file_size=file_size,
                                         file_name=file_name,
                                         author=author,
                                         release_time=release_time,
                                         download_count=download_count,
                                         search_type=search_type,
                                         keyword=contained_key,
                                         keyword_count=contained_key_count,
                                         task_id=remark['task_id'])
    base_parser.update_url('VA_urls', root_url, Constance.DONE)
Beispiel #30
0
if __name__ == '__main__':
    db.gonggao_content.ensure_index('url', unique=True)
    export_data = ExportData()

    urls = [
        'http://www.sapprft.gov.cn/sapprft/channels/6588.shtml',
        'http://www.sapprft.gov.cn/sapprft/channels/6588_2.shtml',
        'http://www.sapprft.gov.cn/sapprft/channels/6588_3.shtml',
        'http://www.sapprft.gov.cn/sapprft/channels/6588_4.shtml',
        'http://www.sapprft.gov.cn/sapprft/channels/6588_5.shtml'
    ]

    count = 0

    for url in urls:
        html, res = tools.get_html_by_requests(url)

        links = tools.get_tag(html, 'a', {'class': 'fl'})
        release_times = tools.get_tag(html, 'span', {'class': 'fr'})

        for link_num in range(len(links)):
            title = links[link_num].get_text()
            link = links[link_num]['href']
            link = 'http://www.sapprft.gov.cn' + link
            release_time = release_times[link_num].get_text()
            link_html, res = tools.get_html_by_requests(link)
            content = tools.get_tag(link_html,
                                    'div', {'id': 'artibody'},
                                    find_all=False)
            content = content.get_text()