Python get_full_url Examples

Programming Language: Python

Namespace/Package Name: utils.tools

Method/Function: get_full_url

Examples at hotexamples.com: 9

Python get_full_url - 9 examples found. These are the top rated real world Python examples of utils.tools.get_full_url extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: youku_parser.py Project: xuexiteam/internet-content-detection

def add_root_url(parser_params = {}):
    log.debug('''
        添加根url
        parser_params : %s
        '''% str(parser_params))

    _db = base_parser.MongoDB()
    _db.set_unique_key('PROGRAM_EPISODE_info', 'episode_url')
    _db.update('PROGRAM_urls', {'depth': 0, 'site_id': SITE_ID}, {'status': 0}, multi=True)

    for page_num in range(1, 14):
        urls = [
                'http://list.youku.com/category/show/c_85_g_热门网综_s_1_d_1_p_%d.html' % page_num,
                'http://list.youku.com/category/show/c_97_g_优酷出品_s_1_d_1_p_%d.html' % page_num,
                'http://list.youku.com/category/show/c_96_g_优酷出品_s_1_d_1_p_%d.html' % page_num,
                ]
        for url in urls:
            print(url)
            print('********************************************************')
            html = tools.get_html_by_urllib(url)
            if tools.get_info(html, ['小酷没有筛选到相关视频']):
                continue
            links = tools.get_tag(html, 'div', {'class': 'p-thumb'})
            for link in links:
                try:
                    link = link.a['href']
                    link = tools.get_full_url('http:', link)
                    link_html = tools.get_html_by_urllib(link)
                    link = tools.get_tag(link_html, 'a', {'class': 'desc-link'}, find_all=False)
                    link = link['href']
                    link = tools.get_full_url('http:', link)
                    base_parser.add_url('PROGRAM_urls', SITE_ID, link, depth=0)
                except Exception as e:
                    log.error(e)
                    print(link_html)

Example #2

Show file

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    offset = remark.get('offset')

    html = tools.get_html_by_webdirver(root_url)
    headers = tools.get_tag(html, 'div', {'class': 'result'}, find_all=True)
    if not headers:
        base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)

    for header in headers:
        # 查看更多相关新闻
        regex = ' <span class="c-info"><a.*?href="(.*?)".*?查看更多相关新闻'
        more_news_url = tools.get_info(str(header), regex, fetch_one = True)
        if more_news_url:
            more_news_url = tools.get_full_url('http://news.baidu.com', more_news_url)
            more_news_url = more_news_url.replace('amp;', '')
            base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, more_news_url, depth = 1, remark = {'offset':0})

        url = header.h3.a['href']
        article_extractor = ArticleExtractor(url)
        content = title = release_time = author = website_domain =''
        content = article_extractor.get_content()
        if content:
            title = article_extractor.get_title()
            release_time = article_extractor.get_release_time()
            author = article_extractor.get_author()
            website_domain = tools.get_domain(url)
            uuid = tools.get_uuid(title, website_domain)
            website_name = ''
            website_position = None

            log.debug('''
                uuid         %s
                title        %s
                author       %s
                release_time %s
                domain       %s
                url          %s
                content      %s
                '''%(uuid, title, author, release_time, website_domain, url, '...'))

            # 入库
            if tools.is_have_chinese(content):
                is_continue = self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name , website_domain, website_position, url, content)

                if not is_continue:
                    break
    else:
        # 循环正常结束 该页均正常入库， 继续爬取下页
        offset += 50
        url = tools.replace_str(root_url, 'pn=\d*', 'pn=%d'%offset)
        base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, url, depth = 0, remark = {'offset': offset})

    base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)

Example #3

Show file

File: soubaidupan_parser.py Project: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html = tools.get_html_by_urllib(root_url)
    title = '<tr height="25"><td><a href=".*?"  title="(.*?)"'
    video_url = ['<tr height="25"><td><a href="(.*?)"']
    author = ['<a href="user-.*?.html" target="_blank">(.*?)</a>']
    watched_count = ['浏览次数: </span>(.*?)&nbsp']
    file_size = ['资料大小: </span>(.*?)&nbsp']
    download_count = ['下载次数: </span>(.*?)&nbsp']

    titles = tools.get_info(html, title, allow_repeat = True)
    video_urls = tools.get_info(html, video_url, allow_repeat = True)
    authors = tools.get_info(html, author, allow_repeat = True)
    watched_counts = tools.get_info(html, watched_count, allow_repeat = True)
    file_sizes = tools.get_info(html, file_size, allow_repeat= True)
    download_counts = tools.get_info(html, download_count, allow_repeat = True)


    for i in range(len(titles)):
        title = titles[i]
        title = tools.del_html_tag(title)

        video_url = video_urls[i]
        video_url = tools.get_full_url('http://www.sobaidupan.com', video_url)

        author = authors[i]
        watched_count = watched_counts[i]
        file_size = file_sizes[i]
        download_count = download_counts[i]

        log.debug('''
            标题：    %s
            视频地址： %s
            作者：    %s
            观看数    %s
            资料大小  %s
            下载次数  %s
        '''%(title, video_url, author, watched_count, file_size, download_count))

        contained_key, contained_key_count = base_parser.get_contained_key(title, '',
                                                            remark['search_keyword1'],
                                                            remark['search_keyword2'], remark['search_keyword3'])
        if not contained_key:
            continue

        base_parser.add_content_info('VA_content_info', SITE_ID, video_url, title, file_size = file_size,
                                     file_name = title, author = author, watched_count = watched_count,
                                     download_count = download_count, search_type = search_type,
                                     keyword = contained_key, keyword_count = contained_key_count, task_id = remark['task_id'])

    base_parser.update_url('VA_urls', root_url, Constance.DONE)

Example #4

Show file

def add_html_url(html, depth, spider_depth, website_url, website_name,
                 website_domain, remark):
    # 近一步取待做url
    if depth < spider_depth - 1:
        urls = tools.get_urls(html)
        for url in urls:
            url = tools.get_full_url(website_url, url)
            if website_name == '百度新闻':
                remark['website_name'] = ''
                remark['website_domain'] = tools.get_domain(url)
                remark['website_position'] = None
                base_parser.add_url(SITE_ID, url, depth + 1, remark=remark)
            elif website_domain in url:
                base_parser.add_url(SITE_ID, url, depth + 1, remark=remark)

Example #5

Show file

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    json = tools.get_json_by_requests(root_url)

    # 主播信息
    lives = tools.get_json_value(json, 'lives')
    # print(tools.dumps_json(lives))

    for live in lives:
        name = tools.get_json_value(live, 'creator.nick')
        image_url = tools.get_json_value(live, 'creator.portrait')
        image_url = tools.get_full_url('http://img2.inke.cn', image_url)

        room_id = tools.get_json_value(live, 'creator.id')
        room_url = tools.get_json_value(live, 'share_addr')
        video_path = tools.get_json_value(live, 'stream_addr')
        watched_count = tools.get_json_value(live, 'online_users')
        address = tools.get_json_value(live, 'city')
        # 取粉丝数
        params = {
            'lc': '0000000000000048',
            'cc': 'TG0001',
            'cv': 'IK3.8.60_Iphone',
            'proto': 7,
            'idfa': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB',
            'idfv': '5779214D-BC8F-446E-A547-913048F7F935',
            'devi': '0a4392f06ab0ff10b44c6f88d95bf4d6db67f0e7',
            'osversion': 'ios_10.200000',
            'ua': 'iPhone9_2',
            'imei': '',
            'imsi': '',
            'uid': 207821358,
            'sid': '20RUXGrYPxpJy75btYQYlVp6lYxi0wj1xV50Ttnls6ty3DcXE5i1',
            'conn': 'wifi',
            'mtid': '987c70ecbcd643998ea6bcd3b8868934',
            'mtxid': 'b0958e29253f',
            'logid': 133,
            'id': room_id,
            's_sg': S_SG,
            's_sc': 100,
            's_st': CURRENT_TIMESTAMP
        }
        fans_json = tools.get_json_by_requests(
            'http://120.55.238.158/api/user/relation/numrelations', params)
        fans_count = tools.get_json_value(fans_json, 'num_followers')

        #主播观众数请求地址
        params = {
            'lc': '0000000000000048',
            'cc': 'TG0001',
            'cv': 'IK3.8.60_Iphone',
            'proto': 7,
            'idfa': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB',
            'idfv': '5779214D-BC8F-446E-A547-913048F7F935',
            'devi': '0a4392f06ab0ff10b44c6f88d95bf4d6db67f0e7',
            'osversion': 'ios_10.200000',
            'ua': 'iPhone9_2',
            'imei': '',
            'imsi': '',
            'uid': 207821358,
            'sid': '20RUXGrYPxpJy75btYQYlVp6lYxi0wj1xV50Ttnls6ty3DcXE5i1',
            'conn': 'wifi',
            'mtid': '987c70ecbcd643998ea6bcd3b8868934',
            'mtxid': 'b0958e29253f',
            'logid': 133,
            'id': tools.get_json_value(live, 'id'),
            'multiaddr': 1,
            's_sg': S_SG,
            's_sc': 100,
            's_st': CURRENT_TIMESTAMP
        }

        watched_count_url = 'http://120.55.238.158/api/live/infos'  #?lc=0000000000000048&cc=TG0001&cv=IK3.8.60_Iphone&proto=7&idfa=D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB&idfv=5779214D-BC8F-446E-A547-913048F7F935&devi=0a4392f06ab0ff10b44c6f88d95bf4d6db67f0e7&osversion=ios_10.200000&ua=iPhone9_2&imei=&imsi=&uid=207821358&sid=20RUXGrYPxpJy75btYQYlVp6lYxi0wj1xV50Ttnls6ty3DcXE5i1&conn=wifi&mtid=987c70ecbcd643998ea6bcd3b8868934&mtxid=b0958e29253f&logid=133&id=1487572239333810%2C1487572432485069%2C1487572763094071%2C1487573160678176%2C1487571635332280&multiaddr=1&s_sg=c3493ab9d9b2e19cfc20f98bb75ff72f&s_sc=100&s_st=1487573119'
        watched_count_url = tools.joint_url(watched_count_url, params)

        live_info = tools.get_json_by_requests(watched_count_url)
        sex = live_info['lives'][0]['creator']['sex']
        sex = 0 if sex == '1' else 1  #数据库中 0 男 1女； 映客中 0 和 3是女 1是男
        age = ''

        log.debug(
            '''
            名字：       %s
            贴图：       %s
            主播id：     %s
            房间url：    %s
            视频流地址： %s
            观看数：     %s
            地址：       %s
            粉丝数：     %s
            性别：       %s
            年龄：       %s
            观众数url：  %s
            ''' %
            (name, image_url, room_id, room_url, video_path, watched_count,
             address, fans_count, sex, age, watched_count_url))

        base_parser.add_anchor_info('LiveApp_anchor_info',
                                    SITE_ID,
                                    name=name,
                                    image_url=image_url,
                                    room_id=room_id,
                                    room_url=room_url,
                                    video_path=video_path,
                                    watched_count=watched_count,
                                    address=address,
                                    fans_count=fans_count,
                                    sex=sex,
                                    age=age,
                                    watched_count_url=watched_count_url)

    base_parser.update_url('LiveApp_urls', root_url, Constance.DONE)

Example #6

Show file

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    website_name = remark['website_name']
    website_position = remark['website_position']
    website_url = remark['website_url']
    website_domain = remark['website_domain']

    html = tools.get_html(root_url)
    if not html:
        base_parser.update_url('news_urls', root_url, Constance.EXCEPTION)
        return

    # 近一步取待做url
    if depth < DEPTH:
        urls = tools.get_urls(html)
        for url in urls:
            url = tools.get_full_url(website_url, url)
            if website_name == '百度新闻':
                remark['website_name'] = ''
                remark['website_domain'] = tools.get_domain(url)
                remark['website_position'] = None
                base_parser.add_url('news_urls',
                                    SITE_ID,
                                    url,
                                    depth + 1,
                                    remark=remark)
            elif website_domain in url:
                base_parser.add_url('news_urls',
                                    SITE_ID,
                                    url,
                                    depth + 1,
                                    remark=remark)

    # 解析网页
    content = title = release_time = author = ''
    article_extractor = ArticleExtractor(root_url, html)
    content = article_extractor.get_content()
    if content:
        title = article_extractor.get_title()
        release_time = article_extractor.get_release_time()
        author = article_extractor.get_author()
        uuid = tools.get_uuid(
            title,
            website_domain) if title != website_name else tools.get_uuid(
                root_url, ' ')

        log.debug('''
            uuid         %s
            title        %s
            author       %s
            release_time %s
            website_name %s
            domain       %s
            position     %s
            url          %s
            content      %s
            ''' % (uuid, title, author, release_time, website_name,
                   website_domain, website_position, root_url, content))

        if tools.is_have_chinese(content):
            # 入库
            self_base_parser.add_news_acticle(uuid, title, author,
                                              release_time, website_name,
                                              website_domain, website_position,
                                              root_url, content)

    log.debug('%s 处理完成' % root_url)
    base_parser.update_url('news_urls', root_url, Constance.DONE)

Example #7

Show file

File: tencent_parser.py Project: xuexiteam/internet-content-detection

def parser_program_info(url_info):
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    program_id = remark['program_id']
    classify = remark['classify']

    # 解析
    html, request = tools.get_html_by_requests(root_url)
    if not html:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION)
        return

    # 标题
    regex = '<h1 class="video_title_cn" >.*?>(.*?)<'
    program_name = tools.get_info(html, regex, fetch_one=True)
    # 地址
    program_url = root_url
    # 图片地址
    regex = '<img src="(.*?)".*?_stat="info:poster"/>'
    image_url = tools.get_info(html, regex, fetch_one=True)
    image_url = tools.get_full_url('http://', image_url)
    # 集数
    regex = ['更新期数:</span>.*?>(.*?)</span>', '总集数:</span>.*?>(.*?)</span>']
    episode = tools.get_info(html, regex, fetch_one=True)
    # 导演
    regex = '<span class="director">导演</span>.*?info:actor_name">(.*?)<'
    directors = tools.get_info(html, regex, split='/')
    # 演员
    regex = '_stat="info:actor_name">(.*?)</span>'  # 包含导演
    actors = tools.get_info(html, regex, split='/')
    actors = actors.replace(directors + '/', '') if directors else actors
    # 发布时间
    regex = ['首播时间:</span>.*?>(.*?)<', '出品时间:</span>.*?>(.*?)<']
    release_time = tools.get_info(html, regex, fetch_one=True)
    # 简介
    regex = 'desc_txt">(.*?)</span>'
    summary = tools.get_info(html, regex, fetch_one=True)
    summary = tools.del_html_tag(summary)

    log.debug('''
        program_name    %s
        program_url     %s
        image_url       %s
        episode         %s
        directors       %s
        actors          %s
        release_time    %s
        summary         %s
        ''' % (program_name, program_url, image_url, episode, directors,
               actors, release_time, summary))

    program_mongo_id = base_parser.add_program_info('PROGRAM_info',
                                                    site_id,
                                                    program_name,
                                                    program_url,
                                                    image_url,
                                                    episode,
                                                    directors,
                                                    actors=actors,
                                                    summary=summary,
                                                    release_time=release_time)

    # 解析分集信息的地址

    # 综艺（json）
    if classify == '综艺':
        # 月份
        regex = '_stat="series:tab".*?>(\d*)月'
        months = tools.get_info(html, regex)
        # print(months)

        # 年
        regex = '<div class="year_slt_list">.*?data-value="(.*?)"'
        years = tools.get_info(html, regex)
        # print(years)

        for year in years:
            for month in months:
                episode_url = 'http://s.video.qq.com/get_playsource?id=%s&plat=2&type=4&data_type=3&video_type=10&year=%s&month=%s&plname=qq&otype=json' % (
                    program_id, year, month)
                log.debug('%s分集json地址:%s' % (program_name, episode_url))
                base_parser.add_url('PROGRAM_urls',
                                    site_id,
                                    episode_url,
                                    depth=2,
                                    remark={
                                        'program_mongo_id': program_mongo_id,
                                        'classify': '综艺'
                                    })

    # 电视剧（包含第几集， url）
    elif classify == '电视剧':
        regex = 'series:numbtn".*?<a href="(.*?)".*?<span itemprop="episodeNumber">(.*?)</span>'
        episode_msgs = tools.get_info(html, regex)
        for episode_msg in episode_msgs:
            episode_url = episode_msg[0]
            episode_num = episode_msg[1]
            log.debug('''
                episode_url  %s
                episode_num  %s
                ''' % (episode_url, episode_num))
            base_parser.add_url('PROGRAM_urls',
                                site_id,
                                episode_url,
                                depth=2,
                                remark={
                                    'program_mongo_id': program_mongo_id,
                                    'episode_num': episode_num,
                                    'program_name': program_name,
                                    'classify': '电视剧'
                                })

    base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)

Example #8

Show file

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    headers = {
        "Upgrade-Insecure-Requests": "1",
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "Cache-Control": "max-age=0",
        "Connection": "keep-alive",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Cookie":
        "wuid=AAGPF/32GQAAAAqLFD2BdAAAGwY=; CXID=A468F618D67D4868DC83E6061B1B3CCC; ABTEST=0|1500285612|v1; weixinIndexVisited=1; SUV=006317867B7CC4C5596C8AAD6B089707; SUIR=0A14ACB4D0CA9B50A8ABB33CD0CA69FA; ld=ekllllllll2BbH49lllllVOm1tylllll1kecBlllll9lllll9Zlll5@@@@@@@@@@; ad=AZllllllll2Bzw7GlllllVOeQA6lllll1kectkllll9lllllVqxlw@@@@@@@@@@@; SUID=72780CD23D148B0A59688B0C0002AD65; IPLOC=CN1100; sct=11; SNUID=B4B50E097177247B9A6BE55E72153425; JSESSIONID=aaaVCfkabuJQTfaNW5f1v",
        "Host": "weixin.sogou.com"
    }

    # 解析
    html, request = tools.get_html_by_requests(root_url, headers=headers)
    if not html:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">'
    check_info = tools.get_info(html, regex, fetch_one=True)
    log.debug('取公众号列表' + check_info)

    # 公众号信息块
    regex = '<!-- a -->(.*?)<!-- z -->'
    account_blocks = tools.get_info(html, regex)

    if not account_blocks:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    # 文章数url
    regex = '<script>var account_anti_url = "(.*?)";</script>'
    articles_count_url = tools.get_info(html, regex, fetch_one=True)
    articles_count_url = tools.get_full_url('http://weixin.sogou.com',
                                            articles_count_url)
    articles_count_json = tools.get_json_by_requests(articles_count_url).get(
        'msg', {})

    for account_block in account_blocks:
        # print(account_block)
        regex = '<a.*?account_name.*?>(.*?)</a>'
        name = tools.get_info(account_block, regex, fetch_one=True)
        name = tools.del_html_tag(name)

        is_have = mongodb.find('WWA_wechat_official_accounts', {'name': name})
        if is_have:
            log.debug(name + " 已存在")
            continue

        regex = '<div class="img-box">.*?<img src="(.*?)"'
        image_url = tools.get_info(account_block, regex, fetch_one=True)

        # 下载图片
        local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(
            date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                date_format='%Y%m%d%H%M%S.%f') + '.jpg'
        is_download = tools.download_file(image_url, local_image_url)
        local_image_url = local_image_url if is_download else ''

        regex = '<p class="tit">.*?(<i></i>).*?<p class="info">'
        is_verified = 102 if tools.get_info(
            account_block, regex, fetch_one=True) else 101

        regex = '<label name="em_weixinhao">(.*?)</label>'
        account_id = tools.get_info(account_block, regex, fetch_one=True)

        regex = '<li id="sogou_vr_.*?d="(.*?)">'
        article_count_key = tools.get_info(account_block,
                                           regex,
                                           fetch_one=True)
        article_count = articles_count_json.get(article_count_key, '')
        article_count = article_count[:article_count.find(',')]

        regex = '<dt>功能介绍.*?<dd>(.*?)</dd>'
        summary = tools.get_info(account_block, regex, fetch_one=True)
        summary = tools.del_html_tag(summary)

        regex = "认证.*?<dd>(.*?)</dd>"
        certification = tools.get_info(account_block, regex, fetch_one=True)

        regex = '微信扫一扫关注.*?<img.*?src="(.*?)"'
        barcode_url = tools.get_info(account_block, regex, fetch_one=True)
        barcode_url = barcode_url.replace('&amp;', "&")

        # 下载图片
        local_barcode_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(
            date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                date_format='%Y%m%d%H%M%S.%f') + '.jpg'
        is_download = tools.download_file(barcode_url, local_barcode_url)
        local_barcode_url = local_barcode_url if is_download else ''

        regex = '<a.*?account_name.*?href="(.*?)">'
        account_url = tools.get_info(account_block, regex, fetch_one=True)
        account_url = account_url.replace('&amp;', "&")

        log.debug('''
            公众号名称          %s
            公众号账号          %s
            账号url             %s
            贴图                %s
            本地贴图            %s
            文章数量            %s
            简介                %s
            微信认证            %s
            是否加V（是否认证） %s
            二维码              %s
            本地二维码          %s
            ''' % (name, account_id, account_url, image_url, local_image_url,
                   article_count, summary, certification, is_verified,
                   barcode_url, local_barcode_url))

        base_parser.add_wechat_account_info(
            'WWA_wechat_official_accounts', site_id, name, account_id,
            account_url, image_url, local_image_url, article_count, summary,
            certification, is_verified, barcode_url, local_barcode_url)

    base_parser.update_url('WWA_wechat_account_url', root_url, Constance.DONE)
    tools.delay_time()

Example #9

Show file

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']['keyword']
    monitor_type = url_info['remark']['monitor_type']
    official_accounts_id = remark
    retry_times = url_info['retry_times']

    headers = {
    "Host": "weixin.sogou.com",
    "Connection": "keep-alive",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Cookie": "ABTEST=8|1506658658|v1; IPLOC=CN1100; SUID=C5C47C7B642E940A0000000059CDC962; SUID=C5C47C7B1508990A0000000059CDC963; weixinIndexVisited=1; SUV=00F95AA57B7CC4C559CDC963CE316529; SNUID=2B2A9295EDE8B7A2BCECB605EE30F1BE; JSESSIONID=aaadcwpP9yaKs-PCMhz6v",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Upgrade-Insecure-Requests": "1"
    }

    # 获取代理
    proxies = base_parser.get_proxies()
    headers["User-Agent"] = base_parser.get_user_agent()

    # 解析
    # print(proxies)
    # html, r = tools.get_html_by_requests('http://ip.chinaz.com/getip.aspx', headers = headers, proxies = proxies)
    # print(html)

    html, request = tools.get_html_by_requests(root_url, headers = headers, proxies = proxies)
    if not html:
        base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1)
        return

    # print(html)
    regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">'
    check_info = tools.get_info(html, regex, fetch_one = True)
    print(root_url)
    log.debug('取文章链接' + check_info)

    if check_info:
        base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1)
        return

    # 公众号信息块
    regex = '<!-- a -->(.*?)<!-- z -->'
    account_block = tools.get_info(html, regex, fetch_one = True)
    # url
    regex = '<a.*?account_name.*?href="(.*?)">'
    account_url = tools.get_info(account_block, regex, fetch_one = True)
    account_url = account_url.replace('&amp;',"&")
    log.debug('account_url = ' + account_url)

    if not account_url:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    headers = {
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Host": "mp.weixin.qq.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        "Upgrade-Insecure-Requests": "1",
        "Connection": "keep-alive"
    }

    # 代理
    proxies = base_parser.get_proxies()
    headers["User-Agent"] = base_parser.get_user_agent()
    proxies = {} #使用代理会出现验证码 暂时不使用

    html, request = tools.get_html_by_requests(account_url, headers = headers, proxies = proxies)
    regex = '<input class="weui_input frm_input" id="input" placeholder="(.*?)" maxlength="4">'
    check_info = tools.get_info(html, regex, fetch_one = True)
    log.debug('''
        取文章详细内容 %s
        url %s
        request.headers %s
        '''%(check_info, account_url, request.headers))
    # print(html)

    regex = 'var msgList = (.*?});'
    article_json = tools.get_info(html, regex, fetch_one = True)
    article_json = tools.get_json(article_json)

    article_list = article_json.get('list', {})
    for article in article_list:
        title = tools.get_json_value(article, 'app_msg_ext_info.title')
        is_have = mongodb.find('WWA_wechat_article', {'title' : title})
        if is_have:
            log.debug(title + " 已存在")
            continue

        summary = tools.get_json_value(article, 'app_msg_ext_info.digest')
        image_url = tools.get_json_value(article, 'app_msg_ext_info.cover')

        sexy_image_url = []

        # 下载图片
        local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg'
        is_download = tools.download_file(image_url, local_image_url)
        local_image_url = local_image_url if is_download else ''
        sexy_image_url.append(local_image_url)

        article_url = tools.get_json_value(article, 'app_msg_ext_info.content_url')
        article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url)
        article_url = article_url.replace('&amp;',"&")

        release_time = tools.get_json_value(article, 'comm_msg_info.datetime')
        release_time = tools.timestamp_to_date(int(release_time)) if release_time else ''

        content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies)
        regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce'
        content = tools.get_info(content_html, regex, fetch_one = True)

        # # 取content里的图片 下载图片 然后替换内容中原来的图片地址
        regex = '<img.*?data-src="(.*?)"'
        images = tools.get_info(content, regex)
        for image in images:
            local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg')
            is_download = tools.download_file(image, local_image_path)
            if is_download:
                content = content.replace(image, local_image_path)
                sexy_image_url.append(local_image_path)
            tools.delay_time(5)

        # 敏感事件
        sensitive_id = ''
        if monitor_type == 1 or monitor_type == 2:
            sensitive_event_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time')
            for sensitive_event_info in sensitive_event_infos:
                _id = sensitive_event_info[0]
                keyword1 = sensitive_event_info[1].split(',') if sensitive_event_info[1] else []
                keyword2 = sensitive_event_info[2].split(',') if sensitive_event_info[2] else []
                keyword3 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else []

                if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3):
                    sensitive_id = _id
                    break

        # 违规事件
        violate_id = ''
        if monitor_type == 0 or monitor_type == 2:
            vioation_knowledge_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time')
            for vioation_knowledge_info in vioation_knowledge_infos:
                _id = vioation_knowledge_info[0]
                keyword1 = vioation_knowledge_info[1].split(',') if vioation_knowledge_info[1] else []
                keyword2 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else []
                keyword3 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else []

                if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3):
                    violate_id = _id
                    break

        log.debug('''
            标题         %s
            简介         %s
            图片地址     %s
            文章地址     %s
            发布时间     %s
            内容         %s
            本地贴图地址 %s
            违规状态     %s
            敏感事件     %s
            图片鉴别地址 %s
            '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url))

        base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url)

        # 同一天发布的
        oneday_article_list = article.get('app_msg_ext_info', {}).get('multi_app_msg_item_list', [])
        for article in oneday_article_list:
            title = tools.get_json_value(article, 'title')
            summary = tools.get_json_value(article, 'digest')
            image_url = tools.get_json_value(article, 'cover')

            sexy_image_url = []

            # 下载图片
            local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg'
            is_download = tools.download_file(image_url, local_image_url)
            local_image_url = local_image_url if is_download else ''
            sexy_image_url.append(local_image_url)

            article_url = tools.get_json_value(article, 'content_url')
            article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url)
            article_url = article_url.replace('&amp;',"&")

            content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies)
            regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce'
            content = tools.get_info(content_html, regex, fetch_one = True)

            # 取content里的图片 下载图片 然后替换内容中原来的图片地址
            regex = '<img.*?data-src="(.*?)"'
            images = tools.get_info(content, regex)
            for image in images:
                local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg')
                is_download = tools.download_file(image, local_image_path)
                if is_download:
                    content = content.replace(image, local_image_path)
                    sexy_image_url.append(local_image_path)
                tools.delay_time(5)

            # 敏感事件
            sensitive_id = ''
            sensitive_event_infos = oracledb.find('select * from tab_mvms_sensitive_event')
            for sensitive_event_info in sensitive_event_infos:
                _id = sensitive_event_info[0]
                keyword1 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else []
                keyword2 = sensitive_event_info[4].split(',') if sensitive_event_info[4] else []
                keyword3 = sensitive_event_info[5].split(',') if sensitive_event_info[5] else []

                if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3):
                    sensitive_id = _id
                    break

            # 违规事件
            violate_id = ''
            vioation_knowledge_infos = oracledb.find('select * from tab_mvms_violation_knowledge')
            for vioation_knowledge_info in vioation_knowledge_infos:
                _id = vioation_knowledge_info[0]
                keyword1 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else []
                keyword2 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else []
                keyword3 = vioation_knowledge_info[4].split(',') if vioation_knowledge_info[4] else []

                if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3):
                    violate_id = _id
                    break

            log.debug('''
            标题         %s
            简介         %s
            图片地址     %s
            文章地址     %s
            发布时间     %s
            内容         %s
            本地贴图地址 %s
            违规状态     %s
            敏感事件     %s
            图片鉴别地址 %s
            '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url))

            base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url)

    base_parser.update_url('WWA_wechat_article_url', root_url, Constance.DONE)
    tools.delay_time()