Python get_urls Beispiele, utils.tools.get_urls Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: xinhua_parser.py Projekt: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    fit_url = tools.fit_url(urls, FIT_URLS)
    for url in fit_url:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)
    # 内容
    regexs = ['<div id="content">(.*?)<div class="clear"></div>',
              '<div class="article">(.*?)<!--文章操作-->',
              '<div id="video_area">(.*?)<!--文章操作-->',
              '<div class="content">(.*?)<div id="article_edit">'
              ]

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             '''%(depth+1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id, source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)

Beispiel #2

0

Datei anzeigen

Datei: cctv_parser.py Projekt: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return
    urls = tools.get_urls(html, STOP_URLS)

    urls = tools.fit_url(urls, "cctv.com")
    for url in urls:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<h1><!--repaste.title.begin-->(.*?)<!--repaste.title.end-->'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)
    # 内容
    regexs = ['<!--repaste.body.begin-->(.*?)<!--repaste.body.end-->']

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             ''' % (depth + 1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id,
                                     source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)

Beispiel #3

0

Datei anzeigen

def add_root_url(parser_params = {}):
    log.debug('''
        添加根url
        parser_params : %s
        '''%str(parser_params))

    url = "http://www.luzhoutianli.com/"
    html, request = tools.get_html_by_requests(url)
    urls = tools.get_urls(html)
    for url in urls:
        base_parser.add_url('op_urls', SITE_ID, url)

Beispiel #4

0

Datei anzeigen

Datei: cctv_parser.py Projekt: xuexiteam/internet-content-detection

def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    url = "http://www.cctv.com/"
    html = tools.get_html_by_urllib(url)
    regex = '<ul class="nav gwnav2">(.*?)</td></tr></table>'
    urls_html = tools.get_info(html, regex)
    urls = tools.get_urls(urls_html)
    for url in urls:
        base_parser.add_url('article_urls', SITE_ID, url)

Beispiel #5

0

Datei anzeigen

def add_html_url(html, depth, spider_depth, website_url, website_name,
                 website_domain, remark):
    # 近一步取待做url
    if depth < spider_depth - 1:
        urls = tools.get_urls(html)
        for url in urls:
            url = tools.get_full_url(website_url, url)
            if website_name == '百度新闻':
                remark['website_name'] = ''
                remark['website_domain'] = tools.get_domain(url)
                remark['website_position'] = None
                base_parser.add_url(SITE_ID, url, depth + 1, remark=remark)
            elif website_domain in url:
                base_parser.add_url(SITE_ID, url, depth + 1, remark=remark)

Beispiel #6

0

Datei anzeigen

Datei: sichuanyikeda_parser.py Projekt: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='gb2312')
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.scmu.edu.cn' + url
        else:
            new_url = 'http://www.scmu.edu.cn/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = [
        '<div class="main_list_right_2_7">(.*?)<div class="main_list_right_2_7_1">',
        '<div class="articlett">(.*?)</div>'
    ]
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '发布时间：(.*?) 点击数'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''

    #作者
    regexs = '作者：(.*?)  来源'
    author = tools.get_info(html, regexs)
    author = release_time and author[0] or ''
    author = tools.del_html_tag(author)

    #文章来源
    regexs = '来源： (.*?) 发布时间'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    #点击数
    regexs = '评论：<span class="style1">(\d*?)</span>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = [
        '<div class="main_list_right_2_5">(.*?)<div class="main_list_right_2_7_2">',
        '<div class="content">(.*?)<div id="pages" class="text-c"></div>'
    ]
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                author              = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth, source_url, title, release_time, author, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                author=author,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

Beispiel #7

0

Datei anzeigen

Datei: cctv_parser.py Projekt: xuexiteam/internet-content-detection

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             ''' % (depth + 1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id,
                                     source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)

    # # 解析
    # html, request = tools.get_html_by_requests(root_url)
    # if not html:
    #     base_parser.update_url('urls', root_url, Constance.EXCEPTION)


if __name__ == '__main__':
    url = "http://www.cctv.com/"
    html = tools.get_html_by_urllib(url)
    regex = '<ul class="nav gwnav2">(.*?)</td></tr></table>'
    urls_html = tools.get_info(html, regex)
    print(urls_html)
    urls = tools.get_urls(urls_html)
    print(urls)
    for url in urls:
        print(url)

Beispiel #8

0

Datei anzeigen

Datei: luzhouzhiye_parser.py Projekt: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        else:
            new_url = 'http://www.lzy.edu.cn/' + url
        base_parser.add_url('op_urls', website_id, url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<p class="atcTitle1a">(.*?)</p>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<div class="atcTitle">.*?</script>(.*?)</div>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.del_html_tag(release_time)
    release_time = release_time.strip('|')

    # #作者
    # regexs = '<span>作者：(.*?)</span>'
    # author = tools.get_info(html, regexs)
    # author = author and author[0] or ''
    # author = tools.del_html_tag(author)

    #文章来源
    regexs = '文章来源:(.*?)点击数'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)
    if origin.find('|'):
        origin = origin.strip('|')

    # #点击数
    regexs = '点击数:<script type="text/javascript" src="(.*?)"></script>'
    times_script_url = tools.get_info(html, regexs)
    times_script_url = ''.join(times_script_url)
    times_script_url = 'http://www.lzy.edu.cn/' + times_script_url
    watched_count_html, request = tools.get_html_by_requests(times_script_url)
    regexs = '\'(\d*?)\''
    watched_count = tools.get_info(watched_count_html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = [
        '<span style="font-size: 18px">(.*?)<div style="text-align:right">'
    ]
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth, source_url, title, release_time, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

Beispiel #9

0

Datei anzeigen

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url, code='GB2312')
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 取当前页面的全部url
    urls = tools.get_urls(html, STOP_URLS)

    # 过滤掉外链接 添加到数据库
    fit_url = tools.fit_url(urls, "people.com.cn")
    for url in fit_url:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<h1>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.replace_str(title, '&.*?;')
    # 内容
    regexs = [
        'div class="box_pic"></div>(.*?)<div class="box_pic"></div>',
        '<div class="content clear clearfix">(.*?)<div class="edit clearfix">',
        '<div class="show_text">(.*?)<div class="edit">'
    ]

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''

    content = tools.del_html_tag(content)

    log.debug('''
                    depth     =  %d
                    source_url = %s
                    title     =  %s
                    content   =  %s
                 ''' % (depth, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id,
                                     source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)

Beispiel #10

0

Datei anzeigen

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.DONE)
        return

    # 取当前页面的全部url
    urls = tools.get_urls(html, STOP_URLS)

    # 过滤掉外链接 添加到数据库
    fit_url = tools.fit_url(urls, "ifeng.com")
    for url in fit_url:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', url, website_id, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)
    # 内容
    regexs = [
        '<div id="main_content".*?>(.*?)</div>',
        '<div class="yc_con_l">(.*?)<div class="txt_share_box"',
        '<div id="slide_no_insert_default"></div>(.*?)</div>'
    ]

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''

    content = tools.del_html_tag(content)

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             ''' % (depth + 1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id,
                                     source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)

Beispiel #11

0

Datei anzeigen

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return
    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        else:
            new_url = 'http://www.xuyong.gov.cn'+url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<td  class="titlestyle1037"  align="center">(.*?)</td></tr>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<span  class="timestyle1037" >(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.format_date(release_time)

    # #作者
    # regexs = '<span>作者：(.*?)</span>'
    # author = tools.get_info(html, regexs)
    # author = author and author[0] or ''
    # author = tools.del_html_tag(author)

    #文章来源
    # regexs = '采编:　(.*?)阅读'
    # origin = tools.get_info(html, regexs)
    # origin = origin and origin[0] or ''
    # origin = tools.del_html_tag(origin)

    # #点击数
    # regexs = '阅读:(\d*?)次'
    # watched_count = tools.get_info(html, regexs)
    # watched_count = watched_count and watched_count[0] or ''
    # watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<tr><td  class="contentstyle1037" >(.*?) <tr><td  class="pagestyle1037"  align="left">']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                content             = %s
             ''' % (depth+1, source_url, title, release_time,  content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title,
                                release_time=release_time, content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

Beispiel #12

0

Datei anzeigen

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 取当前页面的全部url
    urls = tools.get_urls(html)

    # 过滤掉外链接 添加到数据库
    fit_url = tools.fit_url(urls, FIT_URLS)
    for url in fit_url:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    if title == '加载中...':
        # 更新source_url为done
        base_parser.update_url('article_urls', source_url, Constance.TODO)
        return

    # 内容
    regexs = [
        'id="artibody".*?>(.*?)<!-- 吸顶导航结束定位标记 -->',
        'id="artibody".*?>(.*?)<div id="left_hzh_ad">',
        '<!-- 正文内容 begin -->(.*?)<!-- 正文内容 end -->',
        'id="article_content".*?>(.*?)<div class="spacer"></div>'
    ]

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             ''' % (depth + 1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id,
                                     source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)

Beispiel #13

0

Datei anzeigen

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match("/", url):
            new_url = 'http://www.naxi.gov.cn' + url
        else:
            new_url = 'http://www.naxi.gov.cn/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<DIV class=news_conent_two_title>(.*?)</DIV>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<SPAN>日期：(.*?)</SPAN>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''

    #文章来源
    regexs = '<SPAN>来源：(.*?)</SPAN>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = '<SPAN>点击数：(\d*?)</SPAN>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = [
        '<DIV id=news_conent_two_text class=news_conent_two_text>(.*?)</DIV>'
    ]
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, source_url, title, release_time, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

Beispiel #14

0

Datei anzeigen

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        base_parser.add_url('op_urls', website_id, url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = '<strong class="NameTxt"><a >(.*?)</a></strong>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '发表时间：(.*?)&nbsp;'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.del_html_tag(release_time)

    #作者
    author = '编辑：(.*?)</div>'
    author = tools.get_info(html, regexs)
    author = release_time and author[0] or ''
    author = tools.del_html_tag(author)

    #文章来源
    regexs = '来源：(.*?)&nbsp'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    #点击数
    regexs = '评论：<span class="style1">(\d*?)</span>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<td height="2" class="graphic10">(.*?)来源']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                author              = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth, source_url, title, release_time, author, origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, author=author,
                                origin=origin, watched_count=watched_count, content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

Beispiel #15

0

Datei anzeigen

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url, code='gb2312')
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.lzzjw.com' + url
        else:
            new_url = 'http://www.lzzjw.com/' + url

        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h1>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<h3>时间：(.*?) 点击'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.del_html_tag(release_time)

    # 作者
    regexs = '<div id="copy">作者：(.*?)来源'
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    author = tools.del_html_tag(author)

    # 来源
    regexs = ' <div id="copy">作者：.*?　来源：(.*?)</div>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    #点击数
    regexs = 'ID=(.*)'
    times_script_url = tools.get_info(source_url, regexs)
    times_script_url = ''.join(times_script_url)
    times_script_url = 'http://www.lzzjw.com/js/count.asp?id=' + times_script_url
    watched_count_html, request = tools.get_html_by_requests(times_script_url)
    regexs = '\'(\d*?)\''
    watched_count = tools.get_info(watched_count_html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<div id="content">(.*?)<div id="copy">']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                author              = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, source_url, title, release_time, author, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                author=author,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

Beispiel #16

0

Datei anzeigen

Datei: luzhoutianli_parser.py Projekt: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate, sdch',
        'Accept-Language':
        'zh-CN,zh;q=0.8',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        # 'Cookie':'__cfduid=d17ca6a5def98d8c14f73dcee28042c7f1492065760',
        'Host':
        'www.luzhoutianli.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }

    html, request = tools.get_html_by_requests(source_url,
                                               headers=headers,
                                               code='gb2312')
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return
    urls = tools.get_urls(html)
    for url in urls:
        #new_url = tools.get_full_url('http://www.luzhoutianli.com', url)
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.luzhoutianli.com' + url
        else:
            new_url = 'http://www.luzhoutianli.com/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题
    #
    # print(html)
    regexs = '<strong class="NameTxt"><a >(.*?)</a></strong>.*?</td>'
    title = tools.get_info(html, regexs)
    if len(title) > 1:
        title = title[1]
    else:
        title = title and title[0] or ''
        title = tools.del_html_tag(title)

    # 时间
    regexs = ' <span class="FC_Time">时间：(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''

    # 作者
    regexs = '<span class="FC_Time">作者：(.*?)</span>'
    author = tools.get_info(html, regexs)
    author = release_time and author[0] or ''
    author = tools.del_html_tag(author)
    # #print(author)
    #
    # 文章来源
    regexs = '来源：(.*?)&nbsp'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # 点击数
    regexs = '评论：<span class="style1">(\d*?)</span>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = [
        '<div class="articleDetails">.*?</script>(.*?)<td class="MoBodyR"'
    ]
    content = tools.get_info(html, regexs)
    # print(content[0])
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                        depth               = %s
                        url                 = %s
                        title               = %s
                        release_time        = %s
                        author              = %s
                        content             = %s
                     ''' %
              (depth, source_url, title, release_time, author, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                author=author,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

Beispiel #17

0

Datei anzeigen

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='gb2312')
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.lx2hs.com' + url
        else:
            new_url = 'http://www.lx2hs.com/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<title>(.*?)</title>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #更新时间
    regexs = '发表时间：(.*?)&nbsp'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''

    # #作者
    regexs = '<span>编辑：(.*?)</div>'
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    author = tools.del_html_tag(author)

    # #文章来源
    # regexs = '<td align=\'center\'   class=\'info\'>(.*?)　点击数'
    # origin = tools.get_info(html, regexs)
    # origin = origin and origin[0] or ''
    # origin = tools.del_html_tag(origin)

    # #点击数
    regexs = '点击/评论：<span class="style1">(\d*?)</span>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['0</span>(.*?)来源']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                watched_count       = %s
                content             = %s
             ''' %
              (depth, source_url, title, release_time, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

Beispiel #18

0

Datei anzeigen

Datei: jiangyang_parser.py Projekt: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        else:
            new_url = 'http://www.jiangyang.gov.cn/template/default/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<div class="tit">(.*?)</div>'
    title = tools.get_info(html, regexs)
    if not title:
        regexs = '<h1>(.*?)</h1>'
        title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<label>(.*?)</label>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    if release_time:
        release_time = tools.format_date(release_time)
    if not release_time:
        regexs = '<span class="time">发布时间：(.*?)</span><span class="source"></span></p>'
        release_time = tools.get_info(html, regexs)
        release_time = release_time and release_time[0] or ''
        #release_time = tools.format_date(release_time)

    #文章来源
    regexs = '<label>来源：(.*?)</label>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # 内容
    regexs = ['<div class="content" id="nr" style="">(.*?)</div>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)
    if not content:
        regexs = '<p style="text-align: center;"(.*?)</div>.*?<div class="content">'
        content = tools.get_info(html, regexs)
        content = content and content[0] or ''
        content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                content             = %s
             ''' %
              (depth + 1, source_url, title, release_time, origin, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                origin=origin,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

Beispiel #19

0

Datei anzeigen

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.gulin.gov.cn' + url
        else:
            new_url = 'http://www.gulin.gov.cn/template/default/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = '<div class="news_titile">(.*?)</div>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<div class="news_info">时间:2017-04-07　采编'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''

    # #作者
    # regexs = '<span>作者：(.*?)</span>'
    # author = tools.get_info(html, regexs)
    # author = author and author[0] or ''
    # author = tools.del_html_tag(author)

    #文章来源
    regexs = '采编:　(.*?)阅读'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = '阅读:(\d*?)次'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = '<div class="news_content" id="news_content">(.*?)</span></b></p>'
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth+1, source_url, title, release_time, origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title,
                                release_time=release_time, origin=origin, watched_count=watched_count, content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

Beispiel #20

0

Datei anzeigen

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    website_name = remark['website_name']
    website_position = remark['website_position']
    website_url = remark['website_url']
    website_domain = remark['website_domain']

    html = tools.get_html(root_url)
    if not html:
        base_parser.update_url('news_urls', root_url, Constance.EXCEPTION)
        return

    # 近一步取待做url
    if depth < DEPTH:
        urls = tools.get_urls(html)
        for url in urls:
            url = tools.get_full_url(website_url, url)
            if website_name == '百度新闻':
                remark['website_name'] = ''
                remark['website_domain'] = tools.get_domain(url)
                remark['website_position'] = None
                base_parser.add_url('news_urls',
                                    SITE_ID,
                                    url,
                                    depth + 1,
                                    remark=remark)
            elif website_domain in url:
                base_parser.add_url('news_urls',
                                    SITE_ID,
                                    url,
                                    depth + 1,
                                    remark=remark)

    # 解析网页
    content = title = release_time = author = ''
    article_extractor = ArticleExtractor(root_url, html)
    content = article_extractor.get_content()
    if content:
        title = article_extractor.get_title()
        release_time = article_extractor.get_release_time()
        author = article_extractor.get_author()
        uuid = tools.get_uuid(
            title,
            website_domain) if title != website_name else tools.get_uuid(
                root_url, ' ')

        log.debug('''
            uuid         %s
            title        %s
            author       %s
            release_time %s
            website_name %s
            domain       %s
            position     %s
            url          %s
            content      %s
            ''' % (uuid, title, author, release_time, website_name,
                   website_domain, website_position, root_url, content))

        if tools.is_have_chinese(content):
            # 入库
            self_base_parser.add_news_acticle(uuid, title, author,
                                              release_time, website_name,
                                              website_domain, website_position,
                                              root_url, content)

    log.debug('%s 处理完成' % root_url)
    base_parser.update_url('news_urls', root_url, Constance.DONE)

Beispiel #21

0

Datei anzeigen

Datei: sichuanhuagong_parser.py Projekt: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='gb2312')
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.sccc.edu.cn/new' + url
        else:
            new_url = 'http://www.sccc.edu.cn/new/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)
    regexs = '<script type="text/javascript" language="JavaScript" src="(.*?)"'
    urls = tools.get_info(html, regexs)
    for url in urls:
        new_url = 'http://www.sccc.edu.cn/new/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = 'td height="60" align="center" valign="bottom" class="nrbt">(.*?)</td>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<td height="3" align="center" valign="top">(.*?)</td>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[1] or ''

    # #作者
    regexs = '<td width="250">(.*?)</td>'
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    #author = tools.del_html_tag(author)

    #文章来源
    regexs = '<td width="300">(.*?)</td>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = ' <td>阅读(\d*?)次</td>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<td class="nr">(.*?)</td>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                    depth               = %s
                    url                 = %s
                    title               = %s
                    release_time        = %s
                    author              = %s
                    origin              = %s
                    watched_count       = %s
                    content             = %s
                 ''' % (depth + 1, source_url, title, release_time, author,
                        origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                source_url,
                                title=title,
                                release_time=release_time,
                                author=author,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)
    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

Beispiel #22

0

Datei anzeigen

Datei: sichuan_police_parser.py Projekt: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='gb2312')

    regexs = 'charset=(.*?)"'
    code = tools.get_info(html, regexs)
    code = code and code[0] or 'gb2312'
    html, request = tools.get_html_by_requests(source_url, code=code)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.scpolicec.edu.cn' + url
        else:
            new_url = 'http://www.scpolicec.edu.cn/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = ['<div class="main_title">(.*?)<div class="top_about">', '<h1>(.*?)</h1>', '<title>(.*?)</title>',
              '<div class="contentPageTitle">(.*?)</div>']
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>','<small>时间:</small>(.*?)<small>',
              '<h2><span>更新时间：(.*?)</span>']
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    if not release_time:
            regexs = '</a> 发布时间：(.*?) 点击数'
            release_time = tools.get_info(html, regexs)
            release_time = release_time and release_time[0] or ''
            release_time = tools.format_date(release_time)


    # #作者
    regexs = ['作者:(.*?) 【']
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    #author = tools.del_html_tag(author)

    #文章来源
    regexs = '来源:(.*?)</a>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = ['浏览:<font id="hits">(\d*?)</font>次', '点击数：(\d*?)&#xA;发表时间']
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<p style="text-align: center;">(.*?)</table>', '<div class="contentPageContent">(.*?)</table>'
              '<div id="endtext" style="width:900px;">(.*?)<div id="pages"></div>',
              '<div id="articleContnet">(.*?)<div class="page_css">']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                    depth               = %s
                    url                 = %s
                    title               = %s
                    release_time        = %s
                    author              = %s
                    origin              = %s
                    watched_count       = %s
                    content             = %s
                 ''' % (depth, source_url, title, release_time, author, origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id,url=source_url, title=title, release_time=release_time, author=author,
                                origin=origin, watched_count=watched_count, content=content)
    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

Beispiel #23

0

Datei anzeigen

                content             = %s
             ''' % (depth+1, source_url, title, release_time,  content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title,
                                release_time=release_time, content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

if __name__ == '__main__':
    url = "http://www.xuyong.gov.cn/content.jsp?urltype=news.NewsContentUrl&wbtreeid=1015&wbnewsid=48759"
    html = tools.get_html_by_urllib(url)
    print(html)

    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            print(url)
        else:
            url = 'http://www.xuyong.gov.cn'+url
            print(url)
        # print(url)
    #urls = tools.get_urls(html)
    #print(urls)
    # for url in urls:
    #     print(url)
        #base_parser.add_url('article_urls', SITE_ID, url)

Beispiel #24

0

Datei anzeigen

Datei: luzhou_parser.py Projekt: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match("&#xD;&#xA", url):
            regex = '.*?(/GovPublicInfo.+?000)'
            new_url = tools.get_info(url, regex)
            new_url = new_url[0]
            new_url = 'http://www.luzhou.gov.cn' + new_url
        else:
            new_url = 'http://www.luzhou.gov.cn' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h2 class="title">(.*?)</h2>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<span>发布时间：(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.format_date(release_time)

    #文章来源
    regexs = '<span>文章来源：(.*?)</span>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    #点击数
    regexs = '<span>点击数.*?src="(.*?)"></script>'
    times_script_url = tools.get_info(html, regexs)
    times_script_url = ''.join(times_script_url)
    times_script_url = 'http://www.luzhou.gov.cn' + times_script_url
    watched_count_html, request = tools.get_html_by_requests(times_script_url)
    regexs = '\'(\d*?)\''
    watched_count = tools.get_info(watched_count_html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<div class="conTxt">(.*?)</div>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, title, source_url, release_time, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

Beispiel #25

0

Datei anzeigen

Datei: longmatan_parser.py Projekt: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return
    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match("&#xD;&#xA", url):
            regex = '.*?(/Survey.+?html)'
            new_url = tools.get_info(url, regex)
            if new_url:
                new_url = new_url[0]
                new_url = 'http://www.longmatan.gov.cn' + new_url
        else:
            new_url = 'http://www.longmatan.gov.cn' + url

        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<h2 class="title">(.*?)</h2>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<span>发布时间：(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.format_date(release_time)

    #作者
    regexs = '<span>作者：(.*?)</span>'
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    author = tools.del_html_tag(author)

    #文章来源
    regexs = '<span>文章来源：(.*?)</span>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = '<span>点击数：(\d*?)<span'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<div class="conTxt">(.*?)</div>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                author              = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, source_url, title, release_time, author, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                author=author,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)