Exemple #1
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match("/", url):
            new_url = 'http://www.naxi.gov.cn' + url
        else:
            new_url = 'http://www.naxi.gov.cn/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<DIV class=news_conent_two_title>(.*?)</DIV>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<SPAN>日期:(.*?)</SPAN>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''

    #文章来源
    regexs = '<SPAN>来源:(.*?)</SPAN>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = '<SPAN>点击数:(\d*?)</SPAN>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = [
        '<DIV id=news_conent_two_text class=news_conent_two_text>(.*?)</DIV>'
    ]
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, source_url, title, release_time, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match("&#xD;&#xA", url):
            regex = '.*?(/GovPublicInfo.+?000)'
            new_url = tools.get_info(url, regex)
            new_url = new_url[0]
            new_url = 'http://www.luzhou.gov.cn' + new_url
        else:
            new_url = 'http://www.luzhou.gov.cn' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h2 class="title">(.*?)</h2>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<span>发布时间:(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.format_date(release_time)

    #文章来源
    regexs = '<span>文章来源:(.*?)</span>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    #点击数
    regexs = '<span>点击数.*?src="(.*?)"></script>'
    times_script_url = tools.get_info(html, regexs)
    times_script_url = ''.join(times_script_url)
    times_script_url = 'http://www.luzhou.gov.cn' + times_script_url
    watched_count_html, request = tools.get_html_by_requests(times_script_url)
    regexs = '\'(\d*?)\''
    watched_count = tools.get_info(watched_count_html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<div class="conTxt">(.*?)</div>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, title, source_url, release_time, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
Exemple #3
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        base_parser.add_url('op_urls', website_id, url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = '<strong class="NameTxt"><a >(.*?)</a></strong>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '发表时间:(.*?)&nbsp;'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.del_html_tag(release_time)

    #作者
    author = '编辑:(.*?)</div>'
    author = tools.get_info(html, regexs)
    author = release_time and author[0] or ''
    author = tools.del_html_tag(author)

    #文章来源
    regexs = '来源:(.*?)&nbsp'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    #点击数
    regexs = '评论:<span class="style1">(\d*?)</span>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<td height="2" class="graphic10">(.*?)来源']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                author              = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth, source_url, title, release_time, author, origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, author=author,
                                origin=origin, watched_count=watched_count, content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='gb2312')

    regexs = 'charset=(.*?)"'
    code = tools.get_info(html, regexs)
    code = code and code[0] or 'gb2312'
    html, request = tools.get_html_by_requests(source_url, code=code)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.scpolicec.edu.cn' + url
        else:
            new_url = 'http://www.scpolicec.edu.cn/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = ['<div class="main_title">(.*?)<div class="top_about">', '<h1>(.*?)</h1>', '<title>(.*?)</title>',
              '<div class="contentPageTitle">(.*?)</div>']
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>','<small>时间:</small>(.*?)<small>',
              '<h2><span>更新时间:(.*?)</span>']
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    if not release_time:
            regexs = '</a> 发布时间:(.*?) 点击数'
            release_time = tools.get_info(html, regexs)
            release_time = release_time and release_time[0] or ''
            release_time = tools.format_date(release_time)


    # #作者
    regexs = ['作者:(.*?) 【']
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    #author = tools.del_html_tag(author)

    #文章来源
    regexs = '来源:(.*?)</a>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = ['浏览:<font id="hits">(\d*?)</font>次', '点击数:(\d*?)&#xA;发表时间']
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<p style="text-align: center;">(.*?)</table>', '<div class="contentPageContent">(.*?)</table>'
              '<div id="endtext" style="width:900px;">(.*?)<div id="pages"></div>',
              '<div id="articleContnet">(.*?)<div class="page_css">']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                    depth               = %s
                    url                 = %s
                    title               = %s
                    release_time        = %s
                    author              = %s
                    origin              = %s
                    watched_count       = %s
                    content             = %s
                 ''' % (depth, source_url, title, release_time, author, origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id,url=source_url, title=title, release_time=release_time, author=author,
                                origin=origin, watched_count=watched_count, content=content)
    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='gb2312')
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.sccc.edu.cn/new' + url
        else:
            new_url = 'http://www.sccc.edu.cn/new/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)
    regexs = '<script type="text/javascript" language="JavaScript" src="(.*?)"'
    urls = tools.get_info(html, regexs)
    for url in urls:
        new_url = 'http://www.sccc.edu.cn/new/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = 'td height="60" align="center" valign="bottom" class="nrbt">(.*?)</td>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<td height="3" align="center" valign="top">(.*?)</td>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[1] or ''

    # #作者
    regexs = '<td width="250">(.*?)</td>'
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    #author = tools.del_html_tag(author)

    #文章来源
    regexs = '<td width="300">(.*?)</td>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = ' <td>阅读(\d*?)次</td>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<td class="nr">(.*?)</td>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                    depth               = %s
                    url                 = %s
                    title               = %s
                    release_time        = %s
                    author              = %s
                    origin              = %s
                    watched_count       = %s
                    content             = %s
                 ''' % (depth + 1, source_url, title, release_time, author,
                        origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                source_url,
                                title=title,
                                release_time=release_time,
                                author=author,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)
    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
Exemple #6
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return
    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        else:
            new_url = 'http://www.xuyong.gov.cn'+url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<td  class="titlestyle1037"  align="center">(.*?)</td></tr>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<span  class="timestyle1037" >(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.format_date(release_time)

    # #作者
    # regexs = '<span>作者:(.*?)</span>'
    # author = tools.get_info(html, regexs)
    # author = author and author[0] or ''
    # author = tools.del_html_tag(author)

    #文章来源
    # regexs = '采编: (.*?)阅读'
    # origin = tools.get_info(html, regexs)
    # origin = origin and origin[0] or ''
    # origin = tools.del_html_tag(origin)

    # #点击数
    # regexs = '阅读:(\d*?)次'
    # watched_count = tools.get_info(html, regexs)
    # watched_count = watched_count and watched_count[0] or ''
    # watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<tr><td  class="contentstyle1037" >(.*?) <tr><td  class="pagestyle1037"  align="left">']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                content             = %s
             ''' % (depth+1, source_url, title, release_time,  content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title,
                                release_time=release_time, content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate, sdch',
        'Accept-Language':
        'zh-CN,zh;q=0.8',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        # 'Cookie':'__cfduid=d17ca6a5def98d8c14f73dcee28042c7f1492065760',
        'Host':
        'www.luzhoutianli.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }

    html, request = tools.get_html_by_requests(source_url,
                                               headers=headers,
                                               code='gb2312')
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return
    urls = tools.get_urls(html)
    for url in urls:
        #new_url = tools.get_full_url('http://www.luzhoutianli.com', url)
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.luzhoutianli.com' + url
        else:
            new_url = 'http://www.luzhoutianli.com/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题
    #
    # print(html)
    regexs = '<strong class="NameTxt"><a >(.*?)</a></strong>.*?</td>'
    title = tools.get_info(html, regexs)
    if len(title) > 1:
        title = title[1]
    else:
        title = title and title[0] or ''
        title = tools.del_html_tag(title)

    # 时间
    regexs = ' <span class="FC_Time">时间:(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''

    # 作者
    regexs = '<span class="FC_Time">作者:(.*?)</span>'
    author = tools.get_info(html, regexs)
    author = release_time and author[0] or ''
    author = tools.del_html_tag(author)
    # #print(author)
    #
    # 文章来源
    regexs = '来源:(.*?)&nbsp'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # 点击数
    regexs = '评论:<span class="style1">(\d*?)</span>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = [
        '<div class="articleDetails">.*?</script>(.*?)<td class="MoBodyR"'
    ]
    content = tools.get_info(html, regexs)
    # print(content[0])
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                        depth               = %s
                        url                 = %s
                        title               = %s
                        release_time        = %s
                        author              = %s
                        content             = %s
                     ''' %
              (depth, source_url, title, release_time, author, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                author=author,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='gb2312')
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.scmu.edu.cn' + url
        else:
            new_url = 'http://www.scmu.edu.cn/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = [
        '<div class="main_list_right_2_7">(.*?)<div class="main_list_right_2_7_1">',
        '<div class="articlett">(.*?)</div>'
    ]
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '发布时间:(.*?) 点击数'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''

    #作者
    regexs = '作者:(.*?)  来源'
    author = tools.get_info(html, regexs)
    author = release_time and author[0] or ''
    author = tools.del_html_tag(author)

    #文章来源
    regexs = '来源: (.*?) 发布时间'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    #点击数
    regexs = '评论:<span class="style1">(\d*?)</span>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = [
        '<div class="main_list_right_2_5">(.*?)<div class="main_list_right_2_7_2">',
        '<div class="content">(.*?)<div id="pages" class="text-c"></div>'
    ]
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                author              = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth, source_url, title, release_time, author, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                author=author,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        else:
            new_url = 'http://www.jiangyang.gov.cn/template/default/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<div class="tit">(.*?)</div>'
    title = tools.get_info(html, regexs)
    if not title:
        regexs = '<h1>(.*?)</h1>'
        title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<label>(.*?)</label>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    if release_time:
        release_time = tools.format_date(release_time)
    if not release_time:
        regexs = '<span class="time">发布时间:(.*?)</span><span class="source"></span></p>'
        release_time = tools.get_info(html, regexs)
        release_time = release_time and release_time[0] or ''
        #release_time = tools.format_date(release_time)

    #文章来源
    regexs = '<label>来源:(.*?)</label>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # 内容
    regexs = ['<div class="content" id="nr" style="">(.*?)</div>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)
    if not content:
        regexs = '<p style="text-align: center;"(.*?)</div>.*?<div class="content">'
        content = tools.get_info(html, regexs)
        content = content and content[0] or ''
        content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                content             = %s
             ''' %
              (depth + 1, source_url, title, release_time, origin, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                origin=origin,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
Exemple #10
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='gb2312')
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.lx2hs.com' + url
        else:
            new_url = 'http://www.lx2hs.com/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<title>(.*?)</title>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #更新时间
    regexs = '发表时间:(.*?)&nbsp'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''

    # #作者
    regexs = '<span>编辑:(.*?)</div>'
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    author = tools.del_html_tag(author)

    # #文章来源
    # regexs = '<td align=\'center\'   class=\'info\'>(.*?) 点击数'
    # origin = tools.get_info(html, regexs)
    # origin = origin and origin[0] or ''
    # origin = tools.del_html_tag(origin)

    # #点击数
    regexs = '点击/评论:<span class="style1">(\d*?)</span>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['0</span>(.*?)来源']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                watched_count       = %s
                content             = %s
             ''' %
              (depth, source_url, title, release_time, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
Exemple #11
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.gulin.gov.cn' + url
        else:
            new_url = 'http://www.gulin.gov.cn/template/default/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = '<div class="news_titile">(.*?)</div>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<div class="news_info">时间:2017-04-07 采编'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''

    # #作者
    # regexs = '<span>作者:(.*?)</span>'
    # author = tools.get_info(html, regexs)
    # author = author and author[0] or ''
    # author = tools.del_html_tag(author)

    #文章来源
    regexs = '采编: (.*?)阅读'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = '阅读:(\d*?)次'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = '<div class="news_content" id="news_content">(.*?)</span></b></p>'
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth+1, source_url, title, release_time, origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title,
                                release_time=release_time, origin=origin, watched_count=watched_count, content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
Exemple #12
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url, code='gb2312')
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.lzzjw.com' + url
        else:
            new_url = 'http://www.lzzjw.com/' + url

        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h1>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<h3>时间:(.*?) 点击'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.del_html_tag(release_time)

    # 作者
    regexs = '<div id="copy">作者:(.*?)来源'
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    author = tools.del_html_tag(author)

    # 来源
    regexs = ' <div id="copy">作者:.*? 来源:(.*?)</div>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    #点击数
    regexs = 'ID=(.*)'
    times_script_url = tools.get_info(source_url, regexs)
    times_script_url = ''.join(times_script_url)
    times_script_url = 'http://www.lzzjw.com/js/count.asp?id=' + times_script_url
    watched_count_html, request = tools.get_html_by_requests(times_script_url)
    regexs = '\'(\d*?)\''
    watched_count = tools.get_info(watched_count_html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<div id="content">(.*?)<div id="copy">']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                author              = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, source_url, title, release_time, author, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                author=author,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url, code='gb2312')

    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    # 取当前页的文章信息
    # 标题

    regexs = '<h1><b>.*?</b>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<li>发表于:(.*?)</li>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''

    #点击数
    regexs = '<li>阅读:(\d*?)</li>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['新浪微博登录</a></p>(.*?)</td>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                watched_count       = %s
                content             = %s
             ''' %
              (depth, source_url, title, release_time, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        else:
            new_url = 'http://www.lzy.edu.cn/' + url
        base_parser.add_url('op_urls', website_id, url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<p class="atcTitle1a">(.*?)</p>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<div class="atcTitle">.*?</script>(.*?)</div>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.del_html_tag(release_time)
    release_time = release_time.strip('|')

    # #作者
    # regexs = '<span>作者:(.*?)</span>'
    # author = tools.get_info(html, regexs)
    # author = author and author[0] or ''
    # author = tools.del_html_tag(author)

    #文章来源
    regexs = '文章来源:(.*?)点击数'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)
    if origin.find('|'):
        origin = origin.strip('|')

    # #点击数
    regexs = '点击数:<script type="text/javascript" src="(.*?)"></script>'
    times_script_url = tools.get_info(html, regexs)
    times_script_url = ''.join(times_script_url)
    times_script_url = 'http://www.lzy.edu.cn/' + times_script_url
    watched_count_html, request = tools.get_html_by_requests(times_script_url)
    regexs = '\'(\d*?)\''
    watched_count = tools.get_info(watched_count_html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = [
        '<span style="font-size: 18px">(.*?)<div style="text-align:right">'
    ]
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth, source_url, title, release_time, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return
    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match("&#xD;&#xA", url):
            regex = '.*?(/Survey.+?html)'
            new_url = tools.get_info(url, regex)
            if new_url:
                new_url = new_url[0]
                new_url = 'http://www.longmatan.gov.cn' + new_url
        else:
            new_url = 'http://www.longmatan.gov.cn' + url

        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<h2 class="title">(.*?)</h2>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<span>发布时间:(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.format_date(release_time)

    #作者
    regexs = '<span>作者:(.*?)</span>'
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    author = tools.del_html_tag(author)

    #文章来源
    regexs = '<span>文章来源:(.*?)</span>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = '<span>点击数:(\d*?)<span'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<div class="conTxt">(.*?)</div>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                author              = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, source_url, title, release_time, author, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                author=author,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)