Beispiel #1
0
def parser_first_page_article(html, video_id, url):
    regex = '(<div class="m-feedSection clearfix.*?)<!-- 评论列表 end-->'
    content_blocks = tools.get_info(html, regex)

    for content_block in content_blocks:
        regex = 'data-paopao-feedId="(.*?)"'
        article_id = tools.get_info(content_block, regex, fetch_one = True)

        regex = '<img width="50".*?"(http.*?)"'
        head_url = tools.get_info(content_block, regex, fetch_one = True)

        regex = '<a.*?data-paopao-ele="userUrl".*?title="(.*?)"'
        name = tools.get_info(content_block, regex, fetch_one = True)

        regex = '<p class="feed_por_time">(.*?)</p>'
        release_time = tools.get_info(content_block, regex, fetch_one = True)
        release_time = tools.format_time(release_time)
        release_time = tools.format_date(release_time)

        regex = '<h3 class="title_icon_right" title="(.*?)">'
        title = tools.get_info(content_block, regex, fetch_one = True)

        regex = '<span data-paopao-ele="dispalyContent.*?">(.*?)</span>'
        content = tools.get_info(content_block, regex, fetch_one = True)

        regex = '<img width="100%" height="100%" data-lazy="(.*?)"'
        image_urls = tools.get_info(content_block, regex, split = ',')

        regex = '<em data-paopao-uvCnt=.*?>(.*?)</em>'
        watch_count = tools.get_info(content_block, regex, fetch_one = True)
        watch_count = tools.get_int(watch_count)

        regex = '<em data-paopao-agreeCnt="(.*?)">'
        up_count = tools.get_info(content_block, regex, fetch_one = True)

        regex = '<em data-paopao-commentCnt="(.*?)">'
        comment_count = tools.get_info(content_block, regex, fetch_one = True)

        log.debug('''
            id:       %s
            节目id     %s
            头像地址: %s
            名字:     %s
            发布时间: %s
            标题:     %s
            内容:     %s
            图片地址: %s
            观看量:   %s
            点赞量:   %s
            评论量:   %s
            '''%(article_id, video_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count))

        if self_base_parser.add_article(article_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count, program_id = video_id, gender = random.randint(0,1), url = url, info_type = 3, emotion = random.randint(0,2), collect = 0, source = '爱奇艺'):

            # 解析評論
            regex = "\['wallId'\] = \"(.*?)\""
            wall_id = tools.get_info(html, regex, fetch_one = True)
            parser_comment(article_id, wall_id)
        else:
            break
Beispiel #2
0
def save_video_info(release_time='',
                    content='',
                    url='',
                    author='',
                    title='',
                    image_url='',
                    site_name='',
                    play_count=None,
                    comment_count=None,
                    praise_count=None,
                    summary='',
                    time_length=None):
    domain = tools.get_domain(url)
    content_info = {
        'domain': domain,
        'uuid': tools.get_uuid(title, domain),
        'site_name': site_name,
        'image_url': image_url,
        'title': title,
        'author': author,
        'url': url,
        'content': content,
        'release_time': tools.format_date(release_time),
        'play_count': play_count,
        'comment_count': comment_count,
        'praise_count': praise_count,
        'time_length': time_length,
        'record_time': tools.get_current_date(),
        'summary': summary
    }
    log.debug(tools.dumps_json(content_info))

    es.add('video_news', content_info, content_info['uuid'])
Beispiel #3
0
        def get_release_time_in_paragraph(paragraph_pos):
            if self._paragraphs:
                while paragraph_pos >= 0:
                    content = self.__replace_str(self._paragraphs[paragraph_pos], '<(.|\n)*?>', '<>')
                    release_time = tools.get_info(content, DAY_TIME_REGEXS, fetch_one = True)
                    if release_time:
                        return tools.format_date(release_time)

                    paragraph_pos -= 1

            return None
Beispiel #4
0
    def get_release_time_old(self):

        if self._content_start_pos and self._content_end_pos:
            content = self.__replace_str('\n'.join(self._paragraphs[self._content_start_pos  - RELEASE_TIME_OFFSET: self._content_end_pos + RELEASE_TIME_OFFSET]), '<(.|\n)*?>', '<>')
        else:
            content = self.__replace_str(self._text, '<(.|\n)*?>', '<>')

        release_time = tools.get_info(content, DAY_TIME_REGEXS, fetch_one = True)
        if not release_time:
            release_time = tools.get_info(self.__replace_str(self._text, '<(.|\n)*?>', '<>'), DAY_TIME_REGEXS, fetch_one = True)

        release_time = tools.format_date(release_time)

        return release_time
def parser_comment(article_id):
    page = 1
    is_continue = True
    while True and is_continue:
        url = 'https://m.weibo.cn/api/comments/show?id=%s&page=%s' % (
            article_id, page)
        comment_json = tools.get_json_by_requests(url)
        msg = comment_json.get('msg')
        if msg == '暂无数据':
            break

        comment_datas = comment_json.get('data', {}).get('data', [])
        for comment_data in comment_datas:
            comment_id = comment_data.get('id')
            release_time = comment_data.get('created_at')
            release_time = tools.format_date(release_time)
            come_from = comment_data.get('source')
            content = comment_data.get('text')
            praise_count = comment_data.get('like_counts')
            user_name = comment_data.get('user', {}).get('screen_name')
            head_url = comment_data.get('user', {}).get('profile_image_url')

            emotion = random.randint(0, 2)
            hot_id = comment_id

            log.debug('''
                id:       %s
                发布时间:%s
                来自:    %s
                内容:    %s
                点赞数:  %s
                用户名    %s
                头像地址  %s
                ''' % (comment_id, release_time, come_from, content,
                       praise_count, user_name, head_url))

            if not self_base_parser.add_comment(
                    comment_id, None, article_id, user_name, head_url, None,
                    content, praise_count, release_time, emotion, hot_id):
                is_continue = False
                break

        page += 1
Beispiel #6
0
def save_weibo_info(table, site_id='', release_time='', video_url='', user_name='', content='', _id='', url='',
                    reposts_count='', comments_count='', attitudes_count='', is_debug=False):

    if es.get('weibo_article', _id):
        log.debug('%s 已存在'%content)
        return False

    content_info = {
        'transmit_count': reposts_count, # 转发数
        'comment_count': comments_count,
        'up_count': attitudes_count,
        'url': url,
        'id': _id, #int
        'video_url': video_url,
        'content': content,
        'release_time': tools.format_date(release_time),
        'record_time' : tools.get_current_date(),
        'user_name': user_name
    }

    log.debug(tools.dumps_json(content_info))
    es.add('weibo_article', content_info, data_id = _id)
    return True
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match("&#xD;&#xA", url):
            regex = '.*?(/GovPublicInfo.+?000)'
            new_url = tools.get_info(url, regex)
            new_url = new_url[0]
            new_url = 'http://www.luzhou.gov.cn' + new_url
        else:
            new_url = 'http://www.luzhou.gov.cn' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h2 class="title">(.*?)</h2>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<span>发布时间:(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.format_date(release_time)

    #文章来源
    regexs = '<span>文章来源:(.*?)</span>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    #点击数
    regexs = '<span>点击数.*?src="(.*?)"></script>'
    times_script_url = tools.get_info(html, regexs)
    times_script_url = ''.join(times_script_url)
    times_script_url = 'http://www.luzhou.gov.cn' + times_script_url
    watched_count_html, request = tools.get_html_by_requests(times_script_url)
    regexs = '\'(\d*?)\''
    watched_count = tools.get_info(watched_count_html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<div class="conTxt">(.*?)</div>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, title, source_url, release_time, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='gb2312')

    regexs = 'charset=(.*?)"'
    code = tools.get_info(html, regexs)
    code = code and code[0] or 'gb2312'
    html, request = tools.get_html_by_requests(source_url, code=code)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.scpolicec.edu.cn' + url
        else:
            new_url = 'http://www.scpolicec.edu.cn/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = ['<div class="main_title">(.*?)<div class="top_about">', '<h1>(.*?)</h1>', '<title>(.*?)</title>',
              '<div class="contentPageTitle">(.*?)</div>']
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>','<small>时间:</small>(.*?)<small>',
              '<h2><span>更新时间:(.*?)</span>']
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    if not release_time:
            regexs = '</a> 发布时间:(.*?) 点击数'
            release_time = tools.get_info(html, regexs)
            release_time = release_time and release_time[0] or ''
            release_time = tools.format_date(release_time)


    # #作者
    regexs = ['作者:(.*?) 【']
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    #author = tools.del_html_tag(author)

    #文章来源
    regexs = '来源:(.*?)</a>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = ['浏览:<font id="hits">(\d*?)</font>次', '点击数:(\d*?)&#xA;发表时间']
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<p style="text-align: center;">(.*?)</table>', '<div class="contentPageContent">(.*?)</table>'
              '<div id="endtext" style="width:900px;">(.*?)<div id="pages"></div>',
              '<div id="articleContnet">(.*?)<div class="page_css">']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                    depth               = %s
                    url                 = %s
                    title               = %s
                    release_time        = %s
                    author              = %s
                    origin              = %s
                    watched_count       = %s
                    content             = %s
                 ''' % (depth, source_url, title, release_time, author, origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id,url=source_url, title=title, release_time=release_time, author=author,
                                origin=origin, watched_count=watched_count, content=content)
    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    # 时间
    regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>']
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    if not release_time:
        regexs = '<small>时间:</small>(.*?)<small>'
        release_time = tools.get_info(html, regexs)
        release_time = release_time and release_time[0] or ''
        if not release_time:
            regexs = '</a> 发布时间:(.*?) 点击数'
            release_time = tools.get_info(html, regexs)
            release_time = release_time and release_time[0] or ''
            release_time = tools.format_date(release_time)
    #
    # #作者
    regexs = ['作者:(.*?) 【', '作者:(.*?) 来源']
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    # author = tools.del_html_tag(author)

    # # 文章来源
    # regexs = '来源:(.*?)</a>'
    # origin = tools.get_info(html, regexs)
    # origin = origin and origin[0] or ''
    # origin = tools.del_html_tag(origin)
    # #
    # # #点击数
    regexs = ['浏览:<font id="hits">(\d*?)</font>次', '点击数:(\d*?)&#xA;发表时间']
Beispiel #10
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    user_id = url_info['remark']['user_id']
    head_url = url_info['remark']['head_url']
    user_name = url_info['remark']['user_name']
    gender = url_info['remark']['gender']
    program_id = url_info['remark']['program_id']

    page_count = 50
    is_continue = True

    for i in range(0, page_count + 1):
        if not is_continue: break

        weibo_content_url = root_url + '&page=%d' % i

        headers = {
            "Cache-Control":
            "max-age=0",
            "Cookie":
            "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011",
            "Accept-Language":
            "zh-CN,zh;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
            "Host":
            "m.weibo.cn",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Upgrade-Insecure-Requests":
            "1",
            "Connection":
            "keep-alive",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
        }
        html = tools.get_json_by_requests(weibo_content_url, headers=headers)

        cards = tools.get_json_value(html, 'data.cards')
        if len(cards) < 2:
            base_parser.update_url('mms_urls', root_url, Constance.DONE)
            return

        for card in cards:
            mblog = tools.get_json_value(card, 'mblog')
            if not mblog:
                continue

            url = tools.get_json_value(card, 'scheme')
            article_id = tools.get_json_value(mblog, 'id')
            article_url = 'https://m.weibo.cn/status/' + article_id

            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Cookie":
                "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011",
                "Host": "m.weibo.cn",
                "Accept-Language": "zh-CN,zh;q=0.8",
                "Upgrade-Insecure-Requests": "1",
                "Connection": "keep-alive"
            }
            origin_html, r = tools.get_html_by_requests(url, headers=headers)
            if not origin_html:
                continue

            # 精确到具体时分秒 需进入到article_url
            release_time = mblog['created_at']
            release_time = tools.format_time(release_time)
            # release_time = get_release_time(mblog)
            release_time = tools.format_date(release_time)

            come_from = tools.get_json_value(mblog, 'source')
            regexs = ['"text": "(.+?)",']
            content = ''.join(tools.get_info(origin_html, regexs))
            # content = tools.del_html_tag(content)
            content = content.replace('\\', '')

            regexs = ['"pic_ids": \[(.*?)\],']
            image_url = ''.join(tools.get_info(origin_html, regexs))
            image_url = tools.del_html_tag(image_url).replace('\"',
                                                              '').replace(
                                                                  '\\n', '')
            if image_url:
                image_url = image_url.split(',')
                for i in range(len(image_url)):
                    image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[
                        i] + '.jpg'

                image_url = ','.join(image_url)

            regexs = ['"stream_url": "(.*?)"']
            video_url = ''.join(tools.get_info(origin_html, regexs))
            transpond_count = tools.get_json_value(mblog, 'reposts_count')
            praise_count = tools.get_json_value(mblog, 'attitudes_count')
            comments_count = tools.get_json_value(mblog, 'comments_count')

            log.debug('''
                原文地址:     %s
                博主ID:       %s
                文章id         %s
                发布时间:     %s
                来自:         %s
                内容:         %s
                图片地址:     %s
                视频地址:     %s
                评论数:       %s
                转发数:       %s
                点赞数:       %s
                ''' % (article_url, user_id, article_id, release_time,
                       come_from, content, image_url, video_url,
                       comments_count, transpond_count, praise_count))

            if self_base_parser.add_article(article_id,
                                            head_url,
                                            user_name,
                                            release_time,
                                            None,
                                            content,
                                            image_url,
                                            None,
                                            praise_count,
                                            comments_count,
                                            program_id=program_id,
                                            gender=gender,
                                            url=article_url,
                                            info_type=1,
                                            emotion=random.randint(0, 2),
                                            collect=0,
                                            source='新浪微博'):

                if comments_count > 0:
                    parser_comment(article_id)
            else:
                is_continue = False
                break

    base_parser.update_url('mms_urls', root_url, Constance.DONE)
Beispiel #11
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return
    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        else:
            new_url = 'http://www.xuyong.gov.cn'+url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<td  class="titlestyle1037"  align="center">(.*?)</td></tr>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<span  class="timestyle1037" >(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.format_date(release_time)

    # #作者
    # regexs = '<span>作者:(.*?)</span>'
    # author = tools.get_info(html, regexs)
    # author = author and author[0] or ''
    # author = tools.del_html_tag(author)

    #文章来源
    # regexs = '采编: (.*?)阅读'
    # origin = tools.get_info(html, regexs)
    # origin = origin and origin[0] or ''
    # origin = tools.del_html_tag(origin)

    # #点击数
    # regexs = '阅读:(\d*?)次'
    # watched_count = tools.get_info(html, regexs)
    # watched_count = watched_count and watched_count[0] or ''
    # watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<tr><td  class="contentstyle1037" >(.*?) <tr><td  class="pagestyle1037"  align="left">']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                content             = %s
             ''' % (depth+1, source_url, title, release_time,  content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title,
                                release_time=release_time, content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
    def get_release_time(self):
        content = self.__del_html_tag(self._text)
        release_time = tools.get_info(content, DAY_TIME_REGEXS, fetch_one = True)
        release_time = tools.format_date(release_time)

        return release_time
Beispiel #13
0
    def get_release_time(self):
        content = self.__replace_str(self._text, '<(.|\n)*?>', '<>')
        release_time = tools.get_info(content, DAY_TIME_REGEXS, fetch_one=True)
        release_time = tools.format_date(release_time)

        return release_time
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        else:
            new_url = 'http://www.jiangyang.gov.cn/template/default/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<div class="tit">(.*?)</div>'
    title = tools.get_info(html, regexs)
    if not title:
        regexs = '<h1>(.*?)</h1>'
        title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<label>(.*?)</label>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    if release_time:
        release_time = tools.format_date(release_time)
    if not release_time:
        regexs = '<span class="time">发布时间:(.*?)</span><span class="source"></span></p>'
        release_time = tools.get_info(html, regexs)
        release_time = release_time and release_time[0] or ''
        #release_time = tools.format_date(release_time)

    #文章来源
    regexs = '<label>来源:(.*?)</label>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # 内容
    regexs = ['<div class="content" id="nr" style="">(.*?)</div>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)
    if not content:
        regexs = '<p style="text-align: center;"(.*?)</div>.*?<div class="content">'
        content = tools.get_info(html, regexs)
        content = content and content[0] or ''
        content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                content             = %s
             ''' %
              (depth + 1, source_url, title, release_time, origin, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                origin=origin,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return
    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match("&#xD;&#xA", url):
            regex = '.*?(/Survey.+?html)'
            new_url = tools.get_info(url, regex)
            if new_url:
                new_url = new_url[0]
                new_url = 'http://www.longmatan.gov.cn' + new_url
        else:
            new_url = 'http://www.longmatan.gov.cn' + url

        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<h2 class="title">(.*?)</h2>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<span>发布时间:(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.format_date(release_time)

    #作者
    regexs = '<span>作者:(.*?)</span>'
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    author = tools.del_html_tag(author)

    #文章来源
    regexs = '<span>文章来源:(.*?)</span>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = '<span>点击数:(\d*?)<span'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<div class="conTxt">(.*?)</div>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                author              = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, source_url, title, release_time, author, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                author=author,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)
Beispiel #16
0
def save_info(table,
              site_id,
              site_name='',
              url='',
              title='',
              content='',
              release_time='',
              image_url='',
              video_url='',
              is_out_link=1,
              download_image=False,
              is_debug=False,
              es_read_status='',
              info_type=''):
    # global num
    # if num<2000:
    #     num+=1
    #     image_recogs=image_recog(image_url)
    # else:
    #     image_recogs=5

    if not download_image:
        sexy_image_url = image_url
        local_image_path = ''
    else:
        file_local_path = tools.get_conf_value('config.conf', 'files',
                                               'zhejiang_app_save_path')
        if image_url:
            img_name = 'images/' + tools.get_current_date(
                date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                    date_format='%Y%m%d%H%M%S.%f') + '.jpg'
            tools.download_file(image_url, file_local_path, img_name)
            local_image_path = file_local_path + img_name
            sexy_image_url = local_image_path
        else:
            local_image_path = ''
            sexy_image_url = ''

    if len(content) > 400:
        temporary_content = content[0:400]
    else:
        temporary_content = content

    # record_time = tools.get_current_date()
    # release_time = tools.format_date(release_time)
    try:
        release_time = tools.format_date(release_time)
    except Exception as e:
        log.debug(e, release_time, url)
    record_time = tools.get_current_date()
    if release_time > record_time:
        return
    content_info = {
        'site_name': site_name,
        'video_url': video_url,
        'image_url': image_url,
        'temporary_content': temporary_content,
        'title': title,
        # 'video_local_path': local_video_path,\
        'img_stor_path': local_image_path,
        'release_time': release_time,
        'is_out_link': is_out_link,
        'url': url,
        'es_read_status': 0,
        'site_id': site_id,
        'read_status': 0,
        'record_time': record_time,
        # 'sexy_image_url': sexy_image_url, 'sexy_image_status': '', 'image_pron_status': image_recogs
    }
    content_info.pop('temporary_content')
    content_info['content'] = content
    if db.add(table, content_info):
        log.debug(content_info)