Beispiel #1
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    offset = remark.get('offset')

    html = tools.get_html_by_webdirver(root_url)
    headers = tools.get_tag(html, 'div', {'class': 'result'}, find_all=True)
    if not headers:
        base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)

    for header in headers:
        # 查看更多相关新闻
        regex = ' <span class="c-info"><a.*?href="(.*?)".*?查看更多相关新闻'
        more_news_url = tools.get_info(str(header), regex, fetch_one = True)
        if more_news_url:
            more_news_url = tools.get_full_url('http://news.baidu.com', more_news_url)
            more_news_url = more_news_url.replace('amp;', '')
            base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, more_news_url, depth = 1, remark = {'offset':0})

        url = header.h3.a['href']
        article_extractor = ArticleExtractor(url)
        content = title = release_time = author = website_domain =''
        content = article_extractor.get_content()
        if content:
            title = article_extractor.get_title()
            release_time = article_extractor.get_release_time()
            author = article_extractor.get_author()
            website_domain = tools.get_domain(url)
            uuid = tools.get_uuid(title, website_domain)
            website_name = ''
            website_position = None

            log.debug('''
                uuid         %s
                title        %s
                author       %s
                release_time %s
                domain       %s
                url          %s
                content      %s
                '''%(uuid, title, author, release_time, website_domain, url, '...'))

            # 入库
            if tools.is_have_chinese(content):
                is_continue = self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name , website_domain, website_position, url, content)

                if not is_continue:
                    break
    else:
        # 循环正常结束 该页均正常入库, 继续爬取下页
        offset += 50
        url = tools.replace_str(root_url, 'pn=\d*', 'pn=%d'%offset)
        base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, url, depth = 0, remark = {'offset': offset})

    base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)
Beispiel #2
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    fit_url = tools.fit_url(urls, FIT_URLS)
    for url in fit_url:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)
    # 取当前页的文章信息
    # 标题
    regexs = '<h1>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.replace_str(title, '&.*?;')
    # 内容
    regexs = [
        '<div id="end_text".*?>(.*?)<div class="post_btmshare">',
        '<div class="post_text".*?>(.*?)<div class="post_btmshare">'
    ]

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             ''' % (depth + 1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id,
                                     source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)
Beispiel #3
0
    def get_article_content(self, data, req_url):
        log.debug('获取文章内容')

        if data:  # 被验证不详实的文章 首次不反回内容,跳转到https://mp.weixin.qq.com/mp/rumor
            req_url = req_url.replace('amp;', '')
            mid = tools.get_param(req_url, 'mid') or tools.get_param(
                req_url, 'appmsgid')  # 图文消息id 同一天发布的图文消息 id一样
            idx = tools.get_param(req_url, 'idx') or tools.get_param(
                req_url, 'itemidx')  # 第几条图文消息 从1开始
            article_id = mid + idx  # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260  idx = 1,则article_id = 26504922601
            WechatAction._current_aritcle_id = article_id  # 记录当前文章的id 为获取评论信息时找对应的文章id使用
            print('当前id' + WechatAction._current_aritcle_id)
            regex = '(<div class="rich_media_content ".*?)<script nonce'
            content = tools.get_info(data, regex, fetch_one=True)
            if content:
                # 缓存文章内容
                WechatAction._article_info[article_id]['content'] = content
                # 取公众号名
                regex = '<title>(.*?)</title>'
                account = tools.get_info(data, regex, fetch_one=True)
                WechatAction._article_info[article_id]['account'] = account

            else:  # 被验证不实的文章,不会请求观看点赞数,此时直接入库
                regex = '<title>(.*?)</title>'
                content = tools.get_info(data, regex, fetch_one=True)
                WechatAction._article_info[article_id]['content'] = content

                # 入库
                print('被验证不实的文章,不会请求观看点赞数,此时直接入库')
                WechatAction._wechat_service.add_article_info(
                    WechatAction._article_info.pop(article_id))

            # 如果下一页是文章列表的链接, 替换文章列表中的appmsg_token,防止列表链接过期
            if (len(WechatAction._todo_urls)
                    == 1) and ('/mp/profile_ext'
                               in WechatAction._todo_urls[-1]):
                regex = 'appmsg_token = "(.*?)"'
                appmsg_token = tools.get_info(data, regex,
                                              fetch_one=True).strip()

                WechatAction._todo_urls[-1] = tools.replace_str(
                    WechatAction._todo_urls[-1], 'appmsg_token=.*?&',
                    'appmsg_token=%s&' % appmsg_token)

            return self.__open_next_page()

        else:
            # 无文章内容
            pass
    def del_not_use_tag(content):
        content = tools.replace_str(content, '<script(.|\n)*?</script>')
        content = tools.replace_str(content, '<style(.|\n)*?</style>')
        content = tools.replace_str(content, '<!--(.|\n)*?-->')
        content = content.replace('</p>', '/p')
        content = tools.replace_str(content, '<[^p].*?>')
        content = content.replace('/p', '</p>')
        content = tools.replace_str(content, '&.*?;')
        content = tools.replace_str(content, '[ \f\r\t\v]')

        return content
Beispiel #5
0
def format_keys(keywords):
    '''
    @summary: &表示与的关系 |表示或的关系,括号括起来表示一组
    ---------
    @param keywords:
    ---------
    @result:
    '''

    keywords = keywords.replace('(', '(')
    keywords = keywords.replace(')', ')')
    keywords = keywords.replace(')(', ')&(')
    print(keywords)

    chinese_word = tools.get_chinese_word(keywords)
    keywords = keywords.split(',')
    for i in range(len(keywords)):
        keywords[i] = keywords[i].strip()
        # print('--------------------------')
        # print(keywords[i])
        # chinese_word = tools.get_chinese_word(keywords[i])
        regex = '[a-zA-Z 0-9:]+'
        english_words = tools.get_info(keywords[i], regex, allow_repeat=True)
        while ' ' in english_words:
            english_words.remove(' ')
        # print(english_words )
        print('=========================')
        for j in range(len(english_words)):
            english_words[j] = english_words[j].strip()
            if english_words[j]:
                keywords[i] = keywords[i].replace(english_words[j], '%s')

        keywords[i] = tools.replace_str(keywords[i], ' +', '&')
        print(keywords[i])
        print(english_words)
        keywords[i] = keywords[i] % (
            tuple(english_words)) if '%s' in keywords[i] else keywords[i]

    keywords = ')|('.join(keywords)
    keywords = '(' + keywords + ')' if not keywords.startswith(
        '(') and keywords else keywords

    return keywords
Beispiel #6
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url, code='GB2312')
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 取当前页面的全部url
    urls = tools.get_urls(html, STOP_URLS)

    # 过滤掉外链接 添加到数据库
    fit_url = tools.fit_url(urls, "people.com.cn")
    for url in fit_url:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<h1>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.replace_str(title, '&.*?;')
    # 内容
    regexs = [
        'div class="box_pic"></div>(.*?)<div class="box_pic"></div>',
        '<div class="content clear clearfix">(.*?)<div class="edit clearfix">',
        '<div class="show_text">(.*?)<div class="edit">'
    ]

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''

    content = tools.del_html_tag(content)

    log.debug('''
                    depth     =  %d
                    source_url = %s
                    title     =  %s
                    content   =  %s
                 ''' % (depth, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id,
                                     source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)