Python Document Examples

Programming Language: Python

Namespace/Package Name: lib.readability.readability

Method/Function: Document

Examples at hotexamples.com: 7

Python Document - 7 examples found. These are the top rated real world Python examples of lib.readability.readability.Document extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

netloc = urlparse.urlsplit(url)[1]

print netloc
r.set(netloc,encoding)

print r.get(netloc)
'''
#url='http://tech.sina.com.cn/internet/'
#url='http://tech.sina.com.cn/i/2014-01-08/08039077686.shtml'
#url='http://blog.knownsec.com/2012/04/about-content-encoding-gzip/'
url = 'http://book.douban.com/review/6549990/'
zzh = URLOpener()
re = zzh.open(url)
#print re.info()
#print re.content.decode('GBK').encode('utf-8')
#print re.content
fout = open('zhang_test', 'wb')
fout.write(re.content)
fout.close()
'''
encoding = chardet.detect(re.content)['encoding']
print encoding
print re.headers
print isinstance(re.content,unicode)
print re.content.decode(encoding,'ignore').encode('utf-8')
'''
doc = readability.Document(re.content)
summary = doc.summary(html_partial=True)
soup = BeautifulSoup(re.content, 'lxml')
print soup.body.contents[0]

Example #2

Show file

    def readability(self, article, url, opts=None, user=None):
        """ 使用readability-lxml处理全文信息 """
        #因为图片文件占内存，为了节省内存，这个函数也做为生成器
        content = self.preprocess(article)

        # 提取正文
        try:
            doc = readability.Document(content)
            summary = doc.summary(html_partial=False)
        except:
            self.log.warn('article is invalid.[%s]' % url)
            return

        title = doc.short_title()
        title = self.processtitle(title)

        #if summary.startswith('<body'): #readability解析出错
        #    html = content
        #else:
        #html = self.FragToXhtml(summary, title, addtitleinbody=True)

        #因为现在只剩文章内容了，使用BeautifulSoup也不会有什么性能问题
        soup = BeautifulSoup(summary, "lxml")
        h = soup.find('head')
        if not h:
            h = soup.new_tag('head')
            t = soup.new_tag('title')
            t.string = title
            h.append(t)
            soup.html.insert(0, h)

        #如果没有内容标题则添加
        t = soup.html.body.find(['h1', 'h2'])
        if not t:
            t = soup.new_tag('h1')
            t.string = title
            soup.html.body.insert(0, t)
        else:
            totallen = 0
            for ps in t.previous_siblings:
                totallen += len(string_of_tag(ps))
                if totallen > 40:  #此H1/H2在文章中间出现，不是文章标题
                    t = soup.new_tag('h1')
                    t.string = title
                    soup.html.body.insert(0, t)
                    break

        self.soupbeforeimage(soup)

        if self.remove_tags:
            for tag in soup.find_all(self.remove_tags):
                tag.decompose()
        for id in self.remove_ids:
            for tag in soup.find_all(attrs={"id": id}):
                tag.decompose()
        for cls in self.remove_classes:
            for tag in soup.find_all(attrs={"class": cls}):
                tag.decompose()
        for attr in self.remove_attrs:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)):
            cmt.extract()

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img', attrs={'src': True}):
                imgurl = img['src']
                if not imgurl.startswith('http'):
                    imgurl = self.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered : %s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content,
                    opts) if imgresult.status_code == 200 else None
                if imgcontent:
                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype
                                              == 'jpeg' else imgtype)
                        img['src'] = fnimg
                        yield (imgmime, imgurl, fnimg, imgcontent, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' %
                                  (imgresult.status_code, imgurl))
                    img.decompose()

            #去掉图像上面的链接
            for img in soup.find_all('img'):
                if img.parent and img.parent.parent and \
                    img.parent.name == 'a':
                    img.parent.replace_with(img)
        else:
            for img in soup.find_all('img'):
                img.decompose()

        self.soupprocessex(soup)

        #插入分享链接
        if user:
            if user.evernote and user.evernote_mail:
                span = soup.new_tag('span')
                span.string = '    '
                soup.html.body.append(span)
                href = "%s/share?act=evernote&u=%s&url=%s" % (DOMAIN,
                                                              user.name, url)
                if user.share_fuckgfw:
                    href = SHARE_FUCK_GFW_SRV % urllib.quote(href)
                ashare = soup.new_tag('a', href=href)
                ashare.string = SAVE_TO_EVERNOTE
                soup.html.body.append(ashare)
            if user.wiz and user.wiz_mail:
                span = soup.new_tag('span')
                span.string = '    '
                soup.html.body.append(span)
                href = "%s/share?act=wiz&u=%s&url=%s" % (DOMAIN, user.name,
                                                         url)
                if user.share_fuckgfw:
                    href = SHARE_FUCK_GFW_SRV % urllib.quote(href)
                ashare = soup.new_tag('a', href=href)
                ashare.string = SAVE_TO_WIZ
                soup.html.body.append(ashare)

        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            body = soup.find('body')
            for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2，避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief)

Example #3

Show file

    def readability(self, article, url, opts=None, user=None):
        """ 使用readability-lxml处理全文信息
        因为图片文件占内存，为了节省内存，这个函数也做为生成器
        """
        content = self.preprocess(article)
        if not content:
            return
            
        # 提取正文
        try:
            doc = readability.Document(content,positive_keywords=self.positive_classes)
            summary = doc.summary(html_partial=False)
        except:
            # 如果提取正文出错，可能是图片（一个图片做为一篇文章，没有使用html包装）
            imgtype = imghdr.what(None, content)
            if imgtype: #如果是图片，则使用一个简单的html做为容器
                imgmime = r"image/" + imgtype
                fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype=='jpeg' else imgtype)
                yield (imgmime, url, fnimg, content, None, None)
                tmphtml = '<html><head><title>Picture</title></head><body><img src="%s" /></body></html>' % fnimg
                yield ('Picture', None, None, tmphtml, '', None)
            else:
                self.log.warn('article is invalid.[%s]' % url)
            return
        
        title = doc.short_title()
        if not title:
            self.log.warn('article has no title.[%s]' % url)
            return
        
        title = self.processtitle(title)
        
        soup = BeautifulSoup(summary, "lxml")
        
        #如果readability解析失败，则启用备用算法（不够好，但有全天候适应能力）
        body = soup.find('body')
        head = soup.find('head')
        if len(body.contents) == 0:
            from simpleextract import simple_extract
            summary = simple_extract(content)
            soup = BeautifulSoup(summary, "lxml")
            body = soup.find('body')
            if not body:
                self.log.warn('extract article content failed.[%s]' % url)
                return
                
            head = soup.find('head')
            #增加备用算法提示，提取效果不好不要找我，类似免责声明：）
            info = soup.new_tag('p', style='color:#555555;font-size:60%;text-align:right;')
            info.string = 'extracted by alternative algorithm.'
            body.append(info)
            
            self.log.info('use alternative algorithm to extract content.')
            
        if not head:
            head = soup.new_tag('head')
            soup.html.insert(0, head)
            
        if not head.find('title'):
            t = soup.new_tag('title')
            t.string = title
            head.append(t)
            
        #如果没有内容标题则添加
        t = body.find(['h1','h2'])
        if not t:
            t = soup.new_tag('h2')
            t.string = title
            body.insert(0, t)
        else:
            totallen = 0
            for ps in t.previous_siblings:
                totallen += len(string_of_tag(ps))
                if totallen > 40: #此H1/H2在文章中间出现，不是文章标题
                    t = soup.new_tag('h2')
                    t.string = title
                    body.insert(0, t)
                    break
                    
        if self.remove_tags:
            for tag in soup.find_all(self.remove_tags):
                tag.decompose()
        for id in self.remove_ids:
            for tag in soup.find_all(attrs={"id":id}):
                tag.decompose()
        for cls in self.remove_classes:
            for tag in soup.find_all(attrs={"class":cls}):
                tag.decompose()
        for attr in self.remove_attrs:
            for tag in soup.find_all(attrs={attr:True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text:isinstance(text, Comment)):
            cmt.extract()

        #删除body的所有属性，以便InsertToc使用正则表达式匹配<body>
        bodyattrs = [attr for attr in body.attrs]
        for attr in bodyattrs:
            del body[attr]

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        self.soupbeforeimage(soup)

        has_imgs = False
        thumbnail = None

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img'):
                #现在使用延迟加载图片技术的网站越来越多了，这里处理一下
                #注意：如果data-src之类的属性保存的不是真实url就没辙了
                imgurl = img['src'] if 'src' in img.attrs else ''
                if not imgurl:
                    for attr in img.attrs:
                        if attr != 'src' and 'src' in attr: #很多网站使用data-src
                            imgurl = img[attr]
                            break
                if not imgurl:
                    img.decompose()
                    continue
                if not imgurl.startswith('data:'):
                    if not imgurl.startswith('http'):
                        imgurl = self.urljoin(url, imgurl)
                    if self.fetch_img_via_ssl and url.startswith('https://'):
                        imgurl = imgurl.replace('http://', 'https://')
                    if self.isfiltered(imgurl):
                        self.log.warn('img filtered : %s' % imgurl)
                        img.decompose()
                        continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(imgresult.content,opts) if imgresult.status_code==200 else None
                if imgcontent:
                    if len(imgcontent) < self.img_min_size: #rexdf too small image
                        img.decompose()
                        continue

                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype=='jpeg' else imgtype)
                        img['src'] = fnimg

                        #使用第一个图片做为目录缩略图
                        if not has_imgs:
                            has_imgs = True
                            thumbnail = imgurl
                            yield (imgmime, imgurl, fnimg, imgcontent, None, True)
                        else:
                            yield (imgmime, imgurl, fnimg, imgcontent, None, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' % (imgresult.status_code,imgurl))
                    img.decompose()

            #去掉图像上面的链接，以免误触后打开浏览器
            for img in soup.find_all('img'):
                if img.parent and img.parent.parent and \
                    img.parent.name == 'a':
                    img.parent.replace_with(img)
        else:
            for img in soup.find_all('img'):
                img.decompose()
        
        #将HTML5标签转换为div
        for x in soup.find_all(['article', 'aside', 'header', 'footer', 'nav',
            'figcaption', 'figure', 'section', 'time']):
            x.name = 'div'
        
        self.soupprocessex(soup)

        #插入分享链接
        if user:
            self.AppendShareLinksToArticle(soup, user, url)

        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            for h in body.find_all(['h1','h2']): # 去掉h1/h2，避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief, thumbnail)

Example #4

Show file

    def readability(self, article, url, opts=None):
        """ 使用readability-lxml处理全文信息 """
        content = self.preprocess(article)
        #		print '--------------'
        #		print content
        #		print '---------------'
        # 提取正文
        try:
            doc = readability.Document(content)
            summary = doc.summary(html_partial=True)
        except:
            self.log.warn('article is invalid.[%s]' % url)
            return

        title = doc.short_title()
        title = self.processtitle(title)
        #		print '=================='
        #		print summary
        #		print '==================='

        soup = BeautifulSoup(summary, 'lxml')
        #	soup = BeautifulSoup(content,'lxml')
        '''
		#没有head
		h = soup.find('head')
		if not h:
			h = soup.new_tag('head')
			t = soup.new_tag('title')
			t.string = title
			h.append(t)
			soup.html.insert(0,h)

		#没有h
		t = soup.html.body.find(['h1','h2'])
		if not t:
			t = soup.new_tag('h1')
			t.string = title
			soup.html.body.insert(0,t)
		else:
			totallen = 0
			for ps in t.previous_siblings:
				totallen += len(string_of_tag(ps))
				if totallen > 40:
					t = soup.new_tag('h1')
					t.string = title
					soup.html.body.insert(0,t)
					break
		'''
        self.soupbeforeimage(soup)

        if self.remove_tags:
            for tag in soup.find_all(self.remove_tags):
                tag.decompose()
        for id in self.remove_ids:
            for tag in soup.find_all(attrs={"id": id}):
                tag.decompose()
        for cls in self.remove_classes:
            for tag in soup.find_all(attrs={"class": cls}):
                tag.decompose()
        for attr in self.remove_attrs:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)):
            cmt.extract()

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img', attrs={'src': True}):
                imgurl = img['src']
                if img.get('height') in ('1','2','3','4','5') \
                 or img.get('width') in ('1','2','3','4','5'):
                    self.log.warn('img size too small,take it away : %s' %
                                  imgurl)
                    img.decompose()
                    continue
                if not imgurl.startswith('http'):
                    imgurl = self.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered : %s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content, opts) if imgresult.code == 200 else None
                if imgcontent:
                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype
                                              == 'jpeg' else imgtype)
                        img['src'] = fnimg
                        yield (imgmime, imgurl, fnimg, imgcontent, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' %
                                  (imgresult.code, imgurl))
                    img.decompose()
            #去掉图像上面的链接
            for img in soup.find_all('img'):
                if img.parent and img.parent.parent and \
                 img.parent.name == 'a':
                    img.parent.replace_with(img)
        else:
            for img in soup.find_all('img'):
                img.decompose()

        self.soupprocessex(soup)
        #		print '====-=-=-=-=-=-=-='
        #		print soup
        #		print '-=-=-=-=-=-=-=-=-=-=-'
        cc = soup.body.contents[0]
        #		cc.name = "articleblock"
        #		print cc
        #		print soup.body.renderContents()
        #content = unicode(soup)
        content = unicode(cc)

        #print soup.find('body').contents
        #print soup.body.contents

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            body = soup.find('body')
            for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2，避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break

        soup = None
        yield (title, None, None, content, brief)

Example #5

Show file

File: base.py Project: jackyueq/KindleEar

    def readability(self, article, url, opts=None):
        #使用readability-lxml处理全文信息
        #因为图片文件占内存，为了节省内存，这个函数也做为生成器
        content = self.preprocess(article)

        # 提取正文
        doc = readability.Document(content)
        summary = doc.summary(html_partial=True)
        title = doc.short_title()
        title = self.processtitle(title)
        #if summary.startswith('<body'): #readability解析出错
        #    html = content
        #else:
        html = self.FragToXhtml(summary, title, addtitleinbody=True)

        #因为现在只剩文章内容了，使用BeautifulSoup也不会有什么性能问题
        soup = BeautifulSoup(html, "lxml")
        self.soupbeforeimage(soup)

        for attr in ['id', 'class']:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)):
            cmt.extract()

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img', attrs={'src': True}):
                imgurl = img['src']
                if img.get('height') in ('1','2','3','4','5') \
                    or img.get('width') in ('1','2','3','4','5'):
                    self.log.warn('img size too small,take away it:%s' %
                                  imgurl)
                    img.decompose()
                    continue
                if not imgurl.startswith('http'):
                    imgurl = urlparse.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered:%s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content,
                    opts) if imgresult.status_code == 200 else None
                if imgcontent:
                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "%d.%s" % (random.randint(
                            10000,
                            99999999), 'jpg' if imgtype == 'jpeg' else imgtype)
                        img['src'] = fnimg
                        yield (imgmime, imgurl, fnimg, imgcontent, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' %
                                  (imgresult.status_code, imgurl))
                    img.decompose()
        else:
            for img in soup.find_all('img'):
                img.decompose()

        self.soupprocessex(soup)
        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            body = soup.find('body')
            for h1 in body.find_all('h1'):  # 去掉H1，避免和标题重复
                h1.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief)

Example #6

Show file

    def readability(self, article, url, opts=None):
        #使用readability-lxml处理全文信息
        #因为图片文件占内存，为了节省内存，这个函数也做为生成器
        content = self.preprocess(article)

        # 提取正文
        doc = readability.Document(content)
        summary = doc.summary(html_partial=False)
        title = doc.short_title()
        title = self.processtitle(title)
        #if summary.startswith('<body'): #readability解析出错
        #    html = content
        #else:
        #html = self.FragToXhtml(summary, title, addtitleinbody=True)

        #因为现在只剩文章内容了，使用BeautifulSoup也不会有什么性能问题
        soup = BeautifulSoup(summary, "lxml")
        h = soup.find('head')
        if not h:
            h = soup.new_tag('head')
            t = soup.new_tag('title')
            t.string = title
            h.append(t)
            soup.html.insert(0, h)

        #如果没有内容标题则添加
        t = soup.html.body.find(['h1', 'h2'])
        if not t:
            t = soup.new_tag('h1')
            t.string = title
            soup.html.body.insert(0, t)
        else:
            totallen = 0
            for ps in t.previous_siblings:
                totallen += len(string_of_tag(ps))
                if totallen > 40:  #此H1/H2在文章中间出现，不是文章标题
                    t = soup.new_tag('h1')
                    t.string = title
                    soup.html.body.insert(0, t)
                    break

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        self.soupbeforeimage(soup)

        if self.remove_tags:
            for tag in soup.find_all(self.remove_tags):
                tag.decompose()
        for id in self.remove_ids:
            for tag in soup.find_all(attrs={"id": id}):
                tag.decompose()
        for cls in self.remove_classes:
            for tag in soup.find_all(attrs={"class": cls}):
                tag.decompose()
        for attr in self.remove_attrs:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img', attrs={'src': True}):
                imgurl = img['src']
                if img.get('height') in ('1','2','3','4','5') \
                    or img.get('width') in ('1','2','3','4','5'):
                    self.log.warn('img size too small,take away it:%s' %
                                  imgurl)
                    img.decompose()
                    continue
                if not imgurl.startswith('http'):
                    imgurl = urlparse.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered:%s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content,
                    opts) if imgresult.status_code == 200 else None
                if imgcontent:
                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "%d.%s" % (random.randint(
                            10000,
                            99999999), 'jpg' if imgtype == 'jpeg' else imgtype)
                        img['src'] = fnimg
                        yield (imgmime, imgurl, fnimg, imgcontent, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' %
                                  (imgresult.status_code, imgurl))
                    img.decompose()
        else:
            for img in soup.find_all('img'):
                img.decompose()

        self.soupprocessex(soup)
        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            body = soup.find('body')
            for h1 in body.find_all('h1'):  # 去掉H1，避免和标题重复
                h1.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief)

Example #7

Show file

    def readability(self, article, url, opts=None, user=None):
        """ 使用readability-lxml处理全文信息
        #因为图片文件占内存，为了节省内存，这个函数也做为生成器
        """
        content = self.preprocess(article)

        # 提取正文
        try:
            doc = readability.Document(content,positive_keywords=self.positive_classes)
            summary = doc.summary(html_partial=False)
        except:
            self.log.warn('article is invalid.[%s]' % url)
            return

        title = doc.short_title()
        if not title:
            self.log.warn('article has no title.[%s]' % url)
            return

        title = self.processtitle(title)

        #if summary.startswith('<body'): #readability解析出错
        #    html = content
        #else:
        #html = self.FragToXhtml(summary, title, addtitleinbody=True)

        #因为现在只剩文章内容了，使用BeautifulSoup也不会有什么性能问题
        soup = BeautifulSoup(summary, "lxml")
        h = soup.find('head')
        if not h:
            h = soup.new_tag('head')
            t = soup.new_tag('title')
            t.string = title
            h.append(t)
            soup.html.insert(0, h)

        #如果没有内容标题则添加
        body = soup.html.body
        t = body.find(['h1','h2'])
        if not t:
            t = soup.new_tag('h2')
            t.string = title
            body.insert(0, t)
        else:
            totallen = 0
            for ps in t.previous_siblings:
                totallen += len(string_of_tag(ps))
                if totallen > 40: #此H1/H2在文章中间出现，不是文章标题
                    t = soup.new_tag('h2')
                    t.string = title
                    body.insert(0, t)
                    break

        if self.remove_tags:
            for tag in soup.find_all(self.remove_tags):
                tag.decompose()
        for id in self.remove_ids:
            for tag in soup.find_all(attrs={"id":id}):
                tag.decompose()
        for cls in self.remove_classes:
            for tag in soup.find_all(attrs={"class":cls}):
                tag.decompose()
        for attr in self.remove_attrs:
            for tag in soup.find_all(attrs={attr:True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text:isinstance(text, Comment)):
            cmt.extract()

        #删除body的所有属性，以便InsertToc使用正则表达式匹配<body>
        bodyattrs = [attr for attr in body.attrs]
        for attr in bodyattrs:
            del body[attr]

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        self.soupbeforeimage(soup)

        has_imgs = False
        thumbnail = None

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img',attrs={'src':True}):
                imgurl = img['src']
                if not imgurl.startswith('http'):
                    imgurl = self.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered : %s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(imgresult.content,opts) if imgresult.status_code==200 else None
                if imgcontent:
                    if len(imgcontent) < self.img_min_size: #rexdf too small image
                        img.decompose()
                        continue

                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype=='jpeg' else imgtype)
                        img['src'] = fnimg

                        #使用第一个图片做为目录缩略图
                        if not has_imgs:
                            has_imgs = True
                            thumbnail = imgurl
                            yield (imgmime, imgurl, fnimg, imgcontent, None, True)
                        else:
                            yield (imgmime, imgurl, fnimg, imgcontent, None, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' % (imgresult.status_code,imgurl))
                    img.decompose()

            #去掉图像上面的链接，以免误触后打开浏览器
            for img in soup.find_all('img'):
                if img.parent and img.parent.parent and \
                    img.parent.name == 'a':
                    img.parent.replace_with(img)
        else:
            for img in soup.find_all('img'):
                img.decompose()
        
        #将HTML5标签转换为div
        for x in soup.find_all(['article', 'aside', 'header', 'footer', 'nav',
            'figcaption', 'figure', 'section', 'time']):
            x.name = 'div'
        
        self.soupprocessex(soup)

        #插入分享链接
        if user:
            self.AppendShareLinksToArticle(soup, user, url)

        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            for h in body.find_all(['h1','h2']): # 去掉h1/h2，避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief, thumbnail)