Beispiel #1
0
netloc = urlparse.urlsplit(url)[1]

print netloc
r.set(netloc,encoding)

print r.get(netloc)
'''
#url='http://tech.sina.com.cn/internet/'
#url='http://tech.sina.com.cn/i/2014-01-08/08039077686.shtml'
#url='http://blog.knownsec.com/2012/04/about-content-encoding-gzip/'
url = 'http://book.douban.com/review/6549990/'
zzh = URLOpener()
re = zzh.open(url)
#print re.info()
#print re.content.decode('GBK').encode('utf-8')
#print re.content
fout = open('zhang_test', 'wb')
fout.write(re.content)
fout.close()
'''
encoding = chardet.detect(re.content)['encoding']
print encoding
print re.headers
print isinstance(re.content,unicode)
print re.content.decode(encoding,'ignore').encode('utf-8')
'''
doc = readability.Document(re.content)
summary = doc.summary(html_partial=True)
soup = BeautifulSoup(re.content, 'lxml')
print soup.body.contents[0]
Beispiel #2
0
    def readability(self, article, url, opts=None, user=None):
        """ 使用readability-lxml处理全文信息 """
        #因为图片文件占内存,为了节省内存,这个函数也做为生成器
        content = self.preprocess(article)

        # 提取正文
        try:
            doc = readability.Document(content)
            summary = doc.summary(html_partial=False)
        except:
            self.log.warn('article is invalid.[%s]' % url)
            return

        title = doc.short_title()
        title = self.processtitle(title)

        #if summary.startswith('<body'): #readability解析出错
        #    html = content
        #else:
        #html = self.FragToXhtml(summary, title, addtitleinbody=True)

        #因为现在只剩文章内容了,使用BeautifulSoup也不会有什么性能问题
        soup = BeautifulSoup(summary, "lxml")
        h = soup.find('head')
        if not h:
            h = soup.new_tag('head')
            t = soup.new_tag('title')
            t.string = title
            h.append(t)
            soup.html.insert(0, h)

        #如果没有内容标题则添加
        t = soup.html.body.find(['h1', 'h2'])
        if not t:
            t = soup.new_tag('h1')
            t.string = title
            soup.html.body.insert(0, t)
        else:
            totallen = 0
            for ps in t.previous_siblings:
                totallen += len(string_of_tag(ps))
                if totallen > 40:  #此H1/H2在文章中间出现,不是文章标题
                    t = soup.new_tag('h1')
                    t.string = title
                    soup.html.body.insert(0, t)
                    break

        self.soupbeforeimage(soup)

        if self.remove_tags:
            for tag in soup.find_all(self.remove_tags):
                tag.decompose()
        for id in self.remove_ids:
            for tag in soup.find_all(attrs={"id": id}):
                tag.decompose()
        for cls in self.remove_classes:
            for tag in soup.find_all(attrs={"class": cls}):
                tag.decompose()
        for attr in self.remove_attrs:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)):
            cmt.extract()

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img', attrs={'src': True}):
                imgurl = img['src']
                if not imgurl.startswith('http'):
                    imgurl = self.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered : %s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content,
                    opts) if imgresult.status_code == 200 else None
                if imgcontent:
                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype
                                              == 'jpeg' else imgtype)
                        img['src'] = fnimg
                        yield (imgmime, imgurl, fnimg, imgcontent, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' %
                                  (imgresult.status_code, imgurl))
                    img.decompose()

            #去掉图像上面的链接
            for img in soup.find_all('img'):
                if img.parent and img.parent.parent and \
                    img.parent.name == 'a':
                    img.parent.replace_with(img)
        else:
            for img in soup.find_all('img'):
                img.decompose()

        self.soupprocessex(soup)

        #插入分享链接
        if user:
            if user.evernote and user.evernote_mail:
                span = soup.new_tag('span')
                span.string = '    '
                soup.html.body.append(span)
                href = "%s/share?act=evernote&u=%s&url=%s" % (DOMAIN,
                                                              user.name, url)
                if user.share_fuckgfw:
                    href = SHARE_FUCK_GFW_SRV % urllib.quote(href)
                ashare = soup.new_tag('a', href=href)
                ashare.string = SAVE_TO_EVERNOTE
                soup.html.body.append(ashare)
            if user.wiz and user.wiz_mail:
                span = soup.new_tag('span')
                span.string = '    '
                soup.html.body.append(span)
                href = "%s/share?act=wiz&u=%s&url=%s" % (DOMAIN, user.name,
                                                         url)
                if user.share_fuckgfw:
                    href = SHARE_FUCK_GFW_SRV % urllib.quote(href)
                ashare = soup.new_tag('a', href=href)
                ashare.string = SAVE_TO_WIZ
                soup.html.body.append(ashare)

        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            body = soup.find('body')
            for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2,避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief)
Beispiel #3
0
    def readability(self, article, url, opts=None, user=None):
        """ 使用readability-lxml处理全文信息
        因为图片文件占内存,为了节省内存,这个函数也做为生成器
        """
        content = self.preprocess(article)
        if not content:
            return
            
        # 提取正文
        try:
            doc = readability.Document(content,positive_keywords=self.positive_classes)
            summary = doc.summary(html_partial=False)
        except:
            # 如果提取正文出错,可能是图片(一个图片做为一篇文章,没有使用html包装)
            imgtype = imghdr.what(None, content)
            if imgtype: #如果是图片,则使用一个简单的html做为容器
                imgmime = r"image/" + imgtype
                fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype=='jpeg' else imgtype)
                yield (imgmime, url, fnimg, content, None, None)
                tmphtml = '<html><head><title>Picture</title></head><body><img src="%s" /></body></html>' % fnimg
                yield ('Picture', None, None, tmphtml, '', None)
            else:
                self.log.warn('article is invalid.[%s]' % url)
            return
        
        title = doc.short_title()
        if not title:
            self.log.warn('article has no title.[%s]' % url)
            return
        
        title = self.processtitle(title)
        
        soup = BeautifulSoup(summary, "lxml")
        
        #如果readability解析失败,则启用备用算法(不够好,但有全天候适应能力)
        body = soup.find('body')
        head = soup.find('head')
        if len(body.contents) == 0:
            from simpleextract import simple_extract
            summary = simple_extract(content)
            soup = BeautifulSoup(summary, "lxml")
            body = soup.find('body')
            if not body:
                self.log.warn('extract article content failed.[%s]' % url)
                return
                
            head = soup.find('head')
            #增加备用算法提示,提取效果不好不要找我,类似免责声明:)
            info = soup.new_tag('p', style='color:#555555;font-size:60%;text-align:right;')
            info.string = 'extracted by alternative algorithm.'
            body.append(info)
            
            self.log.info('use alternative algorithm to extract content.')
            
        if not head:
            head = soup.new_tag('head')
            soup.html.insert(0, head)
            
        if not head.find('title'):
            t = soup.new_tag('title')
            t.string = title
            head.append(t)
            
        #如果没有内容标题则添加
        t = body.find(['h1','h2'])
        if not t:
            t = soup.new_tag('h2')
            t.string = title
            body.insert(0, t)
        else:
            totallen = 0
            for ps in t.previous_siblings:
                totallen += len(string_of_tag(ps))
                if totallen > 40: #此H1/H2在文章中间出现,不是文章标题
                    t = soup.new_tag('h2')
                    t.string = title
                    body.insert(0, t)
                    break
                    
        if self.remove_tags:
            for tag in soup.find_all(self.remove_tags):
                tag.decompose()
        for id in self.remove_ids:
            for tag in soup.find_all(attrs={"id":id}):
                tag.decompose()
        for cls in self.remove_classes:
            for tag in soup.find_all(attrs={"class":cls}):
                tag.decompose()
        for attr in self.remove_attrs:
            for tag in soup.find_all(attrs={attr:True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text:isinstance(text, Comment)):
            cmt.extract()

        #删除body的所有属性,以便InsertToc使用正则表达式匹配<body>
        bodyattrs = [attr for attr in body.attrs]
        for attr in bodyattrs:
            del body[attr]

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        self.soupbeforeimage(soup)

        has_imgs = False
        thumbnail = None

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img'):
                #现在使用延迟加载图片技术的网站越来越多了,这里处理一下
                #注意:如果data-src之类的属性保存的不是真实url就没辙了
                imgurl = img['src'] if 'src' in img.attrs else ''
                if not imgurl:
                    for attr in img.attrs:
                        if attr != 'src' and 'src' in attr: #很多网站使用data-src
                            imgurl = img[attr]
                            break
                if not imgurl:
                    img.decompose()
                    continue
                if not imgurl.startswith('data:'):
                    if not imgurl.startswith('http'):
                        imgurl = self.urljoin(url, imgurl)
                    if self.fetch_img_via_ssl and url.startswith('https://'):
                        imgurl = imgurl.replace('http://', 'https://')
                    if self.isfiltered(imgurl):
                        self.log.warn('img filtered : %s' % imgurl)
                        img.decompose()
                        continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(imgresult.content,opts) if imgresult.status_code==200 else None
                if imgcontent:
                    if len(imgcontent) < self.img_min_size: #rexdf too small image
                        img.decompose()
                        continue

                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype=='jpeg' else imgtype)
                        img['src'] = fnimg

                        #使用第一个图片做为目录缩略图
                        if not has_imgs:
                            has_imgs = True
                            thumbnail = imgurl
                            yield (imgmime, imgurl, fnimg, imgcontent, None, True)
                        else:
                            yield (imgmime, imgurl, fnimg, imgcontent, None, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' % (imgresult.status_code,imgurl))
                    img.decompose()

            #去掉图像上面的链接,以免误触后打开浏览器
            for img in soup.find_all('img'):
                if img.parent and img.parent.parent and \
                    img.parent.name == 'a':
                    img.parent.replace_with(img)
        else:
            for img in soup.find_all('img'):
                img.decompose()
        
        #将HTML5标签转换为div
        for x in soup.find_all(['article', 'aside', 'header', 'footer', 'nav',
            'figcaption', 'figure', 'section', 'time']):
            x.name = 'div'
        
        self.soupprocessex(soup)

        #插入分享链接
        if user:
            self.AppendShareLinksToArticle(soup, user, url)

        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            for h in body.find_all(['h1','h2']): # 去掉h1/h2,避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief, thumbnail)
Beispiel #4
0
    def readability(self, article, url, opts=None):
        """ 使用readability-lxml处理全文信息 """
        content = self.preprocess(article)
        #		print '--------------'
        #		print content
        #		print '---------------'
        # 提取正文
        try:
            doc = readability.Document(content)
            summary = doc.summary(html_partial=True)
        except:
            self.log.warn('article is invalid.[%s]' % url)
            return

        title = doc.short_title()
        title = self.processtitle(title)
        #		print '=================='
        #		print summary
        #		print '==================='

        soup = BeautifulSoup(summary, 'lxml')
        #	soup = BeautifulSoup(content,'lxml')
        '''
		#没有head
		h = soup.find('head')
		if not h:
			h = soup.new_tag('head')
			t = soup.new_tag('title')
			t.string = title
			h.append(t)
			soup.html.insert(0,h)

		#没有h
		t = soup.html.body.find(['h1','h2'])
		if not t:
			t = soup.new_tag('h1')
			t.string = title
			soup.html.body.insert(0,t)
		else:
			totallen = 0
			for ps in t.previous_siblings:
				totallen += len(string_of_tag(ps))
				if totallen > 40:
					t = soup.new_tag('h1')
					t.string = title
					soup.html.body.insert(0,t)
					break
		'''
        self.soupbeforeimage(soup)

        if self.remove_tags:
            for tag in soup.find_all(self.remove_tags):
                tag.decompose()
        for id in self.remove_ids:
            for tag in soup.find_all(attrs={"id": id}):
                tag.decompose()
        for cls in self.remove_classes:
            for tag in soup.find_all(attrs={"class": cls}):
                tag.decompose()
        for attr in self.remove_attrs:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)):
            cmt.extract()

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img', attrs={'src': True}):
                imgurl = img['src']
                if img.get('height') in ('1','2','3','4','5') \
                 or img.get('width') in ('1','2','3','4','5'):
                    self.log.warn('img size too small,take it away : %s' %
                                  imgurl)
                    img.decompose()
                    continue
                if not imgurl.startswith('http'):
                    imgurl = self.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered : %s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content, opts) if imgresult.code == 200 else None
                if imgcontent:
                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype
                                              == 'jpeg' else imgtype)
                        img['src'] = fnimg
                        yield (imgmime, imgurl, fnimg, imgcontent, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' %
                                  (imgresult.code, imgurl))
                    img.decompose()
            #去掉图像上面的链接
            for img in soup.find_all('img'):
                if img.parent and img.parent.parent and \
                 img.parent.name == 'a':
                    img.parent.replace_with(img)
        else:
            for img in soup.find_all('img'):
                img.decompose()

        self.soupprocessex(soup)
        #		print '====-=-=-=-=-=-=-='
        #		print soup
        #		print '-=-=-=-=-=-=-=-=-=-=-'
        cc = soup.body.contents[0]
        #		cc.name = "articleblock"
        #		print cc
        #		print soup.body.renderContents()
        #content = unicode(soup)
        content = unicode(cc)

        #print soup.find('body').contents
        #print soup.body.contents

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            body = soup.find('body')
            for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2,避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break

        soup = None
        yield (title, None, None, content, brief)
Beispiel #5
0
    def readability(self, article, url, opts=None):
        #使用readability-lxml处理全文信息
        #因为图片文件占内存,为了节省内存,这个函数也做为生成器
        content = self.preprocess(article)

        # 提取正文
        doc = readability.Document(content)
        summary = doc.summary(html_partial=True)
        title = doc.short_title()
        title = self.processtitle(title)
        #if summary.startswith('<body'): #readability解析出错
        #    html = content
        #else:
        html = self.FragToXhtml(summary, title, addtitleinbody=True)

        #因为现在只剩文章内容了,使用BeautifulSoup也不会有什么性能问题
        soup = BeautifulSoup(html, "lxml")
        self.soupbeforeimage(soup)

        for attr in ['id', 'class']:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)):
            cmt.extract()

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img', attrs={'src': True}):
                imgurl = img['src']
                if img.get('height') in ('1','2','3','4','5') \
                    or img.get('width') in ('1','2','3','4','5'):
                    self.log.warn('img size too small,take away it:%s' %
                                  imgurl)
                    img.decompose()
                    continue
                if not imgurl.startswith('http'):
                    imgurl = urlparse.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered:%s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content,
                    opts) if imgresult.status_code == 200 else None
                if imgcontent:
                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "%d.%s" % (random.randint(
                            10000,
                            99999999), 'jpg' if imgtype == 'jpeg' else imgtype)
                        img['src'] = fnimg
                        yield (imgmime, imgurl, fnimg, imgcontent, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' %
                                  (imgresult.status_code, imgurl))
                    img.decompose()
        else:
            for img in soup.find_all('img'):
                img.decompose()

        self.soupprocessex(soup)
        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            body = soup.find('body')
            for h1 in body.find_all('h1'):  # 去掉H1,避免和标题重复
                h1.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief)
Beispiel #6
0
    def readability(self, article, url, opts=None):
        #使用readability-lxml处理全文信息
        #因为图片文件占内存,为了节省内存,这个函数也做为生成器
        content = self.preprocess(article)

        # 提取正文
        doc = readability.Document(content)
        summary = doc.summary(html_partial=False)
        title = doc.short_title()
        title = self.processtitle(title)
        #if summary.startswith('<body'): #readability解析出错
        #    html = content
        #else:
        #html = self.FragToXhtml(summary, title, addtitleinbody=True)

        #因为现在只剩文章内容了,使用BeautifulSoup也不会有什么性能问题
        soup = BeautifulSoup(summary, "lxml")
        h = soup.find('head')
        if not h:
            h = soup.new_tag('head')
            t = soup.new_tag('title')
            t.string = title
            h.append(t)
            soup.html.insert(0, h)

        #如果没有内容标题则添加
        t = soup.html.body.find(['h1', 'h2'])
        if not t:
            t = soup.new_tag('h1')
            t.string = title
            soup.html.body.insert(0, t)
        else:
            totallen = 0
            for ps in t.previous_siblings:
                totallen += len(string_of_tag(ps))
                if totallen > 40:  #此H1/H2在文章中间出现,不是文章标题
                    t = soup.new_tag('h1')
                    t.string = title
                    soup.html.body.insert(0, t)
                    break

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        self.soupbeforeimage(soup)

        if self.remove_tags:
            for tag in soup.find_all(self.remove_tags):
                tag.decompose()
        for id in self.remove_ids:
            for tag in soup.find_all(attrs={"id": id}):
                tag.decompose()
        for cls in self.remove_classes:
            for tag in soup.find_all(attrs={"class": cls}):
                tag.decompose()
        for attr in self.remove_attrs:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img', attrs={'src': True}):
                imgurl = img['src']
                if img.get('height') in ('1','2','3','4','5') \
                    or img.get('width') in ('1','2','3','4','5'):
                    self.log.warn('img size too small,take away it:%s' %
                                  imgurl)
                    img.decompose()
                    continue
                if not imgurl.startswith('http'):
                    imgurl = urlparse.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered:%s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content,
                    opts) if imgresult.status_code == 200 else None
                if imgcontent:
                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "%d.%s" % (random.randint(
                            10000,
                            99999999), 'jpg' if imgtype == 'jpeg' else imgtype)
                        img['src'] = fnimg
                        yield (imgmime, imgurl, fnimg, imgcontent, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' %
                                  (imgresult.status_code, imgurl))
                    img.decompose()
        else:
            for img in soup.find_all('img'):
                img.decompose()

        self.soupprocessex(soup)
        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            body = soup.find('body')
            for h1 in body.find_all('h1'):  # 去掉H1,避免和标题重复
                h1.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief)
Beispiel #7
0
    def readability(self, article, url, opts=None, user=None):
        """ 使用readability-lxml处理全文信息
        #因为图片文件占内存,为了节省内存,这个函数也做为生成器
        """
        content = self.preprocess(article)

        # 提取正文
        try:
            doc = readability.Document(content,positive_keywords=self.positive_classes)
            summary = doc.summary(html_partial=False)
        except:
            self.log.warn('article is invalid.[%s]' % url)
            return

        title = doc.short_title()
        if not title:
            self.log.warn('article has no title.[%s]' % url)
            return

        title = self.processtitle(title)

        #if summary.startswith('<body'): #readability解析出错
        #    html = content
        #else:
        #html = self.FragToXhtml(summary, title, addtitleinbody=True)

        #因为现在只剩文章内容了,使用BeautifulSoup也不会有什么性能问题
        soup = BeautifulSoup(summary, "lxml")
        h = soup.find('head')
        if not h:
            h = soup.new_tag('head')
            t = soup.new_tag('title')
            t.string = title
            h.append(t)
            soup.html.insert(0, h)

        #如果没有内容标题则添加
        body = soup.html.body
        t = body.find(['h1','h2'])
        if not t:
            t = soup.new_tag('h2')
            t.string = title
            body.insert(0, t)
        else:
            totallen = 0
            for ps in t.previous_siblings:
                totallen += len(string_of_tag(ps))
                if totallen > 40: #此H1/H2在文章中间出现,不是文章标题
                    t = soup.new_tag('h2')
                    t.string = title
                    body.insert(0, t)
                    break

        if self.remove_tags:
            for tag in soup.find_all(self.remove_tags):
                tag.decompose()
        for id in self.remove_ids:
            for tag in soup.find_all(attrs={"id":id}):
                tag.decompose()
        for cls in self.remove_classes:
            for tag in soup.find_all(attrs={"class":cls}):
                tag.decompose()
        for attr in self.remove_attrs:
            for tag in soup.find_all(attrs={attr:True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text:isinstance(text, Comment)):
            cmt.extract()

        #删除body的所有属性,以便InsertToc使用正则表达式匹配<body>
        bodyattrs = [attr for attr in body.attrs]
        for attr in bodyattrs:
            del body[attr]

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        self.soupbeforeimage(soup)

        has_imgs = False
        thumbnail = None

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img',attrs={'src':True}):
                imgurl = img['src']
                if not imgurl.startswith('http'):
                    imgurl = self.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered : %s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(imgresult.content,opts) if imgresult.status_code==200 else None
                if imgcontent:
                    if len(imgcontent) < self.img_min_size: #rexdf too small image
                        img.decompose()
                        continue

                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype=='jpeg' else imgtype)
                        img['src'] = fnimg

                        #使用第一个图片做为目录缩略图
                        if not has_imgs:
                            has_imgs = True
                            thumbnail = imgurl
                            yield (imgmime, imgurl, fnimg, imgcontent, None, True)
                        else:
                            yield (imgmime, imgurl, fnimg, imgcontent, None, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' % (imgresult.status_code,imgurl))
                    img.decompose()

            #去掉图像上面的链接,以免误触后打开浏览器
            for img in soup.find_all('img'):
                if img.parent and img.parent.parent and \
                    img.parent.name == 'a':
                    img.parent.replace_with(img)
        else:
            for img in soup.find_all('img'):
                img.decompose()
        
        #将HTML5标签转换为div
        for x in soup.find_all(['article', 'aside', 'header', 'footer', 'nav',
            'figcaption', 'figure', 'section', 'time']):
            x.name = 'div'
        
        self.soupprocessex(soup)

        #插入分享链接
        if user:
            self.AppendShareLinksToArticle(soup, user, url)

        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            for h in body.find_all(['h1','h2']): # 去掉h1/h2,避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief, thumbnail)