Ejemplo n.º 1
0
 def Items(self):
     itemsprocessed = []
     cnt4debug = 0
     opener = URLOpener(self.host)
     decoder = AutoDecoder()
     for section, url in self.feeds:
         content = None
         cnt4debug += 1
         if IsRunInLocal and cnt4debug > 1:
             break
         
         result = opener.open(url)
         status_code, content = result.status_code, result.content
         if status_code != 200 and content:
             logging.error('err(%d) to fetch %s.' % (status_code,url))
             continue
         
         if self.feed_encoding:
             content = content.decode(self.feed_encoding)
         else:
             content = decoder.decode(content)
         
         content = self.preprocess(content)
         
         feed = feedparser.parse(content)
         for e in feed['entries']:
             # 全文RSS中如果有广告或其他不需要的内容,可以在postprocess去掉
             desc = self.postprocess(e.description)
             desc = self.FragToXhtml(desc, e.title, self.feed_encoding)
             
             if self.keep_image:
                 soup = BeautifulSoup(content)
                 self.soupbeforeimage(soup)
                 for img in soup.findAll('img'):
                     imgurl = img['src']
                     if not imgurl.startswith('http') and not imgurl.startswith('www'):
                         imgurl = self.urljoin(url, imgurl)
                     imgresult = opener.open(imgurl)
                     imgcontent = imgresult.content if imgresult.status_code == 200 else None
                     if imgcontent:
                         imgtype = imghdr.what(None, imgcontent)
                         if imgtype:
                             imgmime = r"image/" + imgtype
                             if imgtype == 'jpeg':
                                 fnimg = "%d.jpg" % random.randint(10000,99999999)
                             else:
                                 fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
                             img['src'] = fnimg
                             yield (imgmime, imgurl, fnimg, imgcontent)
                 self.soupprocessex(soup)
                 desc = soup.renderContents('utf-8').decode('utf-8')
                 soup = None
             
             if e.title not in itemsprocessed and desc:
                 itemsprocessed.append(e.title)
                 yield (section, e.link, e.title, desc)
Ejemplo n.º 2
0
 def sanitize_contents(self, contents):
   soup = BeautifulSoup(contents)
   for tagname in ['script', 'meta', 'head', 'link']:
     [tag.extract() for tag in soup.findAll(tagname)]
   
   attr_re = re.compile('^on.*', re.I)
   for tag in soup.findAll():
     for attr, _ in tag.attrs:
       if attr_re.match(attr):
         del tag[attr]
   for tag in soup.findAll(attrs={'href': re.compile(r'^\s*javascript:.*', re.I)}):
     del tag['href']
   for tag in soup.findAll(attrs={'src': re.compile(r'^\s*javascript:.*', re.I)}):
     del tag['src']
     
   sanitized_contents = soup.renderContents()
   return sanitized_contents
Ejemplo n.º 3
0
 def Items(self):
     """
     生成器,返回一个元组
     对于HTML:section,url,title,content
     对于图片,mime,url,filename,content
     """
     cnt4debug = 0
     decoder = AutoDecoder()
     for section, url in self.feeds:
         cnt4debug += 1
         if IsRunInLocal and cnt4debug > 1:
             break
         
         opener = URLOpener(self.host)
         result = opener.open(url)
         status_code, content = result.status_code, result.content
         if status_code != 200 or not content:
             logging.error('err(%d) to fetch %s.' % (status_code,url))
             continue
         
         if self.page_encoding:
             content = content.decode(self.page_encoding)
         else:
             content = decoder.decode(content)
         
         content =  self.preprocess(content)
         soup = BeautifulSoup(content)
         
         try:
             title = soup.html.head.title.string
         except AttributeError:
             logging.error('object soup invalid!(%s)'%url)
             continue
         
         title = self.processtitle(title)
         
         if self.keep_only_tags:
             body = Tag(soup, 'body')
             try:
                 if isinstance(self.keep_only_tags, dict):
                     self.keep_only_tags = [self.keep_only_tags]
                 for spec in self.keep_only_tags:
                     for tag in soup.find('body').findAll(**spec):
                         body.insert(len(body.contents), tag)
                 soup.find('body').replaceWith(body)
             except AttributeError: # soup has no body element
                 pass
         
         def remove_beyond(tag, next): # 鍐呭祵鍑芥暟
             while tag is not None and getattr(tag, 'name', None) != 'body':
                 after = getattr(tag, next)
                 while after is not None:
                     ns = getattr(tag, next)
                     after.extract()
                     after = ns
                 tag = tag.parent
         
         if self.remove_tags_after:
             rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
             for spec in rt:
                 tag = soup.find(**spec)
                 remove_beyond(tag, 'nextSibling')
         
         if self.remove_tags_before:
             tag = soup.find(**self.remove_tags_before)
             remove_beyond(tag, 'previousSibling')
         
         remove_tags = self.insta_remove_tags + self.remove_tags
         remove_ids = self.insta_remove_ids + self.remove_ids
         remove_classes = self.insta_remove_classes + self.remove_classes
         remove_attrs = self.insta_remove_attrs + self.remove_attrs
         for tag in soup.findAll(remove_tags):
             tag.extract()
         for id in remove_ids:
             for tag in soup.findAll(attrs={"id":id}):
                 tag.extract()
         for cls in remove_classes:
             for tag in soup.findAll(attrs={"class":cls}):
                 tag.extract()
         for attr in remove_attrs:
             for tag in soup.findAll(attrs={attr:True}):
                 del tag[attr]
         for tag in soup.findAll(attrs={"type":"text/css"}):
             tag.extract()
         for cmt in soup.findAll(text=lambda text:isinstance(text, Comment)):
             cmt.extract
         
         if self.keep_image:
             self.soupbeforeimage(soup)
             for img in soup.findAll('img'):
                 imgurl = img['src']
                 if not imgurl.startswith('http') and not imgurl.startswith('www'):
                     imgurl = self.urljoin(url, imgurl)
                 imgresult = opener.open(imgurl)
                 imgcontent = imgresult.content if imgresult.status_code == 200 else None
                 if imgcontent:
                     imgtype = imghdr.what(None, imgcontent)
                     if imgtype:
                         imgmime = r"image/" + imgtype
                         if imgtype == 'jpeg':
                             fnimg = "%d.jpg" % random.randint(10000,99999999)
                         else:
                             fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
                         img['src'] = fnimg
                         yield (imgmime, imgurl, fnimg, imgcontent)
         else:
             for img in soup.findAll('img'):
                 img.extract()
         
         self.soupprocessex(soup)
         content = soup.renderContents('utf-8').decode('utf-8')
         soup = None
         content =  self.postprocess(content)
         yield (section, url, title, content)
Ejemplo n.º 4
0
 def fulltext(self, url, decoder):
     #因为图片文件占内存,为了节省内存,这个函数也做为生成器
     if self.fulltext_by_instapaper:
         url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url)
     opener = URLOpener(self.host)
     result = opener.open(url)
     status_code, content = result.status_code, result.content
     if status_code != 200 or not content:
         logging.error('err(%d) to fetch %s.' % (status_code,url))
         return
     
     if self.page_encoding:
         content = content.decode(self.page_encoding)
     else:
         content = decoder.decode(content)
     
     content = self.preprocess(content)
     soup = BeautifulSoup(content)
     
     try:
         title = soup.html.head.title.string
     except AttributeError:
         logging.error('object soup invalid!(%s)'%url)
         return
         
     title = self.processtitle(title)
     soup.html.head.title.string = title
     
     if self.keep_only_tags:
         body = Tag(soup, 'body')
         try:
             if isinstance(self.keep_only_tags, dict):
                 self.keep_only_tags = [self.keep_only_tags]
             for spec in self.keep_only_tags:
                 for tag in soup.find('body').findAll(**spec):
                     body.insert(len(body.contents), tag)
             soup.find('body').replaceWith(body)
         except AttributeError: # soup has no body element
             pass
     
     def remove_beyond(tag, next): # 内联函数
         while tag is not None and getattr(tag, 'name', None) != 'body':
             after = getattr(tag, next)
             while after is not None:
                 ns = getattr(tag, next)
                 after.extract()
                 after = ns
             tag = tag.parent
     
     if self.remove_tags_after:
         rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
         for spec in rt:
             tag = soup.find(**spec)
             remove_beyond(tag, 'nextSibling')
     
     if self.remove_tags_before:
         tag = soup.find(**self.remove_tags_before)
         remove_beyond(tag, 'previousSibling')
     
     remove_tags = self.insta_remove_tags + self.remove_tags
     remove_ids = self.insta_remove_ids + self.remove_ids
     remove_classes = self.insta_remove_classes + self.remove_classes
     remove_attrs = self.insta_remove_attrs + self.remove_attrs
     
     for tag in soup.findAll(remove_tags):
         tag.extract()
     for id in remove_ids:
         for tag in soup.findAll(attrs={"id":id}):
             tag.extract()
     for cls in remove_classes:
         for tag in soup.findAll(attrs={"class":cls}):
             tag.extract()
     for attr in remove_attrs:
         for tag in soup.findAll(attrs={attr:True}):
             del tag[attr]
     for tag in soup.findAll(attrs={"type":"text/css"}):
         tag.extract()
     for cmt in soup.findAll(text=lambda text:isinstance(text, Comment)):
         cmt.extract
     
     if self.keep_image:
         self.soupbeforeimage(soup)
         for img in soup.findAll('img'):
             imgurl = img['src']
             if not imgurl.startswith('http') and not imgurl.startswith('www'):
                 imgurl = self.urljoin(url, imgurl)
             imgresult = opener.open(imgurl)
             imgcontent = imgresult.content if imgresult.status_code == 200 else None
             if imgcontent:
                 imgtype = imghdr.what(None, imgcontent)
                 if imgtype:
                     imgmime = r"image/" + imgtype
                     if imgtype == 'jpeg':
                         fnimg = "%d.jpg" % random.randint(10000,99999999)
                     else:
                         fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
                     img['src'] = fnimg
                     yield (imgmime, imgurl, fnimg, imgcontent)
     else:
         for img in soup.findAll('img'):
             img.extract()
     
     self.soupprocessex(soup)
     content = soup.renderContents('utf-8').decode('utf-8')
     soup = None
     
     yield (title, None, None, content)