def parse_content(self, content, ref): """处理文章""" soup = BeautifulSoup(content) for span in list(soup.findAll(attrs={"style": "display: none;"})): span.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr: True}): del x[attr] for tag in soup.findAll(self.remove_tags): tag.extract() img_count = 0 images = [] for img in list(soup.findAll('img')): if ((self.max_image_number >= 0 and img_count >= self.max_image_number) or img.has_key('src') is False or self.is_url_blocked(img['src'])): img.extract() else: if len(img['src']) > 2048: logging.warning("img src is too long") img.extract() else: try: output_dir = self.output_dir localimage, fullname = ImageDownloadManager.parse_image( img['src'], ref, output_dir) if os.path.isfile(fullname) is False: images.append({ 'url': img['src'], 'filename': fullname }) if localimage: img['src'] = localimage img_count = img_count + 1 else: img.extract() except Exception, e: logging.info("error: %s" % e) img.extract()