def parse_content(self, content, ref): """处理文章""" soup = BeautifulSoup(content) for span in list(soup.findAll(attrs={"style": "display: none;"})): span.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr: True}): del x[attr] for tag in soup.findAll(self.remove_tags): tag.extract() img_count = 0 images = [] for img in list(soup.findAll('img')): if ((self.max_image_number >= 0 and img_count >= self.max_image_number) or img.has_key('src') is False or self.is_url_blocked(img['src'])): img.extract() else: if len(img['src']) > 2048: logging.warning("img src is too long") img.extract() else: try: output_dir = self.output_dir localimage, fullname = ImageDownloadManager.parse_image( img['src'], ref, output_dir) if os.path.isfile(fullname) is False: images.append({ 'url': img['src'], 'filename': fullname }) if localimage: img['src'] = localimage img_count = img_count + 1 else: img.extract() except Exception, e: logging.info("error: %s" % e) img.extract()
def check_feeds_update(self, since=None, reorder=False): self.reader.buildSubscriptionList() categories = self.reader.getCategories() feeds = self.get_valid_feeds(categories) max_items_number = int(self.config['max_items_number']) mark_read = int(self.config['mark_read']) exclude_read = int(self.config['exclude_read']) max_image_per_article = self.config['max_image_per_article'] try: max_image_per_article = int(max_image_per_article) self.max_image_number = max_image_per_article except: pass if not max_items_number: max_items_number = 50 updated_feeds = [] current_feed = 0 image_download_manager = ImageDownloadManager() for feed_id in feeds: feed = feeds[feed_id] current_feed = current_feed + 1 logging.info("[%s/%s]: %s" % (current_feed, len(feeds), feed.id)) try: feed_data = self.reader.getFeedContent( feed, exclude_read, loadLimit=max_items_number, since=since) if not feed_data: continue for item in feed_data['items']: if not self.is_item_in_reading_list(item): continue content = item.get('content', '') if not content: content = item.get('summary', {}).get( 'content', '') if not content: continue url = None for alternate in item.get('alternate', []): if alternate.get('type', '') == 'text/html': url = alternate['href'] break item['content'], images = self.parse_content( content, url) item = Item(self.reader, item, feed) image_download_manager.add_images(images) feed.item_count = len(feed.items) if mark_read: if feed.item_count >= max_items_number: for item in feed.items: item.markRead() elif feed.item_count > 0: self.reader.markFeedAsRead(feed) if feed.item_count > 0: if reorder: feed.items.sort(key=lambda item: item.published) updated_feeds.append(feed) logging.info("update %s items." % feed.item_count) else: logging.info("no update.") except Exception: import traceback logging.error("fail: %s" % traceback.format_exc()) # download image by multithreading image_download_manager.run() return updated_feeds