Example #1
0
    def parse_content(self, content, ref):
        """处理文章"""

        soup = BeautifulSoup(content)

        for span in list(soup.findAll(attrs={"style": "display: none;"})):
            span.extract()

        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr: True}):
                del x[attr]

        for tag in soup.findAll(self.remove_tags):
            tag.extract()

        img_count = 0
        images = []
        for img in list(soup.findAll('img')):
            if ((self.max_image_number >= 0 and
                img_count >= self.max_image_number) or
                img.has_key('src') is False or
                self.is_url_blocked(img['src'])):
                img.extract()
            else:
                if len(img['src']) > 2048:
                    logging.warning("img src is too long")
                    img.extract()
                else:
                    try:
                        output_dir = self.output_dir
                        localimage, fullname = ImageDownloadManager.parse_image(
                            img['src'], ref, output_dir)
                        if os.path.isfile(fullname) is False:
                            images.append({
                                'url': img['src'],
                                'filename': fullname
                            })

                        if localimage:
                            img['src'] = localimage
                            img_count = img_count + 1
                        else:
                            img.extract()
                    except Exception, e:
                        logging.info("error: %s" % e)
                        img.extract()
Example #2
0
    def check_feeds_update(self, since=None, reorder=False):
        self.reader.buildSubscriptionList()
        categories = self.reader.getCategories()
        feeds = self.get_valid_feeds(categories)

        max_items_number = int(self.config['max_items_number'])
        mark_read = int(self.config['mark_read'])
        exclude_read = int(self.config['exclude_read'])
        max_image_per_article = self.config['max_image_per_article']

        try:
            max_image_per_article = int(max_image_per_article)
            self.max_image_number = max_image_per_article
        except:
            pass

        if not max_items_number:
            max_items_number = 50

        updated_feeds = []
        current_feed = 0
        image_download_manager = ImageDownloadManager()
        for feed_id in feeds:
            feed = feeds[feed_id]

            current_feed = current_feed + 1
            logging.info("[%s/%s]: %s" % (current_feed, len(feeds), feed.id))
            try:
                feed_data = self.reader.getFeedContent(
                    feed, exclude_read, loadLimit=max_items_number,
                    since=since)

                if not feed_data:
                    continue

                for item in feed_data['items']:
                    if not self.is_item_in_reading_list(item):
                        continue

                    content = item.get('content', '')
                    if not content:
                        content = item.get('summary', {}).get(
                            'content', '')
                    if not content:
                        continue

                    url = None
                    for alternate in item.get('alternate', []):
                        if alternate.get('type', '') == 'text/html':
                            url = alternate['href']
                            break
                    item['content'], images = self.parse_content(
                        content, url)
                    item = Item(self.reader, item, feed)
                    image_download_manager.add_images(images)

                feed.item_count = len(feed.items)
                if mark_read:
                    if feed.item_count >= max_items_number:
                        for item in feed.items:
                            item.markRead()
                    elif feed.item_count > 0:
                        self.reader.markFeedAsRead(feed)

                if feed.item_count > 0:
                    if reorder:
                        feed.items.sort(key=lambda item: item.published)
                    updated_feeds.append(feed)
                    logging.info("update %s items." % feed.item_count)
                else:
                    logging.info("no update.")
            except Exception:
                import traceback
                logging.error("fail: %s" % traceback.format_exc())

        # download image by multithreading
        image_download_manager.run()
        return updated_feeds