Example #1
0
    def build_index(self):
        #========== added =========
        downloaded_list = DownloadedArticlesList(self.download_history_file)
        #==========================

        self.report_progress(0, _('Fetching feeds...'))
        try:
            feeds = feeds_from_index(
                self.parse_index(),
                oldest_article=self.oldest_article,
                max_articles_per_feed=self.max_articles_per_feed,
                log=self.log)
            self.report_progress(0, _('Got feeds from index page'))
        except NotImplementedError:
            feeds = self.parse_feeds()

        #========== reworked =========
        for feed in feeds:
            feed.articles = filter(
                lambda article: article.url not in downloaded_list,
                feed.articles)

        # Filer out empty feeds
        if self.ignore_duplicate_articles is not None:
            feeds = self.remove_duplicate_articles(feeds)
        feeds = filter(lambda feed: len(feed.articles), feeds)
        if not feeds:
            raise ValueError('No articles found, aborting')
        #=============================

        #feeds = FeedCollection(feeds)

        self.has_single_feed = len(feeds) == 1

        index = os.path.join(self.output_dir, 'index.html')

        html = self.feeds2index(feeds)
        with open(index, 'wb') as fi:
            fi.write(html)

        self.jobs = []

        if self.reverse_article_order:
            for feed in feeds:
                if hasattr(feed, 'reverse'):
                    feed.reverse()

        self.feed_objects = feeds
        for f, feed in enumerate(feeds):
            feed_dir = os.path.join(self.output_dir, 'feed_%d' % f)
            if not os.path.isdir(feed_dir):
                os.makedirs(feed_dir)

            for a, article in enumerate(feed):
                #========== refactored =========
                art_dir = os.path.join(feed_dir, 'article_%d' % a)
                if not os.path.isdir(art_dir):
                    os.makedirs(art_dir)

                downloaded_list.add(article.url)
                url = self.feed_settings[feed.title].print_version_url(
                    article.url)

                req = WorkRequest(
                    self.feed_settings[feed.title].fetch,
                    (self, article, url, art_dir, f, a, len(feed)), {}, (f, a),
                    self.article_downloaded, self.error_in_article_download)
                #===============================
                req.feed = feed
                req.article = article
                req.feed_dir = feed_dir
                self.jobs.append(req)

        self.jobs_done = 0
        tp = ThreadPool(self.simultaneous_downloads)
        for req in self.jobs:
            tp.putRequest(req, block=True, timeout=0)

        self.report_progress(
            0,
            _('Starting download [%d thread(s)]...') %
            self.simultaneous_downloads)
        while True:
            try:
                tp.poll()
                time.sleep(0.1)
            except NoResultsPending:
                break

        for f, feed in enumerate(feeds):
            html = self.feed2index(f, feeds)
            feed_dir = os.path.join(self.output_dir, 'feed_%d' % f)
            with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
                fi.write(html)
        self.create_opf(feeds)
        self.report_progress(1, _('Feeds downloaded to %s') % index)

        #========== added =========
        downloaded_list.close()
        #==========================

        return index
Example #2
0
    def build_index(self):
        #========== added =========
        downloaded_list = DownloadedArticlesList(self.download_history_file)
        #==========================

        self.report_progress(0, _('Fetching feeds...'))
        try:
            feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
                                     max_articles_per_feed=self.max_articles_per_feed,
                                     log=self.log)
            self.report_progress(0, _('Got feeds from index page'))
        except NotImplementedError:
            feeds = self.parse_feeds()

        #========== reworked =========
        for feed in feeds:
            feed.articles = filter(lambda article: article.url not in downloaded_list, feed.articles)

        # Filer out empty feeds
        if self.ignore_duplicate_articles is not None:
            feeds = self.remove_duplicate_articles(feeds)
        feeds = filter(lambda feed: len(feed.articles), feeds)
        if not feeds:
            raise ValueError('No articles found, aborting')
        #=============================

        #feeds = FeedCollection(feeds)

        self.has_single_feed = len(feeds) == 1

        index = os.path.join(self.output_dir, 'index.html')

        html = self.feeds2index(feeds)
        with open(index, 'wb') as fi:
            fi.write(html)

        self.jobs = []

        if self.reverse_article_order:
            for feed in feeds:
                if hasattr(feed, 'reverse'):
                    feed.reverse()

        self.feed_objects = feeds
        for f, feed in enumerate(feeds):
            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
            if not os.path.isdir(feed_dir):
                os.makedirs(feed_dir)

            for a, article in enumerate(feed):
                #========== refactored =========
                art_dir = os.path.join(feed_dir, 'article_%d'%a)
                if not os.path.isdir(art_dir):
                    os.makedirs(art_dir)
              
                downloaded_list.add(article.url) 
                url = self.feed_settings[feed.title].print_version_url(article.url)

                req = WorkRequest(
                    self.feed_settings[feed.title].fetch,
                    (self, article, url, art_dir, f, a, len(feed)),
                                      {}, (f, a), self.article_downloaded,
                                      self.error_in_article_download)
                #===============================
                req.feed = feed
                req.article = article
                req.feed_dir = feed_dir
                self.jobs.append(req)

        self.jobs_done = 0
        tp = ThreadPool(self.simultaneous_downloads)
        for req in self.jobs:
            tp.putRequest(req, block=True, timeout=0)

        self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
        while True:
            try:
                tp.poll()
                time.sleep(0.1)
            except NoResultsPending:
                break

        for f, feed in enumerate(feeds):
            html = self.feed2index(f,feeds)
            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
            with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
                fi.write(html)
        self.create_opf(feeds)
        self.report_progress(1, _('Feeds downloaded to %s')%index)

        #========== added =========
        downloaded_list.close()
        #==========================

        return index
Example #3
0
    def build_index(self, data, browser):
        sections = data.get('index', None)
        if not sections:
            raise ValueError('No articles found, aborting')

        feeds = feeds_from_index(sections, oldest_article=self.oldest_article,
                                    max_articles_per_feed=self.max_articles_per_feed,
                                    log=self.log)
        if not feeds:
            raise ValueError('No articles found, aborting')
        if self.ignore_duplicate_articles is not None:
            feeds = self.remove_duplicate_articles(feeds)
        if self.test:
            feeds = feeds[:self.test[0]]
        self.has_single_feed = len(feeds) == 1
        index = os.path.join(self.output_dir, 'index.html')

        html = self.feeds2index(feeds)
        with open(index, 'wb') as fi:
            fi.write(html)

        if self.reverse_article_order:
            for feed in feeds:
                if hasattr(feed, 'reverse'):
                    feed.reverse()

        self.report_progress(0, _('Got feeds from index page'))
        resource_cache = {}

        total = 0
        for feed in feeds:
            total += min(self.max_articles_per_feed, len(feed))
        num = 0

        for f, feed in enumerate(feeds):
            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
            if not os.path.isdir(feed_dir):
                os.makedirs(feed_dir)

            for a, article in enumerate(feed):
                if a >= self.max_articles_per_feed:
                    break
                num += 1
                art_dir = os.path.join(feed_dir, 'article_%d'%a)
                if not os.path.isdir(art_dir):
                    os.makedirs(art_dir)
                try:
                    url = self.print_version(article.url)
                except NotImplementedError:
                    url = article.url
                except:
                    self.log.exception('Failed to find print version for: '+article.url)
                    url = None
                if not url:
                    continue

                self.log.debug('Downloading article:', article.title, 'from', url)
                try:
                    pages = fetch_page(
                        url,
                        load_complete=self.load_complete,
                        links=self.select_links,
                        remove=self.remove_tags,
                        keep_only=self.keep_only_tags,
                        preprocess_browser=partial(self._preprocess_browser, article),
                        postprocess_html=partial(self._postprocess_html, article, f, a, len(feed)),
                        remove_before=self.remove_tags_before,
                        remove_after=self.remove_tags_after,
                        remove_javascript=self.remove_javascript,
                        delay=self.delay,
                        resource_cache=resource_cache, output_dir=art_dir, browser=browser)
                except AbortFetch:
                    self.log.exception('Fetching of article: %r aborted' % article.title)
                    continue
                except Exception:
                    self.log.exception('Fetching of article: %r failed' % article.title)
                    continue
                self.log.debug('Downloaded article:', article.title, 'from', article.url)
                article.orig_url = article.url
                article.url = 'article_%d/index.html'%a
                article.downloaded = True
                article.sub_pages  = pages[1:]
                self.report_progress(float(num)/total,
                    _(u'Article downloaded: %s')%force_unicode(article.title))

        for f, feed in enumerate(feeds):
            html = self.feed2index(f, feeds)
            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
            with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
                fi.write(html)
        if self.no_stylesheets:
            for f in walk(self.output_dir):
                if f.endswith('.css'):
                    os.remove(f)
        self.create_opf(feeds)
        self.report_progress(1, _('Download finished'))
        return index