Ejemplo n.º 1
0
    def run(self):
        """
        Extract an RSS Feed and create articles.
        """
        feed_url = self.options['feed_url']
        feed_domain = url.get_simple_domain(feed_url)
        domains = self.org.get('domains', [''])
        if feed_domain:
            domains.append(feed_domain)

        # iterate through RSS entries.
        self.log.info('Fetching {}'.format(feed_url))
        for article in get_feed(feed_url, domains):
            article['type'] = 'article'  # set this type as article.

            # since we poll often, we can assume this is a good
            # approximation of an article publish date.
            if not article.get('created'):
                article['created'] = dates.now()

            # if we havent run, just yield all results.
            if not self.max_date_last_run:
                self.publish_dates.append(article['created'])
                yield article

            # only yield new articles
            elif article['created'] > self.max_date_last_run:
                self.publish_dates.append(article['created'])
                yield article
Ejemplo n.º 2
0
    def run(self):
        """
        Fetch and format google alerts.
        """
        feed_url = self.options['feed_url']
        feed_domain = url.get_simple_domain(feed_url)

        # iterate through RSS entries.
        entries = get_feed(feed_url, [feed_domain])
        p = Pool(self.options.get('max_workers', 5))
        try:
            for event in p.imap_unordered(self.format, entries):
                if event:
                    yield event
        except RequestError:
            self.log.warning('{} had no entries at this time'.format(feed_url))
Ejemplo n.º 3
0
 def test_get_simple_domain(self):
     case = 'http://www.nytimes.com/2014/06/06/business/gm-ignition-switch-internal-recall-investigation-report.html?hp&_r=0'
     assert(url.get_simple_domain(case) == 'nytimes')