def run(self): """ Extract an RSS Feed and create articles. """ feed_url = self.options['feed_url'] feed_domain = url.get_simple_domain(feed_url) domains = self.org.get('domains', ['']) if feed_domain: domains.append(feed_domain) # iterate through RSS entries. self.log.info('Fetching {}'.format(feed_url)) for article in get_feed(feed_url, domains): article['type'] = 'article' # set this type as article. # since we poll often, we can assume this is a good # approximation of an article publish date. if not article.get('created'): article['created'] = dates.now() # if we havent run, just yield all results. if not self.max_date_last_run: self.publish_dates.append(article['created']) yield article # only yield new articles elif article['created'] > self.max_date_last_run: self.publish_dates.append(article['created']) yield article
def run(self): """ Fetch and format google alerts. """ feed_url = self.options['feed_url'] feed_domain = url.get_simple_domain(feed_url) # iterate through RSS entries. entries = get_feed(feed_url, [feed_domain]) p = Pool(self.options.get('max_workers', 5)) try: for event in p.imap_unordered(self.format, entries): if event: yield event except RequestError: self.log.warning('{} had no entries at this time'.format(feed_url))
def test_get_simple_domain(self): case = 'http://www.nytimes.com/2014/06/06/business/gm-ignition-switch-internal-recall-investigation-report.html?hp&_r=0' assert(url.get_simple_domain(case) == 'nytimes')