class SourceConsumer(ConsumerThread): def __init__(self, worker_id, task_queue, completed_queue): ConsumerThread.__init__(self, worker_id, task_queue, completed_queue) self._storage = DatabaseStorage() self._crawler = GenericSourceCrawler(Scraper([GenericSourceParser]), Scraper([RSSParser])) topic_scraper = Scraper([RSSParser, GenericFrontPageParser]) article_scraper = Scraper([TheOnionParser, EngadgetParser, TechCrunchParser, TorontoStarParser, TheGlobeAndMailParser, GenericContentParser]) sub_task = ArticleScrapingTask(article_scraper) self._task = TopicScrapingTask(topic_scraper, sub_task) def consume(self, source_url): self._logger.info('Consuming source url %s.' % source_url) try: source = self._crawler.crawl(source_url) for topic in source.topics: last_scraped = self._storage.get_topic_last_update(topic.url) if (last_scraped is not None) and (datetime.utcnow() - last_scraped).total_seconds() < TopicFreshnessSeconds: self._logger.info('Not scraping fresh topic: %s' % topic.url) continue try: for article in self._task.run(topic.url): self._storage.insert(article, topic, source) except IOError, e: self._logger.error('Failed scraping topic: %s' % e) except IOError, e: self._logger.error('Failed scraping source: %s' % e) except ValueError, e: self._logger.error('Failed scraping source: %s' % e)
def __init__(self, worker_id, task_queue, completed_queue): ConsumerThread.__init__(self, worker_id, task_queue, completed_queue) self._storage = DatabaseStorage() self._crawler = GenericSourceCrawler(Scraper([GenericSourceParser]), Scraper([RSSParser])) topic_scraper = Scraper([RSSParser, GenericFrontPageParser]) article_scraper = Scraper([TheOnionParser, EngadgetParser, TechCrunchParser, TorontoStarParser, TheGlobeAndMailParser, GenericContentParser]) sub_task = ArticleScrapingTask(article_scraper) self._task = TopicScrapingTask(topic_scraper, sub_task)