class SourceConsumer(ConsumerThread): def __init__(self, worker_id, task_queue, completed_queue): ConsumerThread.__init__(self, worker_id, task_queue, completed_queue) self._storage = DatabaseStorage() self._crawler = GenericSourceCrawler(Scraper([GenericSourceParser]), Scraper([RSSParser])) topic_scraper = Scraper([RSSParser, GenericFrontPageParser]) article_scraper = Scraper([TheOnionParser, EngadgetParser, TechCrunchParser, TorontoStarParser, TheGlobeAndMailParser, GenericContentParser]) sub_task = ArticleScrapingTask(article_scraper) self._task = TopicScrapingTask(topic_scraper, sub_task) def consume(self, source_url): self._logger.info('Consuming source url %s.' % source_url) try: source = self._crawler.crawl(source_url) for topic in source.topics: last_scraped = self._storage.get_topic_last_update(topic.url) if (last_scraped is not None) and (datetime.utcnow() - last_scraped).total_seconds() < TopicFreshnessSeconds: self._logger.info('Not scraping fresh topic: %s' % topic.url) continue try: for article in self._task.run(topic.url): self._storage.insert(article, topic, source) except IOError, e: self._logger.error('Failed scraping topic: %s' % e) except IOError, e: self._logger.error('Failed scraping source: %s' % e) except ValueError, e: self._logger.error('Failed scraping source: %s' % e)
def seed_sources(): crawler = GenericSourceCrawler(Scraper([GenericSourceParser]), Scraper([RSSParser])) with open('resources/sources.dat') as f: for source_url in f: source = crawler.crawl(source_url.strip()) db_subscription = Subscription.update_or_create(Subscription.url == source.url, url=source.url, name=source.name, image=source.image_url) for topic in source.topics: kwargs = {'source': db_subscription, 'name': topic.name, 'url': topic.url} try: Topic.get(**kwargs) except Topic.DoesNotExist: Topic.create(**kwargs)
def __init__(self, worker_id, task_queue, completed_queue): ConsumerThread.__init__(self, worker_id, task_queue, completed_queue) self._storage = DatabaseStorage() self._crawler = GenericSourceCrawler(Scraper([GenericSourceParser]), Scraper([RSSParser])) topic_scraper = Scraper([RSSParser, GenericFrontPageParser]) article_scraper = Scraper([TheOnionParser, EngadgetParser, TechCrunchParser, TorontoStarParser, TheGlobeAndMailParser, GenericContentParser]) sub_task = ArticleScrapingTask(article_scraper) self._task = TopicScrapingTask(topic_scraper, sub_task)