コード例 #1
0
ファイル: workers.py プロジェクト: johnliu/journal-server
class SourceConsumer(ConsumerThread):
    def __init__(self, worker_id, task_queue, completed_queue):
        ConsumerThread.__init__(self, worker_id, task_queue, completed_queue)

        self._storage = DatabaseStorage()
        self._crawler = GenericSourceCrawler(Scraper([GenericSourceParser]), Scraper([RSSParser]))

        topic_scraper = Scraper([RSSParser, GenericFrontPageParser])
        article_scraper = Scraper([TheOnionParser, EngadgetParser, TechCrunchParser, TorontoStarParser, TheGlobeAndMailParser, GenericContentParser])

        sub_task = ArticleScrapingTask(article_scraper)
        self._task = TopicScrapingTask(topic_scraper, sub_task)

    def consume(self, source_url):
        self._logger.info('Consuming source url %s.' % source_url)
        try:
            source = self._crawler.crawl(source_url)

            for topic in source.topics:
                last_scraped = self._storage.get_topic_last_update(topic.url)
                if (last_scraped is not None) and (datetime.utcnow() - last_scraped).total_seconds() < TopicFreshnessSeconds:
                    self._logger.info('Not scraping fresh topic: %s' % topic.url)
                    continue
                try:
                    for article in self._task.run(topic.url):
                        self._storage.insert(article, topic, source)
                except IOError, e:
                    self._logger.error('Failed scraping topic: %s' % e)

        except IOError, e:
            self._logger.error('Failed scraping source: %s' % e)
        except ValueError, e:
            self._logger.error('Failed scraping source: %s' % e)
コード例 #2
0
ファイル: seed.py プロジェクト: johnliu/journal-server
def seed_sources():
    crawler = GenericSourceCrawler(Scraper([GenericSourceParser]), Scraper([RSSParser]))
    with open('resources/sources.dat') as f:
        for source_url in f:
            source = crawler.crawl(source_url.strip())

            db_subscription = Subscription.update_or_create(Subscription.url == source.url,
                                                            url=source.url,
                                                            name=source.name,
                                                            image=source.image_url)

            for topic in source.topics:
                kwargs = {'source': db_subscription, 'name': topic.name, 'url': topic.url}
                try:
                    Topic.get(**kwargs)
                except Topic.DoesNotExist:
                    Topic.create(**kwargs)
コード例 #3
0
ファイル: workers.py プロジェクト: johnliu/journal-server
    def __init__(self, worker_id, task_queue, completed_queue):
        ConsumerThread.__init__(self, worker_id, task_queue, completed_queue)

        self._storage = DatabaseStorage()
        self._crawler = GenericSourceCrawler(Scraper([GenericSourceParser]), Scraper([RSSParser]))

        topic_scraper = Scraper([RSSParser, GenericFrontPageParser])
        article_scraper = Scraper([TheOnionParser, EngadgetParser, TechCrunchParser, TorontoStarParser, TheGlobeAndMailParser, GenericContentParser])

        sub_task = ArticleScrapingTask(article_scraper)
        self._task = TopicScrapingTask(topic_scraper, sub_task)