Example #1
0
    def testSchedulerAndCrawler(self):
        urls = [u"http://feeds.feedburner.com/43folders",
                u"http://advocacy.python.org/podcasts/littlebit.rss",
                u"http://friendfeed.com/alawrence?format=atom",
                u"http://feeds.feedburner.com/antiwar"]

        with transaction.manager:
            for url in urls:
                meta.Session().add(CrawlJobModel(url=url))


        self.assert_(len(list(meta.Session().query(CrawlJobModel).all())))
        self.assert_(len(list(get_crawl_jobs())))

        log.info("telling worker to use database %s" % self.db_url)
        scheduler_bind = "ipc:///tmp/scheduler_socket"
        crawl_bind = "ipc:///tmp/crawler_socket"
        from feederengine import crawler
        with mock(crawler, "proxy", mock_rss_server):
            w = SchedulerWorker(self.db_url, scheduler_bind)

            c = CrawlWorker(scheduler_bind, crawl_bind)

            w.start()
            c.start()

            self.assert_(w.is_alive())
            self.assert_(c.is_alive())

            context = zmq.Context()

            with pull_socket(context, crawl_bind) as subscription:
                count = 0
                tries = 0
                poller = zmq.Poller()
                poller.register(subscription, zmq.POLLIN)
                while count < len(urls) and tries < 100:
                    polled = dict(poller.poll(timeout=100))
                    if subscription in polled and polled[subscription] == zmq.POLLIN:
                        try:
                            url, data = subscription.recv_multipart(zmq.NOBLOCK)
                            count += 1
                        except zmq.ZMQError:
                            log.error("timeout", exc_info=True)
                            time.sleep(.1)
                        else:
                            log.info(data)
                    tries += 1
                    log.info("tries %s and results %s" % (tries, count))

            [w.terminate(), c.terminate()]
            time.sleep(1)
            self.assert_(not w.is_alive())
            self.assert_(not c.is_alive())
            self.assert_(count == len(urls), "didn't get all expected messages")
Example #2
0
    def testCrawlWorker(self):

        from feederengine import crawler
        with mock(crawler, "proxy", mock_rss_server):
            urls = ["http://www.reddit.com/r/Python/",
                    "http://slashdot.org",
                    "http://news.ycombinator.com/"]
            workers = {}
            for url in urls:
                workers[url] = crawler.crawl(url=url)

            [(k, str(w())) for k, w in workers.items()]
Example #3
0
    def testCrawlerFailGettingResponse(self):
        """
        strictly for test coverage
        """
        def err_app(environ, start_response):
            raise Exception("fuuuuuuuu")

        from feederengine import crawler
        with mock(crawler, "proxy", err_app):
            crawler.proxy = err_app
            try:
                str(crawler.crawl(url="http://1.1.1.1")(.01))
                self.fail()  # pragma no cover
            except:
                pass