def testSchedulerAndCrawler(self): urls = [u"http://feeds.feedburner.com/43folders", u"http://advocacy.python.org/podcasts/littlebit.rss", u"http://friendfeed.com/alawrence?format=atom", u"http://feeds.feedburner.com/antiwar"] with transaction.manager: for url in urls: meta.Session().add(CrawlJobModel(url=url)) self.assert_(len(list(meta.Session().query(CrawlJobModel).all()))) self.assert_(len(list(get_crawl_jobs()))) log.info("telling worker to use database %s" % self.db_url) scheduler_bind = "ipc:///tmp/scheduler_socket" crawl_bind = "ipc:///tmp/crawler_socket" from feederengine import crawler with mock(crawler, "proxy", mock_rss_server): w = SchedulerWorker(self.db_url, scheduler_bind) c = CrawlWorker(scheduler_bind, crawl_bind) w.start() c.start() self.assert_(w.is_alive()) self.assert_(c.is_alive()) context = zmq.Context() with pull_socket(context, crawl_bind) as subscription: count = 0 tries = 0 poller = zmq.Poller() poller.register(subscription, zmq.POLLIN) while count < len(urls) and tries < 100: polled = dict(poller.poll(timeout=100)) if subscription in polled and polled[subscription] == zmq.POLLIN: try: url, data = subscription.recv_multipart(zmq.NOBLOCK) count += 1 except zmq.ZMQError: log.error("timeout", exc_info=True) time.sleep(.1) else: log.info(data) tries += 1 log.info("tries %s and results %s" % (tries, count)) [w.terminate(), c.terminate()] time.sleep(1) self.assert_(not w.is_alive()) self.assert_(not c.is_alive()) self.assert_(count == len(urls), "didn't get all expected messages")
def testCrawlWorker(self): from feederengine import crawler with mock(crawler, "proxy", mock_rss_server): urls = ["http://www.reddit.com/r/Python/", "http://slashdot.org", "http://news.ycombinator.com/"] workers = {} for url in urls: workers[url] = crawler.crawl(url=url) [(k, str(w())) for k, w in workers.items()]
def testCrawlerFailGettingResponse(self): """ strictly for test coverage """ def err_app(environ, start_response): raise Exception("fuuuuuuuu") from feederengine import crawler with mock(crawler, "proxy", err_app): crawler.proxy = err_app try: str(crawler.crawl(url="http://1.1.1.1")(.01)) self.fail() # pragma no cover except: pass