def testGetCrawlJobs(self): """ test get_crawl_jobs logic a query that returns a list of urls that should be crawled. the list is determined by the last time it was checked if at all and whether it is currently scheduled to be crawled the goal is to crawl a url at most 5 minutes """ recs = self.make_data() #stuff is scheduled, backdate scheduled so we have the right #conditions self.scheduled_backdate_recs(recs, 10) # at this point we should get 0 results because no job has # been marked checked result = [r for r in scheduler.get_crawl_jobs()] self.assert_(len(result) == 0) # if we mark one checked and backdate it to 7 minutes ago we # should get results of 1 self.checked_backdate_recs([recs[0], ], 7) result = [r for r in scheduler.get_crawl_jobs()] self.assert_(len(result) == 1)
def testSchedulerAndCrawler(self): urls = [u"http://feeds.feedburner.com/43folders", u"http://advocacy.python.org/podcasts/littlebit.rss", u"http://friendfeed.com/alawrence?format=atom", u"http://feeds.feedburner.com/antiwar"] with transaction.manager: for url in urls: meta.Session().add(CrawlJobModel(url=url)) self.assert_(len(list(meta.Session().query(CrawlJobModel).all()))) self.assert_(len(list(get_crawl_jobs()))) log.info("telling worker to use database %s" % self.db_url) scheduler_bind = "ipc:///tmp/scheduler_socket" crawl_bind = "ipc:///tmp/crawler_socket" from feederengine import crawler with mock(crawler, "proxy", mock_rss_server): w = SchedulerWorker(self.db_url, scheduler_bind) c = CrawlWorker(scheduler_bind, crawl_bind) w.start() c.start() self.assert_(w.is_alive()) self.assert_(c.is_alive()) context = zmq.Context() with pull_socket(context, crawl_bind) as subscription: count = 0 tries = 0 poller = zmq.Poller() poller.register(subscription, zmq.POLLIN) while count < len(urls) and tries < 100: polled = dict(poller.poll(timeout=100)) if subscription in polled and polled[subscription] == zmq.POLLIN: try: url, data = subscription.recv_multipart(zmq.NOBLOCK) count += 1 except zmq.ZMQError: log.error("timeout", exc_info=True) time.sleep(.1) else: log.info(data) tries += 1 log.info("tries %s and results %s" % (tries, count)) [w.terminate(), c.terminate()] time.sleep(1) self.assert_(not w.is_alive()) self.assert_(not c.is_alive()) self.assert_(count == len(urls), "didn't get all expected messages")