def run(self): """Run the main loop. """ try: self._running = True # Initialize components. tasks = TaskQueue(self._sites_info, self._tasks_dir) logging.info('There are %d tasks waiting for execution' % len(tasks)) results = ResultQueue(self._sites_info, self._results_dir) logging.info('There are %d results waiting for processing' % len(results)) crawlers = CrawlerManager(self._sites_info, self._num_crawlers, tasks, results) processor = ProcessorManager(self._sites_info, self._database_dir, tasks, results) # Start components. crawlers.start() processor.start() # Run the main loop. while self._running: signal.pause() # Stop and close components. crawlers.stop() processor.stop() crawlers.join() processor.join() results.close() tasks.close() logging.info('Daemon stopped, exiting') except: logging.exception('Unhandled exception, printing traceback') finally: logging.shutdown()
def setUp(self): self._db_home = os.path.join(TESTDIR, 'testresultqueue') os.mkdir(self._db_home) self._sites_info = { 'a78e6853355ad5cdc751ad678d15339382f9ed21': {'url': URL('ftp://atlantis.uh.cu/')}, '7e019d6f671d336a0cc31f137ba034efb13fc327': {'url': URL('ftp://andromeda.uh.cu/')}, 'aa958756e769188be9f76fbdb291fe1b2ddd4777': {'url': URL('ftp://deltha.uh.cu/')}, 'd4af25db08f5fb6e768db027d51b207cd1a7f5d0': {'url': URL('ftp://anduin.uh.cu/')}, '886b46f54bcd45d4dd5732e290c60e9639b0d101': {'url': URL('ftp://tigris.uh.cu/')}, 'ee5b017839d97507bf059ec91f1e5644a30b2fa6': {'url': URL('ftp://lara.uh.cu/')}, '341938200f949daa356e0b62f747580247609f5a': {'url': URL('ftp://nimbo.uh.cu/')}, 'd64f2fc98d015a43da3be34668341e3ee6f79133': {'url': URL('ftp://liverpool.reduh.uh.cu/')}, '0d3465f2b9fd5cf55748797c590ea621e3017a29': {'url': URL('ftp://london.reduh.uh.cu/')}, 'c5bcce5953866b673054f8927648d634a7237a9b': {'url': URL('ftp://bristol.reduh.uh.cu/')}, } self._results = [] self._results_per_site = 10 for site_id, info in self._sites_info.iteritems(): for name in (str(n) for n in xrange(self._results_per_site)): task = CrawlTask(site_id, info['url'].join(name)) self._results.append(CrawlResult(task, True)) self._queue = ResultQueue(self._sites_info, self._db_home)
def test_persistence(self): self._populate_queue() for i, result in enumerate(self._results): if i % (self._results_per_site / 2) == 0: # When a few results have been removed close the database to # write all the results to disk and open it again. self._queue.close() self._queue = ResultQueue(self._sites_info, self._db_home) returned = self._queue.get() self.assertEquals(str(returned.task.url), str(result.task.url)) self._queue.report_done(returned)
def test_remove_site(self): self._populate_queue() self._queue.close() # Remove a site. It should not return results from this site but it # should keep the order of the other results in the queue. del self._sites_info[self._sites_info.keys()[0]] self._queue = ResultQueue(self._sites_info, self._db_home) for result in self._results: if result.task.site_id in self._sites_info: returned = self._queue.get() self.assertEquals(str(returned.task.url), str(result.task.url)) self._queue.report_done(returned) self.assertEquals(len(self._queue), 0)