Example #1
0
 def run(self):
     """Run the main loop.
     """
     try:
         self._running = True
         # Initialize components.
         tasks = TaskQueue(self._sites_info, self._tasks_dir)
         logging.info('There are %d tasks waiting for execution' %
                      len(tasks))
         results = ResultQueue(self._sites_info, self._results_dir)
         logging.info('There are %d results waiting for processing' %
                      len(results))
         crawlers = CrawlerManager(self._sites_info, self._num_crawlers,
                                   tasks, results)
         processor = ProcessorManager(self._sites_info, self._database_dir,
                                      tasks, results)
         # Start components.
         crawlers.start()
         processor.start()
         # Run the main loop.
         while self._running:
             signal.pause()
         # Stop and close components.
         crawlers.stop()
         processor.stop()
         crawlers.join()
         processor.join()
         results.close()
         tasks.close()
         logging.info('Daemon stopped, exiting')
     except:
         logging.exception('Unhandled exception, printing traceback')
     finally:
         logging.shutdown()
Example #2
0
 def setUp(self):
     self._db_home = os.path.join(TESTDIR, 'testresultqueue')
     os.mkdir(self._db_home)
     self._sites_info = {
         'a78e6853355ad5cdc751ad678d15339382f9ed21':
             {'url': URL('ftp://atlantis.uh.cu/')},
         '7e019d6f671d336a0cc31f137ba034efb13fc327':
             {'url': URL('ftp://andromeda.uh.cu/')},
         'aa958756e769188be9f76fbdb291fe1b2ddd4777':
             {'url': URL('ftp://deltha.uh.cu/')},
         'd4af25db08f5fb6e768db027d51b207cd1a7f5d0':
             {'url': URL('ftp://anduin.uh.cu/')},
         '886b46f54bcd45d4dd5732e290c60e9639b0d101':
             {'url': URL('ftp://tigris.uh.cu/')},
         'ee5b017839d97507bf059ec91f1e5644a30b2fa6':
             {'url': URL('ftp://lara.uh.cu/')},
         '341938200f949daa356e0b62f747580247609f5a':
             {'url': URL('ftp://nimbo.uh.cu/')},
         'd64f2fc98d015a43da3be34668341e3ee6f79133':
             {'url': URL('ftp://liverpool.reduh.uh.cu/')},
         '0d3465f2b9fd5cf55748797c590ea621e3017a29':
             {'url': URL('ftp://london.reduh.uh.cu/')},
         'c5bcce5953866b673054f8927648d634a7237a9b':
             {'url': URL('ftp://bristol.reduh.uh.cu/')},
     }
     self._results = []
     self._results_per_site = 10
     for site_id, info in self._sites_info.iteritems():
         for name in (str(n) for n in xrange(self._results_per_site)):
             task = CrawlTask(site_id, info['url'].join(name))
             self._results.append(CrawlResult(task, True))
     self._queue = ResultQueue(self._sites_info, self._db_home)
Example #3
0
 def test_persistence(self):
     self._populate_queue()
     for i, result in enumerate(self._results):
         if i % (self._results_per_site / 2) == 0:
             # When a few results have been removed close the database to
             # write all the results to disk and open it again.
             self._queue.close()
             self._queue = ResultQueue(self._sites_info, self._db_home)
         returned = self._queue.get()
         self.assertEquals(str(returned.task.url), str(result.task.url))
         self._queue.report_done(returned)
Example #4
0
 def test_remove_site(self):
     self._populate_queue()
     self._queue.close()
     # Remove a site.  It should not return results from this site but it
     # should keep the order of the other results in the queue.
     del self._sites_info[self._sites_info.keys()[0]]
     self._queue = ResultQueue(self._sites_info, self._db_home)
     for result in self._results:
         if result.task.site_id in self._sites_info:
             returned = self._queue.get()
             self.assertEquals(str(returned.task.url), str(result.task.url))
             self._queue.report_done(returned)
     self.assertEquals(len(self._queue), 0)