def setUp(self): self._db_home = os.path.join(TESTDIR, 'testresultqueue') os.mkdir(self._db_home) self._sites_info = { 'a78e6853355ad5cdc751ad678d15339382f9ed21': {'url': URL('ftp://atlantis.uh.cu/')}, '7e019d6f671d336a0cc31f137ba034efb13fc327': {'url': URL('ftp://andromeda.uh.cu/')}, 'aa958756e769188be9f76fbdb291fe1b2ddd4777': {'url': URL('ftp://deltha.uh.cu/')}, 'd4af25db08f5fb6e768db027d51b207cd1a7f5d0': {'url': URL('ftp://anduin.uh.cu/')}, '886b46f54bcd45d4dd5732e290c60e9639b0d101': {'url': URL('ftp://tigris.uh.cu/')}, 'ee5b017839d97507bf059ec91f1e5644a30b2fa6': {'url': URL('ftp://lara.uh.cu/')}, '341938200f949daa356e0b62f747580247609f5a': {'url': URL('ftp://nimbo.uh.cu/')}, 'd64f2fc98d015a43da3be34668341e3ee6f79133': {'url': URL('ftp://liverpool.reduh.uh.cu/')}, '0d3465f2b9fd5cf55748797c590ea621e3017a29': {'url': URL('ftp://london.reduh.uh.cu/')}, 'c5bcce5953866b673054f8927648d634a7237a9b': {'url': URL('ftp://bristol.reduh.uh.cu/')}, } self._results = [] self._results_per_site = 10 for site_id, info in self._sites_info.iteritems(): for name in (str(n) for n in xrange(self._results_per_site)): task = CrawlTask(site_id, info['url'].join(name)) self._results.append(CrawlResult(task, True)) self._queue = ResultQueue(self._sites_info, self._db_home)
def setUp(self): self._db_home = os.path.join(TESTDIR, 'testtaskqueue') os.mkdir(self._db_home) self._request_wait = 2 self._error_dir_wait = 3 self._error_site_wait = 4 self._min_revisit_wait = 2 self._default_revisit_wait = 4 self._sites_info = { 'a78e6853355ad5cdc751ad678d15339382f9ed21': { 'url': URL('ftp://atlantis.uh.cu/') }, '7e019d6f671d336a0cc31f137ba034efb13fc327': { 'url': URL('ftp://andromeda.uh.cu/') }, 'aa958756e769188be9f76fbdb291fe1b2ddd4777': { 'url': URL('ftp://deltha.uh.cu/') }, 'd4af25db08f5fb6e768db027d51b207cd1a7f5d0': { 'url': URL('ftp://anduin.uh.cu/') }, '886b46f54bcd45d4dd5732e290c60e9639b0d101': { 'url': URL('ftp://tigris.uh.cu/') }, 'ee5b017839d97507bf059ec91f1e5644a30b2fa6': { 'url': URL('ftp://lara.uh.cu/') }, '341938200f949daa356e0b62f747580247609f5a': { 'url': URL('ftp://nimbo.uh.cu/') }, 'd64f2fc98d015a43da3be34668341e3ee6f79133': { 'url': URL('ftp://liverpool.reduh.uh.cu/') }, '0d3465f2b9fd5cf55748797c590ea621e3017a29': { 'url': URL('ftp://london.reduh.uh.cu/') }, 'c5bcce5953866b673054f8927648d634a7237a9b': { 'url': URL('ftp://bristol.reduh.uh.cu/') }, } self._tasks = {} self._tasks_per_site = 10 self._num_sites = len(self._sites_info) self._num_tasks = self._num_sites * self._tasks_per_site for site_id, info in self._sites_info.iteritems(): # Set common information. info['max_depth'] = 100 info['request_wait'] = self._request_wait info['error_dir_wait'] = self._error_dir_wait info['error_site_wait'] = self._error_site_wait info['min_revisit_wait'] = self._min_revisit_wait info['default_revisit_wait'] = self._default_revisit_wait # Create tasks for site. task_list = [] for name in (str(n) for n in xrange(self._tasks_per_site)): task_list.append(CrawlTask(site_id, info['url'].join(name))) self._tasks[site_id] = task_list self._queue = TaskQueue(self._sites_info, self._db_home)
def test_join_unicode_args(self): for url_str, is_root, attrs, join_info in self._urls: url = URL(url_str.decode(self._encoding), is_root) joined_url = url.join(join_info[0].decode(self._encoding)) # Joined URLs should not be root. self.assertEquals(joined_url.is_root, False) self.assertEquals(str(joined_url), join_info[1])
def test_join(self): for url_str, is_root, attrs, join_info in self._urls: url = URL(url_str, is_root) joined_url = url.join(join_info[0]) self.assertEquals(joined_url.is_root, False) # Joined URLs should not be root. self.assertEquals(str(joined_url), join_info[1])
def __init__(self, sites, num_crawlers, spool_dir, database_dir, log_file, log_level, pid_file): """Initialize the daemon. Creates the `TaskQueue`, `ResultQueue`, `CrawlerManager` and `ProcessorManager` instances. The `sites` argument should be a list with the information for each site. """ Daemon.__init__(self, pid_file=pid_file) logging.basicConfig(filename=log_file, level=log_level, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logging.info('Starting Arachne daemon %s' % __version__) logging.info('Running for %d sites' % len(sites)) # Create URL instances and assign an id to each site. self._sites_info = {} for site in sites: site['url'] = URL(site['url'], True) self._sites_info[hashlib.sha1(str(site['url'])).hexdigest()] = site # Create or check required directories. self._results_dir = os.path.join(spool_dir, 'results') if not os.path.isdir(self._results_dir): os.mkdir(self._results_dir) self._tasks_dir = os.path.join(spool_dir, 'tasks') if not os.path.isdir(self._tasks_dir): os.mkdir(self._tasks_dir) self._database_dir = database_dir self._num_crawlers = num_crawlers self._running = False
def setUp(self): url = URL('ftp://deltha.uh.cu/') site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777' self._num_entries = 10 self._found = True self._task = CrawlTask(site_id, url) self._entries = [(str(i), {'is_dir': i < (self._num_entries / 2)}) for i in range(self._num_entries)] self._result = CrawlResult(self._task, self._found)
def test_properties_unicode_args(self): for url_str, is_root, attrs, join_info in self._urls: url = URL(url_str.decode(self._encoding), is_root) self.assertEquals(url.is_root, is_root) self.assertEquals(url.scheme, attrs[0]) self.assertEquals(url.username, attrs[1]) self.assertEquals(url.password, attrs[2]) self.assertEquals(url.hostname, attrs[3]) self.assertEquals(url.port, attrs[4]) self.assertEquals(url.path, attrs[5]) self.assertEquals(url.dirname, attrs[6]) self.assertEquals(url.basename, attrs[7])
def test_pickling(self): for url_str, is_root, attrs, join_info in self._urls: url = pickle.loads(pickle.dumps(URL(url_str, is_root))) self.assertEquals(url.is_root, is_root) self.assertEquals(url.scheme, attrs[0]) self.assertEquals(url.username, attrs[1]) self.assertEquals(url.password, attrs[2]) self.assertEquals(url.hostname, attrs[3]) self.assertEquals(url.port, attrs[4]) self.assertEquals(url.path, attrs[5]) self.assertEquals(url.dirname, attrs[6]) self.assertEquals(url.basename, attrs[7])
def test_type_unicode(self): for url_str, is_root, attrs, join_info in self._urls: url = URL(url_str, is_root) self.assertTrue(type(url.scheme) is unicode) if attrs[1] is not None: self.assertTrue(type(url.username) is unicode) if attrs[2] is not None: self.assertTrue(type(url.password) is unicode) if attrs[3] is not None: self.assertTrue(type(url.hostname) is unicode) self.assertTrue(type(url.path) is unicode) self.assertTrue(type(url.dirname) is unicode) self.assertTrue(type(url.basename) is unicode) pickled_url = pickle.loads(pickle.dumps(url)) self.assertTrue(type(pickled_url.scheme) is unicode) if attrs[1] is not None: self.assertTrue(type(pickled_url.username) is unicode) if attrs[2] is not None: self.assertTrue(type(pickled_url.password) is unicode) if attrs[3] is not None: self.assertTrue(type(pickled_url.hostname) is unicode) self.assertTrue(type(pickled_url.path) is unicode) self.assertTrue(type(pickled_url.dirname) is unicode) self.assertTrue(type(pickled_url.basename) is unicode)
def setUp(self): self._url = URL('ftp://deltha.uh.cu/') self._site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777' self._task = CrawlTask(self._site_id, self._url)