def setUp(self): self._db_home = os.path.join(TESTDIR, 'testresultqueue') os.mkdir(self._db_home) self._sites_info = { 'a78e6853355ad5cdc751ad678d15339382f9ed21': {'url': URL('ftp://atlantis.uh.cu/')}, '7e019d6f671d336a0cc31f137ba034efb13fc327': {'url': URL('ftp://andromeda.uh.cu/')}, 'aa958756e769188be9f76fbdb291fe1b2ddd4777': {'url': URL('ftp://deltha.uh.cu/')}, 'd4af25db08f5fb6e768db027d51b207cd1a7f5d0': {'url': URL('ftp://anduin.uh.cu/')}, '886b46f54bcd45d4dd5732e290c60e9639b0d101': {'url': URL('ftp://tigris.uh.cu/')}, 'ee5b017839d97507bf059ec91f1e5644a30b2fa6': {'url': URL('ftp://lara.uh.cu/')}, '341938200f949daa356e0b62f747580247609f5a': {'url': URL('ftp://nimbo.uh.cu/')}, 'd64f2fc98d015a43da3be34668341e3ee6f79133': {'url': URL('ftp://liverpool.reduh.uh.cu/')}, '0d3465f2b9fd5cf55748797c590ea621e3017a29': {'url': URL('ftp://london.reduh.uh.cu/')}, 'c5bcce5953866b673054f8927648d634a7237a9b': {'url': URL('ftp://bristol.reduh.uh.cu/')}, } self._results = [] self._results_per_site = 10 for site_id, info in self._sites_info.iteritems(): for name in (str(n) for n in xrange(self._results_per_site)): task = CrawlTask(site_id, info['url'].join(name)) self._results.append(CrawlResult(task, True)) self._queue = ResultQueue(self._sites_info, self._db_home)
def process(self, result): """Process a crawl result. """ for entry_url, data in result: if data['is_dir']: task = CrawlTask(result.task.site_id, entry_url) self._tasks.put_new(task) self._results.report_done(result)
def setUp(self): self._db_home = os.path.join(TESTDIR, 'testtaskqueue') os.mkdir(self._db_home) self._request_wait = 2 self._error_dir_wait = 3 self._error_site_wait = 4 self._min_revisit_wait = 2 self._default_revisit_wait = 4 self._sites_info = { 'a78e6853355ad5cdc751ad678d15339382f9ed21': { 'url': URL('ftp://atlantis.uh.cu/') }, '7e019d6f671d336a0cc31f137ba034efb13fc327': { 'url': URL('ftp://andromeda.uh.cu/') }, 'aa958756e769188be9f76fbdb291fe1b2ddd4777': { 'url': URL('ftp://deltha.uh.cu/') }, 'd4af25db08f5fb6e768db027d51b207cd1a7f5d0': { 'url': URL('ftp://anduin.uh.cu/') }, '886b46f54bcd45d4dd5732e290c60e9639b0d101': { 'url': URL('ftp://tigris.uh.cu/') }, 'ee5b017839d97507bf059ec91f1e5644a30b2fa6': { 'url': URL('ftp://lara.uh.cu/') }, '341938200f949daa356e0b62f747580247609f5a': { 'url': URL('ftp://nimbo.uh.cu/') }, 'd64f2fc98d015a43da3be34668341e3ee6f79133': { 'url': URL('ftp://liverpool.reduh.uh.cu/') }, '0d3465f2b9fd5cf55748797c590ea621e3017a29': { 'url': URL('ftp://london.reduh.uh.cu/') }, 'c5bcce5953866b673054f8927648d634a7237a9b': { 'url': URL('ftp://bristol.reduh.uh.cu/') }, } self._tasks = {} self._tasks_per_site = 10 self._num_sites = len(self._sites_info) self._num_tasks = self._num_sites * self._tasks_per_site for site_id, info in self._sites_info.iteritems(): # Set common information. info['max_depth'] = 100 info['request_wait'] = self._request_wait info['error_dir_wait'] = self._error_dir_wait info['error_site_wait'] = self._error_site_wait info['min_revisit_wait'] = self._min_revisit_wait info['default_revisit_wait'] = self._default_revisit_wait # Create tasks for site. task_list = [] for name in (str(n) for n in xrange(self._tasks_per_site)): task_list.append(CrawlTask(site_id, info['url'].join(name))) self._tasks[site_id] = task_list self._queue = TaskQueue(self._sites_info, self._db_home)
def setUp(self): url = URL('ftp://deltha.uh.cu/') site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777' self._num_entries = 10 self._found = True self._task = CrawlTask(site_id, url) self._entries = [(str(i), {'is_dir': i < (self._num_entries / 2)}) for i in range(self._num_entries)] self._result = CrawlResult(self._task, self._found)
def setUp(self): self._url = URL('ftp://deltha.uh.cu/') self._site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777' self._task = CrawlTask(self._site_id, self._url)
class TestCrawlTask(unittest.TestCase): def setUp(self): self._url = URL('ftp://deltha.uh.cu/') self._site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777' self._task = CrawlTask(self._site_id, self._url) def test_properties(self): self.assertEquals(self._task.site_id, self._site_id) self.assertEquals(str(self._task.url), str(self._url)) self.assertEquals(self._task.revisit_wait, 0) self.assertEquals(self._task.revisit_count, -1) self.assertEquals(self._task.change_count, 0) def test_pickling(self): task = pickle.loads(pickle.dumps(self._task)) self.assertEquals(self._task.site_id, task.site_id) self.assertEquals(str(self._task.url), str(task.url)) self.assertEquals(self._task.revisit_wait, task.revisit_wait) self.assertEquals(self._task.revisit_count, task.revisit_count) self.assertEquals(self._task.change_count, task.change_count) def test_revisit_wait(self): self._task.report_visit(True) self._task.report_visit(False) self._task.revisit_wait = 60 self.assertEquals(self._task.revisit_wait, 60) def test_reset_counters(self): self._task.report_visit(True) self._task.report_visit(True) self._task.revisit_wait = 60 self._task.reset_change_count() self.assertEquals(self._task.revisit_wait, 60) self.assertEquals(self._task.revisit_count, 1) self.assertEquals(self._task.change_count, 0) def test_report_visit(self): self._task.report_visit(True) # Reporting visit without changes. self._task.report_visit(False) self._task.report_visit(False) self.assertEquals(self._task.revisit_count, 2) self.assertEquals(self._task.change_count, 0) # Reporting visits with changes. self._task.report_visit(True) self._task.report_visit(True) self.assertEquals(self._task.revisit_count, 4) self.assertEquals(self._task.change_count, 2)
def process(self, result): """Process a crawl result. """ url = result.task.url site_id = result.task.site_id if not result.found: self._rmtree(site_id, url.path) else: enquire = xapian.Enquire(self._db) enquire.set_docid_order(xapian.Enquire.DONT_CARE) site_id_query = xapian.Query(self.SITE_ID_PREFIX + site_id) if url.is_root: # The parent of the root directory is not known, or it can even # not exist. We should check that the root directory is indexed # because it is required to search for files in a selected # number of sites. root_query = xapian.Query(self.IS_ROOT_PREFIX + self.TRUE_VALUE) query = xapian.Query(xapian.Query.OP_FILTER, site_id_query, root_query) enquire.set_query(query) mset = enquire.get_mset(0, 1) if mset.empty(): # Index this root directory. data = {'url': url, 'is_dir': True} doc = self._create_document(site_id, data) self._db.add_document(doc) # Process entries of the directory. dir_changed = False doc_count = self._db.get_doccount() # Get all the entries of this directory in the index. dirname = url.path.rstrip(u'/') + u'/' dirname_query = xapian.Query(xapian.Query.OP_VALUE_RANGE, self.DIRNAME_SLOT, dirname, dirname) query = xapian.Query(xapian.Query.OP_FILTER, site_id_query, dirname_query) enquire.set_query(query) indexed_entries = [] for match in enquire.get_mset(0, doc_count): doc = match.get_document() is_dir = self._get_doc_value(doc, self.IS_DIR_SLOT) basename = self._get_doc_value(doc, self.BASENAME_SLOT) # I check this as an ugly hack to avoid removing the root # directory of the site if we are processing the result for the # root directory itself. if basename != '/': try: data = result[basename] except KeyError: # Entry removed from the directory in the site. dir_changed = True if is_dir: # Remove entries in the sub-tree of the directory. self._rmtree(site_id, dirname + basename + u'/') else: self._db.delete_document(doc.get_docid()) else: # Check if metadata is updated. if is_dir == data['is_dir']: indexed_entries.append(basename) else: dir_changed = True # Lazy solution. Remove the document from the # index and then add it again with the right data. self._db.delete_document(doc.get_docid()) # Add new or modified entries. for entry, data in result: if entry not in indexed_entries: # New entry found in the directory. Mark as changed, index # the entry and if it is a directory add a new task to # visit it. dir_changed = True doc = self._create_document(site_id, data) self._db.add_document(doc) if data['is_dir']: task = CrawlTask(result.task.site_id, data['url']) self._tasks.put_new(task) # Put a new task to visit the directory again. self._tasks.put_visited(result.task, dir_changed) # Result sucessfully processed. self._results.report_done(result)