Example #1
0
 def setUp(self):
     self._db_home = os.path.join(TESTDIR, 'testresultqueue')
     os.mkdir(self._db_home)
     self._sites_info = {
         'a78e6853355ad5cdc751ad678d15339382f9ed21':
             {'url': URL('ftp://atlantis.uh.cu/')},
         '7e019d6f671d336a0cc31f137ba034efb13fc327':
             {'url': URL('ftp://andromeda.uh.cu/')},
         'aa958756e769188be9f76fbdb291fe1b2ddd4777':
             {'url': URL('ftp://deltha.uh.cu/')},
         'd4af25db08f5fb6e768db027d51b207cd1a7f5d0':
             {'url': URL('ftp://anduin.uh.cu/')},
         '886b46f54bcd45d4dd5732e290c60e9639b0d101':
             {'url': URL('ftp://tigris.uh.cu/')},
         'ee5b017839d97507bf059ec91f1e5644a30b2fa6':
             {'url': URL('ftp://lara.uh.cu/')},
         '341938200f949daa356e0b62f747580247609f5a':
             {'url': URL('ftp://nimbo.uh.cu/')},
         'd64f2fc98d015a43da3be34668341e3ee6f79133':
             {'url': URL('ftp://liverpool.reduh.uh.cu/')},
         '0d3465f2b9fd5cf55748797c590ea621e3017a29':
             {'url': URL('ftp://london.reduh.uh.cu/')},
         'c5bcce5953866b673054f8927648d634a7237a9b':
             {'url': URL('ftp://bristol.reduh.uh.cu/')},
     }
     self._results = []
     self._results_per_site = 10
     for site_id, info in self._sites_info.iteritems():
         for name in (str(n) for n in xrange(self._results_per_site)):
             task = CrawlTask(site_id, info['url'].join(name))
             self._results.append(CrawlResult(task, True))
     self._queue = ResultQueue(self._sites_info, self._db_home)
Example #2
0
 def process(self, result):
     """Process a crawl result.
     """
     for entry_url, data in result:
         if data['is_dir']:
             task = CrawlTask(result.task.site_id, entry_url)
             self._tasks.put_new(task)
     self._results.report_done(result)
Example #3
0
 def setUp(self):
     self._db_home = os.path.join(TESTDIR, 'testtaskqueue')
     os.mkdir(self._db_home)
     self._request_wait = 2
     self._error_dir_wait = 3
     self._error_site_wait = 4
     self._min_revisit_wait = 2
     self._default_revisit_wait = 4
     self._sites_info = {
         'a78e6853355ad5cdc751ad678d15339382f9ed21': {
             'url': URL('ftp://atlantis.uh.cu/')
         },
         '7e019d6f671d336a0cc31f137ba034efb13fc327': {
             'url': URL('ftp://andromeda.uh.cu/')
         },
         'aa958756e769188be9f76fbdb291fe1b2ddd4777': {
             'url': URL('ftp://deltha.uh.cu/')
         },
         'd4af25db08f5fb6e768db027d51b207cd1a7f5d0': {
             'url': URL('ftp://anduin.uh.cu/')
         },
         '886b46f54bcd45d4dd5732e290c60e9639b0d101': {
             'url': URL('ftp://tigris.uh.cu/')
         },
         'ee5b017839d97507bf059ec91f1e5644a30b2fa6': {
             'url': URL('ftp://lara.uh.cu/')
         },
         '341938200f949daa356e0b62f747580247609f5a': {
             'url': URL('ftp://nimbo.uh.cu/')
         },
         'd64f2fc98d015a43da3be34668341e3ee6f79133': {
             'url': URL('ftp://liverpool.reduh.uh.cu/')
         },
         '0d3465f2b9fd5cf55748797c590ea621e3017a29': {
             'url': URL('ftp://london.reduh.uh.cu/')
         },
         'c5bcce5953866b673054f8927648d634a7237a9b': {
             'url': URL('ftp://bristol.reduh.uh.cu/')
         },
     }
     self._tasks = {}
     self._tasks_per_site = 10
     self._num_sites = len(self._sites_info)
     self._num_tasks = self._num_sites * self._tasks_per_site
     for site_id, info in self._sites_info.iteritems():
         # Set common information.
         info['max_depth'] = 100
         info['request_wait'] = self._request_wait
         info['error_dir_wait'] = self._error_dir_wait
         info['error_site_wait'] = self._error_site_wait
         info['min_revisit_wait'] = self._min_revisit_wait
         info['default_revisit_wait'] = self._default_revisit_wait
         # Create tasks for site.
         task_list = []
         for name in (str(n) for n in xrange(self._tasks_per_site)):
             task_list.append(CrawlTask(site_id, info['url'].join(name)))
         self._tasks[site_id] = task_list
     self._queue = TaskQueue(self._sites_info, self._db_home)
Example #4
0
 def setUp(self):
     url = URL('ftp://deltha.uh.cu/')
     site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777'
     self._num_entries = 10
     self._found = True
     self._task = CrawlTask(site_id, url)
     self._entries = [(str(i), {'is_dir': i < (self._num_entries / 2)})
                      for i in range(self._num_entries)]
     self._result = CrawlResult(self._task, self._found)
Example #5
0
 def setUp(self):
     self._url = URL('ftp://deltha.uh.cu/')
     self._site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777'
     self._task = CrawlTask(self._site_id, self._url)
Example #6
0
class TestCrawlTask(unittest.TestCase):

    def setUp(self):
        self._url = URL('ftp://deltha.uh.cu/')
        self._site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777'
        self._task = CrawlTask(self._site_id, self._url)

    def test_properties(self):
        self.assertEquals(self._task.site_id, self._site_id)
        self.assertEquals(str(self._task.url), str(self._url))
        self.assertEquals(self._task.revisit_wait, 0)
        self.assertEquals(self._task.revisit_count, -1)
        self.assertEquals(self._task.change_count, 0)

    def test_pickling(self):
        task = pickle.loads(pickle.dumps(self._task))
        self.assertEquals(self._task.site_id, task.site_id)
        self.assertEquals(str(self._task.url), str(task.url))
        self.assertEquals(self._task.revisit_wait, task.revisit_wait)
        self.assertEquals(self._task.revisit_count, task.revisit_count)
        self.assertEquals(self._task.change_count, task.change_count)

    def test_revisit_wait(self):
        self._task.report_visit(True)
        self._task.report_visit(False)
        self._task.revisit_wait = 60
        self.assertEquals(self._task.revisit_wait, 60)

    def test_reset_counters(self):
        self._task.report_visit(True)
        self._task.report_visit(True)
        self._task.revisit_wait = 60
        self._task.reset_change_count()
        self.assertEquals(self._task.revisit_wait, 60)
        self.assertEquals(self._task.revisit_count, 1)
        self.assertEquals(self._task.change_count, 0)

    def test_report_visit(self):
        self._task.report_visit(True)
        # Reporting visit without changes.
        self._task.report_visit(False)
        self._task.report_visit(False)
        self.assertEquals(self._task.revisit_count, 2)
        self.assertEquals(self._task.change_count, 0)
        # Reporting visits with changes.
        self._task.report_visit(True)
        self._task.report_visit(True)
        self.assertEquals(self._task.revisit_count, 4)
        self.assertEquals(self._task.change_count, 2)
Example #7
0
 def process(self, result):
     """Process a crawl result.
     """
     url = result.task.url
     site_id = result.task.site_id
     if not result.found:
         self._rmtree(site_id, url.path)
     else:
         enquire = xapian.Enquire(self._db)
         enquire.set_docid_order(xapian.Enquire.DONT_CARE)
         site_id_query = xapian.Query(self.SITE_ID_PREFIX + site_id)
         if url.is_root:
             # The parent of the root directory is not known, or it can even
             # not exist. We should check that the root directory is indexed
             # because it is required to search for files in a selected
             # number of sites.
             root_query = xapian.Query(self.IS_ROOT_PREFIX + self.TRUE_VALUE)
             query = xapian.Query(xapian.Query.OP_FILTER, site_id_query,
                                  root_query)
             enquire.set_query(query)
             mset = enquire.get_mset(0, 1)
             if mset.empty():
                 # Index this root directory.
                 data = {'url': url, 'is_dir': True}
                 doc = self._create_document(site_id, data)
                 self._db.add_document(doc)
         # Process entries of the directory.
         dir_changed = False
         doc_count = self._db.get_doccount()
         # Get all the entries of this directory in the index.
         dirname = url.path.rstrip(u'/') + u'/'
         dirname_query = xapian.Query(xapian.Query.OP_VALUE_RANGE,
                                      self.DIRNAME_SLOT, dirname, dirname)
         query = xapian.Query(xapian.Query.OP_FILTER, site_id_query,
                              dirname_query)
         enquire.set_query(query)
         indexed_entries = []
         for match in enquire.get_mset(0, doc_count):
             doc = match.get_document()
             is_dir = self._get_doc_value(doc, self.IS_DIR_SLOT)
             basename = self._get_doc_value(doc, self.BASENAME_SLOT)
             # I check this as an ugly hack to avoid removing the root
             # directory of the site if we are processing the result for the
             # root directory itself.
             if basename != '/':
                 try:
                     data = result[basename]
                 except KeyError:
                     # Entry removed from the directory in the site.
                     dir_changed = True
                     if is_dir:
                         # Remove entries in the sub-tree of the directory.
                         self._rmtree(site_id, dirname + basename + u'/')
                     else:
                         self._db.delete_document(doc.get_docid())
                 else:
                     # Check if metadata is updated.
                     if is_dir == data['is_dir']:
                         indexed_entries.append(basename)
                     else:
                         dir_changed = True
                         # Lazy solution.  Remove the document from the
                         # index and then add it again with the right data.
                         self._db.delete_document(doc.get_docid())
         # Add new or modified entries.
         for entry, data in result:
             if entry not in indexed_entries:
                 # New entry found in the directory. Mark as changed, index
                 # the entry and if it is a directory add a new task to
                 # visit it.
                 dir_changed = True
                 doc = self._create_document(site_id, data)
                 self._db.add_document(doc)
                 if data['is_dir']:
                     task = CrawlTask(result.task.site_id, data['url'])
                     self._tasks.put_new(task)
         # Put a new task to visit the directory again.
         self._tasks.put_visited(result.task, dir_changed)
     # Result sucessfully processed.
     self._results.report_done(result)
Example #8
0
 def setUp(self):
     self._url = URL('ftp://deltha.uh.cu/')
     self._site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777'
     self._task = CrawlTask(self._site_id, self._url)
Example #9
0
class TestCrawlTask(unittest.TestCase):
    def setUp(self):
        self._url = URL('ftp://deltha.uh.cu/')
        self._site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777'
        self._task = CrawlTask(self._site_id, self._url)

    def test_properties(self):
        self.assertEquals(self._task.site_id, self._site_id)
        self.assertEquals(str(self._task.url), str(self._url))
        self.assertEquals(self._task.revisit_wait, 0)
        self.assertEquals(self._task.revisit_count, -1)
        self.assertEquals(self._task.change_count, 0)

    def test_pickling(self):
        task = pickle.loads(pickle.dumps(self._task))
        self.assertEquals(self._task.site_id, task.site_id)
        self.assertEquals(str(self._task.url), str(task.url))
        self.assertEquals(self._task.revisit_wait, task.revisit_wait)
        self.assertEquals(self._task.revisit_count, task.revisit_count)
        self.assertEquals(self._task.change_count, task.change_count)

    def test_revisit_wait(self):
        self._task.report_visit(True)
        self._task.report_visit(False)
        self._task.revisit_wait = 60
        self.assertEquals(self._task.revisit_wait, 60)

    def test_reset_counters(self):
        self._task.report_visit(True)
        self._task.report_visit(True)
        self._task.revisit_wait = 60
        self._task.reset_change_count()
        self.assertEquals(self._task.revisit_wait, 60)
        self.assertEquals(self._task.revisit_count, 1)
        self.assertEquals(self._task.change_count, 0)

    def test_report_visit(self):
        self._task.report_visit(True)
        # Reporting visit without changes.
        self._task.report_visit(False)
        self._task.report_visit(False)
        self.assertEquals(self._task.revisit_count, 2)
        self.assertEquals(self._task.change_count, 0)
        # Reporting visits with changes.
        self._task.report_visit(True)
        self._task.report_visit(True)
        self.assertEquals(self._task.revisit_count, 4)
        self.assertEquals(self._task.change_count, 2)