def pop(self): """Return the next document to fetch""" document_metadata = DocumentMetadata() item = self.priority_store.pop(int(time.time())) if item: logging.debug("Get priority:" + str(item[1])) document_metadata.url = item[1] document_metadata.depth = item[3] document_metadata.delay = item[2] document_metadata.source = Source.priority else: while not item: item = self.normal_store.pop() if not item: break # the following check is needed because urls are stored in seen # after seeing them # so we can have multiple identical url in normal list. # and we do not want to have multiple same urls in refetching list if not self.seen.is_new(item[1]): item = None if item: # In case of network error I repush url on normal queue # just to not loose them. So it is possible we have # something already seen here. # It is not a problem to refetch this cases logging.debug("Get normal:" + str(item[1])) document_metadata.url = item[1] document_metadata.depth = item[0] document_metadata.delay = 0 document_metadata.source = Source.normal else: item = self.refetch_store.pop(int(time.time())) if item: logging.debug("Get Refetch:" + str(item[1])) document_metadata.url = item[1] document_metadata.depth = item[3] document_metadata.delay = item[2] document_metadata.source = Source.refetch return document_metadata
def test_pop_ordering(self, mc): ''' Test adding url to priority, normal and refetch and checking the ordering of popping is correct ''' mc.return_value = mongomock.MongoClient() qm = QueueManager("queues-names", START_DELAY, CONFIGURATION) # inserting a priority url. urls = [ "www.daniele.com", ] qm.init_priority_list(urls) # inserting a normal url. burls = [ { "url": "www.daniele1.com", "depth": "2" }, ] qm.add_bootstrap_urls(burls) # inserting a refetch url dm = DocumentMetadata("http://www.randomurl8.it") dm.depth = 1 dm.dhash = 121212 dm.source = Source.normal dm.delay = 500 dm.alternatives = ["http://www.randomurl8.it"] qm.add_seen_and_reschedule(dm) # make sure all the inserted url are ready to be popped with mock.patch("time.time", mock_time): doc = qm.pop() # first one from priority self.assertEqual(doc.depth, 0) self.assertEqual(doc.source, Source.priority) doc = qm.pop() # second one from normal self.assertEqual(doc.source, Source.normal) doc = qm.pop() # third from refetching self.assertEqual(doc.source, Source.refetch)
def test_reschedule_priority(self): ############################################ # inserting a priority url dm = DocumentMetadata("http://www.randomurl8.it") dm.depth = 1 dm.dhash = 121212 dm.source = Source.priority dm.delay = 500 dm.alternatives = ["http://www.randomurl8.it"] self.qm.add_seen_and_reschedule(dm) # checking all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.delay, START_DELAY) self.assertEqual(refetching_data.source, Source.priority)
def test_reschedule_different_content(self): ############################################ # this one is in seen with a different hash and # not taken from priority queue # I expect: halving the delay and set seen counter to 1 dm = DocumentMetadata("http://www.randomurl1.it") dm.depth = 1 dm.dhash = 1936 dm.source = Source.normal dm.delay = 20 dm.alternatives = ["http://www.randomurl1.it"] self.qm.add_seen_and_reschedule(dm) # checking all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.delay, dm.delay / 2) self.assertEqual(refetching_data.source, Source.refetch)
def test_reschedule4(self): ############################################ # inserting a new url dm = DocumentMetadata("http://www.randomurl8.it") dm.depth = 1 dm.dhash = 121212 dm.source = Source.normal dm.delay = 500 dm.alternatives = ["http://www.randomurl8.it"] self.qm.add_seen_and_reschedule(dm) # checking all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.delay, CONFIGURATION["queues"]["refetching-delay"]) self.assertEqual(refetching_data.source, Source.refetch)
def test_reschedule3(self): ############################################ # as before but with a small delay. # cheking delay not changing dm = DocumentMetadata("http://www.randomurl1.it") dm.depth = 1 dm.dhash = 121212 dm.source = Source.normal dm.delay = 500 dm.alternatives = ["http://www.randomurl1.it"] self.qm.add_seen_and_reschedule(dm) # checking all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.delay, dm.delay) self.assertEqual(refetching_data.source, Source.refetch)
def test_reschedule_samecontent_lastdelay(self): ############################################ # testing rescheduling some urls # this one is in seen with the same hash and # not taken from priority queue # I expect: doublig the delay and set seen counter to 1 dm = DocumentMetadata("http://www.randomurl1.it") dm.depth = 1 dm.dhash = 12345 dm.source = Source.normal dm.delay = 40 # alternatives contains always at least one url. dm.alternatives = ["http://www.randomurl1.it"] # we want to check that former alternatives are also correctly # updated even if new alternatives field is different. alternatives = self.qm.seen.get(dm.url).get("alternatives") self.assertNotEqual(len(dm.alternatives), len(alternatives)) self.qm.add_seen_and_reschedule(dm) # check all the parameters counter = self.qm.seen.get(dm.url).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) # check updated all the alternatives for urls in alternatives: counter = self.qm.seen.get(urls).get("count") dhash = self.qm.seen.get(dm.url).get("page_hash") self.assertEqual(counter, 1) self.assertEqual(dhash, dm.dhash) with mock.patch("time.time", mock_time): refetching_data = self.qm.pop() self.assertEqual(refetching_data.url, "") self.assertEqual(refetching_data.source, Source.unknown)