def test_with_multiple_active_queues(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" s.FRONTIER_ACTIVE_QUEUES = 2 s.FRONTIER_QUEUE_BUDGET = 4 s.FRONTIER_QUEUE_BUDGET_PUNISH = 5 frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) curi1 = CrawlUri("http://localhost") curi1.current_priority = 2 curi1.req_time = 0.4 frontier.add_uri(curi1) cur = frontier._front_end_queues._cursor curi2 = CrawlUri("http://www.google.de") curi2.current_priority = 1 curi2.req_time = 1.4 frontier.add_uri(curi2) self.assertEqual(0, len(frontier._current_queues)) frontier._maybe_add_queues() self.assertEqual(2, len(frontier._current_queues)) next_url = frontier.get_next()
def test_with_multiple_active_queues(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" s.FRONTIER_ACTIVE_QUEUES = 2 s.FRONTIER_QUEUE_BUDGET = 4 s.FRONTIER_QUEUE_BUDGET_PUNISH = 5 frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) curi1 = CrawlUri("http://localhost") curi1.current_priority = 2 curi1.req_time = 0.4 frontier.add_uri(curi1) cur = frontier._front_end_queues._cursor curi2 = CrawlUri("http://www.google.de") curi2.current_priority = 1 curi2.req_time = 1.4 frontier.add_uri(curi2) self.assertEqual(0, len(frontier._current_queues)) frontier._maybe_add_queues() self.assertEqual(2, len(frontier._current_queues)) next_url = frontier.get_next()
def test_that_time_based_politeness_works(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = SingleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime(*datetime.fromtimestamp( time.time()).timetuple()[0:6]) - timedelta(days=2) curi = CrawlUri("http://localhost/test") curi.current_priority = 3 curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) } curi.req_time = 0.5 frontier._add_to_heap(frontier._uri_from_curi(curi), 0) a = frontier._next_possible_crawl frontier.process_successful_crawl(curi) self.assertTrue(frontier._next_possible_crawl > a) self.assertTrue(frontier._next_possible_crawl > time.time()) self.assertRaises(Empty, frontier.get_next)
def test_that_time_based_politeness_works(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = SingleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime( *datetime.fromtimestamp(time.time()).timetuple()[0:6]) - timedelta( days=2) curi = CrawlUri("http://localhost/test") curi.current_priority = 3 curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)} curi.req_time = 0.5 frontier._add_to_heap(frontier._uri_from_curi(curi), 0) a = frontier._next_possible_crawl frontier.process_successful_crawl(curi) self.assertTrue(frontier._next_possible_crawl > a) self.assertTrue(frontier._next_possible_crawl > time.time()) self.assertRaises(Empty, frontier.get_next)
def test_queues_work(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" s.FRONTIER_ACTIVE_QUEUES = 1 s.FRONTIER_QUEUE_BUDGET = 4 s.FRONTIER_QUEUE_BUDGET_PUNISH = 5 frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) curi1 = CrawlUri("http://localhost") curi1.current_priority = 2 curi1.req_time = 0.4 frontier.add_uri(curi1) cur = frontier._front_end_queues._cursor curi2 = CrawlUri("http://foreignhost") curi2.current_priority = 1 curi2.req_time = 1.4 frontier.add_uri(curi2) self.assertEqual(0, len(frontier._current_queues)) frontier._maybe_add_queues() self.assertEqual(1, len(frontier._current_queues)) for q1 in frontier._current_queues.keys(): pass self.assertEquals(4, frontier._budget_politeness[q1]) frontier._cleanup_budget_politeness() self.assertEquals(4, frontier._budget_politeness[q1]) frontier._update_heap() self.assertEqual(1, len(frontier._current_queues)) if q1 == 1: curi1.status_code = 500 frontier.process_server_error(curi1) else: curi1.status_code = 500 frontier.process_server_error(curi2) self.assertEquals(-1, frontier._budget_politeness[q1]) frontier._cleanup_budget_politeness() self.assertEqual(1, len(frontier._current_queues)) for q2 in frontier._current_queues.keys(): pass self.assertEquals(4, frontier._budget_politeness[q2]) frontier._cleanup_budget_politeness() self.assertEquals(4, frontier._budget_politeness[q2]) frontier._update_heap() self.assertEqual(1, len(frontier._current_queues)) if q2 == 1: curi1.status_code = 200 frontier.process_successful_crawl(curi1) else: curi2.status_code = 200 frontier.process_successful_crawl(curi2) self.assertEquals(3, frontier._budget_politeness[q2]) frontier._cleanup_budget_politeness()
def test_queues_work(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" s.FRONTIER_ACTIVE_QUEUES = 1 s.FRONTIER_QUEUE_BUDGET = 4 s.FRONTIER_QUEUE_BUDGET_PUNISH = 5 frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) curi1 = CrawlUri("http://localhost") curi1.current_priority = 2 curi1.req_time = 0.4 frontier.add_uri(curi1) cur = frontier._front_end_queues._cursor curi2 = CrawlUri("http://foreignhost") curi2.current_priority = 1 curi2.req_time = 1.4 frontier.add_uri(curi2) self.assertEqual(0, len(frontier._current_queues)) frontier._maybe_add_queues() self.assertEqual(1, len(frontier._current_queues)) for q1 in frontier._current_queues.keys(): pass self.assertEquals(4, frontier._budget_politeness[q1]) frontier._cleanup_budget_politeness() self.assertEquals(4, frontier._budget_politeness[q1]) frontier._update_heap() self.assertEqual(1, len(frontier._current_queues)) if q1 == 1: curi1.status_code = 500 frontier.process_server_error(curi1) else: curi1.status_code = 500 frontier.process_server_error(curi2) self.assertEquals(-1, frontier._budget_politeness[q1]) frontier._cleanup_budget_politeness() self.assertEqual(1, len(frontier._current_queues)) for q2 in frontier._current_queues.keys(): pass self.assertEquals(4, frontier._budget_politeness[q2]) frontier._cleanup_budget_politeness() self.assertEquals(4, frontier._budget_politeness[q2]) frontier._update_heap() self.assertEqual(1, len(frontier._current_queues)) if q2 == 1: curi1.status_code = 200 frontier.process_successful_crawl(curi1) else: curi2.status_code = 200 frontier.process_successful_crawl(curi2) self.assertEquals(3, frontier._budget_politeness[q2]) frontier._cleanup_budget_politeness()