def test_next_url_new_domain_block(self): my_example = "https://my.example.com/" www_example = "https://www.example.com/" schedule = Scheduler(my_example) # Constructor calls upon schedule_url schedule.schedule_url(www_example) self.assertEqual(schedule.next_url(), my_example) self.assertEqual(schedule.next_url(), www_example)
def test_schedule_multiple_url_different_subdomain(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url schedule.schedule_url("https://www.example.com/path/to") www_example = schedule.blocks_to_crawl.pop() my_example = schedule.blocks_to_crawl.pop() self.assertNotEqual(www_example, my_example) self.assertTrue(www_example.next_url()) self.assertTrue(my_example.next_url())
def test_schedule_url_already_crawled(self): url = "https://my.example.com/path/to/location" schedule = Scheduler(url) # Constructor calls upon schedule_url schedule.next_url() self.assertFalse(schedule.schedule_url(url))
def test_next_url_extensions(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url schedule.schedule_url("https://my.example.com/path/to/location?key=val;word=bird#frag") correct_output = "https://my.example.com/path/to/location?key=val;word=bird#frag" self.assertEqual(schedule.next_url(), correct_output)
def test_schedule_url_variety(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location?key1=val1;key2=val2#frag") self.assertTrue(has_been_scheduled)
def test_schedule_url_params(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location;key=val") self.assertTrue(has_been_scheduled)
def test_schedule_url_fragment(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location#maincontent") self.assertTrue(has_been_scheduled)
def test_schedule_url_different_domain(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.org/path/to/location") self.assertFalse(has_been_scheduled)
def test_schedule_same_url_twice(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location") self.assertFalse(has_been_scheduled)
def test_schedule_multiple_url_same_netloc(self): schedule = Scheduler("https://www.example.com/path/to/location") # Constructor calls upon schedule_url schedule.schedule_url("https://www.example.com/path/to") block = schedule.blocks_to_crawl.pop() correct_output = deque(["/path/to", "/path/to/location"]) self.assertEqual(sorted(block.extensions_to_crawl), sorted(correct_output))