Ejemplo n.º 1
0
 def test_next_url_new_domain_block(self):
     my_example = "https://my.example.com/"
     www_example = "https://www.example.com/"
     schedule = Scheduler(my_example) # Constructor calls upon schedule_url
     schedule.schedule_url(www_example)
     self.assertEqual(schedule.next_url(), my_example)
     self.assertEqual(schedule.next_url(), www_example)
Ejemplo n.º 2
0
 def test_schedule_multiple_url_different_subdomain(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     schedule.schedule_url("https://www.example.com/path/to")
     www_example = schedule.blocks_to_crawl.pop()
     my_example = schedule.blocks_to_crawl.pop()
     self.assertNotEqual(www_example, my_example)
     self.assertTrue(www_example.next_url())
     self.assertTrue(my_example.next_url())
Ejemplo n.º 3
0
 def test_schedule_url_already_crawled(self):
     url = "https://my.example.com/path/to/location"
     schedule = Scheduler(url) # Constructor calls upon schedule_url
     schedule.next_url()
     self.assertFalse(schedule.schedule_url(url))
Ejemplo n.º 4
0
 def test_next_url_extensions(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     schedule.schedule_url("https://my.example.com/path/to/location?key=val;word=bird#frag")
     correct_output = "https://my.example.com/path/to/location?key=val;word=bird#frag" 
     self.assertEqual(schedule.next_url(), correct_output)
Ejemplo n.º 5
0
 def test_schedule_url_variety(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location?key1=val1;key2=val2#frag")
     self.assertTrue(has_been_scheduled)
Ejemplo n.º 6
0
 def test_schedule_url_params(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location;key=val")
     self.assertTrue(has_been_scheduled)
Ejemplo n.º 7
0
 def test_schedule_url_fragment(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location#maincontent")
     self.assertTrue(has_been_scheduled)
Ejemplo n.º 8
0
 def test_schedule_url_different_domain(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.org/path/to/location")
     self.assertFalse(has_been_scheduled)
Ejemplo n.º 9
0
 def test_schedule_same_url_twice(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location")
     self.assertFalse(has_been_scheduled)
Ejemplo n.º 10
0
 def test_schedule_multiple_url_same_netloc(self):
     schedule = Scheduler("https://www.example.com/path/to/location") # Constructor calls upon schedule_url
     schedule.schedule_url("https://www.example.com/path/to")
     block = schedule.blocks_to_crawl.pop()
     correct_output = deque(["/path/to", "/path/to/location"])
     self.assertEqual(sorted(block.extensions_to_crawl), sorted(correct_output))