Beispiel #1
0
 def test_interesting_files(self):
     process = ProcessIndexOfRequest(None, self.get_crawler_url())
     process.process(self.html, BeautifulSoup(self.html, 'html.parser'))
     self.assertEqual([file.url for file in process.interesting_files()], [
         'http://domain.com/path/foo.php',
         'http://domain.com/path/error_log'
     ])
Beispiel #2
0
 def test_process(self):
     process = ProcessIndexOfRequest(None, self.get_crawler_url())
     process.process(
         TestCommonDirectoryList.html,
         BeautifulSoup(TestCommonDirectoryList.html, 'html.parser'))
     links = [link.url for link in process.files]
     self.assertEqual(links, TestCommonDirectoryList.urls)
Beispiel #3
0
 def test_process(self):
     process = ProcessIndexOfRequest(None, self.get_crawler_url())
     process.process(self.html, BeautifulSoup(self.html, 'html.parser'))
     urls = [file.url for file in process.files]
     self.assertEqual(urls, [
         'http://domain.com/',
         'http://domain.com/path/dir/',
         'http://domain.com/path/foo.php',
         'http://domain.com/path/error_log',
         'http://domain.com/spam/eggs',
     ])
Beispiel #4
0
 def test_is_applicable(self):
     crawler_url = self.get_crawler_url()
     with requests_mock.mock() as m:
         m.get('http://test.com', text=self.html, headers={'Content-Type': 'text/html'})
         r = requests.get('http://test.com')
         soup = BeautifulSoup(self.html, 'html.parser')
         self.assertTrue(ProcessIndexOfRequest.is_applicable(r, self.html, crawler_url, soup))
Beispiel #5
0
 def test_str(self):
     process = ProcessIndexOfRequest(None, self.get_crawler_url())
     process.process(self.html, BeautifulSoup(self.html, 'html.parser'))
     str(process)
Beispiel #6
0
 def test_flag_nothing(self):
     process = ProcessIndexOfRequest(None, self.get_crawler_url())
     process.process('', BeautifulSoup('', 'html.parser'))
     self.assertEqual(process.flags, {'index_of', 'index_of.nothing'})
Beispiel #7
0
 def get_processor(self):
     return ProcessIndexOfRequest(None, self.get_crawler_url())