Beispiel #1
0
    def test_robots_given_lower_path_allowed_url(self):
        # allowed by /search/about after /search is forbidden
        url = "https://google.com/search/about"
        checker = RobotsIndex(True, 'duckduckbot')
        self.assertEqual(checker.size(), 0)

        self.assertFalse(check_link(url, checker))
        self.assertEqual(checker.size(), 1)

        # subsequent checks should reuse known robots.txt file
        self.assertFalse(check_link(url + '/more', checker))
        self.assertFalse(check_link(url + '/plus', checker))
        self.assertFalse(check_link(url + '/extra', checker))
        self.assertEqual(checker.size(), 1)
Beispiel #2
0
    def test_robots_given_forbidden_url(self):
        # prohibited explicitly
        url = "https://github.com/search/"
        checker = RobotsIndex(True, 'duckduckbot')

        self.assertTrue(check_link(url, checker))
Beispiel #3
0
    def test_robots_given_allowed_url(self):
        # allowed expliticly
        url = "https://www.google.com/m/finance"
        checker = RobotsIndex(True, 'duckduckbot')

        self.assertFalse(check_link(url, checker))
Beispiel #4
0
    def test_robots_given_asterisk_path_allowed_url(self):
        # allowed by /*/*/tree/master
        url = "https://github.com/rivermont/spidy/tree/master"
        checker = RobotsIndex(True, 'duckduckbot')

        self.assertFalse(check_link(url, checker))
Beispiel #5
0
 def test_check_link_given_short_url(self):
     url = "http://a"
     self.assertTrue(check_link(url))
Beispiel #6
0
 def test_check_link_given_invalid_url2(self):
     url = "github.com"
     self.assertTrue(check_link(url))
Beispiel #7
0
 def test_check_link_given_invalid_url(self):
     url = "www.blah.com"
     self.assertTrue(check_link(url))
Beispiel #8
0
 def test_check_link_given_valid_url(self):
     url = "http://www.github.com"
     self.assertFalse(check_link(url))
Beispiel #9
0
    def test_robots_given_lower_path_allowed_url(self):
        # allowed by /search/about after /search is forbidden
        url = "https://google.com/search/about"
        checker = init_robot_checker(True, 'duckduckbot', url)

        self.assertFalse(check_link(url, checker))