def test_should_respect_the_robots_txt_rules(self): rules = ''' User-Agent: * Disallow: /login ''' solution_one.RERP.parse(rules) url = 'http://test.com/login' self.assertFalse(solution_one.can_visit_link(url)) url = 'http://test.com/logout' self.assertTrue(solution_one.can_visit_link(url))
def test_should_not_allow_external_links(self): url = 'http://test.com/internal' self.assertTrue(solution_one.can_visit_link(url)) url = 'http://external.com/home' self.assertFalse(solution_one.can_visit_link(url))
def test_should_not_allow_the_same_url_twice(self): url = 'http://test.com/twice' self.assertTrue(solution_one.can_visit_link(url)) solution_one.discovered = [url, ] self.assertFalse(solution_one.can_visit_link(url))
def test_should_check_the_black_list(self): solution_one.BLACKLIST_REGEX = re.compile(r'/private') url = 'http://test.com/private' self.assertFalse(solution_one.can_visit_link(url)) url = 'http://test.com/public' self.assertTrue(solution_one.can_visit_link(url))