def test_basic(self): http_client = Client() client = RichClient(http_client) session = client.session(Request.new(self.get_url('/'))) self.assertFalse(session.done) response = yield session.fetch() self.assertEqual(200, response.status_code) self.assertTrue(session.done)
def test_bad_redirect(self): http_client = Client() client = RichClient(http_client) session = client.session(Request.new(self.get_url('/bad_redirect'))) while not session.done: try: yield session.fetch() except ProtocolError: return else: self.fail()
def test_fetch_disallow(self): http_client = MockHTTPClient() pool = RobotsTxtPool() client = RichClient(http_client, pool) session = MockRobotsTxtRichClientSession( client, Request.new('http://example.com')) self.assertEqual(RobotsState.unknown, session._robots_state) request = session.next_request self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response('HTTP/1.0', 200, 'OK') response.body.content_file = io.StringIO('User-agent:*\nDisallow: /\n') http_client.response = response yield session.fetch() self.assertEqual(RobotsState.denied, session._robots_state) request = session.next_request self.assertIsNone(request) try: yield session.fetch() except RobotsDenied: pass else: self.fail() self.assertTrue(session.done)
def test_redirect_loop(self): http_client = MockHTTPClient() pool = RobotsTxtPool() client = RichClient(http_client, pool) session = MockRobotsTxtRichClientSession( client, Request.new('http://example.com') ) self.assertEqual(RobotsState.unknown, session._robots_state) for dummy in range(21): request = session.next_request self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response('HTTP/1.0', 302, 'See else') response.url_info = request.url_info response.fields['location'] = '/robots.txt' http_client.response = response yield session.fetch() request = session.next_request self.assertTrue(request) response = Response('HTTP/1.0', 200, 'OK') http_client.response = response yield session.fetch() self.assertEqual(RobotsState.ok, session._robots_state) print(session.next_request) self.assertTrue(session.done)
def test_server_error(self): http_client = MockHTTPClient() pool = RobotsTxtPool() client = RichClient(http_client, pool) session = MockRobotsTxtRichClientSession( client, Request.new('http://example.com') ) self.assertEqual(RobotsState.unknown, session._robots_state) for dummy in range(21): request = session.next_request self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response('HTTP/1.0', 500, 'Opps') http_client.response = response yield session.fetch() request = session.next_request self.assertIsNone(request) try: yield session.fetch() except RobotsDenied: pass else: self.fail() self.assertTrue(session.done)
def test_redirect(self): http_client = Client() client = RichClient(http_client) session = client.session(Request.new(self.get_url('/redirect'))) status_codes = [] while not session.done: response = yield session.fetch() if not status_codes: self.assertEqual( RichClientResponseType.redirect, session.response_type) status_codes.append(response.status_code) self.assertEqual([301, 200], status_codes) self.assertTrue(session.done) self.assertEqual(RichClientResponseType.normal, session.response_type)
def test_redirect(self): http_client = Client() client = RichClient(http_client) session = client.session(Request.new(self.get_url('/redirect'))) status_codes = [] while not session.done: response = yield session.fetch() if not status_codes: self.assertEqual(RichClientResponseType.redirect, session.response_type) status_codes.append(response.status_code) self.assertEqual([301, 200], status_codes) self.assertTrue(session.done) self.assertEqual(RichClientResponseType.normal, session.response_type)
def test_fetch_allow_redirects(self): http_client = MockHTTPClient() pool = RobotsTxtPool() client = RichClient(http_client, pool) session = MockRobotsTxtRichClientSession( client, Request.new('http://example.com')) self.assertEqual(RobotsState.unknown, session._robots_state) # Try fetch example.com/ (need robots.txt) self.assertFalse(session.done) request = session.next_request self.assertEqual('http://example.com/robots.txt', request.url_info.url) response = Response('HTTP/1.0', 301, 'Moved') response.fields['location'] = 'http://www.example.com/robots.txt' http_client.response = response yield session.fetch() self.assertEqual(RobotsState.in_progress, session._robots_state) # Try fetch www.example.com/robots.txt self.assertFalse(session.done) request = session.next_request self.assertEqual('http://www.example.com/robots.txt', request.url_info.url) response = Response('HTTP/1.0', 301, 'Moved') response.fields['location'] = 'http://www.example.net/robots.txt' http_client.response = response yield session.fetch() self.assertEqual(RobotsState.in_progress, session._robots_state) # Try fetch www.example.net/robots.txt self.assertFalse(session.done) request = session.next_request self.assertEqual('http://www.example.net/robots.txt', request.url_info.url) response = Response('HTTP/1.0', 200, 'OK') response.body.content_file = io.StringIO('User-agent:*\nAllow: /\n') http_client.response = response yield session.fetch() self.assertEqual(RobotsState.ok, session._robots_state) # Try fetch example.com/ (robots.txt already fetched) self.assertFalse(session.done) request = session.next_request self.assertEqual('http://example.com/', request.url_info.url) response = Response('HTTP/1.0', 301, 'Moved') response.fields['location'] = 'http://www.example.com/' http_client.response = response yield session.fetch() self.assertEqual(RobotsState.ok, session._robots_state) # Try www.example.com/ (robots.txt already fetched) self.assertFalse(session.done) request = session.next_request self.assertEqual('http://www.example.com/', request.url_info.url) response = Response('HTTP/1.0', 301, 'Moved') response.fields['location'] = 'http://www.example.net/' http_client.response = response yield session.fetch() self.assertEqual(RobotsState.ok, session._robots_state) # Try www.example.net/ (robots.txt already fetched) self.assertFalse(session.done) request = session.next_request self.assertEqual('http://www.example.net/', request.url_info.url) response = Response('HTTP/1.0', 301, 'Moved') response.fields['location'] = 'http://lol.example.net/' http_client.response = response yield session.fetch() self.assertEqual(RobotsState.ok, session._robots_state) # Try lol.example.net/ (need robots.txt) self.assertFalse(session.done) request = session.next_request self.assertEqual('http://lol.example.net/robots.txt', request.url_info.url) response = Response('HTTP/1.0', 200, 'OK') response.body.content_file = io.StringIO('User-agent:*\nAllow: /\n') http_client.response = response yield session.fetch() self.assertEqual(RobotsState.in_progress, session._robots_state) # Try lol.example.net/ (robots.txt already fetched) self.assertFalse(session.done) request = session.next_request self.assertEqual('http://lol.example.net/', request.url_info.url) response = Response('HTTP/1.0', 200, 'OK') http_client.response = response yield session.fetch() self.assertEqual(RobotsState.ok, session._robots_state) self.assertTrue(session.done)