def _process(self, item): url = item["url"] log.debug(u"Crawling: %s", url) uri = httplib2.iri2uri(url) report = {"url": url, "result": None, "status_code": None, "visited": None} total_start_time = time.time() (scheme, authority, _path, _query, _fragment) = httplib2.parse_uri(uri) if scheme is None or authority is None: report["result"] = u"Invalid URI" return report try: # this line is copied from robotsparser.py:can_fetch urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) except KeyError: report["result"] = u"Malformed URL quoting." return report try: robot_check_result = self.ask_robots(uri, scheme, authority) # Graceful stop thing. if robot_check_result is None: raise Stop() except CrawlError, e: report["result"] = unicode(e) return report
def _follow_redirect(self, uri, method, body, headers, response, content, max_redirects): """ Internal function to follow a redirect recieved by L{request} """ (scheme, authority, absolute_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri)) if self.cache: cachekey = defrag_uri else: cachekey = None # Pick out the location header and basically start from the beginning # remembering first to strip the ETag header and decrement our 'depth' if not response.has_key('location') and response.status != 300: raise httplib2.RedirectMissingLocation("Redirected but the response is missing a Location: header.", response, content) # Fix-up relative redirects (which violate an RFC 2616 MUST) if response.has_key('location'): location = response['location'] (scheme, authority, path, query, fragment) = httplib2.parse_uri(location) if authority == None: response['location'] = httplib2.urlparse.urljoin(uri, location) logging.debug('Relative redirect: changed [%s] to [%s]' % (location, response['location'])) if response.status == 301 and method in ["GET", "HEAD"]: response['-x-permanent-redirect-url'] = response['location'] if not response.has_key('content-location'): response['content-location'] = absolute_uri httplib2._updateCache(headers, response, content, self.cache, cachekey) headers.pop('if-none-match', None) headers.pop('if-modified-since', None) if response.has_key('location'): location = response['location'] redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method return self.request(location, redirect_method, body=body, headers = headers, max_redirects = max_redirects - 1) else: raise httplib2.RedirectLimit("Redirected more times than redirection_limit allows.", response, content)
def testFromStd66(self): self.assertEqual( ('http', 'example.com', '', None, None ), httplib2.parse_uri("http://example.com")) self.assertEqual( ('https', 'example.com', '', None, None ), httplib2.parse_uri("https://example.com")) self.assertEqual( ('https', 'example.com:8080', '', None, None ), httplib2.parse_uri("https://example.com:8080")) self.assertEqual( ('http', 'example.com', '/', None, None ), httplib2.parse_uri("http://example.com/")) self.assertEqual( ('http', 'example.com', '/path', None, None ), httplib2.parse_uri("http://example.com/path")) self.assertEqual( ('http', 'example.com', '/path', 'a=1&b=2', None ), httplib2.parse_uri("http://example.com/path?a=1&b=2")) self.assertEqual( ('http', 'example.com', '/path', 'a=1&b=2', 'fred' ), httplib2.parse_uri("http://example.com/path?a=1&b=2#fred")) self.assertEqual( ('http', 'example.com', '/path', 'a=1&b=2', 'fred' ), httplib2.parse_uri("http://example.com/path?a=1&b=2#fred"))
def _follow_redirect(self, uri, method, body, headers, response, content, max_redirects): """Internal function to follow a redirect recieved by L{request}""" (scheme, authority, absolute_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri)) if self.cache: cachekey = defrag_uri else: cachekey = None # Pick out the location header and basically start from the beginning # remembering first to strip the ETag header and decrement our 'depth' if "location" not in response and response.status != 300: raise httplib2.RedirectMissingLocation( "Redirected but the response is missing a Location: header.", response, content) # Fix-up relative redirects (which violate an RFC 2616 MUST) if "location" in response: location = response['location'] (scheme, authority, path, query, fragment) = httplib2.parse_uri(location) if authority is None: response['location'] = httplib2.urlparse.urljoin(uri, location) pywikibot.debug( u"Relative redirect: changed [%s] to [%s]" % (location, response['location']), _logger) if response.status == 301 and method in ["GET", "HEAD"]: response['-x-permanent-redirect-url'] = response['location'] if "content-location" not in response: response['content-location'] = absolute_uri httplib2._updateCache(headers, response, content, self.cache, cachekey) headers.pop('if-none-match', None) headers.pop('if-modified-since', None) if "location" in response: location = response['location'] redirect_method = ( (response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method return self.request(location, redirect_method, body=body, headers=headers, max_redirects=max_redirects - 1) else: raise httplib2.RedirectLimit( "Redirected more times than redirection_limit allows.", response, content)
def test_from_std66(): cases = ( ('http://example.com', ('http', 'example.com', '', None, None)), ('https://example.com', ('https', 'example.com', '', None, None)), ('https://example.com:8080', ('https', 'example.com:8080', '', None, None)), ('http://example.com/', ('http', 'example.com', '/', None, None)), ('http://example.com/path', ('http', 'example.com', '/path', None, None)), ('http://example.com/path?a=1&b=2', ('http', 'example.com', '/path', 'a=1&b=2', None)), ('http://example.com/path?a=1&b=2#fred', ('http', 'example.com', '/path', 'a=1&b=2', 'fred')), ('http://example.com/path?a=1&b=2#fred', ('http', 'example.com', '/path', 'a=1&b=2', 'fred')), ) for a, b in cases: assert httplib2.parse_uri(a) == b
def test_from_std66(): cases = ( ("http://example.com", ("http", "example.com", "", None, None)), ("https://example.com", ("https", "example.com", "", None, None)), ("https://example.com:8080", ("https", "example.com:8080", "", None, None)), ("http://example.com/", ("http", "example.com", "/", None, None)), ("http://example.com/path", ("http", "example.com", "/path", None, None)), ( "http://example.com/path?a=1&b=2", ("http", "example.com", "/path", "a=1&b=2", None), ), ( "http://example.com/path?a=1&b=2#fred", ("http", "example.com", "/path", "a=1&b=2", "fred"), ), ( "http://example.com/path?a=1&b=2#fred", ("http", "example.com", "/path", "a=1&b=2", "fred"), ), ) for a, b in cases: assert httplib2.parse_uri(a) == b
def OverrideRequest(self, conn, host, absolute_uri, request_uri, method, body, headers, redirections, cachekey): """Do the actual request using the connection object. Also follow one level of redirects if necessary. """ auths = ([(auth.depth(request_uri), auth) for auth in self.authorizations if auth.inscope(host, request_uri)]) auth = auths and sorted(auths)[0][1] or None if auth: auth.request(method, request_uri, headers, body) (response, content) = self._conn_request(conn, request_uri, method, body, headers) if auth: if auth.response(response, body): auth.request(method, request_uri, headers, body) (response, content) = self._conn_request(conn, request_uri, method, body, headers) response._stale_digest = 1 if response.status == 401: for authorization in self._auth_from_challenge( host, request_uri, headers, response, content): authorization.request(method, request_uri, headers, body) (response, content) = self._conn_request(conn, request_uri, method, body, headers) if response.status != 401: self.authorizations.append(authorization) authorization.response(response, body) break if (self.follow_all_redirects or (method in ["GET", "HEAD"]) or response.status == 303): if self.follow_redirects and response.status in [300, 301, 302, 303, 307]: # Pick out the location header and basically start from the beginning # remembering first to strip the ETag header and decrement our 'depth' if redirections: if not response.has_key('location') and response.status != 300: raise httplib2.RedirectMissingLocation( "Redirected but the response is missing a Location: header.", response, content) # Fix-up relative redirects (which violate an RFC 2616 MUST) if response.has_key('location'): location = response['location'] (scheme, authority, path, query, fragment) = parse_uri(location) if authority == None: response['location'] = urlparse.urljoin(absolute_uri, location) if response.status == 301 and method in ["GET", "HEAD"]: response['-x-permanent-redirect-url'] = response['location'] if not response.has_key('content-location'): response['content-location'] = absolute_uri httplib2._updateCache(headers, response, content, self.cache, cachekey) if headers.has_key('if-none-match'): del headers['if-none-match'] if headers.has_key('if-modified-since'): del headers['if-modified-since'] if ('authorization' in headers and not self.forward_authorization_headers): del headers['authorization'] if response.has_key('location'): location = response['location'] old_response = copy.deepcopy(response) if not old_response.has_key('content-location'): old_response['content-location'] = absolute_uri redirect_method = method if response.status in [302, 303]: redirect_method = "GET" body = None (response, content) = self.request( location, redirect_method, body=body, headers=headers, redirections=redirections-1, connection_type=conn.__class__) response.previous = old_response else: raise httplib2.RedirectLimit( "Redirected more times than redirection_limit allows.", response, content) elif response.status in [200, 203] and method in ["GET", "HEAD"]: # Don't cache 206's since we aren't going to handle byte range # requests if not response.has_key('content-location'): response['content-location'] = absolute_uri httplib2._updateCache(headers, response, content, self.cache, cachekey) return (response, content)
def OverrideRequest(self, conn, host, absolute_uri, request_uri, method, body, headers, redirections, cachekey): """Do the actual request using the connection object. Also follow one level of redirects if necessary. """ auths = ([(auth.depth(request_uri), auth) for auth in self.authorizations if auth.inscope(host, request_uri)]) auth = auths and sorted(auths)[0][1] or None if auth: auth.request(method, request_uri, headers, body) (response, content) = self._conn_request(conn, request_uri, method, body, headers) if auth: if auth.response(response, body): auth.request(method, request_uri, headers, body) (response, content) = self._conn_request(conn, request_uri, method, body, headers) response._stale_digest = 1 if response.status == 401: for authorization in self._auth_from_challenge( host, request_uri, headers, response, content): authorization.request(method, request_uri, headers, body) (response, content) = self._conn_request(conn, request_uri, method, body, headers) if response.status != 401: self.authorizations.append(authorization) authorization.response(response, body) break if (self.follow_all_redirects or (method in ["GET", "HEAD"]) or response.status == 303): if self.follow_redirects and response.status in [ 300, 301, 302, 303, 307 ]: # Pick out the location header and basically start from the beginning # remembering first to strip the ETag header and decrement our 'depth' if redirections: if not response.has_key( 'location') and response.status != 300: raise httplib2.RedirectMissingLocation( "Redirected but the response is missing a Location: header.", response, content) # Fix-up relative redirects (which violate an RFC 2616 MUST) if response.has_key('location'): location = response['location'] (scheme, authority, path, query, fragment) = parse_uri(location) if authority == None: response['location'] = urlparse.urljoin( absolute_uri, location) if response.status == 301 and method in ["GET", "HEAD"]: response['-x-permanent-redirect-url'] = response[ 'location'] if not response.has_key('content-location'): response['content-location'] = absolute_uri httplib2._updateCache(headers, response, content, self.cache, cachekey) if headers.has_key('if-none-match'): del headers['if-none-match'] if headers.has_key('if-modified-since'): del headers['if-modified-since'] if ('authorization' in headers and not self.forward_authorization_headers): del headers['authorization'] if response.has_key('location'): location = response['location'] old_response = copy.deepcopy(response) if not old_response.has_key('content-location'): old_response['content-location'] = absolute_uri redirect_method = method if response.status in [302, 303]: redirect_method = "GET" body = None (response, content) = self.request( location, redirect_method, body=body, headers=headers, redirections=redirections - 1, connection_type=conn.__class__) response.previous = old_response else: raise httplib2.RedirectLimit( "Redirected more times than redirection_limit allows.", response, content) elif response.status in [200, 203] and method in ["GET", "HEAD"]: # Don't cache 206's since we aren't going to handle byte range # requests if not response.has_key('content-location'): response['content-location'] = absolute_uri httplib2._updateCache(headers, response, content, self.cache, cachekey) return (response, content)