def test_safename(): cases = ( ('http://example.org/fred/?a=b', 'example.org,fred,a=b,58489f63a7a83c3b7794a6a398ee8b1f'), ('http://example.org/fred?/a=b', 'example.org,fred,a=b,8c5946d56fec453071f43329ff0be46b'), ('http://www.example.org/fred?/a=b', 'www.example.org,fred,a=b,499c44b8d844a011b67ea2c015116968'), ('https://www.example.org/fred?/a=b', 'www.example.org,fred,a=b,692e843a333484ce0095b070497ab45d'), (httplib2.urlnorm('http://WWW')[-1], httplib2.safename(httplib2.urlnorm('http://www')[-1])), (u'http://\u2304.org/fred/?a=b', 'xn--http,-4y1d.org,fred,a=b,579924c35db315e5a32e3d9963388193'), ) for a, b in cases: assert httplib2.safename(a) == b assert httplib2.safename('http://www') != httplib2.safename('https://www') # Test the max length limits uri = 'http://' + ('w' * 200) + '.org' uri2 = 'http://' + ('w' * 201) + '.org' assert httplib2.safename(uri) != httplib2.safename(uri2) # Max length should be 200 + 1 (',') + 32 assert len(httplib2.safename(uri2)) == 233 assert len(httplib2.safename(uri)) == 233
def TestOneInput(data): fdp = atheris.FuzzedDataProvider(data) original = fdp.ConsumeUnicode(sys.maxsize) try: httplib2.urlnorm(original) except httplib2.RelativeURIError: return return
def test(self): self.assertEqual( "http://example.org/", httplib2.urlnorm("http://example.org")[-1]) self.assertEqual( "http://example.org/", httplib2.urlnorm("http://EXAMple.org")[-1]) self.assertEqual( "http://example.org/?=b", httplib2.urlnorm("http://EXAMple.org?=b")[-1]) self.assertEqual( "http://example.org/mypath?a=b", httplib2.urlnorm("http://EXAMple.org/mypath?a=b")[-1]) self.assertEqual( "http://localhost:80/", httplib2.urlnorm("http://localhost:80")[-1]) self.assertEqual( httplib2.urlnorm("http://localhost:80/"), httplib2.urlnorm("HTTP://LOCALHOST:80")) try: httplib2.urlnorm("/") self.fail("Non-absolute URIs should raise an exception") except httplib2.RelativeURIError: pass
def testGet304(self): # Test that we use ETags properly to validate our cache uri = urllib.parse.urljoin(base, "304/test_etag.txt") (response, content) = self.http.request(uri, "GET") self.assertNotEqual(response['etag'], "") (response, content) = self.http.request(uri, "GET") (response, content) = self.http.request(uri, "GET", headers = {'cache-control': 'must-revalidate'}) self.assertEqual(response.status, 200) self.assertEqual(response.fromcache, True) cache_file_name = os.path.join(cacheDirName, httplib2.safename(httplib2.urlnorm(uri)[-1])) f = open(cache_file_name, "r") status_line = f.readline() f.close() self.assertTrue(status_line.startswith("status:")) (response, content) = self.http.request(uri, "HEAD") self.assertEqual(response.status, 200) self.assertEqual(response.fromcache, True) (response, content) = self.http.request(uri, "GET", headers = {'range': 'bytes=0-0'}) self.assertEqual(response.status, 206) self.assertEqual(response.fromcache, False)
def _follow_redirect(self, uri, method, body, headers, response, content, max_redirects): """ Internal function to follow a redirect recieved by L{request} """ (scheme, authority, absolute_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri)) if self.cache: cachekey = defrag_uri else: cachekey = None # Pick out the location header and basically start from the beginning # remembering first to strip the ETag header and decrement our 'depth' if not response.has_key('location') and response.status != 300: raise httplib2.RedirectMissingLocation("Redirected but the response is missing a Location: header.", response, content) # Fix-up relative redirects (which violate an RFC 2616 MUST) if response.has_key('location'): location = response['location'] (scheme, authority, path, query, fragment) = httplib2.parse_uri(location) if authority == None: response['location'] = httplib2.urlparse.urljoin(uri, location) logging.debug('Relative redirect: changed [%s] to [%s]' % (location, response['location'])) if response.status == 301 and method in ["GET", "HEAD"]: response['-x-permanent-redirect-url'] = response['location'] if not response.has_key('content-location'): response['content-location'] = absolute_uri httplib2._updateCache(headers, response, content, self.cache, cachekey) headers.pop('if-none-match', None) headers.pop('if-modified-since', None) if response.has_key('location'): location = response['location'] redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method return self.request(location, redirect_method, body=body, headers = headers, max_redirects = max_redirects - 1) else: raise httplib2.RedirectLimit("Redirected more times than redirection_limit allows.", response, content)
def request(self, uri, method="GET", body=None, headers=None, max_redirects=None, connection_type=None): """Start an HTTP request. @param uri: The uri to retrieve @param method: (optional) The HTTP method to use. Default is 'GET' @param body: (optional) The request body. Default is no body. @param headers: (optional) Additional headers to send. Defaults include C{connection: keep-alive}, C{user-agent} and C{content-type}. @param max_redirects: (optional) The maximum number of redirects to use for this request. The class instance's max_redirects is default @param connection_type: (optional) see L{httplib2.Http.request} @return: (response, content) tuple """ if max_redirects is None: max_redirects = self.max_redirects if headers is None: headers = {} # Prepare headers headers.pop('cookie', None) req = DummyRequest(uri, headers) self.cookiejar.lock.acquire() try: self.cookiejar.add_cookie_header(req) finally: self.cookiejar.lock.release() headers = req.headers # Wikimedia squids: add connection: keep-alive to request headers # unless overridden headers['connection'] = headers.pop('connection', 'keep-alive') # determine connection pool key and fetch connection (scheme, authority, request_uri, defrag_uri) = httplib2.urlnorm( httplib2.iri2uri(uri)) conn_key = scheme+":"+authority connection = self.connection_pool.pop_connection(conn_key) if connection is not None: self.connections[conn_key] = connection # Redirect hack: we want to regulate redirects follow_redirects = self.follow_redirects self.follow_redirects = False pywikibot.debug(u"%r" % ( (uri.replace("%7C","|"), method, body, headers, max_redirects, connection_type),), _logger) try: (response, content) = httplib2.Http.request( self, uri, method, body, headers, max_redirects, connection_type) except Exception, e: # what types? # return exception instance to be retrieved by the calling thread return e
def test_etag_used(): # Test that we use ETags properly to validate our cache cache_path = tests.get_cache_path() http = httplib2.Http(cache=cache_path) response_kwargs = dict( add_date=True, add_etag=True, body=b'something', headers={ 'cache-control': 'public,max-age=300', }, ) def handler(request): if request.headers.get('range'): return tests.http_response_bytes(status=206, **response_kwargs) return tests.http_response_bytes(**response_kwargs) with tests.server_request(handler, request_count=2) as uri: response, _ = http.request(uri, 'GET', headers={'accept-encoding': 'identity'}) assert response['etag'] == '"437b930db84b8079c2dd804a71936b5f"' http.request(uri, 'GET', headers={'accept-encoding': 'identity'}) response, _ = http.request( uri, 'GET', headers={ 'accept-encoding': 'identity', 'cache-control': 'must-revalidate' }, ) assert response.status == 200 assert response.fromcache # TODO: API to read cache item, at least internal to tests cache_file_name = os.path.join( cache_path, httplib2.safename(httplib2.urlnorm(uri)[-1])) with open(cache_file_name, 'r') as f: status_line = f.readline() assert status_line.startswith("status:") response, content = http.request( uri, 'HEAD', headers={'accept-encoding': 'identity'}) assert response.status == 200 assert response.fromcache response, content = http.request(uri, 'GET', headers={ 'accept-encoding': 'identity', 'range': 'bytes=0-0' }) assert response.status == 206 assert not response.fromcache
def _getCachedHeader(self, uri, header): """Retrieve a cached value for an HTTP header.""" (scheme, authority, request_uri, cachekey) = urlnorm(uri) cached_value = self.get(cachekey) header_start = header + ':' if cached_value is not None: for line in StringIO(cached_value): if line.startswith(header_start): return line[len(header_start):].strip() return None
def test_norm(): cases = ( ('http://example.org', 'http://example.org/'), ('http://EXAMple.org', 'http://example.org/'), ('http://EXAMple.org?=b', 'http://example.org/?=b'), ('http://EXAMple.org/mypath?a=b', 'http://example.org/mypath?a=b'), ('http://localhost:80', 'http://localhost:80/'), ) for a, b in cases: assert httplib2.urlnorm(a)[-1] == b assert httplib2.urlnorm('http://localhost:80/') == httplib2.urlnorm( 'HTTP://LOCALHOST:80') try: httplib2.urlnorm('/') assert False, 'Non-absolute URIs should raise an exception' except httplib2.RelativeURIError: pass
def test_norm(): cases = ( ("http://example.org", "http://example.org/"), ("http://EXAMple.org", "http://example.org/"), ("http://EXAMple.org?=b", "http://example.org/?=b"), ("http://EXAMple.org/mypath?a=b", "http://example.org/mypath?a=b"), ("http://localhost:80", "http://localhost:80/"), ) for a, b in cases: assert httplib2.urlnorm(a)[-1] == b assert httplib2.urlnorm("http://localhost:80/") == httplib2.urlnorm( "HTTP://LOCALHOST:80") try: httplib2.urlnorm("/") assert False, "Non-absolute URIs should raise an exception" except httplib2.RelativeURIError: pass
def test_norm(): cases = ( ("http://example.org", "http://example.org/"), ("http://EXAMple.org", "http://example.org/"), ("http://EXAMple.org?=b", "http://example.org/?=b"), ("http://EXAMple.org/mypath?a=b", "http://example.org/mypath?a=b"), ("http://localhost:80", "http://localhost:80/"), ) for a, b in cases: assert httplib2.urlnorm(a)[-1] == b assert httplib2.urlnorm("http://localhost:80/") == httplib2.urlnorm( "HTTP://LOCALHOST:80" ) try: httplib2.urlnorm("/") assert False, "Non-absolute URIs should raise an exception" except httplib2.RelativeURIError: pass
def _get_from_cache(self, url): """ get a given url from the cachedir even if its expired or return None if no data is available """ scheme = urlparse(url).scheme if self._http[scheme].cache: cached_value = self._http[scheme].cache.get( httplib2.urlnorm(url)[-1]) if cached_value: info, content = cached_value.split('\r\n\r\n', 1) return content
def _get_from_cache(self, url): """ get a given url from the cachedir even if its expired or return None if no data is available """ http = httplib2.Http(cache=self._cachedir) if http.cache: cached_value = http.cache.get(httplib2.urlnorm(url)[-1]) if cached_value: info, content = cached_value.split('\r\n\r\n', 1) return content return None
def _get_from_cache(self, url): """ get a given url from the cachedir even if its expired or return None if no data is available """ scheme = urlparse(url).scheme if self._http[scheme].cache: cached_value = self._http[scheme].cache.get( httplib2.urlnorm(url)[-1]) if cached_value: info, content = cached_value.decode("utf-8").split( '\r\n\r\n', 1) return content
def _getCachedHeader(self, uri, header): """Retrieve a cached value for an HTTP header.""" (scheme, authority, request_uri, cachekey) = urlnorm(uri) cached_value = self.get(cachekey) header_start = header + ':' if not isinstance(header_start, bytes): header_start = header_start.encode('utf-8') if cached_value is not None: for line in BytesIO(cached_value): if line.startswith(header_start): return line[len(header_start):].strip() return None
def test_norm(): cases = ( ('http://example.org', 'http://example.org/'), ('http://EXAMple.org', 'http://example.org/'), ('http://EXAMple.org?=b', 'http://example.org/?=b'), ('http://EXAMple.org/mypath?a=b', 'http://example.org/mypath?a=b'), ('http://localhost:80', 'http://localhost:80/'), ) for a, b in cases: assert httplib2.urlnorm(a)[-1] == b assert httplib2.urlnorm('http://localhost:80/') == httplib2.urlnorm('HTTP://LOCALHOST:80') try: httplib2.urlnorm('/') assert False, 'Non-absolute URIs should raise an exception' except httplib2.RelativeURIError: pass
def _follow_redirect(self, uri, method, body, headers, response, content, max_redirects): """Internal function to follow a redirect recieved by L{request}""" (scheme, authority, absolute_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri)) if self.cache: cachekey = defrag_uri else: cachekey = None # Pick out the location header and basically start from the beginning # remembering first to strip the ETag header and decrement our 'depth' if "location" not in response and response.status != 300: raise httplib2.RedirectMissingLocation( "Redirected but the response is missing a Location: header.", response, content) # Fix-up relative redirects (which violate an RFC 2616 MUST) if "location" in response: location = response['location'] (scheme, authority, path, query, fragment) = httplib2.parse_uri(location) if authority is None: response['location'] = httplib2.urlparse.urljoin(uri, location) pywikibot.debug( u"Relative redirect: changed [%s] to [%s]" % (location, response['location']), _logger) if response.status == 301 and method in ["GET", "HEAD"]: response['-x-permanent-redirect-url'] = response['location'] if "content-location" not in response: response['content-location'] = absolute_uri httplib2._updateCache(headers, response, content, self.cache, cachekey) headers.pop('if-none-match', None) headers.pop('if-modified-since', None) if "location" in response: location = response['location'] redirect_method = ( (response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method return self.request(location, redirect_method, body=body, headers=headers, max_redirects=max_redirects - 1) else: raise httplib2.RedirectLimit( "Redirected more times than redirection_limit allows.", response, content)
def test_etag_used(): # Test that we use ETags properly to validate our cache cache_path = tests.get_cache_path() http = httplib2.Http(cache=cache_path) response_kwargs = dict( add_date=True, add_etag=True, body=b"something", headers={"cache-control": "public,max-age=300"}, ) def handler(request): if request.headers.get("range"): return tests.http_response_bytes(status=206, **response_kwargs) return tests.http_response_bytes(**response_kwargs) with tests.server_request(handler, request_count=2) as uri: response, _ = http.request(uri, "GET", headers={"accept-encoding": "identity"}) assert response["etag"] == '"437b930db84b8079c2dd804a71936b5f"' http.request(uri, "GET", headers={"accept-encoding": "identity"}) response, _ = http.request( uri, "GET", headers={"accept-encoding": "identity", "cache-control": "must-revalidate"}, ) assert response.status == 200 assert response.fromcache # TODO: API to read cache item, at least internal to tests cache_file_name = os.path.join( cache_path, httplib2.safename(httplib2.urlnorm(uri)[-1]) ) with open(cache_file_name, "r") as f: status_line = f.readline() assert status_line.startswith("status:") response, content = http.request( uri, "HEAD", headers={"accept-encoding": "identity"} ) assert response.status == 200 assert response.fromcache response, content = http.request( uri, "GET", headers={"accept-encoding": "identity", "range": "bytes=0-0"} ) assert response.status == 206 assert not response.fromcache
def test(self): # Test that different URIs end up generating different safe names self.assertEqual( "example.org,fred,a=b,58489f63a7a83c3b7794a6a398ee8b1f", httplib2.safename("http://example.org/fred/?a=b")) self.assertEqual( "example.org,fred,a=b,8c5946d56fec453071f43329ff0be46b", httplib2.safename("http://example.org/fred?/a=b")) self.assertEqual( "www.example.org,fred,a=b,499c44b8d844a011b67ea2c015116968", httplib2.safename("http://www.example.org/fred?/a=b")) self.assertEqual( httplib2.safename(httplib2.urlnorm("http://www")[-1]), httplib2.safename(httplib2.urlnorm("http://WWW")[-1])) self.assertEqual( "www.example.org,fred,a=b,692e843a333484ce0095b070497ab45d", httplib2.safename("https://www.example.org/fred?/a=b")) self.assertNotEqual( httplib2.safename("http://www"), httplib2.safename("https://www")) # Test the max length limits uri = "http://" + ("w" * 200) + ".org" uri2 = "http://" + ("w" * 201) + ".org" self.assertNotEqual( httplib2.safename(uri2), httplib2.safename(uri)) # Max length should be 200 + 1 (",") + 32 self.assertEqual(233, len(httplib2.safename(uri2))) self.assertEqual(233, len(httplib2.safename(uri))) # Unicode if sys.version_info >= (2,3): self.assertEqual( "xn--http,-4y1d.org,fred,a=b,579924c35db315e5a32e3d9963388193", httplib2.safename("http://\u2304.org/fred/?a=b"))
def send_http_request(method, request_url, body=None, request_headers={}): uri = httplib2.iri2uri(request_url) (scheme, authority, request_uri) = httplib2.urlnorm(uri)[:3] address = _get_hostport(authority) http_client = httplib2.HTTPConnectionWithTimeout(address[0], port=address[1]) if http_client.sock is None: http_client.connect() http_client.putrequest(method, request_uri.encode(DEFAULT_HTTP_URI_CHARSET), {'skip_host': 1, 'skip_accept_encoding': 1}) for key, value in request_headers.items(): http_client.putheader(key, value.encode(DEFAULT_HTTP_HEADER_CHARSET)) http_client.endheaders() if body: http_client.send(body) return http_client.getresponse()
def make_request(self, request, host): if request.proxy_uri in self.fm.form_headers: h = self.fm.form_headers.pop(request.proxy_uri) h.pop('content-length') request.headers.update(h) uri = request.proxy_uri.replace(request.host, host, 1) headers = self.clean_request_headers(request, host) headers['host'] = host[host.rindex('/')+1:] connection_type=None (scheme, authority, request_uri, defrag_uri) = httplib2.urlnorm(uri) if(self.http.use_http_proxy and scheme=='http'): connection_type=HTTPProxyConnectionWithTimeout if self.http.use_http_proxy_auth: headers['proxy-authorization']=self.http.http_proxy_auth resp, response = self.http.request(uri, method=request.method, body=str(request.body), headers=headers,connection_type=connection_type) self.set_response_headers(resp, response, request.host, host) response.request = request # print resp.status, uri return response
def dump(self): lines = ["*** HTTP Call Info ***\n"] uriNorm = httplib2.urlnorm(self.callInfo['uri']) lines.append("{0} {1} {2}/1.1\n".format(self.callInfo['method'], uriNorm[2], uriNorm[0].upper())) if self.callInfo['headers'] is not None: for chName, chValue in self.callInfo['headers']: lines.append("{0} : {1}\n".format(chName, chValue)) else: lines.append("No request headers sent") lines.append("\n") lines.append(self.callBody) lines.append("\n*** HTTP Response Info ***\n") lines.append("HTTP/1.1 {0} {1}\n".format(self.responseInfo.status, self.responseInfo.reason)) if self.responseHeaders is not None: for rhName, rhValue in self.responseHeaders.iteritems(): lines.append("{0} : {1}\n".format(rhName, rhValue)) lines.append("\n") lines.append(self.responseBody) return ''.join(lines)
def request(self, uri, method="GET", body=None, headers=None, max_redirects=None, connection_type=None): """Start an HTTP request. @param uri: The uri to retrieve @param method: (optional) The HTTP method to use. Default is 'GET' @param body: (optional) The request body. Default is no body. @param headers: (optional) Additional headers to send. Defaults include C{connection: keep-alive}, C{user-agent} and C{content-type}. @param max_redirects: (optional) The maximum number of redirects to use for this request. The class instance's max_redirects is default @param connection_type: (optional) see L{httplib2.Http.request} @return: (response, content) tuple """ if max_redirects is None: max_redirects = self.max_redirects if headers is None: headers = {} # Prepare headers headers.pop('cookie', None) req = DummyRequest(uri, headers) self.cookiejar.add_cookie_header(req) headers = req.headers # Wikimedia squids: add connection: keep-alive to request headers # unless overridden headers['connection'] = headers.pop('connection', 'keep-alive') # determine connection pool key and fetch connection (scheme, authority, request_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri)) conn_key = scheme + ":" + authority connection = self.connection_pool.pop_connection(conn_key) if connection is not None: self.connections[conn_key] = connection # Redirect hack: we want to regulate redirects follow_redirects = self.follow_redirects self.follow_redirects = False pywikibot.debug(u"%r" % ( (uri.replace("%7C", "|"), method, body, headers, max_redirects, connection_type), ), _logger) try: if authority in config.authenticate: self.add_credentials(*config.authenticate[authority]) (response, content) = httplib2.Http.request( self, uri, method, body, headers, max_redirects, connection_type ) except Exception as e: # what types? # return exception instance to be retrieved by the calling thread return e finally: self.follow_redirects = follow_redirects # return connection to pool self.connection_pool.push_connection(conn_key, self.connections[conn_key]) del self.connections[conn_key] # First write cookies self.cookiejar.extract_cookies(DummyResponse(response), req) # Check for possible redirects redirectable_response = ((response.status == 303) or (response.status in [300, 301, 302, 307] and method in ["GET", "HEAD"])) if (self.follow_redirects and (max_redirects > 0) and redirectable_response): # Return directly and not unpack the values in case the result was # an exception, which can't be unpacked return self._follow_redirect( uri, method, body, headers, response, content, max_redirects) else: return response, content
def request(self, uri, method="GET", body=None, headers=None, max_redirects=None, connection_type=None): """ Starts an HTTP request. @param uri: The uri to retrieve @param method: (optional) The HTTP method to use. Default is 'GET' @param body: (optional) The request body. Default is no body. @param headers: (optional) Additional headers to send. Defaults include C{connection: keep-alive}, C{user-agent} and C{content-type}. @param max_redirects: (optional) The maximum number of redirects to use for this request. The class instances max_redirects is default @param connection_type: (optional) ? @returns: (response, content) tuple """ if max_redirects is None: max_redirects = self.max_redirects if headers is None: headers = {} # Prepare headers headers.pop('cookie', None) req = DummyRequest(uri, headers) self.cookiejar.lock.acquire() try: self.cookiejar.add_cookie_header(req) finally: self.cookiejar.lock.release() headers = req.headers # Wikimedia squids: add connection: keep-alive to request headers unless overridden headers['connection'] = headers.pop('connection', 'keep-alive') # determine connection pool key and fetch connection (scheme, authority, request_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri)) conn_key = scheme+":"+authority connection = self.connection_pool.pop_connection(conn_key) if connection is not None: self.connections[conn_key] = connection # Redirect hack: we want to regulate redirects follow_redirects = self.follow_redirects #print 'follow_redirects: %r %r' % (self.follow_redirects, follow_redirects) self.follow_redirects = False #print 'follow_redirects: %r %r' % (self.follow_redirects, follow_redirects) logging.debug('%r' % ((uri, method, headers, max_redirects, connection_type),)) (response, content) = httplib2.Http.request(self, uri, method, body, headers, max_redirects, connection_type) #print 'follow_redirects: %r %r' % (self.follow_redirects, follow_redirects) self.follow_redirects = follow_redirects #print 'follow_redirects: %r %r' % (self.follow_redirects, follow_redirects) # return connection to pool self.connection_pool.push_connection(conn_key, self.connections[conn_key]) del self.connections[conn_key] # First write cookies self.cookiejar.lock.acquire() try: self.cookiejar.extract_cookies(DummyResponse(response), req) finally: self.cookiejar.lock.release() # Check for possible redirects redirectable_response = ((response.status == 303) or (response.status in [300, 301, 302, 307] and method in ["GET", "HEAD"])) if self.follow_redirects and (max_redirects > 0) and redirectable_response: (response, content) = self._follow_redirect(uri, method, body, headers, response, content, max_redirects) return (response, content)
except httplib2.RelativeURIError: pass @pytest.mark.parametrize( 'data', ( ('', ',d41d8cd98f00b204e9800998ecf8427e'), ('http://example.org/fred/?a=b', 'example.orgfreda=b,58489f63a7a83c3b7794a6a398ee8b1f'), ('http://example.org/fred?/a=b', 'example.orgfreda=b,8c5946d56fec453071f43329ff0be46b'), ('http://www.example.org/fred?/a=b', 'www.example.orgfreda=b,499c44b8d844a011b67ea2c015116968'), ('https://www.example.org/fred?/a=b', 'www.example.orgfreda=b,692e843a333484ce0095b070497ab45d'), (httplib2.urlnorm('http://WWW')[-1], httplib2.safename(httplib2.urlnorm('http://www')[-1])), (u'http://\u2304.org/fred/?a=b', '.orgfreda=b,ecaf0f97756c0716de76f593bd60a35e'), ('normal-resource-name.js', 'normal-resource-name.js,8ff7c46fd6e61bf4e91a0a1606954a54'), ('foo://dom/path/brath/carapath', 'dompathbrathcarapath,83db942781ed975c7a5b7c24039f8ca3'), ('with/slash', 'withslash,17cc656656bb8ce2411bd41ead56d176'), ('thisistoomuch' * 42, ('thisistoomuch' * 6) + 'thisistoomuc,c4553439dd179422c6acf6a8ac093eb6'), (u'\u043f\u0440', ',9f18c0db74a9734e9d18461e16345083'), (u'\u043f\u0440'.encode('utf-8'), ',9f18c0db74a9734e9d18461e16345083'), (b'column\tvalues/unstr.zip', 'columnvaluesunstr.zip,b9740dcd0553e11b526450ceb8f76683'), ), ids=str) def test_safename(data): result = httplib2.safename(data[0]) assert result == data[1]
assert False, 'Non-absolute URIs should raise an exception' except httplib2.RelativeURIError: pass @pytest.mark.parametrize('data', ( ('', ',d41d8cd98f00b204e9800998ecf8427e'), ('http://example.org/fred/?a=b', 'example.orgfreda=b,58489f63a7a83c3b7794a6a398ee8b1f'), ('http://example.org/fred?/a=b', 'example.orgfreda=b,8c5946d56fec453071f43329ff0be46b'), ('http://www.example.org/fred?/a=b', 'www.example.orgfreda=b,499c44b8d844a011b67ea2c015116968'), ('https://www.example.org/fred?/a=b', 'www.example.orgfreda=b,692e843a333484ce0095b070497ab45d'), (httplib2.urlnorm('http://WWW')[-1], httplib2.safename(httplib2.urlnorm('http://www')[-1])), (u'http://\u2304.org/fred/?a=b', '.orgfreda=b,ecaf0f97756c0716de76f593bd60a35e'), ('normal-resource-name.js', 'normal-resource-name.js,8ff7c46fd6e61bf4e91a0a1606954a54'), ('foo://dom/path/brath/carapath', 'dompathbrathcarapath,83db942781ed975c7a5b7c24039f8ca3'), ('with/slash', 'withslash,17cc656656bb8ce2411bd41ead56d176'), ('thisistoomuch' * 42, ('thisistoomuch' * 6) + 'thisistoomuc,c4553439dd179422c6acf6a8ac093eb6'), (u'\u043f\u0440', ',9f18c0db74a9734e9d18461e16345083'), (u'\u043f\u0440'.encode('utf-8'), ',9f18c0db74a9734e9d18461e16345083'), (b'column\tvalues/unstr.zip', 'columnvaluesunstr.zip,b9740dcd0553e11b526450ceb8f76683'), ),
"example.orgfreda=b,58489f63a7a83c3b7794a6a398ee8b1f", ), ( "http://example.org/fred?/a=b", "example.orgfreda=b,8c5946d56fec453071f43329ff0be46b", ), ( "http://www.example.org/fred?/a=b", "www.example.orgfreda=b,499c44b8d844a011b67ea2c015116968", ), ( "https://www.example.org/fred?/a=b", "www.example.orgfreda=b,692e843a333484ce0095b070497ab45d", ), ( httplib2.urlnorm("http://WWW")[-1], httplib2.safename(httplib2.urlnorm("http://www")[-1]), ), ( u"http://\u2304.org/fred/?a=b", ".orgfreda=b,ecaf0f97756c0716de76f593bd60a35e", ), ( "normal-resource-name.js", "normal-resource-name.js,8ff7c46fd6e61bf4e91a0a1606954a54", ), ( "foo://dom/path/brath/carapath", "dompathbrathcarapath,83db942781ed975c7a5b7c24039f8ca3", ), ("with/slash", "withslash,17cc656656bb8ce2411bd41ead56d176"),
## Strip error causing Line Feed ascii char import urllib2 image_url = ''.join(image_url.split('%0A')) ######################################################## ############ Finally ######################### ##### Replace ALL url encoding % escapes ######## ### TWICE TO ACCOUNT FOR EX. %2520 --> %20 --> ' ' ### #image_url = image_url.replace('/Flat%2520Images/', '/Flat%20Images/') print image_url, ' URL' #image_url = image_url.replace('/Flat%2520Images/', '/Flat%20Images/') # image_url = urllib2.unquote(image_url) regex_validurl = re.compile(r'^http[s]?://.+?$', re.U) if regex_validurl.findall(image_url): import httplib2 image_url = httplib2.urlnorm(httplib2.urllib.unquote(image_url))[-1] #image_url = urllib2.unquote(image_url) #urllib2.unquote(image_url)) ######################################################## ######################################################## print 'RRR' headers = {'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:33.0) Gecko/20100101 Firefox/33.0'} try: print image_url, destpath #.split('/' )[-1].replace('.jpg','_1200.jpg') #error_check = urllib.urlopen(image_url) #print error_check #urlcode_value = error_check.getcode() res = requests.get(image_url, stream=True, timeout=1, headers=headers) print 'ALMOST' urlcode_value = res.status_code
def request(self, uri, method="GET", body=None, headers=None, redirections=httplib2.DEFAULT_MAX_REDIRECTS): (scheme, authority, request_uri, defrag_uri) = httplib2.urlnorm(uri) connection_type = (scheme == 'https') and HTTPSConnection or HTTPConnection return httplib2.Http.request(self, uri, method, body, headers, redirections, connection_type)
def url_download_file(image_url,filepath,errdir=None): import urllib, os, io, cStringIO, requests ## Split Vendor # to try again on fail of full VENDOR_STYLE_NO url_split = image_url.split('/')[-1] url_split = url_split.split('-')[1:] url_split = '-'.join(url_split) url_parent = image_url.split('/')[:-1] url_parent = '/'.join(url_parent) backupurl = image_url.replace('admin.swisswatchintl.com/Z/', 'admin.swisswatchintl.com/H/') backup_spliturl = os.path.join(url_parent, url_split).replace('admin.swisswatchintl.com/Z/', 'admin.swisswatchintl.com/H/') headers = {'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:33.0) Gecko/20100101 Firefox/33.0'} error_check = requests.get(image_url, stream=True, timeout=1, headers=headers) urlcode_value = error_check.status_code() print urlcode_value ### PRIMARY URL, AKA /Z/ import httplib2 image_url = httplib2.urlnorm(httplib2.urllib.unquote(image_url))[-1] print 'RRR' print image_url, filepath #.split('/' )[-1].replace('.jpg','_1200.jpg') res = requests.get(image_url, stream=True, timeout=1, headers=headers) print 'ALMOST' urlcode_value = res.status_code print urlcode_value if urlcode_value == 200: res = requests.get(url_split, stream=True, timeout=1, headers=headers) with open(filepath, 'ab+') as f: f.write(res.content) f.close() elif urlcode_value == 404: ### Split URL, /Z/ urlsplit = os.path.join(url_parent, url_split) error_check = requests.get(url_split, stream=True, timeout=1, headers=headers) split_urlcode_value = error_check.status_code() ### Backup URL, AKA /H/ error_check = requests.get(backupurl, stream=True, timeout=1, headers=headers) backup_urlcode_value = error_check.status_code() ### BackupSplit error_check = urllib.urlopen(backup_spliturl) error_check = requests.get(backup_spliturl, stream=True, timeout=1, headers=headers) backup_spliturlcode_value = error_check.status_code() if split_urlcode_value == 200: res = requests.get(url_split, stream=True, timeout=1, headers=headers) with open(filepath, 'ab+') as f: f.write(res.content) f.close() # print "On 2nd Attempt, Retrieved: " + urlsplit + " ---> " + filepath elif backup_urlcode_value == 200: # urllib.urlretrieve(backupurl, filepath.replace('.jpg', '_H.jpg')) res = requests.get(backupurl, stream=True, timeout=1, headers=headers) with open(filepath, 'ab+') as f: f.write(res.content) f.close() #print "Downloaded URL {0} Finally on 3rd and Final Attempt with Error Code {1}".format(backupurl, backup_urlcode_value) elif backup_spliturlcode_value == 200: # urllib.urlretrieve(backup_spliturl, filepath.replace('.jpg', '_HH.jpg')) res = requests.get(backup_spliturl, stream=True, timeout=1, headers=headers) with open(filepath, 'ab+') as f: f.write(res.content) f.close() #print "Didnt Fail Downloading URL {0} even on 3rd and Final Attempt with Error Code {1}".format(backup_spliturl, backup_spliturlcode_value) else: #print "AWFUL Totally Failed Downloading URL {0} on 2nd Attempt with Error Code {1}".format(image_url, urlcode_value) # print "TERRIBLE Failed Downloading URL {0} even on 3rd and Final Attempt with Error Code {1}".format(backupurl, backup_urlcode_value) try: errdir=os.path.join('/mnt','Post_Complete/Complete_Archive/MARKETPLACE/SWI/ERRORS') try: os.makedirs(errdir, 16877) except OSError: pass colorstyle = filepath.split('/')[-1][:9] alt = filepath.split('/')[-1].split('_')[-1][0] if alt.isdigit(): alt = str(alt) elif alt == 'a': alt = str(alt) else: alt = '1' try: #info = cStringIO.StringIO() with io.open(os.path.join(os.path.abspath(errdir), colorstyle + '_' + alt + '_error404.txt'), mode='wt+') as f: info = "{0},{1},{2},{3}".format(str(colorstyle), str(alt), str(urlcode_value), str(image_url)) outtext = unicode(info, 'utf-8') print outtext print >>f, outtext #info.flush() #f.write(info.getvalue()) #f.write() #info.close() f.flush() f.close() except AttributeError: pass except OSError: pass else: print "{0} Error:\v {1} is not a valid URL".format(urlcode_value,image_url)
def request(self, uri, method="GET", body=None, headers=None, max_redirects=None, connection_type=None): """Start an HTTP request. @param uri: The uri to retrieve @param method: (optional) The HTTP method to use. Default is 'GET' @param body: (optional) The request body. Default is no body. @param headers: (optional) Additional headers to send. Defaults include C{connection: keep-alive}, C{user-agent} and C{content-type}. @param max_redirects: (optional) The maximum number of redirects to use for this request. The class instance's max_redirects is default @param connection_type: (optional) see L{httplib2.Http.request} @return: (response, content) tuple """ if max_redirects is None: max_redirects = self.max_redirects if headers is None: headers = {} # Prepare headers headers.pop('cookie', None) req = DummyRequest(uri, headers) self.cookiejar.lock.acquire() try: self.cookiejar.add_cookie_header(req) finally: self.cookiejar.lock.release() headers = req.headers # Wikimedia squids: add connection: keep-alive to request headers # unless overridden headers['connection'] = headers.pop('connection', 'keep-alive') # determine connection pool key and fetch connection (scheme, authority, request_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri)) conn_key = scheme + ":" + authority connection = self.connection_pool.pop_connection(conn_key) if connection is not None: self.connections[conn_key] = connection # Redirect hack: we want to regulate redirects follow_redirects = self.follow_redirects self.follow_redirects = False pywikibot.debug( u"%r" % ((uri.replace("%7C", "|"), method, body, headers, max_redirects, connection_type), ), _logger) try: (response, content) = httplib2.Http.request(self, uri, method, body, headers, max_redirects, connection_type) except Exception, e: # what types? # return exception instance to be retrieved by the calling thread return e
def request(self, uri, method="GET", body=None, headers=None, redirections=httplib2.DEFAULT_MAX_REDIRECTS, connection_type=None): """request handler with thread safety hacked in""" try: if headers is None: headers = {} else: headers = httplib2._normalize_headers(headers) if not headers.has_key('user-agent'): headers['user-agent'] = "Python-httplib2/%s" % httplib2.__version__ uri = httplib2.iri2uri(uri) (scheme, authority, request_uri, defrag_uri) = httplib2.urlnorm(uri) domain_port = authority.split(":")[0:2] if len(domain_port) == 2 and domain_port[1] == '443' and scheme == 'http': scheme = 'https' authority = domain_port[0] conn_key = scheme+":"+authority def get_conn(conn_key): if conn_key in self.connections: conn = self.connections[conn_key] if type(conn) is list: for c in conn: if not getattr(c, 'busy', True): return c else: return c if type(conn) is list: return None conn = get_conn(conn_key) if conn is None: if not connection_type: connection_type = (scheme == 'https') and httplib2.HTTPSConnectionWithTimeout or httplib2.HTTPConnectionWithTimeout certs = list(self.certificates.iter(authority)) if scheme == 'https' and certs: conn = connection_type(authority, key_file=certs[0][0], cert_file=certs[0][1], timeout=self.timeout, proxy_info=self.proxy_info) self.connections.setdefault(conn_key, []).append(conn) else: conn = connection_type(authority, timeout=self.timeout, proxy_info=self.proxy_info) self.connections.setdefault(conn_key, []).append(conn) conn.set_debuglevel(httplib2.debuglevel) conn.busy = True if method in ["GET", "HEAD"] and 'range' not in headers and 'accept-encoding' not in headers: headers['accept-encoding'] = 'deflate, gzip' info = httplib2.email.Message.Message() cached_value = None if self.cache: cachekey = defrag_uri cached_value = self.cache.get(cachekey) if cached_value: try: info, content = cached_value.split('\r\n\r\n', 1) feedparser = httplib2.email.FeedParser.FeedParser() feedparser.feed(info) info = feedparser.close() feedparser._parse = None except IndexError: self.cache.delete(cachekey) cachekey = None cached_value = None else: cachekey = None if method not in ["GET", "HEAD"] and self.cache and cachekey: # RFC 2616 Section 13.10 self.cache.delete(cachekey) if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers: if info.has_key('-x-permanent-redirect-url'): (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1) response.previous = Response(info) response.previous.fromcache = True else: entry_disposition = httplib2._entry_disposition(info, headers) if entry_disposition == "FRESH": if not cached_value: info['status'] = '504' content = "" response = Response(info) if cached_value: response.fromcache = True return (response, content) if entry_disposition == "STALE": if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers: headers['if-none-match'] = info['etag'] if info.has_key('last-modified') and not 'last-modified' in headers: headers['if-modified-since'] = info['last-modified'] elif entry_disposition == "TRANSPARENT": pass (response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) if response.status == 304 and method == "GET": # Rewrite the cache entry with the new end-to-end headers # Take all headers that are in response # and overwrite their values in info. # unless they are hop-by-hop, or are listed in the connection header. for key in httplib2._get_end2end_headers(response): info[key] = response[key] merged_response = Response(info) if hasattr(response, "_stale_digest"): merged_response._stale_digest = response._stale_digest httplib2._updateCache(headers, merged_response, content, self.cache, cachekey) response = merged_response response.status = 200 response.fromcache = True elif response.status == 200: content = new_content else: self.cache.delete(cachekey) content = new_content else: cc = httplib2._parse_cache_control(headers) if cc.has_key('only-if-cached'): info['status'] = '504' response = Response(info) content = "" else: (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) except Exception, e: if self.force_exception_to_status_code: if isinstance(e, httplib2.HttpLib2ErrorWithResponse): response = e.response content = e.content response.status = 500 response.reason = str(e) elif isinstance(e, socket.timeout): content = "Request Timeout" response = Response( { "content-type": "text/plain", "status": "408", "content-length": len(content) }) response.reason = "Request Timeout" else: content = str(e) response = Response( { "content-type": "text/plain", "status": "400", "content-length": len(content) }) response.reason = "Bad Request" else: raise