def fetch_and_cache(self, mirror_url): if url_tools.is_absolute_url(mirror_url): self.record_last_host = url_tools.get_host_from_url(mirror_url) else: mirror_url = url_tools.join(HTTP_PREFIX, self.record_last_host, mirror_url) host_name = url_tools.get_host_from_url(mirror_url) # http://www.w3.org/Protocols/rfc2616/rfc2616-sec8.html 8.1.3 Proxy Servers adjusted_headers = dict(self.request.headers) adjusted_headers['Connection'] = 'close' logging.debug("request headers '%s' of url: '%s'", url_tools.dict_to_s(adjusted_headers), mirror_url) try: # fetch the requested url for attempt in range(FETCH_ATTEMPTS): response = urlfetch.fetch(mirror_url, self.request.body, self.method, adjusted_headers) logging.info('url fetch attempt %d for "%s" successful', attempt + 1, mirror_url) break except urlfetch.Error: exception_type = sys.exc_info()[0] logging.error('url fetch exception "%s" for "%s"', str(exception_type), mirror_url) return None transform_response = transform.ResponseTransformer(mirror_url, response) # cache the transformed entity and return mirror_content = MirrorEntity(mirror_url, host_name, transform_response.status_code, transform_response.headers, transform_response.content) memcache.add(mirror_url, mirror_content, config.EXPIRATION_RATE_S) return mirror_content
def fix_url(self, url): """ URI lore: <http://www.ietf.org/rfc/rfc3986.txt> """ host_name = url_tools.get_host_from_url(self.response_url) url = url.strip() # check if the scheme is supported scheme = url_tools.get_scheme_from_url(self.response_url) if not (scheme.startswith('http') or scheme.startswith('https') or scheme.startswith('//')): logging.error('scheme not supported for url "%s" %s' % (url, scheme)) return url # check if the 'url' field is a special type if url.startswith('#') or url.startswith('javascript'): return url # fix absolute URLs (i.e. href='http://hostname.net/path') if re.match(ABSOLUTE_URL_RE, url): if urlparse.urlsplit(url).netloc: new_url = '/' + url_tools.strip_scheme_from_url(url) logging.debug('absolute url transformation "%s" -> "%s"', url, new_url) return new_url # fix protocol relative URLs (i.e. src='//hostname.com/path') if re.match(PROTOCOL_RELATIVE_URL_RE, url): new_url = '/' + url[len('//'):] logging.debug('protocol absolute url transformation "%s"', url, new_url) return new_url # fix root relative URLs (i.e. src='/subpath_of_host_url/path') if re.match(ROOT_RELATIVE_URL_RE, url): if url == '/': new_url = config.PROXY_SITE + host_name else: new_url = '/' + host_name + url_tools.get_path_from_url(url) logging.debug('root relative transformation "%s"', url, new_url) return new_url # fix directory relative URLs (i.e. src='file_in_the_same_directory/path') if re.match(DIRECTORY_RELATIVE_URL_RE, url): new_url = config.PROXY_SITE + popd(self.response_url) + '/' + url logging.debug('directory relative transformation "%s"', url, new_url) return new_url # ignore traversal relative URLs for now (i.e. src='../another_dir_file') if re.match(TRAVERSAL_RELATIVE_URL_RE, url): pass return url