Esempio n. 1
0
    def fetch_and_cache(self, mirror_url):
        if url_tools.is_absolute_url(mirror_url):
            self.record_last_host = url_tools.get_host_from_url(mirror_url)
        else:
            mirror_url = url_tools.join(HTTP_PREFIX, self.record_last_host, mirror_url)
        
        host_name = url_tools.get_host_from_url(mirror_url)
            
        # http://www.w3.org/Protocols/rfc2616/rfc2616-sec8.html 8.1.3 Proxy Servers
        adjusted_headers = dict(self.request.headers)
        adjusted_headers['Connection'] = 'close'
        logging.debug("request headers '%s' of url: '%s'", url_tools.dict_to_s(adjusted_headers), mirror_url)
                      
        try:
            # fetch the requested url
            for attempt in range(FETCH_ATTEMPTS):
                response = urlfetch.fetch(mirror_url, self.request.body, self.method, adjusted_headers)
                logging.info('url fetch attempt %d for "%s" successful', attempt + 1, mirror_url)
                break
        except urlfetch.Error:
            exception_type = sys.exc_info()[0]
            logging.error('url fetch exception "%s" for "%s"', str(exception_type), mirror_url)
            return None

        transform_response = transform.ResponseTransformer(mirror_url, response)
        
        # cache the transformed entity and return
        mirror_content = MirrorEntity(mirror_url,
                                      host_name,
                                      transform_response.status_code,
                                      transform_response.headers,
                                      transform_response.content)
        memcache.add(mirror_url, mirror_content, config.EXPIRATION_RATE_S)

        return mirror_content
Esempio n. 2
0
    def fix_url(self, url):
        """ URI lore: <http://www.ietf.org/rfc/rfc3986.txt> """
        
        host_name = url_tools.get_host_from_url(self.response_url)      
        url = url.strip()
        
        # check if the scheme is supported
        scheme = url_tools.get_scheme_from_url(self.response_url)
        if not (scheme.startswith('http') or scheme.startswith('https') or scheme.startswith('//')): 
            logging.error('scheme not supported for url "%s" %s' % (url, scheme))
            return url
        
        # check if the 'url' field is a special type
        if url.startswith('#') or url.startswith('javascript'):
            return url
        
        # fix absolute URLs (i.e. href='http://hostname.net/path')
        if re.match(ABSOLUTE_URL_RE, url):
            if urlparse.urlsplit(url).netloc:
                new_url = '/' + url_tools.strip_scheme_from_url(url)
                logging.debug('absolute url transformation "%s" -> "%s"', url, new_url) 
                return new_url 
        
        # fix protocol relative URLs (i.e. src='//hostname.com/path')
        if re.match(PROTOCOL_RELATIVE_URL_RE, url):
            new_url = '/' + url[len('//'):] 
            logging.debug('protocol absolute url transformation "%s"', url, new_url)
            return new_url
                    
        # fix root relative URLs (i.e. src='/subpath_of_host_url/path')
        if re.match(ROOT_RELATIVE_URL_RE, url):
            if url == '/':
                new_url = config.PROXY_SITE + host_name
            else:
                new_url = '/' + host_name + url_tools.get_path_from_url(url)

            logging.debug('root relative transformation "%s"', url, new_url)                
            return new_url
            
        
        # fix directory relative URLs (i.e. src='file_in_the_same_directory/path')
        if re.match(DIRECTORY_RELATIVE_URL_RE, url):
            new_url = config.PROXY_SITE + popd(self.response_url) + '/' + url
            logging.debug('directory relative transformation "%s"', url, new_url) 
            return new_url 
        
        # ignore traversal relative URLs for now (i.e. src='../another_dir_file')
        if re.match(TRAVERSAL_RELATIVE_URL_RE, url):
            pass
        
        return url