def do_deprefix(url, rel_prefix, full_prefix): encoded = urllib.quote_plus(full_prefix) url = url.replace(full_prefix, encoded) rewriter = UrlRewriter(url, rel_prefix, full_prefix) url = rewriter.deprefix_url() return urllib.unquote_plus(url)
def setup_class(cls): urlrewriter = UrlRewriter( '20161226/http://example.com/?callback=jQuery_ABC', '/web/', 'https://localhost/web/') cls.rewriter = JSONPRewriter(urlrewriter) urlrewriter = UrlRewriter('20161226/http://example.com/', '/web/', 'https://localhost/web/') cls.rewriter_missing_cb = JSONPRewriter(urlrewriter)
def parse(html_text, is_ajax=False): urlrewriter = UrlRewriter( '20131226101010/https://example.com/some/path.html', '/web/') if is_ajax: urlrewriter.rewrite_opts['is_ajax'] = True rewriter = HTMLInsertOnlyRewriter(urlrewriter, head_insert='<!--Insert-->') return rewriter.rewrite(html_text) + rewriter.final_read()
def _test_proxy_headers(http_cache=None): headers = _make_cache_headers() status = '200 OK' rewriter = UrlRewriter('20131010/http://example.com/', '/pywb/', rewrite_opts={'http_cache': http_cache}) rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), rewriter, rewriter.get_cookie_rewriter()) return rewritten.status_headers
def rewrite_record(self, headers, content, ts, url='http://example.com/', prefix='http://localhost:8080/prefix/', warc_headers=None, request_url=None, is_live=None, use_js_proxy=True, environ=None): record = self._create_response_record(url, headers, content, warc_headers) wburl = WbUrl(ts + '/' + (request_url or url)) url_rewriter = UrlRewriter(wburl, prefix) cdx = CDXObject() cdx['url'] = url cdx['timestamp'] = ts cdx['urlkey'] = canonicalize(url) if request_url != url: cdx['is_fuzzy'] = '1' cdx['is_live'] = is_live def insert_func(rule, cdx): return '' if use_js_proxy: rewriter = self.js_proxy_content_rewriter else: rewriter = self.content_rewriter return rewriter(record, url_rewriter, cookie_rewriter=None, head_insert_func=insert_func, cdx=cdx, environ=environ)
def rewrite_record(self, headers, content, ts, url='http://example.com/', prefix='http://localhost:8080/prefix/', warc_headers=None, request_url=None, is_live=None): record = self._create_response_record(url, headers, content, warc_headers) wburl = WbUrl(ts + '/' + (request_url or url)) url_rewriter = UrlRewriter(wburl, prefix) cdx = CDXObject() cdx['url'] = url cdx['timestamp'] = ts cdx['urlkey'] = canonicalize(url) if request_url != url: cdx['is_fuzzy'] = '1' cdx['is_live'] = is_live return self.content_rewriter(record, url_rewriter, None, cdx=cdx)
def parse(html_text): urlrewriter = UrlRewriter( '20131226101010/https://example.com/some/path.html', '/web/') rewriter = HTMLInsertOnlyRewriter(urlrewriter, head_insert='<!--Insert-->') return rewriter.rewrite(html_text) + rewriter.final_read()
def do_rewrite(rel_url, base_url, prefix, mod=None, full_prefix=None): rewriter = UrlRewriter(base_url, prefix, full_prefix=full_prefix) return rewriter.rewrite(rel_url, mod)
def render_content(self, wb_url, kwargs, environ): wb_url = WbUrl(wb_url) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix resp = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if resp is not None: content_type = 'text/html' # if not replay outer frame, specify utf-8 charset if not self.is_framed_replay(wb_url): content_type += '; charset=utf-8' return WbResponse.text_response(resp, content_type=content_type) urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix) self.unrewrite_referrer(environ) urlkey = canonicalize(wb_url.url) inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, self.content_rewriter) inputreq.include_post_query(wb_url.url) mod_url = None use_206 = False rangeres = None readd_range = False async_record_url = None if kwargs.get('type') in ('record', 'patch'): rangeres = inputreq.extract_range() if rangeres: mod_url, start, end, use_206 = rangeres # if bytes=0- Range request, # simply remove the range and still proxy if start == 0 and not end and use_206: wb_url.url = mod_url inputreq.url = mod_url del environ['HTTP_RANGE'] readd_range = True else: async_record_url = mod_url skip = async_record_url is not None setcookie_headers = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) res = self.cookie_tracker.get_cookie_headers(wb_url.url, cookie_key) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip) if r.status_code >= 400: error = None try: error = r.raw.read() r.raw.close() except: pass if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) if async_record_url: environ.pop('HTTP_RANGE', '') gevent.spawn(self._do_async_req, inputreq, async_record_url, wb_url, kwargs, False) record = self.loader.parse_record_stream(r.raw) cdx = CDXObject() cdx['urlkey'] = urlkey cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime')) cdx['url'] = wb_url.url self._add_custom_params(cdx, r.headers, kwargs) if readd_range: content_length = (record.status_headers. get_header('Content-Length')) try: content_length = int(content_length) record.status_headers.add_range(0, content_length, content_length) except (ValueError, TypeError): pass if self.is_ajax(environ): head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view. create_insert_func(wb_url, full_prefix, host_prefix, top_url, environ, self.framed_replay)) cookie_rewriter = None if self.cookie_tracker: cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter, cookie_key) result = self.content_rewriter.rewrite_content(urlrewriter, record.status_headers, record.stream, head_insert_func, urlkey, cdx, cookie_rewriter, environ) status_headers, gen, is_rw = result if setcookie_headers: status_headers.headers.extend(setcookie_headers) return WbResponse(status_headers, gen)
def setup_class(cls): cls.urlrewriter = UrlRewriter('20171226/http://example.com/', '/warc/') cls.default_rewriter = DefaultRewriter()
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) is_timegate = self._check_accept_dt(wb_url, environ) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix is_proxy = ('wsgiprox.proxy_host' in environ) response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: scheme, netloc, path, query, frag = url_parts path = '/' url = urlunsplit((scheme, netloc, path, query, frag)) resp = WbResponse.redir_response(urlrewriter.rewrite(url), '307 Temporary Redirect') if self.enable_memento: resp.status_headers['Link'] = MementoUtils.make_link(url, 'original') return resp self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range(inputreq, wb_url) setcookie_headers = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() r.raw.close() except: pass if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) #cdx['urlkey'] = urlkey #cdx['timestamp'] = http_date_to_timestamp(memento_dt) #cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redir to exact, redir if url or ts are different if self.redirect_to_exact: if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy) else: resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' is_ajax = self.is_ajax(environ) if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view. create_insert_func(wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, config=self.config)) cookie_rewriter = None if self.cookie_tracker: cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx) status_headers, gen, is_rw = result if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll')) set_content_loc = True if set_content_loc and not self.redirect_to_exact: status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) return response
from pywb.rewrite.rewrite_live import LiveRewriter from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.wburl import WbUrl from pywb.utils.loaders import to_native_str from pywb import get_test_dir from io import BytesIO # This module has some rewriting tests against the 'live web' # As such, the content may change and the test may break urlrewriter = UrlRewriter( '20131226101010/http://example.com/some/path/index.html', '/pywb/') bn_urlrewriter = UrlRewriter( '20131226101010bn_/http://example.com/some/path/index.html', '/pywb/') def head_insert_func(rule, cdx): if rule.js_rewrite_location != 'urls': return '<script src="/static/__pywb/wombat.js"> </script>' else: return '' def test_csrf_token_headers(): rewriter = LiveRewriter() env = {'HTTP_X_CSRFTOKEN': 'wrong', 'HTTP_COOKIE': 'csrftoken=foobar'} req_headers = rewriter.translate_headers('http://example.com/',
""" from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter from pywb.rewrite.cookie_rewriter import get_cookie_rewriter from pywb.rewrite.url_rewriter import UrlRewriter import pytest import sys urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', 'http://*****:*****@pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported') def test_with_expires(): # keep expires res = rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll') assert len(res) == 1
def do_deprefix(url, rel_prefix, full_prefix): rewriter = UrlRewriter(url, rel_prefix, full_prefix) url = rewriter.deprefix_url() return urllib.unquote_plus(url)
def do_deprefix(url, rel_prefix, full_prefix): rewriter = UrlRewriter(url, rel_prefix, full_prefix) url = rewriter.deprefix_url() return unquote_plus(url)
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) proto = environ.get('HTTP_X_FORWARDED_PROTO', self.force_scheme) if proto: environ['wsgi.url_scheme'] = proto history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '') if history_page: wb_url.url = history_page is_ajax = True else: is_ajax = self.is_ajax(environ) is_timegate = self._check_accept_dt(wb_url, environ) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix environ['pywb.host_prefix'] = host_prefix pywb_static_prefix = host_prefix + environ.get( 'pywb.app_prefix', '') + environ.get('pywb.static_prefix', '/static/') is_proxy = ('wsgiprox.proxy_host' in environ) response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix, pywb_static_prefix=pywb_static_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: return self.send_redirect('/', url_parts, urlrewriter) self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range( inputreq, wb_url) setcookie_headers = None cookie_key = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) if cookie_key: res = self.cookie_tracker.get_cookie_headers( wb_url.url, urlrewriter, cookie_key, environ.get('HTTP_COOKIE', '')) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() except Exception: pass finally: no_except_close(r.raw) if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) cdx_url_parts = urlsplit(cdx['url']) if cdx_url_parts.path.endswith( '/') and not url_parts.path.endswith('/'): # add trailing slash new_path = url_parts.path + '/' no_except_close(r.raw) return self.send_redirect(new_path, url_parts, urlrewriter) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') # cdx['urlkey'] = urlkey # cdx['timestamp'] = http_date_to_timestamp(memento_dt) # cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redir to exact, redir if url or ts are different if self.redirect_to_exact: if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy) else: resp.status_headers['Link'] = MementoUtils.make_link( target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs, record) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view.create_insert_func( wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, coll=kwargs.get('coll', ''), replay_mod=self.replay_mod, config=self.config)) cookie_rewriter = None if self.cookie_tracker and cookie_key: # skip add cookie if service worker is not 200 # it seems cookie headers from service workers are not applied, so don't update in cache if wb_url.mod == 'sw_': cookie_key = None cookie_rewriter = self.cookie_tracker.get_rewriter( urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx, environ) status_headers, gen, is_rw = result if history_page: title = DefaultRewriter._extract_title(gen) if not title: title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', '')) if not title: title = history_page self._add_history_page(cdx, kwargs, title) return WbResponse.json_response({'title': title}) if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll')) set_content_loc = True if set_content_loc and not self.redirect_to_exact: status_headers.headers.append( ('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) return response
# Test no parsing at all >>> p = HTMLRewriter(urlrewriter) >>> p.close() '' """ from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.html_rewriter import HTMLRewriter import pprint import urllib urlrewriter = UrlRewriter( '20131226101010/http://example.com/some/path/index.html', '/web/', rewrite_opts=dict(punycode_links=False)) urlrewriter_pencode = UrlRewriter( '20131226101010/http://example.com/some/path/index.html', '/web/', rewrite_opts=dict(punycode_links=True)) no_base_canon_rewriter = UrlRewriter( '20131226101010/http://example.com/some/path/index.html', '/web/', rewrite_opts=dict(rewrite_rel_canon=False, rewrite_base=False)) def parse(data, head_insert=None, urlrewriter=urlrewriter): parser = HTMLRewriter(urlrewriter, head_insert=head_insert)
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT') [('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')] # Cookie with invalid chars, not parsed >>> rewrite_cookie('abc@def=123') [] # ExactCookieRewriter >>> rewrite_cookie('some=value; Path=/diff/path/;', urlrewriter, ExactPathCookieRewriter) [('Set-Cookie', 'some=value')] >>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500', urlrewriter, ExactPathCookieRewriter) [('Set-Cookie', 'some=value')] """ from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter, ExactPathCookieRewriter from pywb.rewrite.url_rewriter import UrlRewriter urlrewriter = UrlRewriter( '20131226101010/http://example.com/some/path/index.html', '/pywb/') urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/') def rewrite_cookie(cookie_str, rewriter=urlrewriter, cookie_rewriter=MinimalScopeCookieRewriter): return cookie_rewriter(rewriter).rewrite(cookie_str)
def new_rewriter(prefix='/web/', rewrite_opts=dict()): PROXY_PATH = '20131226101010/{0}'.format(ORIGINAL_URL) return UrlRewriter(PROXY_PATH, prefix, rewrite_opts=rewrite_opts)
[] >>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500', urlrewriter, 'removeall') [] """ from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter from pywb.rewrite.cookie_rewriter import get_cookie_rewriter from pywb.rewrite.url_rewriter import UrlRewriter import pytest import sys urlrewriter = UrlRewriter( '20131226101010/http://example.com/some/path/index.html', 'http://*****:*****@pytest.mark.skipif(sys.version_info < (2, 7), reason='Unsupported')
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'), ('Content-Encoding', 'gzip'), ('X-Archive-Orig-Transfer-Encoding', 'chunked')]), 'text_type': None} """ from pywb.rewrite.header_rewriter import HeaderRewriter from pywb.rewrite.url_rewriter import UrlRewriter from pywb.utils.statusandheaders import StatusAndHeaders import pprint urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/') headerrewriter = HeaderRewriter() def _test_headers(headers, status='200 OK'): rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), urlrewriter, urlrewriter.get_cookie_rewriter()) return pprint.pprint(vars(rewritten)) if __name__ == "__main__": import doctest doctest.testmod()
>>> parse('') <BLANKLINE> # Test no parsing at all >>> p = HTMLRewriter(urlrewriter) >>> p.close() '' """ from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.html_rewriter import HTMLRewriter import pprint urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/') no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/', rewrite_opts=dict(rewrite_rel_canon=False, rewrite_base=False)) def parse(data, head_insert=None, urlrewriter=urlrewriter): parser = HTMLRewriter(urlrewriter, head_insert = head_insert) #data = data.decode('utf-8') result = parser.rewrite(data) + parser.close() # decode only for printing print result.decode('utf-8') if __name__ == "__main__": import doctest
>>> _test_css("@import (\"url.css\")") '@import ("url.css")' >>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)") '@import url(/web/20131010cs_/http://example.com/url.css)\n@import url(/web/20131010cs_/http://example.com/anotherurl.css)\n @import url(/web/20131010cs_/http://example.com/and_a_third.css)' """ #================================================================= from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter, RxRules from pywb.rewrite.regex_rewriters import JSWombatProxyRewriter urlrewriter = UrlRewriter('20131010/http://example.com/', '/web/', 'https://localhost/web/') def _test_js(string, extra = []): return JSRewriter(urlrewriter, extra).rewrite(string) def _test_js_obj_proxy(string): rw = JSWombatProxyRewriter(urlrewriter) rw.first_buff = '' rw.close_string = '' return rw.rewrite(string) def _test_xml(string): return XMLRewriter(urlrewriter).rewrite(string) def _test_css(string):
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '') if history_page: wb_url.url = history_page is_ajax = True else: is_ajax = self.is_ajax(environ) is_timegate = self._check_accept_dt(wb_url, environ) self.prepare_env(environ) host_prefix = environ['pywb.host_prefix'] rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix pywb_static_prefix = environ['pywb.static_prefix'] + '/' is_proxy = ('wsgiprox.proxy_host' in environ) # if OPTIONS in proxy mode, just generate the proxy responss if is_proxy and self.is_preflight(environ): return WbResponse.options_response(environ) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw # no redirects if in proxy redirect_to_exact = self.redirect_to_exact and not is_proxy # Check Prefer pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ, content_rw, is_proxy) response = None keep_frame_response = False # prefer overrides custom response? if pref_mod is not None: # fast-redirect to preferred if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod: new_url = full_prefix + wb_url.to_str(mod=pref_mod) headers = [('Preference-Applied', pref_applied), ('Vary', 'Prefer')] return WbResponse.redir_response(new_url, '307 Temporary Redirect', headers=headers) else: wb_url.mod = pref_mod else: if kwargs.get('output'): response = self.handle_timemap(wb_url, kwargs, full_prefix) elif wb_url.is_query(): response = self.handle_query(environ, wb_url, kwargs, full_prefix) else: response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) keep_frame_response = (not kwargs.get('no_timegate_check') and is_timegate and not is_proxy) or redirect_to_exact if response and not keep_frame_response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix, pywb_static_prefix=pywb_static_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: return self.send_redirect('/', url_parts, urlrewriter) self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range( inputreq, wb_url) setcookie_headers = None cookie_key = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) if cookie_key: res = self.cookie_tracker.get_cookie_headers( wb_url.url, urlrewriter, cookie_key, environ.get('HTTP_COOKIE', '')) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() except Exception: pass finally: no_except_close(r.raw) if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) if r.status_code == 404: raise NotFoundException(url=wb_url.url, msg=details) else: raise UpstreamException(r.status_code, url=wb_url.url, details=details) cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) cdx_url_parts = urlsplit(cdx['url']) if cdx_url_parts.path.endswith( '/') and not url_parts.path.endswith('/'): # add trailing slash new_path = url_parts.path + '/' no_except_close(r.raw) return self.send_redirect(new_path, url_parts, urlrewriter) # only redirect to exact if not live, otherwise set to false redirect_to_exact = redirect_to_exact and not cdx.get('is_live') # return top-frame timegate response, with timestamp from cdx if response and keep_frame_response and (not redirect_to_exact or not is_timegate): no_except_close(r.raw) return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy, cdx['timestamp']) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') # cdx['urlkey'] = urlkey # cdx['timestamp'] = http_date_to_timestamp(memento_dt) # cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redirect to exact timestamp (only set if not live) if redirect_to_exact: if set_content_loc or is_timegate or wb_url.timestamp != cdx.get( 'timestamp'): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy, pref_applied=pref_applied, mod=pref_mod, is_memento=False) else: resp.status_headers['Link'] = MementoUtils.make_link( target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs, record) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view.create_insert_func( wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, coll=kwargs.get('coll', ''), replay_mod=self.replay_mod, metadata=kwargs.get('metadata', {}), config=self.config)) cookie_rewriter = None if self.cookie_tracker and cookie_key: # skip add cookie if service worker is not 200 # it seems cookie headers from service workers are not applied, so don't update in cache if wb_url.mod == 'sw_': cookie_key = None cookie_rewriter = self.cookie_tracker.get_rewriter( urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx, environ) status_headers, gen, is_rw = result if history_page: title = DefaultRewriter._extract_title(gen) if not title: title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', '')) if not title: title = history_page self._add_history_page(cdx, kwargs, title) return WbResponse.json_response({'title': title}) if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll'), mod=pref_mod, pref_applied=pref_applied) set_content_loc = True if set_content_loc and not redirect_to_exact and not is_proxy: status_headers.headers.append( ('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) if is_proxy and environ.get('HTTP_ORIGIN'): response.add_access_control_headers(environ) if r.status_code == 200 and kwargs.get( 'cache') == 'always' and environ.get('HTTP_REFERER'): response.status_headers[ 'Cache-Control'] = 'public, max-age=31536000, immutable' return response