def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts, status_headers, is_timegate, is_proxy, coll=None): # memento url + header if not memento_dt and memento_ts: memento_dt = timestamp_to_http_date(memento_ts) if memento_dt: status_headers.headers.append(('Memento-Datetime', memento_dt)) if is_proxy: memento_url = url else: memento_url = full_prefix + memento_ts + self.replay_mod memento_url += '/' + url else: memento_url = None timegate_url, timemap_url = self._get_timegate_timemap(url, full_prefix) link = [] if not is_proxy: link.append(MementoUtils.make_link(url, 'original')) link.append(MementoUtils.make_link(timegate_url, 'timegate')) link.append(MementoUtils.make_link(timemap_url, 'timemap')) if memento_dt: link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt, coll)) link_str = ', '.join(link) status_headers.headers.append(('Link', link_str)) if is_timegate: status_headers.headers.append(('Vary', 'accept-datetime'))
def test_agg_post_resolve_postreq(self): req_data = """\ POST /post HTTP/1.1 content-length: 16 accept-encoding: gzip, deflate accept: */* host: httpbin.org content-type: application/x-www-form-urlencoded foo=bar&test=abc""" resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data) assert resp.headers['Warcserver-Source-Coll'] == 'post' self._check_uri_date(resp, 'http://httpbin.org/post', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') assert resp.headers['Memento-Datetime'] != '' assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert b'"test": "abc"' in resp.body assert b'"url": "http://httpbin.org/post"' in resp.body assert 'ResErrors' not in resp.headers
def _test_warc_write(self, recorder_app, host, path, other_params='', link_url=''): url = 'http://' + host + path req_url = '/live/resource/postreq?url=' + url + other_params testapp = webtest.TestApp(recorder_app) resp = testapp.post( req_url, general_req_data.format(host=host, path=path).encode('utf-8')) if not recorder_app.write_queue.empty(): recorder_app._write_one() assert resp.headers['Warcserver-Source-Coll'] == 'live' if not link_url: link_url = unquote(url) assert resp.headers['Link'] == MementoUtils.make_link( link_url, 'original') assert resp.headers['Memento-Datetime'] != '' return resp
def test_agg_local_revisit(self): resp = self.testapp.get( '/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local' ) assert resp.headers['Warcserver-Source-Coll'] == 'local:dupes.cdxj' buff = BytesIO(resp.body) status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff) assert status_headers.get_header( 'WARC-Target-URI') == 'http://example.com' assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z' assert status_headers.get_header( 'WARC-Refers-To-Target-URI') == 'http://example.com' assert status_headers.get_header( 'WARC-Refers-To-Date') == '2014-01-27T17:12:00Z' assert resp.headers['Link'] == MementoUtils.make_link( 'http://example.com', 'original') assert resp.headers[ 'Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' assert b'HTTP/1.1 200 OK' in resp.body assert b'<!doctype html>' in resp.body assert 'ResErrors' not in resp.headers
def test_agg_live_postreq(self): req_data = """\ GET /get?foo=bar HTTP/1.1 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 Host: httpbin.org """ resp = self.testapp.post( '/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data) assert resp.headers['Warcserver-Source-Coll'] == 'live' self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True) assert resp.headers['Link'] == MementoUtils.make_link( 'http://httpbin.org/get?foo=bar', 'original') assert resp.headers['Memento-Datetime'] != '' assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body #assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://httpbin.org/get?foo=bar',)"} assert "NotFoundException('http://webenact.rhizome.org/vvork/" in json.loads( resp.headers['ResErrors'])['rhiz']
def test_live_video_loader_post(self): pytest.importorskip('youtube_dl') req_data = """\ GET /v/BfBgWtAIbRc HTTP/1.1 accept-encoding: gzip, deflate accept: */* host: www.youtube.com\ """ params = { 'url': 'http://www.youtube.com/v/BfBgWtAIbRc', 'content_type': 'application/vnd.youtube-dl_formats+json' } resp = self.testapp.post( '/live/resource/postreq?&' + urlencode(params), req_data) assert resp.headers['Warcserver-Source-Coll'] == 'live' self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True) assert resp.headers['Link'] == MementoUtils.make_link( 'metadata://www.youtube.com/v/BfBgWtAIbRc', 'original') assert resp.headers['Memento-Datetime'] != '' assert b'WARC-Type: metadata' in resp.body assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body
def test_agg_select_local_postreq(self): req_data = """\ GET / HTTP/1.1 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 Host: iana.org """ resp = self.testapp.post( '/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data) assert resp.headers['Warcserver-Source-Coll'] == 'local:iana.cdxj' self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z') assert resp.headers['Link'] == MementoUtils.make_link( 'http://www.iana.org/', 'original') assert resp.headers[ 'Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT' assert json.loads(resp.headers['ResErrors']) == { "rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)" }
def test_agg_select_live(self): resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=now') assert resp.headers['Warcserver-Source-Coll'] == 'live' self._check_uri_date(resp, 'http://vvork.com/', True) assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original') assert resp.headers['Memento-Datetime'] != '' assert 'ResErrors' not in resp.headers
def test_url_agnost(self): f = FakeStrictRedis.from_url('redis://localhost/2') f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-revisit.warc.gz') f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-orig.warc.gz') resp = self.testapp.get('/urlagnost/resource?url=http://example.com/¶m.arg=foo') assert resp.status_int == 200 assert resp.headers['Link'] == MementoUtils.make_link('http://[email protected]/', 'original') assert resp.headers['Warcserver-Source-Coll'] == 'url-agnost' assert resp.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT'
def test_agg_select_local(self): resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624') assert resp.headers['Warcserver-Source-Coll'] == 'local:iana.cdxj' self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z') assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT' assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
def test_agg_select_local(self): resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624') assert resp.headers['Warcserver-Source-Coll'] == 'local:iana.cdxj' self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z') assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT' assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('https://webenact.rhizome.org/vvork/http://iana.org/',)"}
def send_redirect(self, new_path, url_parts, urlrewriter): scheme, netloc, path, query, frag = url_parts path = new_path url = urlunsplit((scheme, netloc, path, query, frag)) resp = WbResponse.redir_response(urlrewriter.rewrite(url), '307 Temporary Redirect') if self.enable_memento: resp.status_headers['Link'] = MementoUtils.make_link(url, 'original') return resp
def test_agg_seq_fallback_1(self): resp = self.testapp.get('/fallback/resource?url=http://httpbin.org/status/200') assert resp.headers['Warcserver-Source-Coll'] == 'live' self._check_uri_date(resp, 'http://httpbin.org/status/200', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/status/200', 'original') assert b'HTTP/1.1 200 OK' in resp.body assert 'ResErrors' not in resp.headers
def test_agg_seq_fallback_2(self): resp = self.testapp.get('/fallback/resource?url=http://www.example.com/') assert resp.headers['Warcserver-Source-Coll'] == 'example' self._check_uri_date(resp, 'http://example.com/', '2016-02-25T04:23:29Z') assert resp.headers['Link'] == MementoUtils.make_link('http://example.com/', 'original') assert resp.headers['Memento-Datetime'] == 'Thu, 25 Feb 2016 04:23:29 GMT' assert b'HTTP/1.1 200 OK' in resp.body assert 'ResErrors' not in resp.headers
def test_agg_select_mem_1(self): resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001') assert resp.headers['Warcserver-Source-Coll'] == 'rhiz' self._check_uri_date(resp, 'http://www.vvork.com/', '2014-10-06T18:43:57Z') assert b'HTTP/1.1 200 OK' in resp.body assert resp.headers['Link'] == MementoUtils.make_link('http://www.vvork.com/', 'original') assert resp.headers['Memento-Datetime'] == 'Mon, 06 Oct 2014 18:43:57 GMT' assert 'ResErrors' not in resp.headers
def test_agg_select_mem_2(self): resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231') assert resp.headers['Warcserver-Source-Coll'] == 'ia' self._check_uri_date(resp, 'http://vvork.com/', '2016-01-10T13:48:55Z') assert b'HTTP/1.1 200 OK' in resp.body assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original') assert resp.headers['Memento-Datetime'] == 'Sun, 10 Jan 2016 13:48:55 GMT' assert 'ResErrors' not in resp.headers
def test_live_post_resource(self): resp = self.testapp.post('/live/resource?url=http://httpbin.org/post', OrderedDict([('foo', 'bar')])) assert resp.headers['Warcserver-Source-Coll'] == 'live' self._check_uri_date(resp, 'http://httpbin.org/post', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') assert resp.headers['Memento-Datetime'] != '' assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert 'ResErrors' not in resp.headers
def test_live_resource(self): headers = {'foo': 'bar'} resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers) assert resp.headers['Warcserver-Source-Coll'] == 'live' self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original') assert resp.headers['Memento-Datetime'] != '' assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert 'ResErrors' not in resp.headers
def test_live_video_loader(self): pytest.importorskip('youtube_dl') params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc', 'content_type': 'application/vnd.youtube-dl_formats+json' } resp = self.testapp.get('/live/resource', params=params) assert resp.headers['Warcserver-Source-Coll'] == 'live' self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True) assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original') assert resp.headers['Memento-Datetime'] != '' assert b'WARC-Type: metadata' in resp.body assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body
def test_agg_post_resolve_fallback(self): req_data = OrderedDict([('foo', 'bar'), ('test', 'abc')]) resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data) assert resp.headers['Warcserver-Source-Coll'] == 'post' self._check_uri_date(resp, 'http://httpbin.org/post', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert b'"test": "abc"' in resp.body assert b'"url": "http://httpbin.org/post"' in resp.body assert 'ResErrors' not in resp.headers
def test_agg_select_local_postreq(self): req_data = """\ GET / HTTP/1.1 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 Host: iana.org """ resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data) assert resp.headers['Warcserver-Source-Coll'] == 'local:iana.cdxj' self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z') assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT' assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
def _test_warc_write(self, recorder_app, host, path, other_params='', link_url=''): url = 'http://' + host + path req_url = '/live/resource/postreq?url=' + url + other_params testapp = webtest.TestApp(recorder_app) resp = testapp.post(req_url, general_req_data.format(host=host, path=path).encode('utf-8')) if not recorder_app.write_queue.empty(): recorder_app._write_one() assert resp.headers['Warcserver-Source-Coll'] == 'live' if not link_url: link_url = unquote(url) assert resp.headers['Link'] == MementoUtils.make_link(link_url, 'original') assert resp.headers['Memento-Datetime'] != '' return resp
def test_agg_local_revisit(self): resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local') assert resp.headers['Warcserver-Source-Coll'] == 'local:dupes.cdxj' buff = BytesIO(resp.body) status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff) assert status_headers.get_header('WARC-Target-URI') == 'http://example.com' assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z' assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://example.com' assert status_headers.get_header('WARC-Refers-To-Date') == '2014-01-27T17:12:00Z' assert resp.headers['Link'] == MementoUtils.make_link('http://example.com', 'original') assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' assert b'HTTP/1.1 200 OK' in resp.body assert b'<!doctype html>' in resp.body assert 'ResErrors' not in resp.headers
def make_timemap(self, wb_url, res, full_prefix, output): wb_url.type = wb_url.QUERY content_type = res.headers.get('Content-Type') text = res.text if not res.text: status = '404 Not Found' elif res.status_code: status = str(res.status_code) + ' ' + res.reason if res.status_code == 200 and output == 'link': timegate, timemap = self._get_timegate_timemap( wb_url.url, full_prefix, wb_url.mod) text = MementoUtils.wrap_timemap_header( wb_url.url, timegate, timemap, res.text) return WbResponse.text_response(text, content_type=content_type, status=status)
def test_agg_live_postreq(self): req_data = """\ GET /get?foo=bar HTTP/1.1 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 Host: httpbin.org """ resp = self.testapp.post('/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data) assert resp.headers['Warcserver-Source-Coll'] == 'live' self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original') assert resp.headers['Memento-Datetime'] != '' assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body #assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://httpbin.org/get?foo=bar',)"} assert "NotFoundException('http://webenact.rhizome.org/vvork/" in json.loads(resp.headers['ResErrors'])['rhiz']
def links_to_cdxobject(self, link_header, def_name): results = MementoUtils.parse_links(link_header, def_name) original = results['original']['url'] key = canonicalize(original) mementos = results['mementos'] for val in mementos: dt = val['datetime'] ts = http_date_to_timestamp(dt) cdx = CDXObject() cdx['urlkey'] = key cdx['timestamp'] = ts cdx['url'] = original cdx['mem_rel'] = val.get('rel', '') cdx['memento_url'] = val['url'] load_url = self._get_replay_url(cdx['timestamp'], original) cdx['load_url'] = load_url yield cdx
def test_live_video_loader_post(self): pytest.importorskip('youtube_dl') req_data = """\ GET /v/BfBgWtAIbRc HTTP/1.1 accept-encoding: gzip, deflate accept: */* host: www.youtube.com\ """ params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc', 'content_type': 'application/vnd.youtube-dl_formats+json' } resp = self.testapp.post('/live/resource/postreq?&' + urlencode(params), req_data) assert resp.headers['Warcserver-Source-Coll'] == 'live' self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True) assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original') assert resp.headers['Memento-Datetime'] != '' assert b'WARC-Type: metadata' in resp.body assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body
def to_link(cdx_iter, fields, params): content_type = 'application/link-format' return content_type, MementoUtils.make_timemap(cdx_iter, params)
def __call__(self, cdx, params): entry = self.load_resource(cdx, params) if not entry: return None, None compress = params.get('compress') == 'gzip' warc_headers, other_headers, stream = entry source = self._get_source_id(cdx) out_headers = {} out_headers['Warcserver-Type'] = 'warc' out_headers['Content-Type'] = 'application/warc-record' if params.get('recorder_skip'): out_headers['Recorder-Skip'] = '1' cdx['recorder_skip'] = '1' out_headers['Warcserver-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) out_headers['Warcserver-Source-Coll'] = to_native_str(source) if not warc_headers: if other_headers: out_headers['Link'] = other_headers.get('Link') out_headers['Memento-Datetime'] = other_headers.get( 'Memento-Datetime') if not compress: out_headers['Content-Length'] = other_headers.get( 'Content-Length') return out_headers, StreamIter(stream, closer=call_release_conn) target_uri = warc_headers.get_header('WARC-Target-URI') out_headers['WARC-Target-URI'] = target_uri out_headers['Link'] = MementoUtils.make_link(target_uri, 'original') memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date')) out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt) warc_headers_buff = warc_headers.to_bytes() if not compress: lenset = self._set_content_len( warc_headers.get_header('Content-Length'), out_headers, len(warc_headers_buff)) else: lenset = False streamiter = StreamIter(stream, header1=warc_headers_buff, header2=other_headers, closer=call_release_conn) if compress: streamiter = compress_gzip_iter(streamiter) out_headers['Content-Encoding'] = 'gzip' #if not lenset: # out_headers['Transfer-Encoding'] = 'chunked' # streamiter = chunk_encode_iter(streamiter) return out_headers, streamiter
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) proto = environ.get('HTTP_X_FORWARDED_PROTO', self.force_scheme) if proto: environ['wsgi.url_scheme'] = proto history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '') if history_page: wb_url.url = history_page is_ajax = True else: is_ajax = self.is_ajax(environ) is_timegate = self._check_accept_dt(wb_url, environ) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix environ['pywb.host_prefix'] = host_prefix pywb_static_prefix = host_prefix + environ.get( 'pywb.app_prefix', '') + environ.get('pywb.static_prefix', '/static/') is_proxy = ('wsgiprox.proxy_host' in environ) response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix, pywb_static_prefix=pywb_static_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: return self.send_redirect('/', url_parts, urlrewriter) self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range( inputreq, wb_url) setcookie_headers = None cookie_key = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) if cookie_key: res = self.cookie_tracker.get_cookie_headers( wb_url.url, urlrewriter, cookie_key, environ.get('HTTP_COOKIE', '')) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() except Exception: pass finally: no_except_close(r.raw) if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) cdx_url_parts = urlsplit(cdx['url']) if cdx_url_parts.path.endswith( '/') and not url_parts.path.endswith('/'): # add trailing slash new_path = url_parts.path + '/' no_except_close(r.raw) return self.send_redirect(new_path, url_parts, urlrewriter) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') # cdx['urlkey'] = urlkey # cdx['timestamp'] = http_date_to_timestamp(memento_dt) # cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redir to exact, redir if url or ts are different if self.redirect_to_exact: if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy) else: resp.status_headers['Link'] = MementoUtils.make_link( target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs, record) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view.create_insert_func( wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, coll=kwargs.get('coll', ''), replay_mod=self.replay_mod, config=self.config)) cookie_rewriter = None if self.cookie_tracker and cookie_key: # skip add cookie if service worker is not 200 # it seems cookie headers from service workers are not applied, so don't update in cache if wb_url.mod == 'sw_': cookie_key = None cookie_rewriter = self.cookie_tracker.get_rewriter( urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx, environ) status_headers, gen, is_rw = result if history_page: title = DefaultRewriter._extract_title(gen) if not title: title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', '')) if not title: title = history_page self._add_history_page(cdx, kwargs, title) return WbResponse.json_response({'title': title}) if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll')) set_content_loc = True if set_content_loc and not self.redirect_to_exact: status_headers.headers.append( ('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) return response
def __call__(self, cdx, params): entry = self.load_resource(cdx, params) if not entry: return None, None compress = params.get('compress') == 'gzip' warc_headers, other_headers, stream = entry source = self._get_source_id(cdx) out_headers = {} out_headers['Warcserver-Type'] = 'warc' out_headers['Content-Type'] = 'application/warc-record' if params.get('recorder_skip'): out_headers['Recorder-Skip'] = '1' cdx['recorder_skip'] = '1' out_headers['Warcserver-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) out_headers['Warcserver-Source-Coll'] = to_native_str(source) if not warc_headers: if other_headers: out_headers['Link'] = other_headers.get('Link') out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime') if not compress: out_headers['Content-Length'] = other_headers.get('Content-Length') return out_headers, StreamIter(stream, closer=call_release_conn) target_uri = warc_headers.get_header('WARC-Target-URI') out_headers['WARC-Target-URI'] = target_uri out_headers['Link'] = MementoUtils.make_link(target_uri, 'original') memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date')) out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt) warc_headers_buff = warc_headers.to_bytes() if not compress: lenset = self._set_content_len(warc_headers.get_header('Content-Length'), out_headers, len(warc_headers_buff)) else: lenset = False streamiter = StreamIter(stream, header1=warc_headers_buff, header2=other_headers, closer=call_release_conn) if compress: streamiter = compress_gzip_iter(streamiter) out_headers['Content-Encoding'] = 'gzip' #if not lenset: # out_headers['Transfer-Encoding'] = 'chunked' # streamiter = chunk_encode_iter(streamiter) return out_headers, streamiter
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts, status_headers, is_timegate, is_proxy, coll=None, pref_applied=None, mod=None, is_memento=True): """Adds the memento link headers to supplied StatusAndHeaders instance :param str url: The URI-R being rewritten :param str full_prefix: The replay prefix :param str|None memento_dt: The memento datetime for the URI-R being rewritten :param str memento_ts: The memento timestamp :param warcio.StatusAndHeaders status_headers: :param bool is_timegate: Are we returning a response for a timegate :param bool is_proxy: Are we operating in proxy mode :param str|None coll: The collection the URI-R is from :param str|None pref_applied: :param str|None mod: The rewrite modifier :param bool is_memento: :rtype: None """ replay_mod = mod or self.replay_mod # memento url + header if not memento_dt and memento_ts: memento_dt = timestamp_to_http_date(memento_ts) if memento_dt: if is_memento: status_headers.headers.append(('Memento-Datetime', memento_dt)) if is_proxy: memento_url = url else: memento_url = full_prefix + memento_ts + replay_mod memento_url += '/' + url else: memento_url = None timegate_url, timemap_url = self._get_timegate_timemap( url, full_prefix, mod) link = [] if not is_proxy: link.append(MementoUtils.make_link(url, 'original')) link.append(MementoUtils.make_link(timegate_url, 'timegate')) link.append(MementoUtils.make_link(timemap_url, 'timemap')) if memento_dt: link.append( MementoUtils.make_memento_link(memento_url, 'memento', memento_dt, coll)) link_str = ', '.join(link) status_headers.headers.append(('Link', link_str)) vary = '' if is_timegate: vary = 'accept-datetime' if pref_applied: vary = 'Prefer' if not vary else vary + ', Prefer' status_headers.headers.append(('Preference-Applied', pref_applied)) if vary: status_headers.headers.append(('Vary', vary))
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) is_timegate = self._check_accept_dt(wb_url, environ) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix is_proxy = ('wsgiprox.proxy_host' in environ) response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: scheme, netloc, path, query, frag = url_parts path = '/' url = urlunsplit((scheme, netloc, path, query, frag)) resp = WbResponse.redir_response(urlrewriter.rewrite(url), '307 Temporary Redirect') if self.enable_memento: resp.status_headers['Link'] = MementoUtils.make_link(url, 'original') return resp self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range(inputreq, wb_url) setcookie_headers = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() r.raw.close() except: pass if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) #cdx['urlkey'] = urlkey #cdx['timestamp'] = http_date_to_timestamp(memento_dt) #cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redir to exact, redir if url or ts are different if self.redirect_to_exact: if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy) else: resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' is_ajax = self.is_ajax(environ) if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view. create_insert_func(wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, config=self.config)) cookie_rewriter = None if self.cookie_tracker: cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx) status_headers, gen, is_rw = result if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll')) set_content_loc = True if set_content_loc and not self.redirect_to_exact: status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) return response
def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '') if history_page: wb_url.url = history_page is_ajax = True else: is_ajax = self.is_ajax(environ) is_timegate = self._check_accept_dt(wb_url, environ) self.prepare_env(environ) host_prefix = environ['pywb.host_prefix'] rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix pywb_static_prefix = environ['pywb.static_prefix'] + '/' is_proxy = ('wsgiprox.proxy_host' in environ) # if OPTIONS in proxy mode, just generate the proxy responss if is_proxy and self.is_preflight(environ): return WbResponse.options_response(environ) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw # no redirects if in proxy redirect_to_exact = self.redirect_to_exact and not is_proxy # Check Prefer pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ, content_rw, is_proxy) response = None keep_frame_response = False # prefer overrides custom response? if pref_mod is not None: # fast-redirect to preferred if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod: new_url = full_prefix + wb_url.to_str(mod=pref_mod) headers = [('Preference-Applied', pref_applied), ('Vary', 'Prefer')] return WbResponse.redir_response(new_url, '307 Temporary Redirect', headers=headers) else: wb_url.mod = pref_mod else: if kwargs.get('output'): response = self.handle_timemap(wb_url, kwargs, full_prefix) elif wb_url.is_query(): response = self.handle_query(environ, wb_url, kwargs, full_prefix) else: response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) keep_frame_response = (not kwargs.get('no_timegate_check') and is_timegate and not is_proxy) or redirect_to_exact if response and not keep_frame_response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix, pywb_static_prefix=pywb_static_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: return self.send_redirect('/', url_parts, urlrewriter) self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range( inputreq, wb_url) setcookie_headers = None cookie_key = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) if cookie_key: res = self.cookie_tracker.get_cookie_headers( wb_url.url, urlrewriter, cookie_key, environ.get('HTTP_COOKIE', '')) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() except Exception: pass finally: no_except_close(r.raw) if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) if r.status_code == 404: raise NotFoundException(url=wb_url.url, msg=details) else: raise UpstreamException(r.status_code, url=wb_url.url, details=details) cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) cdx_url_parts = urlsplit(cdx['url']) if cdx_url_parts.path.endswith( '/') and not url_parts.path.endswith('/'): # add trailing slash new_path = url_parts.path + '/' no_except_close(r.raw) return self.send_redirect(new_path, url_parts, urlrewriter) # only redirect to exact if not live, otherwise set to false redirect_to_exact = redirect_to_exact and not cdx.get('is_live') # return top-frame timegate response, with timestamp from cdx if response and keep_frame_response and (not redirect_to_exact or not is_timegate): no_except_close(r.raw) return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy, cdx['timestamp']) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') # cdx['urlkey'] = urlkey # cdx['timestamp'] = http_date_to_timestamp(memento_dt) # cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redirect to exact timestamp (only set if not live) if redirect_to_exact: if set_content_loc or is_timegate or wb_url.timestamp != cdx.get( 'timestamp'): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy, pref_applied=pref_applied, mod=pref_mod, is_memento=False) else: resp.status_headers['Link'] = MementoUtils.make_link( target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs, record) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view.create_insert_func( wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, coll=kwargs.get('coll', ''), replay_mod=self.replay_mod, metadata=kwargs.get('metadata', {}), config=self.config)) cookie_rewriter = None if self.cookie_tracker and cookie_key: # skip add cookie if service worker is not 200 # it seems cookie headers from service workers are not applied, so don't update in cache if wb_url.mod == 'sw_': cookie_key = None cookie_rewriter = self.cookie_tracker.get_rewriter( urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx, environ) status_headers, gen, is_rw = result if history_page: title = DefaultRewriter._extract_title(gen) if not title: title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', '')) if not title: title = history_page self._add_history_page(cdx, kwargs, title) return WbResponse.json_response({'title': title}) if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll'), mod=pref_mod, pref_applied=pref_applied) set_content_loc = True if set_content_loc and not redirect_to_exact and not is_proxy: status_headers.headers.append( ('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) if is_proxy and environ.get('HTTP_ORIGIN'): response.add_access_control_headers(environ) if r.status_code == 200 and kwargs.get( 'cache') == 'always' and environ.get('HTTP_REFERER'): response.status_headers[ 'Cache-Control'] = 'public, max-age=31536000, immutable' return response
def to_link(cdx_iter, fields): content_type = 'application/link-format' return content_type, MementoUtils.make_timemap(cdx_iter)