def proxy_fetch(self, env, url): """Proxy mode only endpoint that handles OPTIONS requests and COR fetches for Preservation Worker. Due to normal cross-origin browser restrictions in proxy mode, auto fetch worker cannot access the CSS rules of cross-origin style sheets and must re-fetch them in a manner that is CORS safe. This endpoint facilitates that by fetching the stylesheets for the auto fetch worker and then responds with its contents :param dict env: The WSGI environment dictionary :param str url: The URL of the resource to be fetched :return: WbResponse that is either response to an Options request or the results of fetching url :rtype: WbResponse """ if not self.is_proxy_enabled(env): # we are not in proxy mode so just respond with forbidden return WbResponse.text_response( 'proxy mode must be enabled to use this endpoint', status='403 Forbidden') if env.get('REQUEST_METHOD') == 'OPTIONS': return WbResponse.options_response(env) # ensure full URL request_url = env['REQUEST_URI'] # replace with /id_ so we do not get rewritten url = request_url.replace('/proxy-fetch', '/id_') # update WSGI environment object env['REQUEST_URI'] = self.proxy_coll + url env['PATH_INFO'] = env['PATH_INFO'].replace('/proxy-fetch', self.proxy_coll + '/id_') # make request using normal serve_content response = self.serve_content(env, self.proxy_coll, url) # for WR if isinstance(response, WbResponse): response.add_access_control_headers(env=env) return response
def serve_cdx(self, environ, coll='$root'): """Make the upstream CDX query for a collection and response with the results of the query :param dict environ: The WSGI environment dictionary for the request :param str coll: The name of the collection this CDX query is for :return: The WbResponse containing the results of the CDX query :rtype: WbResponse """ base_url = self.rewriterapp.paths['cdx-server'] # if coll == self.all_coll: # coll = '*' cdx_url = base_url.format(coll=coll) if environ.get('QUERY_STRING'): cdx_url += '&' if '?' in cdx_url else '?' cdx_url += environ.get('QUERY_STRING') if self.query_limit: cdx_url += '&' if '?' in cdx_url else '?' cdx_url += 'limit=' + str(self.query_limit) try: res = requests.get(cdx_url, stream=True) content_type = res.headers.get('Content-Type') return WbResponse.bin_stream(StreamIter(res.raw), content_type=content_type) except Exception as e: return WbResponse.text_response('Error: ' + str(e), status='400 Bad Request')
def put_custom_record(self, environ, coll): chunks = [] while True: buff = environ["wsgi.input"].read() print("LEN", len(buff)) if not buff: break chunks.append(buff) data = b"".join(chunks) params = dict(parse_qsl(environ.get("QUERY_STRING"))) rec_type = "resource" headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")} target_uri = params.get("url") if not target_uri: return WbResponse.json_response({"error": "no url"}) timestamp = params.get("timestamp") if timestamp: headers["WARC-Date"] = timestamp_to_iso_date(timestamp) put_url = self.custom_record_path.format(url=target_uri, coll=coll, rec_type=rec_type) res = requests.put(put_url, headers=headers, data=data) res = res.json() return WbResponse.json_response(res)
def proxy_fetch(self, env, url): """Proxy mode only endpoint that handles OPTIONS requests and COR fetches for Preservation Worker. Due to normal cross-origin browser restrictions in proxy mode, auto fetch worker cannot access the CSS rules of cross-origin style sheets and must re-fetch them in a manner that is CORS safe. This endpoint facilitates that by fetching the stylesheets for the auto fetch worker and then responds with its contents :param dict env: The WSGI environment dictionary :param str url: The URL of the resource to be fetched :return: WbResponse that is either response to an Options request or the results of fetching url :rtype: WbResponse """ if not self.is_proxy_enabled(env): # we are not in proxy mode so just respond with forbidden return WbResponse.text_response('proxy mode must be enabled to use this endpoint', status='403 Forbidden') if env.get('REQUEST_METHOD') == 'OPTIONS': return WbResponse.options_response(env) # ensure full URL request_url = env['REQUEST_URI'] # replace with /id_ so we do not get rewritten url = request_url.replace('/proxy-fetch', '/id_') # update WSGI environment object env['REQUEST_URI'] = self.proxy_coll + url env['PATH_INFO'] = env['PATH_INFO'].replace('/proxy-fetch', self.proxy_coll + '/id_') # make request using normal serve_content response = self.serve_content(env, self.proxy_coll, url) # for WR if isinstance(response, WbResponse): response.add_access_control_headers(env=env) return response
def serve_cdx(self, environ, coll='$root'): """Make the upstream CDX query for a collection and response with the results of the query :param dict environ: The WSGI environment dictionary for the request :param str coll: The name of the collection this CDX query is for :return: The WbResponse containing the results of the CDX query :rtype: WbResponse """ base_url = self.rewriterapp.paths['cdx-server'] #if coll == self.all_coll: # coll = '*' cdx_url = base_url.format(coll=coll) if environ.get('QUERY_STRING'): cdx_url += '&' if '?' in cdx_url else '?' cdx_url += environ.get('QUERY_STRING') try: res = requests.get(cdx_url, stream=True) content_type = res.headers.get('Content-Type') return WbResponse.bin_stream(StreamIter(res.raw), content_type=content_type) except Exception as e: return WbResponse.text_response('Error: ' + str(e), status='400 Bad Request')
def test_wbresponse_options_response(): res = WbResponse.options_response(dict(HTTP_ORIGIN='http://example.com')) assert ('Access-Control-Allow-Origin', 'http://example.com') in res.status_headers.headers res = WbResponse.options_response(dict(HTTP_REFERER='http://example.com')) assert ('Access-Control-Allow-Origin', 'http://example.com') in res.status_headers.headers res = WbResponse.options_response(dict()) assert ('Access-Control-Allow-Origin', '*') in res.status_headers.headers res = WbResponse.options_response(dict(HTTP_ORIGIN=None)) assert ('Access-Control-Allow-Origin', '*') in res.status_headers.headers res = WbResponse.options_response(dict(HTTP_REFERER=None)) assert ('Access-Control-Allow-Origin', '*') in res.status_headers.headers
def serve_static(self, environ, coll='', filepath=''): """Serve a static file associated with a specific collection or one of pywb's own static assets :param dict environ: The WSGI environment dictionary for the request :param str coll: The collection the static file is associated with :param str filepath: The file path (relative to the collection) for the static assest :return: The WbResponse for the static asset :rtype: WbResponse """ proxy_enabled = self.is_proxy_enabled(environ) if proxy_enabled and environ.get('REQUEST_METHOD') == 'OPTIONS': return WbResponse.options_response(environ) if coll: path = os.path.join(self.warcserver.root_dir, coll, self.static_dir) else: path = self.static_dir environ['pywb.static_dir'] = path try: response = self.static_handler(environ, filepath) if proxy_enabled: response.add_access_control_headers(env=environ) return response except Exception: self.raise_not_found(environ, 'static_file_not_found', filepath)
def _check_refer_redirect(self, environ): """Returns a WbResponse for a HTTP 307 redirection if the HTTP referer header is the same as the HTTP host header :param dict environ: The WSGI environment dictionary for the request :return: WbResponse HTTP 307 redirection :rtype: WbResponse """ referer = environ.get('HTTP_REFERER') if not referer: return host = environ.get('HTTP_HOST') if host not in referer: return inx = referer[1:].find('http') if not inx: inx = referer[1:].find('///') if inx > 0: inx + 1 if inx < 0: return url = referer[inx + 1:] host = referer[:inx + 1] orig_url = environ['PATH_INFO'] if environ.get('QUERY_STRING'): orig_url += '?' + environ['QUERY_STRING'] full_url = host + urljoin(url, orig_url) return WbResponse.redir_response(full_url, '307 Redirect')
def serve_coll_page(self, environ, coll='$root'): """Render and serve a collections search page (search.html). :param dict environ: The WSGI environment dictionary for the request :param str coll: The name of the collection to serve the collections search page for :return: The WbResponse containing the collections search page :rtype: WbResponse """ if not self.is_valid_coll(coll): self.raise_not_found(environ, 'coll_not_found', coll) self.setup_paths(environ, coll) coll_config = self.get_coll_config(coll) metadata = coll_config.get('metadata') view = BaseInsertView(self.rewriterapp.jinja_env, 'search.html') wb_prefix = environ.get('SCRIPT_NAME', '') if wb_prefix: wb_prefix += '/' content = view.render_to_string(environ, wb_prefix=wb_prefix, coll=coll, coll_config=coll_config, metadata=metadata) return WbResponse.text_response( content, content_type='text/html; charset="utf-8"')
def serve_listing(self, environ): result = { 'fixed': self.warcserver.list_fixed_routes(), 'dynamic': self.warcserver.list_dynamic_routes() } return WbResponse.json_response(result)
def _check_refer_redirect(self, environ): referer = environ.get('HTTP_REFERER') if not referer: return host = environ.get('HTTP_HOST') if host not in referer: return inx = referer[1:].find('http') if not inx: inx = referer[1:].find('///') if inx > 0: inx + 1 if inx < 0: return url = referer[inx + 1:] host = referer[:inx + 1] orig_url = environ['PATH_INFO'] if environ.get('QUERY_STRING'): orig_url += '?' + environ['QUERY_STRING'] full_url = host + urljoin(url, orig_url) return WbResponse.redir_response(full_url, '307 Redirect')
def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy, timegate_closest_ts=None): memento_ts = None if not isinstance(response, WbResponse): content_type = 'text/html' # if not replay outer frame, specify utf-8 charset if not self.is_framed_replay(wb_url): content_type += '; charset=utf-8' else: memento_ts = timegate_closest_ts or wb_url.timestamp response = WbResponse.text_response(response, content_type=content_type) if self.enable_memento and response.status_headers.statusline.startswith( '200'): self._add_memento_links(wb_url.url, full_prefix, None, memento_ts, response.status_headers, is_timegate, is_proxy, is_memento=not is_timegate) return response
def serve_record(self, environ, coll='$root', url=''): if coll in self.warcserver.list_fixed_routes(): return WbResponse.text_response( 'Error: Can Not Record Into Custom Collection "{0}"'.format( coll)) return self.serve_content(environ, coll, url, record=True)
def _check_refer_redirect(self, environ): """Returns a WbResponse for a HTTP 307 redirection if the HTTP referer header is the same as the HTTP host header :param dict environ: The WSGI environment dictionary for the request :return: WbResponse HTTP 307 redirection :rtype: WbResponse """ referer = environ.get('HTTP_REFERER') if not referer: return host = environ.get('HTTP_HOST') if host not in referer: return inx = referer[1:].find('http') if not inx: inx = referer[1:].find('///') if inx < 0: return url = referer[inx + 1:] host = referer[:inx + 1] orig_url = environ['PATH_INFO'] if environ.get('QUERY_STRING'): orig_url += '?' + environ['QUERY_STRING'] full_url = host + urljoin(url, orig_url) return WbResponse.redir_response(full_url, '307 Redirect')
def exit(self, environ=None): import uwsgi import signal resp = WbResponse.json_response({}) os.kill(uwsgi.masterpid(), signal.SIGTERM) return resp
def get_wacz(self, environ, coll): # if self.pending_count != 0 or self.pending_size != 0: # return WbResponse.json_response( # {"error": "not_ready"}, status="404 Not Found" # ) params = dict(parse_qsl(environ.get("QUERY_STRING"))) archive_dir = os.path.join("collections", coll, "archive") all_warcs = [ os.path.join(archive_dir, name) for name in os.listdir(archive_dir) ] all_warcs.append("-o") all_warcs.append("/tmp/out/archive.wacz") url = params.get("url") if url: all_warcs.append("--url") all_warcs.append(url) try: wacz_main(all_warcs) except Exception as e: print(e) return WbResponse.json_response({"done": "/tmp/out/archive.wacz"})
def serve_coll_page(self, environ, coll='$root'): """Render and serve a collections search page (search.html). :param dict environ: The WSGI environment dictionary for the request :param str coll: The name of the collection to serve the collections search page for :return: The WbResponse containing the collections search page :rtype: WbResponse """ if not self.is_valid_coll(coll): self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll)) self.setup_paths(environ, coll) metadata = self.get_metadata(coll) view = BaseInsertView(self.rewriterapp.jinja_env, 'search.html') wb_prefix = environ.get('SCRIPT_NAME') if wb_prefix: wb_prefix += '/' content = view.render_to_string(environ, wb_prefix=wb_prefix, metadata=metadata, coll=coll) return WbResponse.text_response(content, content_type='text/html; charset="utf-8"')
def serve_static(self, environ, coll='', filepath=''): """Serve a static file associated with a specific collection or one of pywb's own static assets :param dict environ: The WSGI environment dictionary for the request :param str coll: The collection the static file is associated with :param str filepath: The file path (relative to the collection) for the static assest :return: The WbResponse for the static asset :rtype: WbResponse """ proxy_enabled = self.is_proxy_enabled(environ) if proxy_enabled and environ.get('REQUEST_METHOD') == 'OPTIONS': return WbResponse.options_response(environ) if coll: path = os.path.join(self.warcserver.root_dir, coll, self.static_dir) else: path = self.static_dir environ['pywb.static_dir'] = path try: response = self.static_handler(environ, filepath) if proxy_enabled: response.add_access_control_headers(env=environ) return response except: self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath))
def test_wbresponse_encode_stream(): stream = [ u'\u00c3' ] # Unicode Character 'LATIN CAPITAL LETTER A WITH TILDE' (U+00C3) expected = [b'\xc3\x83'] encoding_stream = WbResponse.encode_stream(stream) assert inspect.isgenerator(encoding_stream) assert list(encoding_stream) == expected
def test_wbresponse_json_response(): body = dict(pywb=1, wr=2) res = WbResponse.json_response(body) status_headers = res.status_headers assert status_headers.statusline == '200 OK' assert ('Content-Type', 'application/json; charset=utf-8') in status_headers.headers assert json.loads(res.body[0]) == body
def _not_found_response(self, environ, url): resp = self.not_found_view.render_to_string(environ, url=url, err_msg="Not Found") return WbResponse.text_response(resp, status='404 Not Found', content_type='text/html')
def _error_response(self, environ, wbe): status = wbe.status() resp = self.error_view.render_to_string(environ, err_msg=wbe.url, err_details=wbe.msg, err_status=wbe.status_code) return WbResponse.text_response(resp, status=status, content_type='text/html')
def lock_clear_all(self, environ): redis = environ[SESSION_KEY].redis for sesh_key in redis.scan_iter(SESH_LIST.format('*')): redis.delete(sesh_key) for lock_key in redis.scan_iter('lock:*'): redis.delete(lock_key) return WbResponse.redir_response('/_locks')
def test_wbresponse_text_stream(): stream = [ u'\u00c3' ] # Unicode Character 'LATIN CAPITAL LETTER A WITH TILDE' (U+00C3) expected = [b'\xc3\x83'] res = WbResponse.text_stream(stream, content_type='text/plain') status_headers = res.status_headers assert status_headers.statusline == '200 OK' assert ('Content-Type', 'text/plain; charset=utf-8') in status_headers.headers assert inspect.isgenerator(res.body) assert list(res.body) == expected res = WbResponse.text_stream(stream) status_headers = res.status_headers assert status_headers.statusline == '200 OK' assert ('Content-Type', 'text/plain; charset=utf-8') in status_headers.headers assert inspect.isgenerator(res.body) assert list(res.body) == expected
def send_redirect(self, new_path, url_parts, urlrewriter): scheme, netloc, path, query, frag = url_parts path = new_path url = urlunsplit((scheme, netloc, path, query, frag)) resp = WbResponse.redir_response(urlrewriter.rewrite(url), '307 Temporary Redirect') if self.enable_memento: resp.status_headers['Link'] = MementoUtils.make_link(url, 'original') return resp
def put_custom_record(self, environ, coll="$root"): """ When recording, PUT a custom WARC record to the specified collection (Available only when recording) :param dict environ: The WSGI environment dictionary for the request :param str coll: The name of the collection the record is to be served from """ chunks = [] while True: buff = environ["wsgi.input"].read() if not buff: break chunks.append(buff) data = b"".join(chunks) params = dict(parse_qsl(environ.get("QUERY_STRING"))) rec_type = "resource" headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")} target_uri = params.get("url") if not target_uri: return WbResponse.json_response({"error": "no url"}, status="400 Bad Request") timestamp = params.get("timestamp") if timestamp: headers["WARC-Date"] = timestamp_to_iso_date(timestamp) put_url = self.put_custom_record_path.format(url=target_uri, coll=coll, rec_type=rec_type) res = requests.put(put_url, headers=headers, data=data) res = res.json() return WbResponse.json_response(res)
def _error_response(self, environ, msg='', details='', status='404 Not Found'): resp = self.error_view.render_to_string(environ, err_msg=msg, err_details=details) return WbResponse.text_response(resp, status=status, content_type='text/html')
def serve_listing(self, environ): """Serves the response for WARCServer fixed and dynamic listing (paths) :param dict environ: The WSGI environment dictionary for the request :return: WbResponse containing the frontend apps WARCServer URL paths :rtype: WbResponse """ result = {'fixed': self.warcserver.list_fixed_routes(), 'dynamic': self.warcserver.list_dynamic_routes() } return WbResponse.json_response(result)
def put_screenshot(self, environ, coll): self.ensure_coll_exists(coll) headers = {'Content-Type': environ.get('CONTENT_TYPE', 'text/plain')} query_data = parse_qs(environ.get('QUERY_STRING')) url = query_data.get('target_uri', []) if url: url = url[0] if not url: return WbResponse.json_response({'error': 'no target_uri'}) put_url = self.screenshot_recorder_path.format(url=url, coll=coll) res = requests.put(put_url, headers=headers, data=environ['wsgi.input']) res = res.json() return WbResponse.json_response(res)
def serve_listing(self, environ): """Serves the response for WARCServer fixed and dynamic listing (paths) :param dict environ: The WSGI environment dictionary for the request :return: WbResponse containing the frontend apps WARCServer URL paths :rtype: WbResponse """ result = { 'fixed': self.warcserver.list_fixed_routes(), 'dynamic': self.warcserver.list_dynamic_routes() } return WbResponse.json_response(result)
def serve_record(self, environ, coll='$root', url=''): """Serve a URL's content from a WARC/ARC record in replay mode or from the live web in live, proxy, and record mode. :param dict environ: The WSGI environment dictionary for the request :param str coll: The name of the collection the record is to be served from :param str url: The URL for the corresponding record to be served if it exists :return: WbResponse containing the contents of the record/URL :rtype: WbResponse """ if coll in self.warcserver.list_fixed_routes(): return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll)) return self.serve_content(environ, coll, url, record=True)
def test_resp_1(): resp = vars(WbResponse.text_response('Test')) expected = { 'body': [b'Test'], 'status_headers': StatusAndHeaders(protocol='', statusline='200 OK', headers=[('Content-Type', 'text/plain; charset=utf-8'), ('Content-Length', '4')]) } assert (resp == expected)
def serve_cdx(self, environ, coll='$root'): base_url = self.rewriterapp.paths['cdx-server'] #if coll == self.all_coll: # coll = '*' cdx_url = base_url.format(coll=coll) if environ.get('QUERY_STRING'): cdx_url += '&' if '?' in cdx_url else '?' cdx_url += environ.get('QUERY_STRING') try: res = requests.get(cdx_url, stream=True) content_type = res.headers.get('Content-Type') return WbResponse.bin_stream(StreamIter(res.raw), content_type=content_type) except Exception as e: return WbResponse.text_response('Error: ' + str(e), status='400 Bad Request')
def test_resp_3(): resp = vars(WbResponse.redir_response('http://example.com/otherfile')) expected = { 'body': [], 'status_headers': StatusAndHeaders(protocol='', statusline='302 Redirect', headers=[('Location', 'http://example.com/otherfile'), ('Content-Length', '0')]) } assert (resp == expected)
def put_record(self, environ, coll, target_uri_format, rec_type, params, data): self.ensure_coll_exists(coll) headers = {'Content-Type': environ.get('CONTENT_TYPE', 'text/plain')} url = params.get('url') if not url: return WbResponse.json_response({'error': 'no url'}) timestamp = params.get('timestamp') if timestamp: headers['WARC-Date'] = timestamp_to_iso_date(timestamp) target_uri = target_uri_format.format(url=url) put_url = self.custom_record_path.format( url=target_uri, coll=coll, rec_type=rec_type ) res = requests.put(put_url, headers=headers, data=data) res = res.json() return WbResponse.json_response(res)
def handle_request(self, environ, start_response): """Retrieves the route handler and calls the handler returning its the response :param dict environ: The WSGI environment dictionary for the request :param start_response: :return: The WbResponse for the request :rtype: WbResponse """ urls = self.url_map.bind_to_environ(environ) try: endpoint, args = urls.match() self.rewriterapp.prepare_env(environ) # store original script_name (original prefix) before modifications are made environ['ORIG_SCRIPT_NAME'] = environ.get('SCRIPT_NAME') lang = args.pop('lang', '') if lang: pop_path_info(environ) environ['pywb_lang'] = lang response = endpoint(environ, **args) except RequestRedirect as rr: # if werkzeug throws this, likely a missing slash redirect # also check referrer here to avoid another redirect later redir = self._check_refer_redirect(environ) if redir: return redir(environ, start_response) response = WbResponse.redir_response(rr.new_url, '307 Redirect') except WbException as wbe: if wbe.status_code == 404: redir = self._check_refer_redirect(environ) if redir: return redir(environ, start_response) response = self.rewriterapp.handle_error(environ, wbe) except Exception as e: if self.debug: traceback.print_exc() response = self.rewriterapp._error_response( environ, WbException('Internal Error: ' + str(e))) return response(environ, start_response)
def serve_home(self, environ): home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html') fixed_routes = self.warcserver.list_fixed_routes() dynamic_routes = self.warcserver.list_dynamic_routes() routes = fixed_routes + dynamic_routes all_metadata = self.metadata_cache.get_all(dynamic_routes) content = home_view.render_to_string(environ, routes=routes, all_metadata=all_metadata) return WbResponse.text_response( content, content_type='text/html; charset="utf-8"')
def __call__(self, environ, url_str): url = url_str.split('?')[0] full_path = environ.get('pywb.static_dir') if full_path: full_path = os.path.join(full_path, url) if not os.path.isfile(full_path): full_path = None if not full_path: full_path = os.path.join(self.static_path, url) try: data = self.block_loader.load(full_path) data.seek(0, 2) size = data.tell() data.seek(0) headers = [('Content-Length', str(size))] reader = None if 'wsgi.file_wrapper' in environ: try: reader = environ['wsgi.file_wrapper'](data) except: pass if not reader: reader = iter(lambda: data.read(), b'') content_type = 'application/octet-stream' guessed = mimetypes.guess_type(full_path) if guessed[0]: content_type = guessed[0] return WbResponse.bin_stream(reader, content_type=content_type, headers=headers) except IOError: raise NotFoundException('Static File Not Found: ' + url_str)
def serve_home(self, environ): """Serves the home (/) view of pywb (not a collections) :param dict environ: The WSGI environment dictionary for the request :return: The WbResponse for serving the home (/) path :rtype: WbResponse """ home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html') fixed_routes = self.warcserver.list_fixed_routes() dynamic_routes = self.warcserver.list_dynamic_routes() routes = fixed_routes + dynamic_routes all_metadata = self.metadata_cache.get_all(dynamic_routes) content = home_view.render_to_string(environ, routes=routes, all_metadata=all_metadata) return WbResponse.text_response(content, content_type='text/html; charset="utf-8"')