def __init__(self, is_framed_replay=False, proxies=None): self.rewriter = RewriteContent(is_framed_replay=is_framed_replay) self.proxies = proxies self.live_request = live_request if self.proxies: logging.debug('Live Rewrite via proxy ' + str(proxies)) if isinstance(proxies, str): self.proxies = {'http': proxies, 'https': proxies} else: logging.debug('Live Rewrite Direct (no proxy)')
def __init__(self, config): super(PlatformHandler, self).__init__(config) self.upstream_url = config.get('upstream_url') self.loader = ArcWarcRecordLoader() framed = config.get('framed_replay') self.content_rewriter = RewriteContent(is_framed_replay=framed)
def __init__(self, content_loader, config): self.content_loader = content_loader framed = config.get('framed_replay') self.content_rewriter = RewriteContent(is_framed_replay=framed) self.head_insert_view = HeadInsertView.init_from_config(config) self.buffer_response = config.get('buffer_response', True) self.redir_to_exact = config.get('redir_to_exact', True) memento = config.get('enable_memento', False) if memento: self.response_class = MementoResponse else: self.response_class = WbResponse self._reporter = config.get('reporter')
def _buffer_response(status_headers, iterator): out = SpooledTemporaryFile(ProxyRouter.BUFF_RESPONSE_MEM_SIZE) size = 0 for buff in iterator: size += len(buff) out.write(buff) content_length_str = str(size) # remove existing content length status_headers.replace_header('Content-Length', content_length_str) out.seek(0) return RewriteContent.stream_to_gen(out)
def __init__(self, content_loader, config): self.content_loader = content_loader framed = config.get("framed_replay") self.content_rewriter = RewriteContent(is_framed_replay=framed) self.head_insert_view = HeadInsertView.init_from_config(config) self.buffer_response = config.get("buffer_response", True) self.buffer_max_size = config.get("buffer_max_size", 16384) self.redir_to_exact = config.get("redir_to_exact", True) memento = config.get("enable_memento", False) if memento: self.response_class = MementoResponse else: self.response_class = WbResponse self.enable_range_cache = config.get("enable_ranges", True) self._reporter = config.get("reporter")
class LiveRewriter(object): def __init__(self, is_framed_replay=False, proxies=None): self.rewriter = RewriteContent(is_framed_replay=is_framed_replay) self.proxies = proxies self.live_request = live_request if self.proxies: logging.debug('Live Rewrite via proxy ' + str(proxies)) if isinstance(proxies, str): self.proxies = {'http': proxies, 'https': proxies} else: logging.debug('Live Rewrite Direct (no proxy)') def is_recording(self): return self.proxies is not None def fetch_local_file(self, uri): #fh = open(uri) fh = LocalFileLoader().load(uri) content_type, _ = mimetypes.guess_type(uri) # create fake headers for local file status_headers = StatusAndHeaders('200 OK', [('Content-Type', content_type)]) stream = fh return (status_headers, stream) def translate_headers(self, url, urlkey, env): headers = {} splits = urlsplit(url) has_cookies = False for name, value in six.iteritems(env): if name == 'HTTP_HOST': name = 'Host' value = splits.netloc elif name == 'HTTP_ORIGIN': name = 'Origin' value = (splits.scheme + '://' + splits.netloc) elif name == 'HTTP_X_CSRFTOKEN': name = 'X-CSRFToken' cookie_val = extract_client_cookie(env, 'csrftoken') if cookie_val: value = cookie_val elif name == 'HTTP_REFERER': continue elif name == 'HTTP_X_PYWB_REQUESTED_WITH': continue elif name == 'HTTP_X_FORWARDED_PROTO': name = 'X-Forwarded-Proto' value = splits.scheme elif name == 'HTTP_COOKIE': name = 'Cookie' value = self._req_cookie_rewrite(urlkey, value) has_cookies = True elif name.startswith('HTTP_'): name = name[5:].title().replace('_', '-') elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'): name = name.title().replace('_', '-') elif name == 'REL_REFERER': name = 'Referer' else: value = None if value: headers[name] = value if not has_cookies: value = self._req_cookie_rewrite(urlkey, '') if value: headers['Cookie'] = value return headers def _req_cookie_rewrite(self, urlkey, value): rule = self.rewriter.ruleset.get_first_match(urlkey) if not rule or not rule.req_cookie_rewrite: return value for cr in rule.req_cookie_rewrite: try: value = cr['rx'].sub(cr['replace'], value) except KeyError: pass return value def fetch_http(self, url, urlkey=None, env=None, req_headers=None, follow_redirects=False, skip_recording=False, verify=True): method = 'GET' data = None proxies = None if not skip_recording: proxies = self.proxies if not req_headers: req_headers = {} if env is not None: method = env['REQUEST_METHOD'].upper() input_ = env['wsgi.input'] req_headers.update(self.translate_headers(url, urlkey, env)) if method in ('POST', 'PUT'): len_ = env.get('CONTENT_LENGTH') if len_: data = LimitReader(input_, int(len_)) else: data = input_ response = self.live_request(method=method, url=url, data=data, headers=req_headers, allow_redirects=follow_redirects, proxies=proxies, stream=True, verify=verify) statusline = str(response.status_code) + ' ' + response.reason headers = response.headers.items() stream = response.raw try: #pragma: no cover #PY 3 headers = stream._original_response.headers._headers except: #pragma: no cover #PY 2 headers = [] resp_headers = stream._original_response.msg.headers for h in resp_headers: n, v = h.split(':', 1) n = n.strip() v = v.strip() headers.append((n, v)) status_headers = StatusAndHeaders(statusline, headers) return (status_headers, stream) def fetch_request(self, url, urlrewriter, head_insert_func=None, urlkey=None, env=None, req_headers={}, timestamp=None, follow_redirects=False, skip_recording=False, verify=True, remote_only=True): ts_err = url.split('///') # fixup for accidental erroneous rewrite which has /// # (unless file:///) if len(ts_err) > 1 and ts_err[0] != 'file:': url = 'http://' + ts_err[1] if url.startswith('//'): url = 'http:' + url if remote_only or is_http(url): is_remote = True else: is_remote = False if not url.startswith('file:'): url = to_file_url(url) # explicit urlkey may be passed in (say for testing) if not urlkey: urlkey = canonicalize(url) if is_remote: (status_headers, stream) = self.fetch_http(url, urlkey, env, req_headers, follow_redirects, skip_recording, verify) else: (status_headers, stream) = self.fetch_local_file(url) if timestamp is None: timestamp = timestamp_now() cdx = { 'urlkey': urlkey, 'timestamp': timestamp, 'url': url, 'status': status_headers.get_statuscode(), 'mime': status_headers.get_header('Content-Type'), 'is_live': True, } result = (self.rewriter.rewrite_content( urlrewriter, status_headers, stream, head_insert_func=head_insert_func, urlkey=urlkey, cdx=cdx)) if env: env['pywb.cdx'] = cdx return result def fetch_async(self, url, headers): resp = self.live_request(method='GET', url=url, headers=headers, proxies=self.proxies, verify=False, stream=True) # don't actually read whole response, # proxy response for writing it resp.close() def add_metadata(self, url, headers, data): return self.live_request(method='PUTMETA', url=url, data=data, headers=headers, proxies=self.proxies, verify=False) def get_rewritten(self, *args, **kwargs): result = self.fetch_request(*args, **kwargs) status_headers, gen, is_rewritten = result buff = b''.join(gen) return (status_headers, buff) def get_video_info(self, url): return youtubedl.extract_info(url)
class ReplayView(object): STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$') def __init__(self, content_loader, config): self.content_loader = content_loader framed = config.get('framed_replay') self.content_rewriter = RewriteContent(is_framed_replay=framed) self.head_insert_view = HeadInsertView.init_from_config(config) self.buffer_response = config.get('buffer_response', True) self.redir_to_exact = config.get('redir_to_exact', True) memento = config.get('enable_memento', False) if memento: self.response_class = MementoResponse else: self.response_class = WbResponse self._reporter = config.get('reporter') def render_content(self, wbrequest, cdx_lines, cdx_loader): last_e = None first = True #cdx_lines = args[0] #cdx_loader = args[1] # List of already failed w/arcs failed_files = [] response = None # Iterate over the cdx until find one that works # The cdx should already be sorted in # closest-to-timestamp order (from the cdx server) for cdx in cdx_lines: try: # optimize: can detect if redirect is needed just from the cdx, # no need to load w/arc data if requiring exact match if first: redir_response = self._redirect_if_needed(wbrequest, cdx) if redir_response: return redir_response first = False response = self.replay_capture(wbrequest, cdx, cdx_loader, failed_files) except (CaptureException, ArchiveLoadFailed) as ce: import traceback traceback.print_exc() last_e = ce pass if response: return response if not last_e: # can only get here if cdx_lines is empty somehow # should be filtered out before hand, but if not msg = 'No Captures found for: ' + wbrequest.wb_url.url last_e = NotFoundException(msg) raise last_e def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): (status_headers, stream) = (self.content_loader.resolve_headers_and_payload( cdx, failed_files, cdx_loader)) # check and reject self-redirect self._reject_self_redirect(wbrequest, cdx, status_headers) # check if redir is needed redir_response = self._redirect_if_needed(wbrequest, cdx) if redir_response: return redir_response length = status_headers.get_header('content-length') stream = LimitReader.wrap_stream(stream, length) # one more check for referrer-based self-redirect # TODO: evaluate this, as refreshing in browser may sometimes cause # referrer to be set to the same page, incorrectly skipping a capture # self._reject_referrer_self_redirect(wbrequest) urlrewriter = wbrequest.urlrewriter # if using url rewriter, use original url for rewriting purposes if wbrequest and wbrequest.wb_url: wbrequest.wb_url.url = cdx['original'] head_insert_func = None if self.head_insert_view: head_insert_func = ( self.head_insert_view.create_insert_func(wbrequest)) result = (self.content_rewriter.rewrite_content( urlrewriter, headers=status_headers, stream=stream, head_insert_func=head_insert_func, urlkey=cdx['urlkey'], cdx=cdx)) (status_headers, response_iter, is_rewritten) = result # buffer response if buffering enabled if self.buffer_response: response_iter = self.buffered_response(status_headers, response_iter) response = self.response_class(status_headers, response_iter, wbrequest=wbrequest, cdx=cdx) # notify reporter callback, if any if self._reporter: self._reporter(wbrequest, cdx, response) return response # Buffer rewrite iterator and return a response from a string def buffered_response(self, status_headers, iterator): out = BytesIO() try: for buff in iterator: out.write(bytes(buff)) finally: content = out.getvalue() content_length_str = str(len(content)) # remove existing content length status_headers.replace_header('Content-Length', content_length_str) out.close() return [content] def _redirect_if_needed(self, wbrequest, cdx): if wbrequest.options['is_proxy']: return None redir_needed = (wbrequest.options.get('is_timegate', False)) if not redir_needed and self.redir_to_exact: redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp) if not redir_needed: return None new_url = (wbrequest.urlrewriter.get_new_url( timestamp=cdx['timestamp'], url=cdx['original'])) if wbrequest.method == 'POST': # FF shows a confirm dialog, so can't use 307 effectively # statusline = '307 Same-Method Internal Redirect' return None else: statusline = '302 Internal Redirect' status_headers = StatusAndHeaders(statusline, [('Location', new_url)]) # don't include cdx to indicate internal redirect return self.response_class(status_headers, wbrequest=wbrequest) def _reject_self_redirect(self, wbrequest, cdx, status_headers): """ Check if response is a 3xx redirect to the same url If so, reject this capture to avoid causing redirect loop """ if not status_headers.statusline.startswith('3'): return # skip all 304s if (status_headers.statusline.startswith('304') and not wbrequest.wb_url.is_identity): raise CaptureException('Skipping 304 Modified: ' + str(cdx)) request_url = wbrequest.wb_url.url.lower() location_url = status_headers.get_header('Location') if not location_url: return location_url = location_url.lower() if location_url.startswith('/'): host = urlsplit(cdx['original']).netloc location_url = host + location_url if (ReplayView.strip_scheme(request_url) == ReplayView.strip_scheme( location_url)): raise CaptureException('Self Redirect: ' + str(cdx)) # TODO: reevaluate this, as it may reject valid refreshes of a page def _reject_referrer_self_redirect(self, wbrequest): # pragma: no cover """ Perform final check for referrer based self-redirect. This method should be called after verifying that the request timestamp == capture timestamp If referrer is same as current url, reject this response and try another capture. """ if not wbrequest.referrer: return # build full url even if using relative-rewriting request_url = (wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url)) if (ReplayView.strip_scheme(request_url) == ReplayView.strip_scheme( wbrequest.referrer)): raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url)) @staticmethod def strip_scheme(url): """ >>> ReplayView.strip_scheme('https://example.com') ==\ ReplayView.strip_scheme('http://example.com') True >>> ReplayView.strip_scheme('https://example.com') ==\ ReplayView.strip_scheme('http:/example.com') True >>> ReplayView.strip_scheme('https://example.com') ==\ ReplayView.strip_scheme('example.com') True >>> ReplayView.strip_scheme('about://example.com') ==\ ReplayView.strip_scheme('example.com') True >>> ReplayView.strip_scheme('http://') ==\ ReplayView.strip_scheme('') True >>> ReplayView.strip_scheme('#!@?') ==\ ReplayView.strip_scheme('#!@?') True """ m = ReplayView.STRIP_SCHEME.match(url) match = m.group(2) return match
def test_type_detect_2(): text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(b' function() { return 0; }')) assert(text_type == 'js') assert(stream.read() == b' function() { return 0; }')
def test_type_detect_1(): text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(b' <html></html>')) assert(text_type == 'html') assert(stream.read() == b' <html></html>')
class ReplayView(object): STRIP_SCHEME_WWW = re.compile('^([\w]+:[/]*(?:www[\d]*\.)?)?(.*?)$', re.MULTILINE) def __init__(self, content_loader, config): self.content_loader = content_loader framed = config.get('framed_replay') self.content_rewriter = RewriteContent(is_framed_replay=framed) self.head_insert_view = HeadInsertView.init_from_config(config) self.buffer_response = config.get('buffer_response', True) self.buffer_max_size = config.get('buffer_max_size', 16384) self.redir_to_exact = config.get('redir_to_exact', True) memento = config.get('enable_memento', False) if memento: self.response_class = MementoResponse else: self.response_class = WbResponse self.enable_range_cache = config.get('enable_ranges', True) self._reporter = config.get('reporter') def render_content(self, wbrequest, cdx_lines, cdx_loader): last_e = None first = True #cdx_lines = args[0] #cdx_loader = args[1] # List of already failed w/arcs failed_files = [] response = None # Iterate over the cdx until find one that works # The cdx should already be sorted in # closest-to-timestamp order (from the cdx server) for cdx in cdx_lines: try: # optimize: can detect if redirect is needed just from the cdx, # no need to load w/arc data if requiring exact match if first: redir_response = self._redirect_if_needed(wbrequest, cdx) if redir_response: return redir_response first = False response = self.cached_replay_capture(wbrequest, cdx, cdx_loader, failed_files) except (CaptureException, ArchiveLoadFailed) as ce: #import traceback #traceback.print_exc() logging.debug(ce) last_e = ce pass if response: return response if not last_e: # can only get here if cdx_lines is empty somehow # should be filtered out before hand, but if not msg = 'No Captures found for: ' + wbrequest.wb_url.url last_e = NotFoundException(msg) raise last_e def cached_replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): def get_capture(): return self.replay_capture(wbrequest, cdx, cdx_loader, failed_files) if not self.enable_range_cache: return get_capture() range_info = wbrequest.extract_range() if not range_info: return get_capture() range_status, range_iter = (range_cache.handle_range( wbrequest, cdx.get('digest', cdx['urlkey']), get_capture, *range_info)) response = self.response_class(range_status, range_iter, wbrequest=wbrequest, cdx=cdx) return response def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): (status_headers, stream) = (self.content_loader(cdx, failed_files, cdx_loader, wbrequest)) # check and reject self-redirect self._reject_self_redirect(wbrequest, cdx, status_headers) # check if redir is needed redir_response = self._redirect_if_needed(wbrequest, cdx) if redir_response: return redir_response #length = status_headers.get_header('content-length') #stream = LimitReader.wrap_stream(stream, length) # one more check for referrer-based self-redirect # TODO: evaluate this, as refreshing in browser may sometimes cause # referrer to be set to the same page, incorrectly skipping a capture # self._reject_referrer_self_redirect(wbrequest) urlrewriter = wbrequest.urlrewriter # if using url rewriter, use original url for rewriting purposes if wbrequest and wbrequest.wb_url: wbrequest.wb_url.url = cdx['url'] if wbrequest.options['is_ajax']: wbrequest.urlrewriter.rewrite_opts['is_ajax'] = True head_insert_func = None if self.head_insert_view: head_insert_func = ( self.head_insert_view.create_insert_func(wbrequest)) result = (self.content_rewriter.rewrite_content( urlrewriter, status_headers=status_headers, stream=stream, head_insert_func=head_insert_func, urlkey=cdx['urlkey'], cdx=cdx, env=wbrequest.env)) (status_headers, response_iter, is_rewritten) = result # buffer response if buffering enabled if self.buffer_response: content_len = status_headers.get_header('content-length') try: content_len = int(content_len) except: content_len = 0 if content_len <= 0: max_size = self.buffer_max_size response_iter = self.buffered_response(status_headers, response_iter, max_size) # Set Content-Location if not exact capture if not self.redir_to_exact: mod = wbrequest.options.get('replay_mod', wbrequest.wb_url.mod) canon_url = (wbrequest.urlrewriter.get_new_url( timestamp=cdx['timestamp'], url=cdx['url'], mod=mod)) status_headers.headers.append(('Content-Location', canon_url)) if wbrequest.wb_url.mod == 'vi_': status_headers.headers.append(('access-control-allow-origin', '*')) response = self.response_class(status_headers, response_iter, wbrequest=wbrequest, cdx=cdx) # notify reporter callback, if any if self._reporter: self._reporter(wbrequest, cdx, response) return response # Buffer rewrite iterator and return a response from a string def buffered_response(self, status_headers, iterator, max_size): out = BytesIO() size = 0 read_all = True try: for buff in iterator: buff = bytes(buff) size += len(buff) out.write(buff) if max_size > 0 and size > max_size: read_all = False break finally: content = out.getvalue() out.close() if read_all: content_length_str = str(len(content)) # remove existing content length status_headers.replace_header('Content-Length', content_length_str) return [content] else: status_headers.remove_header('Content-Length') return chain(iter([content]), iterator) def _redirect_if_needed(self, wbrequest, cdx): if not self.redir_to_exact: return None if wbrequest.options['is_proxy']: return None if wbrequest.custom_params.get('noredir'): return None is_timegate = (wbrequest.options.get('is_timegate', False)) if not is_timegate: is_timegate = wbrequest.wb_url.is_latest_replay() redir_needed = is_timegate or (cdx['timestamp'] != wbrequest.wb_url.timestamp) if not redir_needed: return None if self.enable_range_cache and wbrequest.extract_range(): return None #if is_timegate: # timestamp = timestamp_now() #else: timestamp = cdx['timestamp'] new_url = (wbrequest.urlrewriter.get_new_url(timestamp=timestamp, url=cdx['url'])) if wbrequest.method == 'POST': # FF shows a confirm dialog, so can't use 307 effectively # was: statusline = '307 Same-Method Internal Redirect' return None elif is_timegate: statusline = '302 Found' else: # clear cdx line to indicate internal redirect statusline = '302 Internal Redirect' cdx = None status_headers = StatusAndHeaders(statusline, [('Location', new_url)]) return self.response_class(status_headers, wbrequest=wbrequest, cdx=cdx, memento_is_redir=True) def _reject_self_redirect(self, wbrequest, cdx, status_headers): """ Check if response is a 3xx redirect to the same url If so, reject this capture to avoid causing redirect loop """ if not status_headers.statusline.startswith('3'): return # skip all 304s if (status_headers.statusline.startswith('304') and not wbrequest.wb_url.is_identity): raise CaptureException('Skipping 304 Modified: ' + str(cdx)) request_url = wbrequest.wb_url.url.lower() location_url = status_headers.get_header('Location') if not location_url: return location_url = location_url.lower() if location_url.startswith('/'): host = urlsplit(cdx['url']).netloc location_url = host + location_url if (ReplayView.strip_scheme_www(request_url) == ReplayView.strip_scheme_www(location_url)): raise CaptureException('Self Redirect: ' + str(cdx)) # TODO: reevaluate this, as it may reject valid refreshes of a page def _reject_referrer_self_redirect(self, wbrequest): # pragma: no cover """ Perform final check for referrer based self-redirect. This method should be called after verifying that the request timestamp == capture timestamp If referrer is same as current url, reject this response and try another capture. """ if not wbrequest.referrer: return # build full url even if using relative-rewriting request_url = (wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url)) if (ReplayView.strip_scheme_www(request_url) == ReplayView.strip_scheme_www(wbrequest.referrer)): raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url)) @staticmethod def strip_scheme_www(url): """ >>> ReplayView.strip_scheme_www('https://example.com') ==\ ReplayView.strip_scheme_www('http://example.com') True >>> ReplayView.strip_scheme_www('https://example.com') ==\ ReplayView.strip_scheme_www('http:/example.com') True >>> ReplayView.strip_scheme_www('https://example.com') ==\ ReplayView.strip_scheme_www('example.com') True >>> ReplayView.strip_scheme_www('https://example.com') ==\ ReplayView.strip_scheme_www('http://www2.example.com') True >>> ReplayView.strip_scheme_www('about://example.com') ==\ ReplayView.strip_scheme_www('example.com') True >>> ReplayView.strip_scheme_www('http://') ==\ ReplayView.strip_scheme_www('') True >>> ReplayView.strip_scheme_www('#!@?') ==\ ReplayView.strip_scheme_www('#!@?') True """ m = ReplayView.STRIP_SCHEME_WWW.match(url) match = m.group(2) return match
class ReplayView(object): STRIP_SCHEME_WWW = re.compile("^([\w]+:[/]*(?:www[\d]*\.)?)?(.*?)$") def __init__(self, content_loader, config): self.content_loader = content_loader framed = config.get("framed_replay") self.content_rewriter = RewriteContent(is_framed_replay=framed) self.head_insert_view = HeadInsertView.init_from_config(config) self.buffer_response = config.get("buffer_response", True) self.buffer_max_size = config.get("buffer_max_size", 16384) self.redir_to_exact = config.get("redir_to_exact", True) memento = config.get("enable_memento", False) if memento: self.response_class = MementoResponse else: self.response_class = WbResponse self.enable_range_cache = config.get("enable_ranges", True) self._reporter = config.get("reporter") def render_content(self, wbrequest, cdx_lines, cdx_loader): last_e = None first = True # cdx_lines = args[0] # cdx_loader = args[1] # List of already failed w/arcs failed_files = [] response = None # Iterate over the cdx until find one that works # The cdx should already be sorted in # closest-to-timestamp order (from the cdx server) for cdx in cdx_lines: try: # optimize: can detect if redirect is needed just from the cdx, # no need to load w/arc data if requiring exact match if first: redir_response = self._redirect_if_needed(wbrequest, cdx) if redir_response: return redir_response first = False response = self.cached_replay_capture(wbrequest, cdx, cdx_loader, failed_files) except (CaptureException, ArchiveLoadFailed) as ce: # import traceback # traceback.print_exc() logging.debug(ce) last_e = ce pass if response: return response if not last_e: # can only get here if cdx_lines is empty somehow # should be filtered out before hand, but if not msg = "No Captures found for: " + wbrequest.wb_url.url last_e = NotFoundException(msg) raise last_e def cached_replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): def get_capture(): return self.replay_capture(wbrequest, cdx, cdx_loader, failed_files) if not self.enable_range_cache: return get_capture() range_info = wbrequest.extract_range() if not range_info: return get_capture() range_status, range_iter = range_cache.handle_range( wbrequest, cdx.get("digest", cdx["urlkey"]), get_capture, *range_info ) response = self.response_class(range_status, range_iter, wbrequest=wbrequest, cdx=cdx) return response def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): (status_headers, stream) = self.content_loader(cdx, failed_files, cdx_loader, wbrequest) # check and reject self-redirect self._reject_self_redirect(wbrequest, cdx, status_headers) # check if redir is needed redir_response = self._redirect_if_needed(wbrequest, cdx) if redir_response: return redir_response # length = status_headers.get_header('content-length') # stream = LimitReader.wrap_stream(stream, length) # one more check for referrer-based self-redirect # TODO: evaluate this, as refreshing in browser may sometimes cause # referrer to be set to the same page, incorrectly skipping a capture # self._reject_referrer_self_redirect(wbrequest) urlrewriter = wbrequest.urlrewriter # if using url rewriter, use original url for rewriting purposes if wbrequest and wbrequest.wb_url: wbrequest.wb_url.url = cdx["url"] head_insert_func = None if self.head_insert_view: head_insert_func = self.head_insert_view.create_insert_func(wbrequest) result = self.content_rewriter.rewrite_content( urlrewriter, status_headers=status_headers, stream=stream, head_insert_func=head_insert_func, urlkey=cdx["urlkey"], cdx=cdx, ) (status_headers, response_iter, is_rewritten) = result # buffer response if buffering enabled if self.buffer_response: content_len = status_headers.get_header("content-length") try: content_len = int(content_len) except: content_len = 0 if content_len <= 0: max_size = self.buffer_max_size response_iter = self.buffered_response(status_headers, response_iter, max_size) # Set Content-Location if not exact capture if not self.redir_to_exact: mod = wbrequest.options.get("replay_mod", wbrequest.wb_url.mod) canon_url = wbrequest.urlrewriter.get_new_url(timestamp=cdx["timestamp"], url=cdx["url"], mod=mod) status_headers.headers.append(("Content-Location", canon_url)) if wbrequest.wb_url.mod == "vi_": status_headers.headers.append(("access-control-allow-origin", "*")) response = self.response_class(status_headers, response_iter, wbrequest=wbrequest, cdx=cdx) # notify reporter callback, if any if self._reporter: self._reporter(wbrequest, cdx, response) return response # Buffer rewrite iterator and return a response from a string def buffered_response(self, status_headers, iterator, max_size): out = BytesIO() size = 0 read_all = True try: for buff in iterator: buff = bytes(buff) size += len(buff) out.write(buff) if max_size > 0 and size > max_size: read_all = False break finally: content = out.getvalue() out.close() if read_all: content_length_str = str(len(content)) # remove existing content length status_headers.replace_header("Content-Length", content_length_str) return [content] else: status_headers.remove_header("Content-Length") return chain(iter([content]), iterator) def _redirect_if_needed(self, wbrequest, cdx): if not self.redir_to_exact: return None if wbrequest.options["is_proxy"]: return None if wbrequest.custom_params.get("noredir"): return None is_timegate = wbrequest.options.get("is_timegate", False) if not is_timegate: is_timegate = wbrequest.wb_url.is_latest_replay() redir_needed = is_timegate or (cdx["timestamp"] != wbrequest.wb_url.timestamp) if not redir_needed: return None if self.enable_range_cache and wbrequest.extract_range(): return None # if is_timegate: # timestamp = timestamp_now() # else: timestamp = cdx["timestamp"] new_url = wbrequest.urlrewriter.get_new_url(timestamp=timestamp, url=cdx["url"]) if wbrequest.method == "POST": # FF shows a confirm dialog, so can't use 307 effectively # was: statusline = '307 Same-Method Internal Redirect' return None elif is_timegate: statusline = "302 Found" else: # clear cdx line to indicate internal redirect statusline = "302 Internal Redirect" cdx = None status_headers = StatusAndHeaders(statusline, [("Location", new_url)]) return self.response_class(status_headers, wbrequest=wbrequest, cdx=cdx, memento_is_redir=True) def _reject_self_redirect(self, wbrequest, cdx, status_headers): """ Check if response is a 3xx redirect to the same url If so, reject this capture to avoid causing redirect loop """ if not status_headers.statusline.startswith("3"): return # skip all 304s if status_headers.statusline.startswith("304") and not wbrequest.wb_url.is_identity: raise CaptureException("Skipping 304 Modified: " + str(cdx)) request_url = wbrequest.wb_url.url.lower() location_url = status_headers.get_header("Location") if not location_url: return location_url = location_url.lower() if location_url.startswith("/"): host = urlsplit(cdx["url"]).netloc location_url = host + location_url if ReplayView.strip_scheme_www(request_url) == ReplayView.strip_scheme_www(location_url): raise CaptureException("Self Redirect: " + str(cdx)) # TODO: reevaluate this, as it may reject valid refreshes of a page def _reject_referrer_self_redirect(self, wbrequest): # pragma: no cover """ Perform final check for referrer based self-redirect. This method should be called after verifying that the request timestamp == capture timestamp If referrer is same as current url, reject this response and try another capture. """ if not wbrequest.referrer: return # build full url even if using relative-rewriting request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url) if ReplayView.strip_scheme_www(request_url) == ReplayView.strip_scheme_www(wbrequest.referrer): raise CaptureException("Self Redirect via Referrer: " + str(wbrequest.wb_url)) @staticmethod def strip_scheme_www(url): """ >>> ReplayView.strip_scheme_www('https://example.com') ==\ ReplayView.strip_scheme_www('http://example.com') True >>> ReplayView.strip_scheme_www('https://example.com') ==\ ReplayView.strip_scheme_www('http:/example.com') True >>> ReplayView.strip_scheme_www('https://example.com') ==\ ReplayView.strip_scheme_www('example.com') True >>> ReplayView.strip_scheme_www('https://example.com') ==\ ReplayView.strip_scheme_www('http://www2.example.com') True >>> ReplayView.strip_scheme_www('about://example.com') ==\ ReplayView.strip_scheme_www('example.com') True >>> ReplayView.strip_scheme_www('http://') ==\ ReplayView.strip_scheme_www('') True >>> ReplayView.strip_scheme_www('#!@?') ==\ ReplayView.strip_scheme_www('#!@?') True """ m = ReplayView.STRIP_SCHEME_WWW.match(url) match = m.group(2) return match
class PlatformHandler(RewriteHandler): def __init__(self, config): super(PlatformHandler, self).__init__(config) self.upstream_url = config.get('upstream_url') self.loader = ArcWarcRecordLoader() framed = config.get('framed_replay') self.content_rewriter = RewriteContent(is_framed_replay=framed) def render_content(self, wbrequest): if wbrequest.wb_url.mod == 'vi_': return self._get_video_info(wbrequest) ref_wburl_str = wbrequest.extract_referrer_wburl_str() if ref_wburl_str: wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url urlkey = canonicalize(wbrequest.wb_url.url) url = wbrequest.wb_url.url inputreq = RewriteInputRequest(wbrequest.env, urlkey, url, self.content_rewriter) req_data = inputreq.reconstruct_request(url) headers = {'Content-Length': len(req_data), 'Content-Type': 'application/request'} if wbrequest.wb_url.is_latest_replay(): closest = 'now' else: closest = wbrequest.wb_url.timestamp upstream_url = self.upstream_url.format(url=quote(url), closest=closest, #coll=wbrequest.coll, **wbrequest.matchdict) r = requests.post(upstream_url, data=BytesIO(req_data), headers=headers, stream=True, allow_redirects=False) r.raise_for_status() record = self.loader.parse_record_stream(r.raw) cdx = CDXObject() cdx['urlkey'] = urlkey cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime')) cdx['url'] = url head_insert_func = self.head_insert_view.create_insert_func(wbrequest) result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter, record.status_headers, record.stream, head_insert_func, urlkey, cdx) status_headers, gen, is_rw = result return self._make_response(wbrequest, *result)