def _resolve_text_type(mod, text_type, stream): # only attempt to resolve between html and other text types if text_type != 'html': return mod, stream buff = stream.read(128) wrapped_stream = BufferedReader(stream, starting_data=buff) # check if starts with a tag, then likely html if RewriteContent.TAG_REGEX.match(buff): mod = 'html' return mod, wrapped_stream
def handle_connect(self, env): sock = self.get_request_socket(env) if not sock: return WbResponse.text_response('HTTPS Proxy Not Supported', '405 HTTPS Proxy Not Supported') sock.send('HTTP/1.0 200 Connection Established\r\n') sock.send('Server: pywb proxy\r\n') sock.send('\r\n') hostname, port = env['REL_REQUEST_URI'].split(':') if not self.use_wildcard: certfile = self.ca.cert_for_host(hostname) else: certfile = self.ca.get_wildcard_cert(hostname) try: ssl_sock = ssl.wrap_socket(sock, server_side=True, certfile=certfile, #ciphers="ALL", suppress_ragged_eofs=False, ssl_version=ssl.PROTOCOL_SSLv23 ) env['pywb.proxy_ssl_sock'] = ssl_sock buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE) statusline = buffreader.readline().rstrip() except Exception as se: raise BadRequestException(se.message) statusparts = statusline.split(' ') if len(statusparts) < 3: raise BadRequestException('Invalid Proxy Request: ' + statusline) env['REQUEST_METHOD'] = statusparts[0] env['REL_REQUEST_URI'] = ('https://' + env['REL_REQUEST_URI'].replace(':443', '') + statusparts[1]) env['SERVER_PROTOCOL'] = statusparts[2].strip() env['pywb.proxy_scheme'] = 'https' env['pywb.proxy_host'] = hostname env['pywb.proxy_port'] = port env['pywb.proxy_req_uri'] = statusparts[1] queryparts = env['REL_REQUEST_URI'].split('?', 1) env['PATH_INFO'] = queryparts[0] env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' env['pywb.proxy_query'] = env['QUERY_STRING'] while True: line = buffreader.readline() if line: line = line.rstrip() if not line: break parts = line.split(':', 1) if len(parts) < 2: continue name = parts[0].strip() value = parts[1].strip() name = name.replace('-', '_').upper() if name not in ('CONTENT_LENGTH', 'CONTENT_TYPE'): name = 'HTTP_' + name env[name] = value remain = buffreader.rem_length() if remain > 0: remainder = buffreader.read(self.BLOCK_SIZE) env['wsgi.input'] = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE, starting_data=remainder)
def rewrite_content(self, urlrewriter, status_headers, stream, head_insert_func=None, urlkey='', cdx=None, cookie_rewriter=None, env=None): wb_url = urlrewriter.wburl if (wb_url.is_identity or (not head_insert_func and wb_url.is_banner_only)): status_headers, stream = self.sanitize_content( status_headers, stream) return (status_headers, self.stream_to_gen(stream), False) if urlrewriter and cdx and cdx.get('is_live'): urlrewriter.rewrite_opts['is_live'] = True rule = self.ruleset.get_first_match(urlkey) (rewritten_headers, stream) = self._rewrite_headers(urlrewriter, rule, status_headers, stream, urlkey, cookie_rewriter) res = self.handle_custom_rewrite(rewritten_headers, stream, urlrewriter, wb_url.mod, env) if res: return res # Handle text content rewriting # ==================================================================== # special case -- need to ungzip the body status_headers = rewritten_headers.status_headers text_type = rewritten_headers.text_type # see known js/css modifier specified, the context should run # default text_type mod = wb_url.mod stream_raw = False encoding = None first_buff = b'' for decomp_type in BufferedReader.get_supported_decompressors(): matched, stream = self._check_encoding(rewritten_headers, stream, decomp_type) if matched: break if mod == 'js_': text_type, stream = self._resolve_text_type( 'js', text_type, stream) elif mod == 'cs_': text_type, stream = self._resolve_text_type( 'css', text_type, stream) # for proxy mode: use special js_proxy rewriter # which may be none rewriter + custom rules (if any) if text_type == 'js' and not urlrewriter.prefix: rewriter_class = rule.rewriters['js_proxy'] else: rewriter_class = rule.rewriters[text_type] # for html, need to perform header insert, supply js, css, xml # rewriters if text_type == 'html': head_insert_str = '' charset = rewritten_headers.charset # if no charset set, attempt to extract from first 1024 if not rewritten_headers.charset: first_buff = stream.read(1024) charset = self._extract_html_charset(first_buff, status_headers) if head_insert_func and not wb_url.is_url_rewrite_only: head_insert_orig = head_insert_func(rule, cdx) if charset: try: head_insert_str = webencodings.encode( head_insert_orig, charset) except: pass if not head_insert_str: charset = 'utf-8' head_insert_str = head_insert_orig.encode(charset) head_insert_buf = head_insert_str #head_insert_str = to_native_str(head_insert_str) head_insert_str = head_insert_str.decode('iso-8859-1') if wb_url.is_banner_only: gen = self._head_insert_only_gen(head_insert_buf, stream, first_buff) content_len = status_headers.get_header('Content-Length') try: content_len = int(content_len) except Exception: content_len = None if content_len and content_len >= 0: content_len = str(content_len + len(head_insert_str)) status_headers.replace_header('Content-Length', content_len) return (status_headers, gen, False) # if proxy, use js_proxy rewriter if not urlrewriter.prefix: js_rewriter_class = rule.rewriters['js_proxy'] else: js_rewriter_class = rule.rewriters['js'] css_rewriter_class = rule.rewriters['css'] if wb_url.is_url_rewrite_only: js_rewriter_class = JSNoneRewriter rewriter = rewriter_class(urlrewriter, js_rewriter_class=js_rewriter_class, css_rewriter_class=css_rewriter_class, head_insert=head_insert_str, url=wb_url.url, defmod=self.defmod, parse_comments=rule.parse_comments) else: if wb_url.is_banner_only: return (status_headers, self.stream_to_gen(stream), False) # url-only rewriter, but not rewriting urls in JS, so return if wb_url.is_url_rewrite_only and text_type == 'js': #return (status_headers, self.stream_to_gen(stream), False) rewriter_class = JSLinkOnlyRewriter # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # align to line end for all non-html rewriting align = (text_type != 'html') # Create rewriting generator gen = self.rewrite_text_stream_to_gen(stream, rewrite_func=rewriter.rewrite, final_read_func=rewriter.close, first_buff=first_buff, align_to_line=align) return (status_headers, gen, True)
def handle_connect(self, env): sock = self.get_request_socket(env) if not sock: return WbResponse.text_response('HTTPS Proxy Not Supported', '405 HTTPS Proxy Not Supported') sock.send(b'HTTP/1.0 200 Connection Established\r\n') sock.send(b'Proxy-Connection: close\r\n') sock.send(b'Server: pywb proxy\r\n') sock.send(b'\r\n') hostname, port = env['REL_REQUEST_URI'].split(':') if not self.use_wildcard: certfile = self.ca.cert_for_host(hostname) else: certfile = self.ca.get_wildcard_cert(hostname) try: ssl_sock = ssl.wrap_socket( sock, server_side=True, certfile=certfile, #ciphers="ALL", suppress_ragged_eofs=False, ssl_version=ssl.PROTOCOL_SSLv23) env['pywb.proxy_ssl_sock'] = ssl_sock buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE) statusline = to_native_str(buffreader.readline().rstrip()) except Exception as se: raise BadRequestException(se.message) statusparts = statusline.split(' ') if len(statusparts) < 3: raise BadRequestException('Invalid Proxy Request: ' + statusline) env['REQUEST_METHOD'] = statusparts[0] env['REL_REQUEST_URI'] = ('https://' + env['REL_REQUEST_URI'].replace(':443', '') + statusparts[1]) env['SERVER_PROTOCOL'] = statusparts[2].strip() env['pywb.proxy_scheme'] = 'https' env['pywb.proxy_host'] = hostname env['pywb.proxy_port'] = port env['pywb.proxy_req_uri'] = statusparts[1] queryparts = env['REL_REQUEST_URI'].split('?', 1) env['PATH_INFO'] = queryparts[0] env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' env['pywb.proxy_query'] = env['QUERY_STRING'] while True: line = to_native_str(buffreader.readline()) if line: line = line.rstrip() if not line: break parts = line.split(':', 1) if len(parts) < 2: continue name = parts[0].strip() value = parts[1].strip() name = name.replace('-', '_').upper() if name not in ('CONTENT_LENGTH', 'CONTENT_TYPE'): name = 'HTTP_' + name env[name] = value env['wsgi.input'] = buffreader
def rewrite_content(self, urlrewriter, status_headers, stream, head_insert_func=None, urlkey='', cdx=None, cookie_rewriter=None, env=None): wb_url = urlrewriter.wburl if (wb_url.is_identity or (not head_insert_func and wb_url.is_banner_only)): status_headers, stream = self.sanitize_content(status_headers, stream) return (status_headers, self.stream_to_gen(stream), False) if urlrewriter and cdx and cdx.get('is_live'): urlrewriter.rewrite_opts['is_live'] = True rule = self.ruleset.get_first_match(urlkey) (rewritten_headers, stream) = self._rewrite_headers(urlrewriter, rule, status_headers, stream, urlkey, cookie_rewriter) status_headers = rewritten_headers.status_headers res = self.handle_custom_rewrite(rewritten_headers.text_type, status_headers, stream, env) if res: return res # Handle text content rewriting # ==================================================================== # special case -- need to ungzip the body text_type = rewritten_headers.text_type # see known js/css modifier specified, the context should run # default text_type mod = wb_url.mod stream_raw = False encoding = None first_buff = b'' for decomp_type in BufferedReader.get_supported_decompressors(): matched, stream = self._check_encoding(rewritten_headers, stream, decomp_type) if matched: break if mod == 'js_': text_type, stream = self._resolve_text_type('js', text_type, stream) elif mod == 'cs_': text_type, stream = self._resolve_text_type('css', text_type, stream) rewriter_class = rule.rewriters[text_type] # for html, need to perform header insert, supply js, css, xml # rewriters if text_type == 'html': head_insert_str = '' charset = rewritten_headers.charset # if no charset set, attempt to extract from first 1024 if not rewritten_headers.charset: first_buff = stream.read(1024) charset = self._extract_html_charset(first_buff, status_headers) if head_insert_func and not wb_url.is_url_rewrite_only: head_insert_orig = head_insert_func(rule, cdx) if charset: try: head_insert_str = webencodings.encode(head_insert_orig, charset) except: pass if not head_insert_str: charset = 'utf-8' head_insert_str = head_insert_orig.encode(charset) head_insert_buf = head_insert_str #head_insert_str = to_native_str(head_insert_str) head_insert_str = head_insert_str.decode('iso-8859-1') if wb_url.is_banner_only: gen = self._head_insert_only_gen(head_insert_buf, stream, first_buff) content_len = status_headers.get_header('Content-Length') try: content_len = int(content_len) except Exception: content_len = None if content_len and content_len >= 0: content_len = str(content_len + len(head_insert_str)) status_headers.replace_header('Content-Length', content_len) return (status_headers, gen, False) js_rewriter_class = rule.rewriters['js'] css_rewriter_class = rule.rewriters['css'] if wb_url.is_url_rewrite_only: js_rewriter_class = JSNoneRewriter rewriter = rewriter_class(urlrewriter, js_rewriter_class=js_rewriter_class, css_rewriter_class=css_rewriter_class, head_insert=head_insert_str, url=wb_url.url, defmod=self.defmod, parse_comments=rule.parse_comments) else: if wb_url.is_banner_only: return (status_headers, self.stream_to_gen(stream), False) # url-only rewriter, but not rewriting urls in JS, so return if wb_url.is_url_rewrite_only and text_type == 'js': #return (status_headers, self.stream_to_gen(stream), False) rewriter_class = JSLinkOnlyRewriter # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # align to line end for all non-html rewriting align = (text_type != 'html') # Create rewriting generator gen = self.rewrite_text_stream_to_gen(stream, rewrite_func=rewriter.rewrite, final_read_func=rewriter.close, first_buff=first_buff, align_to_line=align) return (status_headers, gen, True)