def get_head_insert(self, rwinfo, rule, head_insert_func, cdx): head_insert_str = '' # if no charset set, attempt to extract from first 1024 if not rwinfo.charset: first_buff = rwinfo.read_and_keep(1024) rwinfo.charset = self.extract_html_charset(first_buff) if head_insert_func: head_insert_orig = head_insert_func(rule, cdx) if rwinfo.charset: try: head_insert_str = webencodings.encode( head_insert_orig, rwinfo.charset) except: pass # no charset detected, encode banner as ascii html entities if not head_insert_str: head_insert_str = head_insert_orig.encode( 'ascii', 'xmlcharrefreplace') head_insert_str = head_insert_str.decode('iso-8859-1') return head_insert_str
def get_head_insert(self, rwinfo, rule, head_insert_func, cdx): head_insert_str = '' charset = rwinfo.charset # if no charset set, attempt to extract from first 1024 if not charset: first_buff = rwinfo.read_and_keep(1024) charset = self.extract_html_charset(first_buff) if head_insert_func: head_insert_orig = head_insert_func(rule, cdx) if charset: try: head_insert_str = webencodings.encode(head_insert_orig, charset) except: pass if not head_insert_str: charset = 'utf-8' head_insert_str = head_insert_orig.encode(charset) head_insert_str = head_insert_str.decode('iso-8859-1') return head_insert_str
def rewrite_content(self, urlrewriter, status_headers, stream, head_insert_func=None, urlkey='', cdx=None, cookie_rewriter=None, env=None): wb_url = urlrewriter.wburl if (wb_url.is_identity or (not head_insert_func and wb_url.is_banner_only)): status_headers, stream = self.sanitize_content( status_headers, stream) return (status_headers, self.stream_to_gen(stream), False) if urlrewriter and cdx and cdx.get('is_live'): urlrewriter.rewrite_opts['is_live'] = True rule = self.ruleset.get_first_match(urlkey) (rewritten_headers, stream) = self._rewrite_headers(urlrewriter, rule, status_headers, stream, urlkey, cookie_rewriter) res = self.handle_custom_rewrite(rewritten_headers, stream, urlrewriter, wb_url.mod, env) if res: return res # Handle text content rewriting # ==================================================================== # special case -- need to ungzip the body status_headers = rewritten_headers.status_headers text_type = rewritten_headers.text_type # see known js/css modifier specified, the context should run # default text_type mod = wb_url.mod stream_raw = False encoding = None first_buff = b'' for decomp_type in BufferedReader.get_supported_decompressors(): matched, stream = self._check_encoding(rewritten_headers, stream, decomp_type) if matched: break if mod == 'js_': text_type, stream = self._resolve_text_type( 'js', text_type, stream) elif mod == 'cs_': text_type, stream = self._resolve_text_type( 'css', text_type, stream) # for proxy mode: use special js_proxy rewriter # which may be none rewriter + custom rules (if any) if text_type == 'js' and not urlrewriter.prefix: rewriter_class = rule.rewriters['js_proxy'] else: rewriter_class = rule.rewriters[text_type] # for html, need to perform header insert, supply js, css, xml # rewriters if text_type == 'html': head_insert_str = '' charset = rewritten_headers.charset # if no charset set, attempt to extract from first 1024 if not rewritten_headers.charset: first_buff = stream.read(1024) charset = self._extract_html_charset(first_buff, status_headers) if head_insert_func and not wb_url.is_url_rewrite_only: head_insert_orig = head_insert_func(rule, cdx) if charset: try: head_insert_str = webencodings.encode( head_insert_orig, charset) except: pass if not head_insert_str: charset = 'utf-8' head_insert_str = head_insert_orig.encode(charset) head_insert_buf = head_insert_str #head_insert_str = to_native_str(head_insert_str) head_insert_str = head_insert_str.decode('iso-8859-1') if wb_url.is_banner_only: gen = self._head_insert_only_gen(head_insert_buf, stream, first_buff) content_len = status_headers.get_header('Content-Length') try: content_len = int(content_len) except Exception: content_len = None if content_len and content_len >= 0: content_len = str(content_len + len(head_insert_str)) status_headers.replace_header('Content-Length', content_len) return (status_headers, gen, False) # if proxy, use js_proxy rewriter if not urlrewriter.prefix: js_rewriter_class = rule.rewriters['js_proxy'] else: js_rewriter_class = rule.rewriters['js'] css_rewriter_class = rule.rewriters['css'] if wb_url.is_url_rewrite_only: js_rewriter_class = JSNoneRewriter rewriter = rewriter_class(urlrewriter, js_rewriter_class=js_rewriter_class, css_rewriter_class=css_rewriter_class, head_insert=head_insert_str, url=wb_url.url, defmod=self.defmod, parse_comments=rule.parse_comments) else: if wb_url.is_banner_only: return (status_headers, self.stream_to_gen(stream), False) # url-only rewriter, but not rewriting urls in JS, so return if wb_url.is_url_rewrite_only and text_type == 'js': #return (status_headers, self.stream_to_gen(stream), False) rewriter_class = JSLinkOnlyRewriter # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # align to line end for all non-html rewriting align = (text_type != 'html') # Create rewriting generator gen = self.rewrite_text_stream_to_gen(stream, rewrite_func=rewriter.rewrite, final_read_func=rewriter.close, first_buff=first_buff, align_to_line=align) return (status_headers, gen, True)
def rewrite_content(self, urlrewriter, status_headers, stream, head_insert_func=None, urlkey='', cdx=None, cookie_rewriter=None, env=None): wb_url = urlrewriter.wburl if (wb_url.is_identity or (not head_insert_func and wb_url.is_banner_only)): status_headers, stream = self.sanitize_content(status_headers, stream) return (status_headers, self.stream_to_gen(stream), False) if urlrewriter and cdx and cdx.get('is_live'): urlrewriter.rewrite_opts['is_live'] = True rule = self.ruleset.get_first_match(urlkey) (rewritten_headers, stream) = self._rewrite_headers(urlrewriter, rule, status_headers, stream, urlkey, cookie_rewriter) status_headers = rewritten_headers.status_headers res = self.handle_custom_rewrite(rewritten_headers.text_type, status_headers, stream, env) if res: return res # Handle text content rewriting # ==================================================================== # special case -- need to ungzip the body text_type = rewritten_headers.text_type # see known js/css modifier specified, the context should run # default text_type mod = wb_url.mod stream_raw = False encoding = None first_buff = b'' for decomp_type in BufferedReader.get_supported_decompressors(): matched, stream = self._check_encoding(rewritten_headers, stream, decomp_type) if matched: break if mod == 'js_': text_type, stream = self._resolve_text_type('js', text_type, stream) elif mod == 'cs_': text_type, stream = self._resolve_text_type('css', text_type, stream) rewriter_class = rule.rewriters[text_type] # for html, need to perform header insert, supply js, css, xml # rewriters if text_type == 'html': head_insert_str = '' charset = rewritten_headers.charset # if no charset set, attempt to extract from first 1024 if not rewritten_headers.charset: first_buff = stream.read(1024) charset = self._extract_html_charset(first_buff, status_headers) if head_insert_func and not wb_url.is_url_rewrite_only: head_insert_orig = head_insert_func(rule, cdx) if charset: try: head_insert_str = webencodings.encode(head_insert_orig, charset) except: pass if not head_insert_str: charset = 'utf-8' head_insert_str = head_insert_orig.encode(charset) head_insert_buf = head_insert_str #head_insert_str = to_native_str(head_insert_str) head_insert_str = head_insert_str.decode('iso-8859-1') if wb_url.is_banner_only: gen = self._head_insert_only_gen(head_insert_buf, stream, first_buff) content_len = status_headers.get_header('Content-Length') try: content_len = int(content_len) except Exception: content_len = None if content_len and content_len >= 0: content_len = str(content_len + len(head_insert_str)) status_headers.replace_header('Content-Length', content_len) return (status_headers, gen, False) js_rewriter_class = rule.rewriters['js'] css_rewriter_class = rule.rewriters['css'] if wb_url.is_url_rewrite_only: js_rewriter_class = JSNoneRewriter rewriter = rewriter_class(urlrewriter, js_rewriter_class=js_rewriter_class, css_rewriter_class=css_rewriter_class, head_insert=head_insert_str, url=wb_url.url, defmod=self.defmod, parse_comments=rule.parse_comments) else: if wb_url.is_banner_only: return (status_headers, self.stream_to_gen(stream), False) # url-only rewriter, but not rewriting urls in JS, so return if wb_url.is_url_rewrite_only and text_type == 'js': #return (status_headers, self.stream_to_gen(stream), False) rewriter_class = JSLinkOnlyRewriter # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # align to line end for all non-html rewriting align = (text_type != 'html') # Create rewriting generator gen = self.rewrite_text_stream_to_gen(stream, rewrite_func=rewriter.rewrite, final_read_func=rewriter.close, first_buff=first_buff, align_to_line=align) return (status_headers, gen, True)
def rewrite_content(self, urlrewriter, status_headers, stream, head_insert_func=None, urlkey="", cdx=None): wb_url = urlrewriter.wburl if wb_url.is_identity or (not head_insert_func and wb_url.is_banner_only): status_headers, stream = self.sanitize_content(status_headers, stream) return (status_headers, self.stream_to_gen(stream), False) if wb_url.is_banner_only: urlrewriter = None rule = self.ruleset.get_first_match(urlkey) (rewritten_headers, stream) = self._rewrite_headers(urlrewriter, rule, status_headers, stream) status_headers = rewritten_headers.status_headers # use rewritten headers, but no further rewriting needed if rewritten_headers.text_type is None: return (status_headers, self.stream_to_gen(stream), False) # Handle text content rewriting # ==================================================================== # special case -- need to ungzip the body text_type = rewritten_headers.text_type # see known js/css modifier specified, the context should run # default text_type mod = wb_url.mod stream_raw = False encoding = None first_buff = "" stream = self._check_encoding(rewritten_headers, stream, "gzip") stream = self._check_encoding(rewritten_headers, stream, "deflate") if mod == "js_": text_type, stream = self._resolve_text_type("js", text_type, stream) elif mod == "cs_": text_type, stream = self._resolve_text_type("css", text_type, stream) rewriter_class = rule.rewriters[text_type] # for html, need to perform header insert, supply js, css, xml # rewriters if text_type == "html": head_insert_str = "" charset = rewritten_headers.charset # if no charset set, attempt to extract from first 1024 if not rewritten_headers.charset: first_buff = stream.read(1024) charset = self._extract_html_charset(first_buff, status_headers) if head_insert_func and not wb_url.is_url_rewrite_only: head_insert_orig = head_insert_func(rule, cdx) if charset: try: head_insert_str = webencodings.encode(head_insert_orig, charset) except: pass if not head_insert_str: charset = "utf-8" head_insert_str = head_insert_orig.encode(charset) if wb_url.is_banner_only: gen = self._head_insert_only_gen(head_insert_str, stream, first_buff) content_len = status_headers.get_header("Content-Length") try: content_len = int(content_len) except Exception: content_len = None if content_len and content_len >= 0: content_len = str(content_len + len(head_insert_str)) status_headers.replace_header("Content-Length", content_len) return (status_headers, gen, False) js_rewriter_class = rule.rewriters["js"] css_rewriter_class = rule.rewriters["css"] if wb_url.is_url_rewrite_only: js_rewriter_class = JSNoneRewriter rewriter = rewriter_class( urlrewriter, js_rewriter_class=js_rewriter_class, css_rewriter_class=css_rewriter_class, head_insert=head_insert_str, url=wb_url.url, defmod=self.defmod, parse_comments=rule.parse_comments, ) else: if wb_url.is_banner_only: return (status_headers, self.stream_to_gen(stream), False) # url-only rewriter, but not rewriting urls in JS, so return if wb_url.is_url_rewrite_only and text_type == "js": # return (status_headers, self.stream_to_gen(stream), False) rewriter_class = JSLinkOnlyRewriter # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # align to line end for all non-html rewriting align = text_type != "html" # Create rewriting generator gen = self.rewrite_text_stream_to_gen( stream, rewrite_func=rewriter.rewrite, final_read_func=rewriter.close, first_buff=first_buff, align_to_line=align, ) return (status_headers, gen, True)