Ejemplo n.º 1
0
def test_err_compress_mix():
    # error: compressed member, followed by not compressed -- considered invalid
    x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
    b = x.read()
    assert b == b'ABC'
    x.read_next_member()
    assert x.read() == b''
Ejemplo n.º 2
0
def test_err_compress_mix():
    # error: compressed member, followed by not compressed -- considered invalid
    x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
    b = x.read()
    b = x.read_next_member()
    with pytest.raises(zlib.error):
        x.read()
Ejemplo n.º 3
0
def extract_text(entry):
    buff_reader = entry.buffer
    if not buff_reader:
        return b''

    buff_reader.seek(0)

    if entry.record.status_headers.get_header('content-encoding'):
        buff_reader = DecompressingBufferedReader(buff_reader)

    buff = b''
    while True:
        new_buff = buff_reader.read()
        if not new_buff:
            break

        buff += new_buff

    if is_binary_string(buff):
        return b''

    return buff
Ejemplo n.º 4
0
def test_brotli():
    with open(get_test_dir() + 'text_content/quickfox_repeated.compressed', 'rb') as fh:
        x = DecompressingBufferedReader(fh, decomp_type='br')
        x.read() == b'The quick brown fox jumps over the lazy dog' * 4096
Ejemplo n.º 5
0
    def rewrite_content(self, urlrewriter, headers, stream,
                        head_insert_func=None, urlkey='',
                        cdx=None):

        wb_url = urlrewriter.wburl

        if (wb_url.is_identity or
             (not head_insert_func and wb_url.is_banner_only)):
            status_headers, stream = self.sanitize_content(headers, stream)
            return (status_headers, self.stream_to_gen(stream), False)

        if wb_url.is_banner_only:
            urlrewriter = None

        rule = self.ruleset.get_first_match(urlkey)

        (rewritten_headers, stream) = self._rewrite_headers(urlrewriter,
                                                            rule,
                                                            headers,
                                                            stream)

        status_headers = rewritten_headers.status_headers

        # use rewritten headers, but no further rewriting needed
        if rewritten_headers.text_type is None:
            return (status_headers, self.stream_to_gen(stream), False)

        # Handle text content rewriting
        # ====================================================================
        # special case -- need to ungzip the body

        text_type = rewritten_headers.text_type

        # see known js/css modifier specified, the context should run
        # default text_type
        mod = wb_url.mod

        stream_raw = False
        encoding = None
        first_buff = ''

        if (rewritten_headers.
             contains_removed_header('content-encoding', 'gzip')):

            #optimize: if already a ChunkedDataReader, add gzip
            if isinstance(stream, ChunkedDataReader):
                stream.set_decomp('gzip')
            else:
                stream = DecompressingBufferedReader(stream)

        if mod == 'js_':
            text_type, stream = self._resolve_text_type('js',
                                                        text_type,
                                                        stream)
        elif mod == 'cs_':
            text_type, stream = self._resolve_text_type('css',
                                                        text_type,
                                                        stream)

        rewriter_class = rule.rewriters[text_type]

        # for html, need to perform header insert, supply js, css, xml
        # rewriters
        if text_type == 'html':
            head_insert_str = ''
            charset = rewritten_headers.charset

            # if no charset set, attempt to extract from first 1024
            if not rewritten_headers.charset:
                first_buff = stream.read(1024)
                charset = self._extract_html_charset(first_buff,
                                                     status_headers)

            if head_insert_func:
                if not charset:
                    charset = 'utf-8'
                head_insert_str = head_insert_func(rule, cdx)
                head_insert_str = head_insert_str.encode(charset)

            if wb_url.is_banner_only:
                gen = self._head_insert_only_gen(head_insert_str,
                                                 stream,
                                                 first_buff)

                content_len = headers.get_header('Content-Length')
                try:
                    content_len = int(content_len)
                except Exception:
                    content_len = None

                if content_len and content_len >= 0:
                    content_len = str(content_len + len(head_insert_str))
                    status_headers.replace_header('Content-Length',
                                                  content_len)

                return (status_headers, gen, False)

            rewriter = rewriter_class(urlrewriter,
                                      js_rewriter_class=rule.rewriters['js'],
                                      css_rewriter_class=rule.rewriters['css'],
                                      head_insert=head_insert_str,
                                      defmod=self.defmod,
                                      parse_comments=rule.parse_comments)

        else:
            if wb_url.is_banner_only:
                return (status_headers, self.stream_to_gen(stream), False)

            # apply one of (js, css, xml) rewriters
            rewriter = rewriter_class(urlrewriter)


        # align to line end for all non-html rewriting
        align = (text_type != 'html')

        # Create rewriting generator
        gen = self.rewrite_text_stream_to_gen(stream,
                                              rewrite_func=rewriter.rewrite,
                                              final_read_func=rewriter.close,
                                              first_buff=first_buff,
                                              align_to_line=align)

        return (status_headers, gen, True)
Ejemplo n.º 6
0
    def rewrite_content(self,
                        urlrewriter,
                        headers,
                        stream,
                        head_insert_func=None,
                        urlkey='',
                        cdx=None):

        wb_url = urlrewriter.wburl

        if (wb_url.is_identity
                or (not head_insert_func and wb_url.is_banner_only)):
            status_headers, stream = self.sanitize_content(headers, stream)
            return (status_headers, self.stream_to_gen(stream), False)

        if wb_url.is_banner_only:
            urlrewriter = None

        rule = self.ruleset.get_first_match(urlkey)

        (rewritten_headers,
         stream) = self._rewrite_headers(urlrewriter, rule, headers, stream)

        status_headers = rewritten_headers.status_headers

        # use rewritten headers, but no further rewriting needed
        if rewritten_headers.text_type is None:
            return (status_headers, self.stream_to_gen(stream), False)

        # Handle text content rewriting
        # ====================================================================
        # special case -- need to ungzip the body

        text_type = rewritten_headers.text_type

        # see known js/css modifier specified, the context should run
        # default text_type
        mod = wb_url.mod

        stream_raw = False
        encoding = None
        first_buff = ''

        if (rewritten_headers.contains_removed_header('content-encoding',
                                                      'gzip')):

            #optimize: if already a ChunkedDataReader, add gzip
            if isinstance(stream, ChunkedDataReader):
                stream.set_decomp('gzip')
            else:
                stream = DecompressingBufferedReader(stream)

        if mod == 'js_':
            text_type, stream = self._resolve_text_type(
                'js', text_type, stream)
        elif mod == 'cs_':
            text_type, stream = self._resolve_text_type(
                'css', text_type, stream)

        rewriter_class = rule.rewriters[text_type]

        # for html, need to perform header insert, supply js, css, xml
        # rewriters
        if text_type == 'html':
            head_insert_str = ''
            charset = rewritten_headers.charset

            # if no charset set, attempt to extract from first 1024
            if not rewritten_headers.charset:
                first_buff = stream.read(1024)
                charset = self._extract_html_charset(first_buff,
                                                     status_headers)

            if head_insert_func:
                if not charset:
                    charset = 'utf-8'
                head_insert_str = head_insert_func(rule, cdx)
                head_insert_str = head_insert_str.encode(charset)

            if wb_url.is_banner_only:
                gen = self._head_insert_only_gen(head_insert_str, stream,
                                                 first_buff)

                content_len = headers.get_header('Content-Length')
                try:
                    content_len = int(content_len)
                except Exception:
                    content_len = None

                if content_len and content_len >= 0:
                    content_len = str(content_len + len(head_insert_str))
                    status_headers.replace_header('Content-Length',
                                                  content_len)

                return (status_headers, gen, False)

            rewriter = rewriter_class(urlrewriter,
                                      js_rewriter_class=rule.rewriters['js'],
                                      css_rewriter_class=rule.rewriters['css'],
                                      head_insert=head_insert_str,
                                      defmod=self.defmod,
                                      parse_comments=rule.parse_comments)

        else:
            if wb_url.is_banner_only:
                return (status_headers, self.stream_to_gen(stream), False)

            # apply one of (js, css, xml) rewriters
            rewriter = rewriter_class(urlrewriter)

        # align to line end for all non-html rewriting
        align = (text_type != 'html')

        # Create rewriting generator
        gen = self.rewrite_text_stream_to_gen(stream,
                                              rewrite_func=rewriter.rewrite,
                                              final_read_func=rewriter.close,
                                              first_buff=first_buff,
                                              align_to_line=align)

        return (status_headers, gen, True)