Ejemplo n.º 1
0
    def load(self, url, offset, length):
        url_parts = urlparse.urlsplit(url)

        try:
            loader = self.loaders.get(url_parts.scheme)
        except Exception:
            raise wbexceptions.UnknownLoaderProtocolException(url)

        the_format = None

        for ext, iformat in ArchiveLoader.FORMAT_MAP.iteritems():
            if url.endswith(ext):
                the_format = iformat
                break

        if the_format is None:
            raise wbexceptions.UnknownArchiveFormatException(url)

        (a_format, is_gzip) = the_format

        decomp = utils.create_decompressor() if is_gzip else None

        try:
            length = int(length)
        except:
            length = -1


        raw = loader.load(url, long(offset), length)

        stream = LineReader(raw, length, self.chunk_size, decomp)

        if a_format == 'arc':
            rec_headers = self.arc_parser.parse(stream)
            rec_type = 'response'
            empty = (rec_headers.get_header('length') == 0)

        elif a_format == 'warc':
            rec_headers = self.warc_parser.parse(stream)
            rec_type = rec_headers.get_header('WARC-Type')
            empty = (rec_headers.get_header('Content-Length') == '0')

        # special case: empty w/arc record (hopefully a revisit)
        if empty:
            status_headers = StatusAndHeaders('204 No Content', [])

        # special case: warc records that are not expected to have http headers
        # attempt to add 200 status and content-type
        elif rec_type == 'metadata' or rec_type == 'resource':
            status_headers = StatusAndHeaders('200 OK', [('Content-Type', rec_headers.get_header('Content-Type'))])

        # special case: http 0.9 response, no status or headers
        #elif rec_type == 'response':
        #    content_type = rec_headers.get_header('Content-Type')
        #    if content_type and (';version=0.9' in content_type):
        #        status_headers = StatusAndHeaders('200 OK', [])

        # response record: parse HTTP status and headers!
        else:
            #(statusline, http_headers) = self.parse_http_headers(stream)
            status_headers = self.http_parser.parse(stream)

        return WBArchiveRecord((a_format, rec_type), rec_headers, stream, status_headers)
Ejemplo n.º 2
0
    def make_response(self, wbrequest, cdx, status_headers, stream, static_path):
        # check and reject self-redirect
        self._reject_self_redirect(wbrequest, cdx, status_headers)

        # check if redir is needed
        self._redirect_if_needed(wbrequest, cdx)

        urlrewriter = wbrequest.urlrewriter

        rewritten_headers = self.header_rewriter.rewrite(status_headers, urlrewriter)

        # de_chunking in case chunk encoding is broken
        # TODO: investigate further
        de_chunk = False

        # handle transfer-encoding: chunked
        if (rewritten_headers.contains_removed_header('transfer-encoding', 'chunked')):
            stream = archiveloader.ChunkedLineReader(stream)
            de_chunk = True

        # transparent, though still may need to dechunk
        if wbrequest.wb_url.mod == 'id_':
            if de_chunk:
                status_headers.remove_header('transfer-encoding')

            return self.create_stream_response(status_headers, stream)

        # non-text content type, just send through with rewritten headers
        # but may need to dechunk
        if rewritten_headers.text_type is None:
            status_headers = rewritten_headers.status_headers

            return self.create_stream_response(status_headers, stream)

        # Handle text rewriting

        # special case -- need to ungzip the body
        if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
            stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor())

        # TODO: is this right?
        if rewritten_headers.charset:
            encoding = rewritten_headers.charset
            first_buff = None
        else:
            (encoding, first_buff) = self._detect_charset(stream)

            # if chardet thinks its ascii, use utf-8
            if encoding == 'ascii':
                #encoding = None
                encoding = 'utf-8'

        # Buffering response for html, streaming for others?
        #if rewritten_headers.text_type == 'html':
        #    return self._rewrite_html(encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)
        #else:
        #    return self._rewrite_other(rewritten_headers.text_type, encoding, urlrewriter, stream, rewritten_headers.status_headers, first_buff)

        text_type = rewritten_headers.text_type
        status_headers = rewritten_headers.status_headers

        if text_type == 'html':
            head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx, static_path = static_path) if self.head_insert_view else None
            rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str)
        elif text_type == 'css':
            rewriter = regex_rewriters.CSSRewriter(urlrewriter)
        elif text_type == 'js':
            rewriter = regex_rewriters.JSRewriter(urlrewriter)
        elif text_type == 'xml':
            rewriter = regex_rewriters.XMLRewriter(urlrewriter)
        else:
            raise Exception('Unknown Text Type for Rewrite: ' + text_type)

        # Create generator for response
        response_gen = self._create_rewrite_stream(rewriter, encoding, stream, first_buff)

        if self.buffer_response:
            return self._create_buffer_response(status_headers, response_gen)
        else:
            return WbResponse(status_headers, value = response_gen)