Esempio n. 1
0
    def __call__(self,
                 record,
                 url_rewriter,
                 cookie_rewriter,
                 head_insert_func=None,
                 cdx=None,
                 environ=None):

        environ = environ or {}
        rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter)
        content_rewriter = None

        url_rewriter.rewrite_opts['cdx'] = cdx

        rule = self.get_rule(cdx)

        if rule.get('mixin') and not rwinfo.text_type:
            rwinfo.text_type = rule.get('mixin_type', 'json')

        if rwinfo.should_rw_content():
            content_rewriter = self.create_rewriter(rwinfo.text_type, rule,
                                                    rwinfo, cdx,
                                                    head_insert_func)

        gen = None

        # check if decoding is needed
        if not rwinfo.is_content_rw:
            content_encoding = rwinfo.record.http_headers.get_header(
                'Content-Encoding')
            accept_encoding = environ.get('HTTP_ACCEPT_ENCODING', '')

            # if content-encoding is set but encoding is not in accept encoding,
            # enable content_rw force decompression
            if content_encoding and content_encoding not in accept_encoding:
                rwinfo.is_content_rw = True

        if content_rewriter:
            gen = content_rewriter(rwinfo)
        elif rwinfo.is_content_rw:
            gen = StreamIter(rwinfo.content_stream)

        rw_http_headers = self.rewrite_headers(rwinfo)

        if not gen:
            # if not rewriting content, still need to dechunk
            # to conform to WSGI spec
            if rwinfo.is_chunked:
                stream = ChunkedDataReader(rwinfo.record.raw_stream,
                                           decomp_type=None)
            else:
                stream = rwinfo.record.raw_stream

            gen = StreamIter(stream)

        return rw_http_headers, gen, (content_rewriter != None)
Esempio n. 2
0
    def serve_cdx(self, environ, coll='$root'):
        """Make the upstream CDX query for a collection and response with the results of the query

        :param dict environ: The WSGI environment dictionary for the request
        :param str coll: The name of the collection this CDX query is for
        :return: The WbResponse containing the results of the CDX query
        :rtype: WbResponse
        """
        base_url = self.rewriterapp.paths['cdx-server']

        # if coll == self.all_coll:
        #    coll = '*'

        cdx_url = base_url.format(coll=coll)

        if environ.get('QUERY_STRING'):
            cdx_url += '&' if '?' in cdx_url else '?'
            cdx_url += environ.get('QUERY_STRING')

        if self.query_limit:
            cdx_url += '&' if '?' in cdx_url else '?'
            cdx_url += 'limit=' + str(self.query_limit)

        try:
            res = requests.get(cdx_url, stream=True)

            content_type = res.headers.get('Content-Type')

            return WbResponse.bin_stream(StreamIter(res.raw),
                                         content_type=content_type)

        except Exception as e:
            return WbResponse.text_response('Error: ' + str(e),
                                            status='400 Bad Request')
Esempio n. 3
0
    def __call__(self,
                 record,
                 url_rewriter,
                 cookie_rewriter,
                 head_insert_func=None,
                 cdx=None):

        rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter)
        content_rewriter = None

        url_rewriter.rewrite_opts['cdx'] = cdx

        rule = self.get_rule(cdx)

        if rule.get('mixin') and not rwinfo.text_type:
            rwinfo.text_type = rule.get('mixin_type', 'json')

        if rwinfo.should_rw_content():
            content_rewriter = self.create_rewriter(rwinfo.text_type, rule,
                                                    rwinfo, cdx,
                                                    head_insert_func)

        gen = None

        if content_rewriter:
            gen = content_rewriter(rwinfo)
        elif rwinfo.is_content_rw:
            gen = StreamIter(rwinfo.content_stream)

        rw_http_headers = self.rewrite_headers(rwinfo)

        if not gen:
            # if not rewriting content, still need to dechunk
            # to conform to WSGI spec
            if rwinfo.is_chunked:
                stream = ChunkedDataReader(rwinfo.record.raw_stream,
                                           decomp_type=None)
            else:
                stream = rwinfo.record.raw_stream

            gen = StreamIter(stream)

        return rw_http_headers, gen, (content_rewriter != None)
Esempio n. 4
0
    def __call__(self, rwinfo):
        stream_buffer = tempfile.SpooledTemporaryFile(BUFF_SIZE * 4)

        with closing(rwinfo.content_stream) as fh:
            while True:
                buff = fh.read()
                if not buff:
                    break

                stream_buffer.write(buff)

        stream_buffer.seek(0)
        return StreamIter(self.rewrite_stream(stream_buffer, rwinfo))
Esempio n. 5
0
        def read_all(infos):
            yield coll_info

            for recording, warcinfo, _ in infos:
                yield warcinfo

                for n, warc_path in recording.iter_all_files():
                    try:
                        fh = loader.load(warc_path)
                    except Exception:
                        print('Skipping invalid ' + warc_path)
                        continue

                    for chunk in StreamIter(fh):
                        yield chunk
Esempio n. 6
0
        def read_all(infos):
            yield coll_info

            for recording, warcinfo, _ in infos:
                yield warcinfo

                for warc_path in self._iter_all_warcs(user, coll, recording['id']):
                    try:
                        fh = loader.load(warc_path)
                    except:
                        print('Skipping invalid ' + warc_path)
                        continue

                    for chunk in StreamIter(fh):
                        yield chunk
Esempio n. 7
0
    def serve_cdx(self, environ, coll='$root'):
        base_url = self.rewriterapp.paths['cdx-server']

        #if coll == self.all_coll:
        #    coll = '*'

        cdx_url = base_url.format(coll=coll)

        if environ.get('QUERY_STRING'):
            cdx_url += '&' if '?' in cdx_url else '?'
            cdx_url += environ.get('QUERY_STRING')

        try:
            res = requests.get(cdx_url, stream=True)

            content_type = res.headers.get('Content-Type')

            return WbResponse.bin_stream(StreamIter(res.raw),
                                         content_type=content_type)

        except Exception as e:
            return WbResponse.text_response('Error: ' + str(e),
                                            status='400 Bad Request')
Esempio n. 8
0
 def read_all(fh):
     for chunk in StreamIter(fh):
         yield chunk
Esempio n. 9
0
    def handle_call(self, environ, start_response):
        input_req = DirectWSGIInputRequest(environ)

        params = self._get_params(environ)

        request_uri = input_req.get_full_request_uri()

        input_buff = input_req.get_req_body()

        headers = input_req.get_req_headers()

        method = input_req.get_req_method()

        path = environ['PATH_INFO']

        # write request body as metadata/resource
        put_record = params.get('put_record')
        if put_record and method in ('PUT', 'POST'):
            return self._put_record(request_uri, input_buff, put_record,
                                    headers, params, start_response)

        skipping = any(
            x.skip_request(path, headers) for x in self.skip_filters)

        req_is_wrapped = False

        if not skipping:
            req_stream = ReqWrapper(input_buff, headers, params,
                                    self.create_buff_func)
            req_is_wrapped = True
        else:
            req_stream = input_buff

        data = None
        if input_buff:
            data = req_stream

        try:
            res = requests.request(url=self.upstream_host + request_uri,
                                   method=method,
                                   data=data,
                                   headers=headers,
                                   allow_redirects=False,
                                   stream=True)
            res.raise_for_status()
        except Exception as e:
            if req_is_wrapped:
                no_except_close(req_stream.out)
            return self.send_error(e, start_response)

        if not skipping:
            skipping = any(
                x.skip_response(path, req_stream.headers, res.headers, params)
                for x in self.skip_filters)

        if not skipping:
            resp_stream = RespWrapper(res.raw, res.headers, req_stream, params,
                                      self.write_queue, path,
                                      self.create_buff_func)

        else:
            resp_stream = res.raw
            if req_is_wrapped:
                no_except_close(req_stream.out)

        resp_iter = StreamIter(resp_stream)

        # ensure TE header from upstream is not included,
        # added automatically by wsgi app
        res.headers.pop('Transfer-Encoding', '')

        start_response('200 OK', list(res.headers.items()))
        return resp_iter
Esempio n. 10
0
    def __call__(self, cdx, params):
        entry = self.load_resource(cdx, params)
        if not entry:
            return None, None

        compress = params.get('compress') == 'gzip'

        warc_headers, other_headers, stream = entry

        source = self._get_source_id(cdx)

        out_headers = {}
        out_headers['Warcserver-Type'] = 'warc'
        out_headers['Content-Type'] = 'application/warc-record'

        if params.get('recorder_skip'):
            out_headers['Recorder-Skip'] = '1'
            cdx['recorder_skip'] = '1'

        out_headers['Warcserver-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
        out_headers['Warcserver-Source-Coll'] = to_native_str(source)

        if not warc_headers:
            if other_headers:
                out_headers['Link'] = other_headers.get('Link')
                out_headers['Memento-Datetime'] = other_headers.get(
                    'Memento-Datetime')
                if not compress:
                    out_headers['Content-Length'] = other_headers.get(
                        'Content-Length')

            return out_headers, StreamIter(stream, closer=call_release_conn)

        target_uri = warc_headers.get_header('WARC-Target-URI')

        out_headers['WARC-Target-URI'] = target_uri

        out_headers['Link'] = MementoUtils.make_link(target_uri, 'original')

        memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
        out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)

        warc_headers_buff = warc_headers.to_bytes()

        if not compress:
            lenset = self._set_content_len(
                warc_headers.get_header('Content-Length'), out_headers,
                len(warc_headers_buff))
        else:
            lenset = False

        streamiter = StreamIter(stream,
                                header1=warc_headers_buff,
                                header2=other_headers,
                                closer=call_release_conn)

        if compress:
            streamiter = compress_gzip_iter(streamiter)
            out_headers['Content-Encoding'] = 'gzip'

        #if not lenset:
        #    out_headers['Transfer-Encoding'] = 'chunked'
        #    streamiter = chunk_encode_iter(streamiter)

        return out_headers, streamiter