def __call__(self, record, url_rewriter, cookie_rewriter, head_insert_func=None, cdx=None, environ=None): environ = environ or {} rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter) content_rewriter = None url_rewriter.rewrite_opts['cdx'] = cdx rule = self.get_rule(cdx) if rule.get('mixin') and not rwinfo.text_type: rwinfo.text_type = rule.get('mixin_type', 'json') if rwinfo.should_rw_content(): content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func) gen = None # check if decoding is needed if not rwinfo.is_content_rw: content_encoding = rwinfo.record.http_headers.get_header( 'Content-Encoding') accept_encoding = environ.get('HTTP_ACCEPT_ENCODING', '') # if content-encoding is set but encoding is not in accept encoding, # enable content_rw force decompression if content_encoding and content_encoding not in accept_encoding: rwinfo.is_content_rw = True if content_rewriter: gen = content_rewriter(rwinfo) elif rwinfo.is_content_rw: gen = StreamIter(rwinfo.content_stream) rw_http_headers = self.rewrite_headers(rwinfo) if not gen: # if not rewriting content, still need to dechunk # to conform to WSGI spec if rwinfo.is_chunked: stream = ChunkedDataReader(rwinfo.record.raw_stream, decomp_type=None) else: stream = rwinfo.record.raw_stream gen = StreamIter(stream) return rw_http_headers, gen, (content_rewriter != None)
def serve_cdx(self, environ, coll='$root'): """Make the upstream CDX query for a collection and response with the results of the query :param dict environ: The WSGI environment dictionary for the request :param str coll: The name of the collection this CDX query is for :return: The WbResponse containing the results of the CDX query :rtype: WbResponse """ base_url = self.rewriterapp.paths['cdx-server'] # if coll == self.all_coll: # coll = '*' cdx_url = base_url.format(coll=coll) if environ.get('QUERY_STRING'): cdx_url += '&' if '?' in cdx_url else '?' cdx_url += environ.get('QUERY_STRING') if self.query_limit: cdx_url += '&' if '?' in cdx_url else '?' cdx_url += 'limit=' + str(self.query_limit) try: res = requests.get(cdx_url, stream=True) content_type = res.headers.get('Content-Type') return WbResponse.bin_stream(StreamIter(res.raw), content_type=content_type) except Exception as e: return WbResponse.text_response('Error: ' + str(e), status='400 Bad Request')
def __call__(self, record, url_rewriter, cookie_rewriter, head_insert_func=None, cdx=None): rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter) content_rewriter = None url_rewriter.rewrite_opts['cdx'] = cdx rule = self.get_rule(cdx) if rule.get('mixin') and not rwinfo.text_type: rwinfo.text_type = rule.get('mixin_type', 'json') if rwinfo.should_rw_content(): content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func) gen = None if content_rewriter: gen = content_rewriter(rwinfo) elif rwinfo.is_content_rw: gen = StreamIter(rwinfo.content_stream) rw_http_headers = self.rewrite_headers(rwinfo) if not gen: # if not rewriting content, still need to dechunk # to conform to WSGI spec if rwinfo.is_chunked: stream = ChunkedDataReader(rwinfo.record.raw_stream, decomp_type=None) else: stream = rwinfo.record.raw_stream gen = StreamIter(stream) return rw_http_headers, gen, (content_rewriter != None)
def __call__(self, rwinfo): stream_buffer = tempfile.SpooledTemporaryFile(BUFF_SIZE * 4) with closing(rwinfo.content_stream) as fh: while True: buff = fh.read() if not buff: break stream_buffer.write(buff) stream_buffer.seek(0) return StreamIter(self.rewrite_stream(stream_buffer, rwinfo))
def read_all(infos): yield coll_info for recording, warcinfo, _ in infos: yield warcinfo for n, warc_path in recording.iter_all_files(): try: fh = loader.load(warc_path) except Exception: print('Skipping invalid ' + warc_path) continue for chunk in StreamIter(fh): yield chunk
def read_all(infos): yield coll_info for recording, warcinfo, _ in infos: yield warcinfo for warc_path in self._iter_all_warcs(user, coll, recording['id']): try: fh = loader.load(warc_path) except: print('Skipping invalid ' + warc_path) continue for chunk in StreamIter(fh): yield chunk
def serve_cdx(self, environ, coll='$root'): base_url = self.rewriterapp.paths['cdx-server'] #if coll == self.all_coll: # coll = '*' cdx_url = base_url.format(coll=coll) if environ.get('QUERY_STRING'): cdx_url += '&' if '?' in cdx_url else '?' cdx_url += environ.get('QUERY_STRING') try: res = requests.get(cdx_url, stream=True) content_type = res.headers.get('Content-Type') return WbResponse.bin_stream(StreamIter(res.raw), content_type=content_type) except Exception as e: return WbResponse.text_response('Error: ' + str(e), status='400 Bad Request')
def read_all(fh): for chunk in StreamIter(fh): yield chunk
def handle_call(self, environ, start_response): input_req = DirectWSGIInputRequest(environ) params = self._get_params(environ) request_uri = input_req.get_full_request_uri() input_buff = input_req.get_req_body() headers = input_req.get_req_headers() method = input_req.get_req_method() path = environ['PATH_INFO'] # write request body as metadata/resource put_record = params.get('put_record') if put_record and method in ('PUT', 'POST'): return self._put_record(request_uri, input_buff, put_record, headers, params, start_response) skipping = any( x.skip_request(path, headers) for x in self.skip_filters) req_is_wrapped = False if not skipping: req_stream = ReqWrapper(input_buff, headers, params, self.create_buff_func) req_is_wrapped = True else: req_stream = input_buff data = None if input_buff: data = req_stream try: res = requests.request(url=self.upstream_host + request_uri, method=method, data=data, headers=headers, allow_redirects=False, stream=True) res.raise_for_status() except Exception as e: if req_is_wrapped: no_except_close(req_stream.out) return self.send_error(e, start_response) if not skipping: skipping = any( x.skip_response(path, req_stream.headers, res.headers, params) for x in self.skip_filters) if not skipping: resp_stream = RespWrapper(res.raw, res.headers, req_stream, params, self.write_queue, path, self.create_buff_func) else: resp_stream = res.raw if req_is_wrapped: no_except_close(req_stream.out) resp_iter = StreamIter(resp_stream) # ensure TE header from upstream is not included, # added automatically by wsgi app res.headers.pop('Transfer-Encoding', '') start_response('200 OK', list(res.headers.items())) return resp_iter
def __call__(self, cdx, params): entry = self.load_resource(cdx, params) if not entry: return None, None compress = params.get('compress') == 'gzip' warc_headers, other_headers, stream = entry source = self._get_source_id(cdx) out_headers = {} out_headers['Warcserver-Type'] = 'warc' out_headers['Content-Type'] = 'application/warc-record' if params.get('recorder_skip'): out_headers['Recorder-Skip'] = '1' cdx['recorder_skip'] = '1' out_headers['Warcserver-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) out_headers['Warcserver-Source-Coll'] = to_native_str(source) if not warc_headers: if other_headers: out_headers['Link'] = other_headers.get('Link') out_headers['Memento-Datetime'] = other_headers.get( 'Memento-Datetime') if not compress: out_headers['Content-Length'] = other_headers.get( 'Content-Length') return out_headers, StreamIter(stream, closer=call_release_conn) target_uri = warc_headers.get_header('WARC-Target-URI') out_headers['WARC-Target-URI'] = target_uri out_headers['Link'] = MementoUtils.make_link(target_uri, 'original') memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date')) out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt) warc_headers_buff = warc_headers.to_bytes() if not compress: lenset = self._set_content_len( warc_headers.get_header('Content-Length'), out_headers, len(warc_headers_buff)) else: lenset = False streamiter = StreamIter(stream, header1=warc_headers_buff, header2=other_headers, closer=call_release_conn) if compress: streamiter = compress_gzip_iter(streamiter) out_headers['Content-Encoding'] = 'gzip' #if not lenset: # out_headers['Transfer-Encoding'] = 'chunked' # streamiter = chunk_encode_iter(streamiter) return out_headers, streamiter