def load_resource(self, cdx, params): if cdx.get('_cached_result'): return cdx.get('_cached_result') if not cdx.get('filename') or cdx.get('offset') is None: return None orig_source = cdx.get('source', '').split(':')[0] formatter = ParamFormatter(params, orig_source) cdx._formatter = formatter def local_index_query(local_params): for n, v in six.iteritems(params): if n.startswith('param.'): local_params[n] = v cdx_iter, errs = self.cdx_source(local_params) for cdx in cdx_iter: cdx._formatter = formatter yield cdx failed_files = [] headers, payload = (self.resolve_loader.load_headers_and_payload( cdx, failed_files, local_index_query)) http_headers_buff = None if payload.rec_type in ('response', 'revisit'): status = cdx.get('status') # status may not be set for 'revisit' if not status or status.startswith('3'): http_headers = self.headers_parser.parse(payload.raw_stream) self.raise_on_self_redirect( params, cdx, http_headers.get_statuscode(), http_headers.get_header('Location')) http_headers_buff = http_headers.to_bytes() warc_headers = payload.rec_headers if headers != payload: warc_headers.replace_header( 'WARC-Refers-To-Target-URI', payload.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Refers-To-Date', payload.rec_headers.get_header('WARC-Date')) warc_headers.replace_header( 'WARC-Target-URI', headers.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Date', headers.rec_headers.get_header('WARC-Date')) headers.raw_stream.close() return (warc_headers, http_headers_buff, payload.raw_stream)
def load_child_source(self, name, source, params): try: params['_name'] = name params['_formatter'] = ParamFormatter(params, name) res = source.load_index(params) if isinstance(res, tuple): cdx_iter, err_list = res else: cdx_iter = res err_list = [] except WbException as wbe: #print('Not found in ' + name) cdx_iter = iter([]) err_list = [(name, repr(wbe))] def add_source(cdx, name): if not cdx.get('url'): return cdx if cdx.get('source'): cdx['source'] = name + ':' + cdx['source'] else: cdx['source'] = name cdx['source-coll'] = self._get_coll(name) return cdx if params.get('nosource') != 'true': src_coll = params.get('param.' + name + '.src_coll') if src_coll: name += ':' + src_coll cdx_iter = (add_source(cdx, name) for cdx in cdx_iter) return cdx_iter, err_list
def _get_params(self, environ): params = dict(parse_qsl(environ.get('QUERY_STRING'))) params['_formatter'] = ParamFormatter(params, name=self.rec_source_name) return params
def load_resource(self, cdx, params): if cdx.get('_cached_result'): return cdx.get('_cached_result') if not cdx.get('filename') or cdx.get('offset') is None: return None orig_source = cdx.get('source', '').split(':')[0] formatter = ParamFormatter(params, orig_source) cdx._formatter = formatter def local_index_query(local_params): for n, v in six.iteritems(params): if n.startswith('param.'): local_params[n] = v cdx_iter, errs = self.cdx_source(local_params) for cdx in cdx_iter: cdx._formatter = formatter yield cdx failed_files = [] headers, payload = (self.resolve_loader.load_headers_and_payload( cdx, failed_files, local_index_query)) http_headers_buff = None if payload.rec_type in ('response', 'revisit'): status = cdx.get('status') # if status is not set and not, 2xx, 4xx, 5xx # go through self-redirect check just in case if not status or not status.startswith(('2', '4', '5')): http_headers = self.headers_parser.parse(payload.raw_stream) try: orig_size = payload.raw_stream.tell() except: orig_size = 0 try: self.raise_on_self_redirect( params, cdx, http_headers.get_statuscode(), http_headers.get_header('Location')) except LiveResourceException: no_except_close(headers.raw_stream) no_except_close(payload.raw_stream) raise http_headers_buff = http_headers.to_bytes() # if new http_headers_buff is different length, # attempt to adjust content-length on the WARC record if orig_size and len(http_headers_buff) != orig_size: orig_cl = payload.rec_headers.get_header('Content-Length') if orig_cl: new_cl = int(orig_cl) + (len(http_headers_buff) - orig_size) payload.rec_headers.replace_header( 'Content-Length', str(new_cl)) warc_headers = payload.rec_headers if headers != payload: warc_headers.replace_header( 'WARC-Refers-To-Target-URI', payload.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Refers-To-Date', payload.rec_headers.get_header('WARC-Date')) warc_headers.replace_header( 'WARC-Target-URI', headers.rec_headers.get_header('WARC-Target-URI')) warc_headers.replace_header( 'WARC-Date', headers.rec_headers.get_header('WARC-Date')) no_except_close(headers.raw_stream) return (warc_headers, http_headers_buff, payload.raw_stream)