def get_rendered_original(): """ Grabs a rendered resource. Only reason Wayback can't do this is that it does not like the extended URIs i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://' """ url = request.args.get('url') app.logger.debug("Got URL: %s" % url) # type = request.args.get('type', 'screenshot') app.logger.debug("Got type: %s" % type) # Query URL qurl = "%s:%s" % (type, url) # Query CDX Server for the item (warc_filename, warc_offset) = lookup_in_cdx(qurl) # If not found, say so: if warc_filename is None: abort(404) # Grab the payload from the WARC and return it. r = requests.get("%s%s%s?op=OPEN&user.name=%s&offset=%s" % (systems().webhdfs, h3().hdfs_root_folder, warc_filename, webhdfs().user, warc_offset)) app.logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() record = rl.parse_record_stream(DecompressingBufferedReader(stream=io.BytesIO(r.content))) print(record) print(record.length) print(record.stream.limit) return send_file(record.stream, mimetype=record.content_type)
def load_test_archive(test_file, offset, length): path = test_warc_dir + test_file testloader = ArcWarcRecordLoader() archive = testloader.load(path, offset, length) pprint.pprint(((archive.format, archive.rec_type), archive.rec_headers, archive.status_headers))
def __init__(self, fileobj, no_record_parse=False, verify_http=False): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http) self.reader = None self.offset = 0 self.known_format = None self.member_info = None self.no_record_parse = no_record_parse
def load_test_archive(test_file, offset, length): path = test_warc_dir + test_file testloader = ArcWarcRecordLoader() archive = testloader.load(path, offset, length) pywb.utils.statusandheaders.WRAP_WIDTH = 160 pprint.pprint(((archive.format, archive.rec_type), archive.rec_headers, archive.status_headers), indent=1, width=160)
def __init__(self, framed_replay=False, jinja_env=None, config=None): self.loader = ArcWarcRecordLoader() config = config or {} self.paths = config['url_templates'] self.framed_replay = framed_replay self.frame_mod = '' self.replay_mod = 'mp_' frame_type = 'inverse' if framed_replay else False self.content_rewriter = RewriteContentAMF(is_framed_replay=frame_type) if not jinja_env: jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'}) self.jinja_env = jinja_env self.head_insert_view = HeadInsertView(self.jinja_env, 'head_insert.html', 'banner.html') self.frame_insert_view = TopFrameView(self.jinja_env, 'frame_insert.html', 'banner.html') self.error_view = BaseInsertView(self.jinja_env, 'error.html') self.query_view = BaseInsertView(self.jinja_env, config.get('query_html', 'query.html')) self.cookie_tracker = None
def __init__(self, config): super(PlatformHandler, self).__init__(config) self.upstream_url = config.get('upstream_url') self.loader = ArcWarcRecordLoader() framed = config.get('framed_replay') self.content_rewriter = RewriteContent(is_framed_replay=framed)
def _init_replay_view(self, config): cookie_maker = config.get('cookie_maker') record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) paths = config.get('archive_paths') resolving_loader = ResolvingLoader(PathResolverMapper()(paths), record_loader=record_loader) return ReplayView(resolving_loader, config)
def get_rendered_original_stream(warc_filename, warc_offset, compressedendoffset): # If not found, say so: if warc_filename is None: return None, None # Grab the payload from the WARC and return it. url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset) if compressedendoffset: url = "%s&length=%s" % (url, compressedendoffset) logger.info("Requesting copy from HDFS: %s " % url) r = requests.get(url, stream=True) logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() logger.info("Passing response to parser...") record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw)) logger.info("RESULT:") logger.info(record) logger.info("Returning stream...") return record.stream, record.content_type
def get_rendered_original(url, type='screenshot', target_timestamp=30001201235900): """ Grabs a rendered resource. Only reason Wayback can't do this is that it does not like the extended URIs i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://' """ # Query URL qurl = "%s:%s" % (type, url) # Query CDX Server for the item #logger.info("Querying CDX for prefix...") warc_filename, warc_offset, compressedendoffset = lookup_in_cdx( qurl, target_timestamp) # If not found, say so: if warc_filename is None: return None # Grab the payload from the WARC and return it. WEBHDFS_PREFIX = os.environ['WEBHDFS_PREFIX'] WEBHDFS_USER = os.environ['WEBHDFS_USER'] url = "%s%s?op=OPEN&user.name=%s&offset=%s" % ( WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset) if compressedendoffset: url = "%s&length=%s" % (url, compressedendoffset) #logger.info("Requesting copy from HDFS: %s " % url) r = requests.get(url, stream=True) #logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() #logger.info("Passing response to parser...") record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw)) #logger.info("RESULT:") #logger.info(record) #logger.info("Returning stream...") return (record.stream, record.content_type)
def _init_replay_view(self, config): cookie_maker = config.get('cookie_maker') record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) paths = config.get('archive_paths') resolving_loader = ResolvingLoader(PathResolverMapper()(paths), record_loader=record_loader) redis_warc_resolver = config.get('redis_warc_resolver') if redis_warc_resolver: resolving_loader.path_resolvers.append(redis_warc_resolver) return WebRecReplayView(resolving_loader, config)
def get_rendered_original(): """ Grabs a rendered resource. Only reason Wayback can't do this is that it does not like the extended URIs i.e. 'screenshot:http://' and replaces them with 'http://screenshot:http://' """ url = request.args.get('url') app.logger.debug("Got URL: %s" % url) # type = request.args.get('type', 'screenshot') app.logger.debug("Got type: %s" % type) # Query URL qurl = "%s:%s" % (type, url) # Query CDX Server for the item (warc_filename, warc_offset) = lookup_in_cdx(qurl) # If not found, say so: if warc_filename is None: abort(404) # Grab the payload from the WARC and return it. r = requests.get("%s%s%s?op=OPEN&user.name=%s&offset=%s" % (systems().webhdfs, h3().hdfs_root_folder, warc_filename, webhdfs().user, warc_offset)) app.logger.info("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() record = rl.parse_record_stream( DecompressingBufferedReader(stream=io.BytesIO(r.content))) print(record) print(record.length) print(record.stream.limit) return send_file(record.stream, mimetype=record.content_type)
def __init__(self, query_handler, config=None): super(WBHandler, self).__init__(config) self.index_reader = query_handler cookie_maker = config.get('cookie_maker') record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) paths = config.get('archive_paths') resolving_loader = ResolvingLoader(paths=paths, record_loader=record_loader) self.replay = ReplayView(resolving_loader, config) self.fallback_handler = None self.fallback_name = config.get('fallback')
class ArchiveIterator(object): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed The indexer will automatically detect format, and decompress if necessary. """ GZIP_ERR_MSG = """ ERROR: Non-chunked gzip file detected, gzip block continues beyond single record. This file is probably not a multi-chunk gzip but a single gzip file. To allow seek, a gzipped {1} must have each record compressed into a single gzip chunk and concatenated together. This file is likely still valid and you can use it by decompressing it: gunzip myfile.{0}.gz You can then also use the 'warc2warc' tool from the 'warc-tools' package which will create a properly chunked gzip file: warc2warc -Z myfile.{0} > myfile.{0}.gz """ INC_RECORD = """\ WARNING: Record not followed by newline, perhaps Content-Length is invalid Offset: {0} Remainder: {1} """ def __init__(self, fileobj, no_record_parse=False, verify_http=False, arc2warc=False): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, arc2warc=arc2warc) self.reader = None self.offset = 0 self.known_format = None self.mixed_arc_warc = arc2warc self.member_info = None self.no_record_parse = no_record_parse def __call__(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None raise_invalid_gzip = False empty_record = False record = None while True: try: curr_offset = self.fh.tell() record = self._next_record(self.next_line) if raise_invalid_gzip: self._raise_invalid_gzip_err() yield record except EOFError: empty_record = True if record: self.read_to_end(record) if self.reader.decompressor: # if another gzip member, continue if self.reader.read_next_member(): continue # if empty record, then we're done elif empty_record: break # otherwise, probably a gzip # containing multiple non-chunked records # raise this as an error else: raise_invalid_gzip = True # non-gzip, so we're done elif empty_record: break def _raise_invalid_gzip_err(self): """ A gzip file with multiple ARC/WARC records, non-chunked has been detected. This is not valid for replay, so notify user """ frmt = 'warc/arc' if self.known_format: frmt = self.known_format frmt_up = frmt.upper() msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) raise Exception(msg) def _consume_blanklines(self): """ Consume blank lines that are between records - For warcs, there are usually 2 - For arcs, may be 1 or 0 - For block gzipped files, these are at end of each gzip envelope and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length count empty_size so that it can be substracted from the record length for uncompressed if first line read is not blank, likely error in WARC/ARC, display a warning """ empty_size = 0 first_line = True while True: line = self.reader.readline() if len(line) == 0: return None, empty_size stripped = line.rstrip() if len(stripped) == 0 or first_line: empty_size += len(line) if len(stripped) != 0: # if first line is not blank, # likely content-length was invalid, display warning err_offset = self.fh.tell() - self.reader.rem_length( ) - empty_size sys.stderr.write(self.INC_RECORD.format(err_offset, line)) first_line = False continue return line, empty_size def read_to_end(self, record, payload_callback=None): """ Read remainder of the stream If a digester is included, update it with the data read """ # already at end of this record, don't read until it is consumed if self.member_info: return None num = 0 curr_offset = self.offset while True: b = record.stream.read(8192) if not b: break num += len(b) if payload_callback: payload_callback(b) """ - For compressed files, blank lines are consumed since they are part of record length - For uncompressed files, blank lines are read later, and not included in the record length """ #if self.reader.decompressor: self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() #if self.offset < 0: # raise Exception('Not Gzipped Properly') if self.next_line: self.offset -= len(self.next_line) length = self.offset - curr_offset if not self.reader.decompressor: length -= empty_size self.member_info = (curr_offset, length) #return self.member_info #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream Supporting warc and arc records """ record = self.loader.parse_record_stream(self.reader, next_line, self.known_format, self.no_record_parse) self.member_info = None # Track known format for faster parsing of other records if not self.mixed_arc_warc: self.known_format = record.format return record
class RewriterApp(object): VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json' def __init__(self, framed_replay=False, jinja_env=None, config=None): self.loader = ArcWarcRecordLoader() config = config or {} self.paths = config['url_templates'] self.framed_replay = framed_replay self.frame_mod = '' self.replay_mod = 'mp_' frame_type = 'inverse' if framed_replay else False self.content_rewriter = RewriteContentAMF(is_framed_replay=frame_type) if not jinja_env: jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'}) self.jinja_env = jinja_env self.head_insert_view = HeadInsertView(self.jinja_env, 'head_insert.html', 'banner.html') self.frame_insert_view = TopFrameView(self.jinja_env, 'frame_insert.html', 'banner.html') self.error_view = BaseInsertView(self.jinja_env, 'error.html') self.query_view = BaseInsertView(self.jinja_env, config.get('query_html', 'query.html')) self.cookie_tracker = None def call_with_params(self, **kwargs): def run_app(environ, start_response): environ['pywb.kwargs'] = kwargs return self(environ, start_response) return run_app def __call__(self, environ, start_response): wb_url = self.get_wburl(environ) kwargs = environ.get('pywb.kwargs', {}) try: response = self.render_content(wb_url, kwargs, environ) except UpstreamException as ue: response = self.handle_error(environ, ue) return response(environ, start_response) def is_framed_replay(self, wb_url): return (self.framed_replay and wb_url.mod == self.frame_mod and wb_url.is_replay()) def render_content(self, wb_url, kwargs, environ): wb_url = WbUrl(wb_url) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix resp = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if resp is not None: content_type = 'text/html' # if not replay outer frame, specify utf-8 charset if not self.is_framed_replay(wb_url): content_type += '; charset=utf-8' return WbResponse.text_response(resp, content_type=content_type) urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix) self.unrewrite_referrer(environ) urlkey = canonicalize(wb_url.url) inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, self.content_rewriter) inputreq.include_post_query(wb_url.url) mod_url = None use_206 = False rangeres = None readd_range = False async_record_url = None if kwargs.get('type') in ('record', 'patch'): rangeres = inputreq.extract_range() if rangeres: mod_url, start, end, use_206 = rangeres # if bytes=0- Range request, # simply remove the range and still proxy if start == 0 and not end and use_206: wb_url.url = mod_url inputreq.url = mod_url del environ['HTTP_RANGE'] readd_range = True else: async_record_url = mod_url skip = async_record_url is not None setcookie_headers = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) res = self.cookie_tracker.get_cookie_headers(wb_url.url, cookie_key) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip) if r.status_code >= 400: error = None try: error = r.raw.read() r.raw.close() except: pass if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) if async_record_url: environ.pop('HTTP_RANGE', '') gevent.spawn(self._do_async_req, inputreq, async_record_url, wb_url, kwargs, False) record = self.loader.parse_record_stream(r.raw) cdx = CDXObject() cdx['urlkey'] = urlkey cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime')) cdx['url'] = wb_url.url self._add_custom_params(cdx, r.headers, kwargs) if readd_range: content_length = (record.status_headers. get_header('Content-Length')) try: content_length = int(content_length) record.status_headers.add_range(0, content_length, content_length) except (ValueError, TypeError): pass if self.is_ajax(environ): head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view. create_insert_func(wb_url, full_prefix, host_prefix, top_url, environ, self.framed_replay)) cookie_rewriter = None if self.cookie_tracker: cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter, cookie_key) result = self.content_rewriter.rewrite_content(urlrewriter, record.status_headers, record.stream, head_insert_func, urlkey, cdx, cookie_rewriter, environ) status_headers, gen, is_rw = result if setcookie_headers: status_headers.headers.extend(setcookie_headers) return WbResponse(status_headers, gen) def get_top_url(self, full_prefix, wb_url, cdx, kwargs): top_url = full_prefix top_url += wb_url.to_str(mod='') return top_url def _do_async_req(self, *args): count = 0 try: r = self._do_req(*args) while True: buff = r.raw.read(8192) count += len(buff) if not buff: return except: import traceback traceback.print_exc() finally: try: r.raw.close() except: pass def handle_error(self, environ, ue): error_html = self.error_view.render_to_string(environ, err_msg=ue.url, err_details=ue.msg) return WbResponse.text_response(error_html, content_type='text/html') def _do_req(self, inputreq, wb_url, kwargs, skip): req_data = inputreq.reconstruct_request(wb_url.url) headers = {'Content-Length': str(len(req_data)), 'Content-Type': 'application/request'} if skip: headers['Recorder-Skip'] = '1' if wb_url.is_latest_replay(): closest = 'now' else: closest = wb_url.timestamp params = {} params['url'] = wb_url.url params['closest'] = closest if wb_url.mod == 'vi_': params['content_type'] = self.VIDEO_INFO_CONTENT_TYPE upstream_url = self.get_upstream_url(wb_url, kwargs, params) r = requests.post(upstream_url, data=BytesIO(req_data), headers=headers, stream=True) return r def do_query(self, wb_url, kwargs): params = {} params['url'] = wb_url.url params['output'] = 'json' params['from'] = wb_url.timestamp params['to'] = wb_url.end_timestamp upstream_url = self.get_upstream_url(wb_url, kwargs, params) upstream_url = upstream_url.replace('/resource/postreq', '/index') r = requests.get(upstream_url) return r.text def handle_query(self, environ, wb_url, kwargs): res = self.do_query(wb_url, kwargs) def format_cdx(text): cdx_lines = text.rstrip().split('\n') for cdx in cdx_lines: if not cdx: continue cdx = json.loads(cdx) self.process_query_cdx(cdx, wb_url, kwargs) yield cdx prefix = self.get_full_prefix(environ) params = dict(url=wb_url.url, prefix=prefix, cdx_lines=list(format_cdx(res))) extra_params = self.get_query_params(wb_url, kwargs) if extra_params: params.update(extra_params) return self.query_view.render_to_string(environ, **params) def process_query_cdx(self, cdx, wb_url, kwargs): return def get_query_params(self, wb_url, kwargs): return None def get_host_prefix(self, environ): #return request.urlparts.scheme + '://' + request.urlparts.netloc url = environ['wsgi.url_scheme'] + '://' if environ.get('HTTP_HOST'): url += environ['HTTP_HOST'] else: url += environ['SERVER_NAME'] if environ['wsgi.url_scheme'] == 'https': if environ['SERVER_PORT'] != '443': url += ':' + environ['SERVER_PORT'] else: if environ['SERVER_PORT'] != '80': url += ':' + environ['SERVER_PORT'] return url def get_rel_prefix(self, environ): #return request.script_name return environ.get('SCRIPT_NAME') + '/' def get_full_prefix(self, environ): return self.get_host_prefix(environ) + self.get_rel_prefix(environ) def get_wburl(self, environ): wb_url = environ.get('PATH_INFO', '/')[1:] if environ.get('QUERY_STRING'): wb_url += '?' + environ.get('QUERY_STRING') return wb_url def unrewrite_referrer(self, environ): referrer = environ.get('HTTP_REFERER') if not referrer: return False full_prefix = self.get_full_prefix(environ) if referrer.startswith(full_prefix): referrer = referrer[len(full_prefix):] environ['HTTP_REFERER'] = WbUrl(referrer).url return True return False def is_ajax(self, environ): value = environ.get('HTTP_X_REQUESTED_WITH') value = value or environ.get('HTTP_X_PYWB_REQUESTED_WITH') if value and value.lower() == 'xmlhttprequest': return True return False def get_base_url(self, wb_url, kwargs): type = kwargs.get('type') return self.paths[type] def get_upstream_url(self, wb_url, kwargs, params): base_url = self.get_base_url(wb_url, kwargs) param_str = urlencode(params, True) if param_str: base_url += '&' + param_str return base_url def get_cookie_key(self, kwargs): raise NotImplemented() def _add_custom_params(self, cdx, headers, kwargs): cdx['is_live'] = 'true' pass def get_top_frame_params(self, wb_url, kwargs): return None def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs): if wb_url.is_query(): return self.handle_query(environ, wb_url, kwargs) if self.is_framed_replay(wb_url): extra_params = self.get_top_frame_params(wb_url, kwargs) return self.frame_insert_view.get_top_frame(wb_url, full_prefix, host_prefix, environ, self.frame_mod, self.replay_mod, coll='', extra_params=extra_params) return None
class ArchiveIterator(object): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed The indexer will automatically detect format, and decompress if necessary. """ GZIP_ERR_MSG = """ ERROR: Non-chunked gzip file detected, gzip block continues beyond single record. This file is probably not a multi-chunk gzip but a single gzip file. To allow seek, a gzipped {1} must have each record compressed into a single gzip chunk and concatenated together. This file is likely still valid and you can use it by decompressing it: gunzip myfile.{0}.gz You can then also use the 'warc2warc' tool from the 'warc-tools' package which will create a properly chunked gzip file: warc2warc -Z myfile.{0} > myfile.{0}.gz """ INC_RECORD = """\ WARNING: Record not followed by newline, perhaps Content-Length is invalid Offset: {0} Remainder: {1} """ def __init__(self, fileobj, no_record_parse=False, verify_http=False): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http) self.reader = None self.offset = 0 self.known_format = None self.member_info = None self.no_record_parse = no_record_parse def __call__(self, block_size=16384): """ iterate over each record """ decomp_type = 'gzip' self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None raise_invalid_gzip = False empty_record = False record = None while True: try: curr_offset = self.fh.tell() record = self._next_record(self.next_line) if raise_invalid_gzip: self._raise_invalid_gzip_err() yield record except EOFError: empty_record = True if record: self.read_to_end(record) if self.reader.decompressor: # if another gzip member, continue if self.reader.read_next_member(): continue # if empty record, then we're done elif empty_record: break # otherwise, probably a gzip # containing multiple non-chunked records # raise this as an error else: raise_invalid_gzip = True # non-gzip, so we're done elif empty_record: break def _raise_invalid_gzip_err(self): """ A gzip file with multiple ARC/WARC records, non-chunked has been detected. This is not valid for replay, so notify user """ frmt = 'warc/arc' if self.known_format: frmt = self.known_format frmt_up = frmt.upper() msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) raise Exception(msg) def _consume_blanklines(self): """ Consume blank lines that are between records - For warcs, there are usually 2 - For arcs, may be 1 or 0 - For block gzipped files, these are at end of each gzip envelope and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length count empty_size so that it can be substracted from the record length for uncompressed if first line read is not blank, likely error in WARC/ARC, display a warning """ empty_size = 0 first_line = True while True: line = self.reader.readline() if len(line) == 0: return None, empty_size stripped = line.rstrip() if len(stripped) == 0 or first_line: empty_size += len(line) if len(stripped) != 0: # if first line is not blank, # likely content-length was invalid, display warning err_offset = self.fh.tell() - self.reader.rem_length() - empty_size sys.stderr.write(self.INC_RECORD.format(err_offset, line)) first_line = False continue return line, empty_size def read_to_end(self, record, payload_callback=None): """ Read remainder of the stream If a digester is included, update it with the data read """ # already at end of this record, don't read until it is consumed if self.member_info: return None num = 0 curr_offset = self.offset while True: b = record.stream.read(8192) if not b: break num += len(b) if payload_callback: payload_callback(b) """ - For compressed files, blank lines are consumed since they are part of record length - For uncompressed files, blank lines are read later, and not included in the record length """ #if self.reader.decompressor: self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() #if self.offset < 0: # raise Exception('Not Gzipped Properly') if self.next_line: self.offset -= len(self.next_line) length = self.offset - curr_offset if not self.reader.decompressor: length -= empty_size self.member_info = (curr_offset, length) #return self.member_info #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream Supporting warc and arc records """ record = self.loader.parse_record_stream(self.reader, next_line, self.known_format, self.no_record_parse) self.member_info = None # Track known format for faster parsing of other records self.known_format = record.format return record
def parse_stream_error(**params): try: return ArcWarcRecordLoader().parse_record_stream(**params) except Exception as e: print 'Exception: ' + e.__class__.__name__
def __init__(self, path_resolvers, record_loader=ArcWarcRecordLoader(), no_record_parse=False): self.path_resolvers = path_resolvers self.record_loader = record_loader self.no_record_parse = no_record_parse
class PlatformHandler(RewriteHandler): def __init__(self, config): super(PlatformHandler, self).__init__(config) self.upstream_url = config.get('upstream_url') self.loader = ArcWarcRecordLoader() framed = config.get('framed_replay') self.content_rewriter = RewriteContent(is_framed_replay=framed) def render_content(self, wbrequest): if wbrequest.wb_url.mod == 'vi_': return self._get_video_info(wbrequest) ref_wburl_str = wbrequest.extract_referrer_wburl_str() if ref_wburl_str: wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url urlkey = canonicalize(wbrequest.wb_url.url) url = wbrequest.wb_url.url inputreq = RewriteInputRequest(wbrequest.env, urlkey, url, self.content_rewriter) req_data = inputreq.reconstruct_request(url) headers = {'Content-Length': len(req_data), 'Content-Type': 'application/request'} if wbrequest.wb_url.is_latest_replay(): closest = 'now' else: closest = wbrequest.wb_url.timestamp upstream_url = self.upstream_url.format(url=quote(url), closest=closest, #coll=wbrequest.coll, **wbrequest.matchdict) r = requests.post(upstream_url, data=BytesIO(req_data), headers=headers, stream=True, allow_redirects=False) r.raise_for_status() record = self.loader.parse_record_stream(r.raw) cdx = CDXObject() cdx['urlkey'] = urlkey cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime')) cdx['url'] = url head_insert_func = self.head_insert_view.create_insert_func(wbrequest) result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter, record.status_headers, record.stream, head_insert_func, urlkey, cdx) status_headers, gen, is_rw = result return self._make_response(wbrequest, *result)